Merge "Remove page level mprotects"
diff --git a/libc/Android.mk b/libc/Android.mk
index 2ec1281..cf3d48c 100644
--- a/libc/Android.mk
+++ b/libc/Android.mk
@@ -110,6 +110,7 @@
     bionic/clock.cpp \
     bionic/clone.cpp \
     bionic/cmsg_nxthdr.cpp \
+    bionic/connect.cpp \
     bionic/dirent.cpp \
     bionic/dup2.cpp \
     bionic/epoll_create.cpp \
@@ -140,6 +141,8 @@
     bionic/mkdir.cpp \
     bionic/mkfifo.cpp \
     bionic/mknod.cpp \
+    bionic/mntent.cpp \
+    bionic/NetdClientDispatch.cpp \
     bionic/open.cpp \
     bionic/pause.cpp \
     bionic/pipe.cpp \
@@ -256,7 +259,6 @@
 
 libc_upstream_netbsd_src_files := \
     upstream-netbsd/common/lib/libc/hash/sha1/sha1.c \
-    upstream-netbsd/common/lib/libc/inet/inet_addr.c \
     upstream-netbsd/lib/libc/gen/ftw.c \
     upstream-netbsd/lib/libc/gen/nftw.c \
     upstream-netbsd/lib/libc/gen/nice.c \
@@ -265,9 +267,6 @@
     upstream-netbsd/lib/libc/gen/setjmperr.c \
     upstream-netbsd/lib/libc/gen/utime.c \
     upstream-netbsd/lib/libc/gen/utmp.c \
-    upstream-netbsd/lib/libc/inet/inet_ntoa.c \
-    upstream-netbsd/lib/libc/inet/inet_ntop.c \
-    upstream-netbsd/lib/libc/inet/inet_pton.c \
     upstream-netbsd/lib/libc/isc/ev_streams.c \
     upstream-netbsd/lib/libc/isc/ev_timers.c \
     upstream-netbsd/lib/libc/regex/regcomp.c \
@@ -356,6 +355,18 @@
     upstream-openbsd/lib/libc/locale/wcsxfrm.c \
     upstream-openbsd/lib/libc/locale/wctob.c \
     upstream-openbsd/lib/libc/locale/wctomb.c \
+    upstream-openbsd/lib/libc/net/htonl.c \
+    upstream-openbsd/lib/libc/net/htons.c \
+    upstream-openbsd/lib/libc/net/inet_addr.c \
+    upstream-openbsd/lib/libc/net/inet_lnaof.c \
+    upstream-openbsd/lib/libc/net/inet_makeaddr.c \
+    upstream-openbsd/lib/libc/net/inet_netof.c \
+    upstream-openbsd/lib/libc/net/inet_network.c \
+    upstream-openbsd/lib/libc/net/inet_ntoa.c \
+    upstream-openbsd/lib/libc/net/inet_ntop.c \
+    upstream-openbsd/lib/libc/net/inet_pton.c \
+    upstream-openbsd/lib/libc/net/ntohl.c \
+    upstream-openbsd/lib/libc/net/ntohs.c \
     upstream-openbsd/lib/libc/stdio/asprintf.c \
     upstream-openbsd/lib/libc/stdio/clrerr.c \
     upstream-openbsd/lib/libc/stdio/fdopen.c \
@@ -448,8 +459,6 @@
     upstream-openbsd/lib/libc/stdlib/strtoull.c \
     upstream-openbsd/lib/libc/stdlib/strtoumax.c \
     upstream-openbsd/lib/libc/stdlib/system.c \
-    upstream-openbsd/lib/libc/string/stpcpy.c \
-    upstream-openbsd/lib/libc/string/stpncpy.c \
     upstream-openbsd/lib/libc/string/strcasecmp.c \
     upstream-openbsd/lib/libc/string/strcspn.c \
     upstream-openbsd/lib/libc/string/strdup.c \
@@ -727,7 +736,10 @@
 include $(CLEAR_VARS)
 
 LOCAL_SRC_FILES := $(libc_bionic_src_files)
-LOCAL_CFLAGS := $(libc_common_cflags) -Werror
+LOCAL_CFLAGS := $(libc_common_cflags) \
+    -Werror \
+    -Wframe-larger-than=2048 \
+
 LOCAL_CONLYFLAGS := $(libc_common_conlyflags)
 LOCAL_CPPFLAGS := $(libc_common_cppflags)
 LOCAL_C_INCLUDES := $(libc_common_c_includes)
@@ -879,6 +891,7 @@
     bionic/debug_stacktrace.cpp \
     bionic/pthread_debug.cpp \
     bionic/libc_init_dynamic.cpp \
+    bionic/NetdClient.cpp \
 
 LOCAL_MODULE := libc
 LOCAL_ADDITIONAL_DEPENDENCIES := $(libc_common_additional_dependencies)
diff --git a/libc/SYSCALLS.TXT b/libc/SYSCALLS.TXT
index 220c713..e9fb575 100644
--- a/libc/SYSCALLS.TXT
+++ b/libc/SYSCALLS.TXT
@@ -61,7 +61,6 @@
 int     tkill(pid_t tid, int sig)  all
 int     tgkill(pid_t tgid, pid_t tid, int sig)  all
 int     __ptrace:ptrace(int request, int pid, void* addr, void* data)  all
-int     __set_thread_area:set_thread_area(void*  user_desc)  mips,mips64,x86
 
 # <sys/resource.h>
 int getrusage(int, struct rusage*)  all
@@ -235,7 +234,7 @@
 int           socket(int, int, int)              arm,arm64,mips,mips64,x86_64
 int           socketpair(int, int, int, int*)    arm,arm64,mips,mips64,x86_64
 int           bind(int, struct sockaddr*, int)  arm,arm64,mips,mips64,x86_64
-int           connect(int, struct sockaddr*, socklen_t)   arm,arm64,mips,mips64,x86_64
+int           __connect:connect(int, struct sockaddr*, socklen_t)   arm,arm64,mips,mips64,x86_64
 int           listen(int, int)                   arm,arm64,mips,mips64,x86_64
 int           accept(int, struct sockaddr*, socklen_t*)  arm,arm64,mips,mips64,x86_64
 int           accept4(int, struct sockaddr*, socklen_t*, int)  arm,arm64,mips,mips64,x86_64
@@ -254,7 +253,7 @@
 # sockets for x86. These are done as an "indexed" call to socketcall syscall.
 int           socket:socketcall:1(int, int, int) x86
 int           bind:socketcall:2(int, struct sockaddr*, int)  x86
-int           connect:socketcall:3(int, struct sockaddr*, socklen_t)   x86
+int           __connect:socketcall:3(int, struct sockaddr*, socklen_t)   x86
 int           listen:socketcall:4(int, int)                   x86
 int           accept:socketcall:5(int, struct sockaddr*, socklen_t*)  x86
 int           getsockname:socketcall:6(int, struct sockaddr*, socklen_t*)  x86
@@ -331,3 +330,7 @@
 
 # MIPS-specific
 int     _flush_cache:cacheflush(char* addr, const int nbytes, const int op) mips,mips64
+int     __set_tls:set_thread_area(void*) mips,mips64
+
+# x86-specific
+int     __set_thread_area:set_thread_area(void*) x86
diff --git a/libc/arch-arm/arm.mk b/libc/arch-arm/arm.mk
index 2dbcb56..06b1675 100644
--- a/libc/arch-arm/arm.mk
+++ b/libc/arch-arm/arm.mk
@@ -26,6 +26,8 @@
     upstream-freebsd/lib/libc/string/wcsrchr.c \
     upstream-freebsd/lib/libc/string/wmemcmp.c \
     upstream-openbsd/lib/libc/string/bcopy.c \
+    upstream-openbsd/lib/libc/string/stpcpy.c \
+    upstream-openbsd/lib/libc/string/stpncpy.c \
     upstream-openbsd/lib/libc/string/strlcat.c \
     upstream-openbsd/lib/libc/string/strlcpy.c \
     upstream-openbsd/lib/libc/string/strncat.c \
diff --git a/libc/arch-arm/syscalls/connect.S b/libc/arch-arm/syscalls/__connect.S
similarity index 88%
rename from libc/arch-arm/syscalls/connect.S
rename to libc/arch-arm/syscalls/__connect.S
index f7a5188..8cb026c 100644
--- a/libc/arch-arm/syscalls/connect.S
+++ b/libc/arch-arm/syscalls/__connect.S
@@ -2,7 +2,7 @@
 
 #include <private/bionic_asm.h>
 
-ENTRY(connect)
+ENTRY(__connect)
     mov     ip, r7
     ldr     r7, =__NR_connect
     swi     #0
@@ -11,4 +11,4 @@
     bxls    lr
     neg     r0, r0
     b       __set_errno
-END(connect)
+END(__connect)
diff --git a/libc/arch-arm64/arm64.mk b/libc/arch-arm64/arm64.mk
index 14cc3f4..7a9eb4e 100644
--- a/libc/arch-arm64/arm64.mk
+++ b/libc/arch-arm64/arm64.mk
@@ -15,6 +15,8 @@
     upstream-freebsd/lib/libc/string/wcsrchr.c \
     upstream-freebsd/lib/libc/string/wmemcmp.c \
     upstream-openbsd/lib/libc/string/bcopy.c \
+    upstream-openbsd/lib/libc/string/stpcpy.c \
+    upstream-openbsd/lib/libc/string/stpncpy.c \
     upstream-openbsd/lib/libc/string/strcat.c \
     upstream-openbsd/lib/libc/string/strcpy.c \
     upstream-openbsd/lib/libc/string/strlcat.c \
diff --git a/libc/arch-arm64/syscalls/connect.S b/libc/arch-arm64/syscalls/__connect.S
similarity index 88%
rename from libc/arch-arm64/syscalls/connect.S
rename to libc/arch-arm64/syscalls/__connect.S
index d3cd43d..c46f418 100644
--- a/libc/arch-arm64/syscalls/connect.S
+++ b/libc/arch-arm64/syscalls/__connect.S
@@ -2,7 +2,7 @@
 
 #include <private/bionic_asm.h>
 
-ENTRY(connect)
+ENTRY(__connect)
     stp     x29, x30, [sp, #-16]!
     mov     x29,  sp
     str     x8,       [sp, #-16]!
@@ -18,4 +18,5 @@
     b.hi    __set_errno
 
     ret
-END(connect)
+END(__connect)
+.hidden __connect
diff --git a/libc/arch-mips/mips.mk b/libc/arch-mips/mips.mk
index 53fa223..d7d1df4 100644
--- a/libc/arch-mips/mips.mk
+++ b/libc/arch-mips/mips.mk
@@ -27,6 +27,8 @@
     upstream-freebsd/lib/libc/string/wcsrchr.c \
     upstream-freebsd/lib/libc/string/wmemcmp.c \
     upstream-openbsd/lib/libc/string/bcopy.c \
+    upstream-openbsd/lib/libc/string/stpcpy.c \
+    upstream-openbsd/lib/libc/string/stpncpy.c \
     upstream-openbsd/lib/libc/string/strcat.c \
     upstream-openbsd/lib/libc/string/strcmp.c \
     upstream-openbsd/lib/libc/string/strcpy.c \
@@ -61,7 +63,6 @@
     arch-mips/bionic/memcmp16.S \
     arch-mips/bionic/_setjmp.S \
     arch-mips/bionic/setjmp.S \
-    arch-mips/bionic/__set_tls.c \
     arch-mips/bionic/sigsetjmp.S \
     arch-mips/bionic/syscall.S \
     arch-mips/bionic/vfork.S \
diff --git a/libc/arch-mips/syscalls/connect.S b/libc/arch-mips/syscalls/__connect.S
similarity index 89%
rename from libc/arch-mips/syscalls/connect.S
rename to libc/arch-mips/syscalls/__connect.S
index 6f10652..f7ac916 100644
--- a/libc/arch-mips/syscalls/connect.S
+++ b/libc/arch-mips/syscalls/__connect.S
@@ -2,7 +2,7 @@
 
 #include <private/bionic_asm.h>
 
-ENTRY(connect)
+ENTRY(__connect)
     .set noreorder
     .cpload t9
     li v0, __NR_connect
@@ -16,4 +16,4 @@
     j t9
     nop
     .set reorder
-END(connect)
+END(__connect)
diff --git a/libc/arch-mips/syscalls/__set_thread_area.S b/libc/arch-mips/syscalls/__set_tls.S
similarity index 84%
rename from libc/arch-mips/syscalls/__set_thread_area.S
rename to libc/arch-mips/syscalls/__set_tls.S
index f83249e..e5b0ca2 100644
--- a/libc/arch-mips/syscalls/__set_thread_area.S
+++ b/libc/arch-mips/syscalls/__set_tls.S
@@ -2,7 +2,7 @@
 
 #include <private/bionic_asm.h>
 
-ENTRY(__set_thread_area)
+ENTRY(__set_tls)
     .set noreorder
     .cpload t9
     li v0, __NR_set_thread_area
@@ -16,4 +16,4 @@
     j t9
     nop
     .set reorder
-END(__set_thread_area)
+END(__set_tls)
diff --git a/libc/arch-mips64/bionic/__set_tls.c b/libc/arch-mips64/bionic/__set_tls.c
deleted file mode 100644
index 38e3a50..0000000
--- a/libc/arch-mips64/bionic/__set_tls.c
+++ /dev/null
@@ -1,35 +0,0 @@
-/*
- * Copyright (C) 2008 The Android Open Source Project
- * All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- *  * Redistributions of source code must retain the above copyright
- *    notice, this list of conditions and the following disclaimer.
- *  * Redistributions in binary form must reproduce the above copyright
- *    notice, this list of conditions and the following disclaimer in
- *    the documentation and/or other materials provided with the
- *    distribution.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
- * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
- * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
- * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
- * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
- * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS
- * OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
- * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
- * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
- * SUCH DAMAGE.
- */
-#include <pthread.h>
-
-extern int __set_thread_area(void *u_info);
-
-int __set_tls(void *ptr)
-{
-    return __set_thread_area(ptr);
-}
diff --git a/libc/arch-mips64/mips64.mk b/libc/arch-mips64/mips64.mk
index 75620d8..b6e0209 100644
--- a/libc/arch-mips64/mips64.mk
+++ b/libc/arch-mips64/mips64.mk
@@ -17,6 +17,8 @@
     upstream-freebsd/lib/libc/string/wcsrchr.c \
     upstream-freebsd/lib/libc/string/wmemcmp.c \
     upstream-openbsd/lib/libc/string/bcopy.c \
+    upstream-openbsd/lib/libc/string/stpcpy.c \
+    upstream-openbsd/lib/libc/string/stpncpy.c \
     upstream-openbsd/lib/libc/string/strcat.c \
     upstream-openbsd/lib/libc/string/strcmp.c \
     upstream-openbsd/lib/libc/string/strcpy.c \
@@ -47,7 +49,6 @@
     arch-mips64/bionic/memcmp16.S \
     arch-mips64/bionic/_setjmp.S \
     arch-mips64/bionic/setjmp.S \
-    arch-mips64/bionic/__set_tls.c \
     arch-mips64/bionic/sigsetjmp.S \
     arch-mips64/bionic/syscall.S \
     arch-mips64/bionic/vfork.S \
diff --git a/libc/arch-mips64/syscalls/connect.S b/libc/arch-mips64/syscalls/__connect.S
similarity index 87%
rename from libc/arch-mips64/syscalls/connect.S
rename to libc/arch-mips64/syscalls/__connect.S
index 8fe2d56..b1475fb 100644
--- a/libc/arch-mips64/syscalls/connect.S
+++ b/libc/arch-mips64/syscalls/__connect.S
@@ -2,7 +2,7 @@
 
 #include <private/bionic_asm.h>
 
-ENTRY(connect)
+ENTRY(__connect)
     .set push
     .set noreorder
     li v0, __NR_connect
@@ -22,4 +22,5 @@
     j t9
     move ra, t0
     .set pop
-END(connect)
+END(__connect)
+.hidden __connect
diff --git a/libc/arch-mips64/syscalls/__set_thread_area.S b/libc/arch-mips64/syscalls/__set_tls.S
similarity index 82%
rename from libc/arch-mips64/syscalls/__set_thread_area.S
rename to libc/arch-mips64/syscalls/__set_tls.S
index c28ee4a..f1c31b4 100644
--- a/libc/arch-mips64/syscalls/__set_thread_area.S
+++ b/libc/arch-mips64/syscalls/__set_tls.S
@@ -2,7 +2,7 @@
 
 #include <private/bionic_asm.h>
 
-ENTRY(__set_thread_area)
+ENTRY(__set_tls)
     .set push
     .set noreorder
     li v0, __NR_set_thread_area
@@ -22,5 +22,5 @@
     j t9
     move ra, t0
     .set pop
-END(__set_thread_area)
-.hidden __set_thread_area
+END(__set_tls)
+.hidden __set_tls
diff --git a/libc/arch-x86/atom/atom.mk b/libc/arch-x86/atom/atom.mk
new file mode 100644
index 0000000..bf408b4
--- /dev/null
+++ b/libc/arch-x86/atom/atom.mk
@@ -0,0 +1,34 @@
+libc_bionic_src_files_x86 += \
+    arch-x86/atom/string/sse2-bzero-atom.S \
+    arch-x86/atom/string/sse2-index-atom.S \
+    arch-x86/atom/string/sse2-memchr-atom.S \
+    arch-x86/atom/string/sse2-memrchr-atom.S \
+    arch-x86/atom/string/sse2-memset-atom.S \
+    arch-x86/atom/string/sse2-strchr-atom.S \
+    arch-x86/atom/string/sse2-strlen-atom.S \
+    arch-x86/atom/string/sse2-strnlen-atom.S \
+    arch-x86/atom/string/sse2-strrchr-atom.S \
+    arch-x86/atom/string/sse2-wcschr-atom.S \
+    arch-x86/atom/string/sse2-wcsrchr-atom.S \
+    arch-x86/atom/string/sse2-wcslen-atom.S \
+    arch-x86/atom/string/sse2-wcscmp-atom.S \
+    arch-x86/atom/string/ssse3-bcopy-atom.S \
+    arch-x86/atom/string/ssse3-memcmp-atom.S \
+    arch-x86/atom/string/ssse3-memcmp16-atom.S \
+    arch-x86/atom/string/ssse3-memcpy-atom.S \
+    arch-x86/atom/string/ssse3-memmove-atom.S \
+    arch-x86/atom/string/ssse3-strcat-atom.S \
+    arch-x86/atom/string/ssse3-strcmp-atom.S \
+    arch-x86/atom/string/ssse3-strcpy-atom.S \
+    arch-x86/atom/string/ssse3-strlcat-atom.S \
+    arch-x86/atom/string/ssse3-strlcpy-atom.S \
+    arch-x86/atom/string/ssse3-strncat-atom.S \
+    arch-x86/atom/string/ssse3-strncmp-atom.S \
+    arch-x86/atom/string/ssse3-strncpy-atom.S \
+    arch-x86/atom/string/ssse3-wcscat-atom.S \
+    arch-x86/atom/string/ssse3-wcscpy-atom.S \
+    arch-x86/atom/string/ssse3-wmemcmp-atom.S
+
+libc_bionic_src_files_x86 += \
+    arch-x86/silvermont/string/sse2-stpcpy-slm.S \
+    arch-x86/silvermont/string/sse2-stpncpy-slm.S
diff --git a/libc/arch-x86/string/cache.h b/libc/arch-x86/atom/string/cache.h
similarity index 89%
copy from libc/arch-x86/string/cache.h
copy to libc/arch-x86/atom/string/cache.h
index 9d0a563..823bb1e 100644
--- a/libc/arch-x86/string/cache.h
+++ b/libc/arch-x86/atom/string/cache.h
@@ -28,15 +28,9 @@
 SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 */
 
-#if defined(__slm__)
-/* Values are optimized for Silvermont */
-#define SHARED_CACHE_SIZE	(1024*1024)			/* Silvermont L2 Cache */
-#define DATA_CACHE_SIZE		(24*1024)			/* Silvermont L1 Data Cache */
-#else
 /* Values are optimized for Atom */
 #define SHARED_CACHE_SIZE	(512*1024)			/* Atom L2 Cache */
 #define DATA_CACHE_SIZE		(24*1024)			/* Atom L1 Data Cache */
-#endif
 
 #define SHARED_CACHE_SIZE_HALF	(SHARED_CACHE_SIZE / 2)
 #define DATA_CACHE_SIZE_HALF	(DATA_CACHE_SIZE / 2)
diff --git a/libc/arch-x86/string/sse2-bzero-atom.S b/libc/arch-x86/atom/string/sse2-bzero-atom.S
similarity index 100%
rename from libc/arch-x86/string/sse2-bzero-atom.S
rename to libc/arch-x86/atom/string/sse2-bzero-atom.S
diff --git a/libc/arch-x86/string/sse2-index-atom.S b/libc/arch-x86/atom/string/sse2-index-atom.S
similarity index 100%
rename from libc/arch-x86/string/sse2-index-atom.S
rename to libc/arch-x86/atom/string/sse2-index-atom.S
diff --git a/libc/arch-x86/string/sse2-memchr-atom.S b/libc/arch-x86/atom/string/sse2-memchr-atom.S
similarity index 100%
rename from libc/arch-x86/string/sse2-memchr-atom.S
rename to libc/arch-x86/atom/string/sse2-memchr-atom.S
diff --git a/libc/arch-x86/string/sse2-memrchr-atom.S b/libc/arch-x86/atom/string/sse2-memrchr-atom.S
similarity index 100%
rename from libc/arch-x86/string/sse2-memrchr-atom.S
rename to libc/arch-x86/atom/string/sse2-memrchr-atom.S
diff --git a/libc/arch-x86/string/sse2-memset-atom.S b/libc/arch-x86/atom/string/sse2-memset-atom.S
similarity index 98%
rename from libc/arch-x86/string/sse2-memset-atom.S
rename to libc/arch-x86/atom/string/sse2-memset-atom.S
index a54bf51..b0963a1 100644
--- a/libc/arch-x86/string/sse2-memset-atom.S
+++ b/libc/arch-x86/atom/string/sse2-memset-atom.S
@@ -29,7 +29,6 @@
 */
 
 #include "cache.h"
-#undef __i686
 
 #ifndef L
 # define L(label)	.L##label
@@ -107,7 +106,7 @@
    jump table with relative offsets.   */
 # define BRANCH_TO_JMPTBL_ENTRY(TABLE)				\
     /* We first load PC into EBX.  */				\
-    call	__i686.get_pc_thunk.bx;				\
+    call	__x86.get_pc_thunk.bx;				\
     /* Get the address of the jump table.  */			\
     add		$(TABLE - .), %ebx;				\
     /* Get the entry and convert the relative offset to the	\
@@ -117,12 +116,12 @@
     /* We loaded the jump table and adjuested EDX. Go.  */	\
     jmp		*%ebx
 
-	.section	.gnu.linkonce.t.__i686.get_pc_thunk.bx,"ax",@progbits
-	.globl	__i686.get_pc_thunk.bx
-	.hidden	__i686.get_pc_thunk.bx
+	.section	.gnu.linkonce.t.__x86.get_pc_thunk.bx,"ax",@progbits
+	.globl	__x86.get_pc_thunk.bx
+	.hidden	__x86.get_pc_thunk.bx
 	ALIGN (4)
-	.type	__i686.get_pc_thunk.bx,@function
-__i686.get_pc_thunk.bx:
+	.type	__x86.get_pc_thunk.bx,@function
+__x86.get_pc_thunk.bx:
 	movl	(%esp), %ebx
 	ret
 #else
@@ -321,7 +320,7 @@
 	mov	$SHARED_CACHE_SIZE, %ebx
 #else
 # if (defined SHARED || defined __PIC__)
-	call	__i686.get_pc_thunk.bx
+	call	__x86.get_pc_thunk.bx
 	add	$_GLOBAL_OFFSET_TABLE_, %ebx
 	mov	__x86_shared_cache_size@GOTOFF(%ebx), %ebx
 # else
@@ -340,7 +339,7 @@
 #else
 # if (defined SHARED || defined __PIC__)
 #  define RESTORE_EBX_STATE
-	call	__i686.get_pc_thunk.bx
+	call	__x86.get_pc_thunk.bx
 	add	$_GLOBAL_OFFSET_TABLE_, %ebx
 	cmp	__x86_data_cache_size@GOTOFF(%ebx), %ecx
 # else
diff --git a/libc/arch-x86/string/sse2-strchr-atom.S b/libc/arch-x86/atom/string/sse2-strchr-atom.S
similarity index 100%
rename from libc/arch-x86/string/sse2-strchr-atom.S
rename to libc/arch-x86/atom/string/sse2-strchr-atom.S
diff --git a/libc/arch-x86/string/sse2-strlen-atom.S b/libc/arch-x86/atom/string/sse2-strlen-atom.S
similarity index 100%
rename from libc/arch-x86/string/sse2-strlen-atom.S
rename to libc/arch-x86/atom/string/sse2-strlen-atom.S
diff --git a/libc/arch-x86/string/sse2-strnlen-atom.S b/libc/arch-x86/atom/string/sse2-strnlen-atom.S
similarity index 100%
rename from libc/arch-x86/string/sse2-strnlen-atom.S
rename to libc/arch-x86/atom/string/sse2-strnlen-atom.S
diff --git a/libc/arch-x86/string/sse2-strrchr-atom.S b/libc/arch-x86/atom/string/sse2-strrchr-atom.S
similarity index 100%
rename from libc/arch-x86/string/sse2-strrchr-atom.S
rename to libc/arch-x86/atom/string/sse2-strrchr-atom.S
diff --git a/libc/arch-x86/string/sse2-wcschr-atom.S b/libc/arch-x86/atom/string/sse2-wcschr-atom.S
similarity index 100%
rename from libc/arch-x86/string/sse2-wcschr-atom.S
rename to libc/arch-x86/atom/string/sse2-wcschr-atom.S
diff --git a/libc/arch-x86/string/sse2-wcscmp-atom.S b/libc/arch-x86/atom/string/sse2-wcscmp-atom.S
similarity index 100%
rename from libc/arch-x86/string/sse2-wcscmp-atom.S
rename to libc/arch-x86/atom/string/sse2-wcscmp-atom.S
diff --git a/libc/arch-x86/string/sse2-wcslen-atom.S b/libc/arch-x86/atom/string/sse2-wcslen-atom.S
similarity index 100%
rename from libc/arch-x86/string/sse2-wcslen-atom.S
rename to libc/arch-x86/atom/string/sse2-wcslen-atom.S
diff --git a/libc/arch-x86/string/sse2-wcsrchr-atom.S b/libc/arch-x86/atom/string/sse2-wcsrchr-atom.S
similarity index 100%
rename from libc/arch-x86/string/sse2-wcsrchr-atom.S
rename to libc/arch-x86/atom/string/sse2-wcsrchr-atom.S
diff --git a/libc/arch-x86/string/ssse3-bcopy-atom.S b/libc/arch-x86/atom/string/ssse3-bcopy-atom.S
similarity index 100%
rename from libc/arch-x86/string/ssse3-bcopy-atom.S
rename to libc/arch-x86/atom/string/ssse3-bcopy-atom.S
diff --git a/libc/arch-x86/string/ssse3-memcmp-atom.S b/libc/arch-x86/atom/string/ssse3-memcmp-atom.S
similarity index 100%
rename from libc/arch-x86/string/ssse3-memcmp-atom.S
rename to libc/arch-x86/atom/string/ssse3-memcmp-atom.S
diff --git a/libc/arch-x86/string/ssse3-memcmp16-atom.S b/libc/arch-x86/atom/string/ssse3-memcmp16-atom.S
similarity index 100%
rename from libc/arch-x86/string/ssse3-memcmp16-atom.S
rename to libc/arch-x86/atom/string/ssse3-memcmp16-atom.S
diff --git a/libc/arch-x86/string/ssse3-memcpy-atom.S b/libc/arch-x86/atom/string/ssse3-memcpy-atom.S
similarity index 99%
rename from libc/arch-x86/string/ssse3-memcpy-atom.S
rename to libc/arch-x86/atom/string/ssse3-memcpy-atom.S
index 1080a38..ac5ec2d 100644
--- a/libc/arch-x86/string/ssse3-memcpy-atom.S
+++ b/libc/arch-x86/atom/string/ssse3-memcpy-atom.S
@@ -29,7 +29,6 @@
 */
 
 #include "cache.h"
-#undef __i686
 
 #ifndef MEMCPY
 # define MEMCPY	memcpy
@@ -101,9 +100,8 @@
 # define RETURN_END	POP (%ebx); ret
 # define RETURN		RETURN_END; CFI_PUSH (%ebx)
 # define JMPTBL(I, B)	I - B
-# undef __i686
 
-# define SETUP_PIC_REG(x)	call	__i686.get_pc_thunk.x
+# define SETUP_PIC_REG(x)	call	__x86.get_pc_thunk.x
 
 /* Load an entry in a jump table into EBX and branch to it.  TABLE is a
 	jump table with relative offsets.  INDEX is a register contains the
diff --git a/libc/arch-x86/string/ssse3-memmove-atom.S b/libc/arch-x86/atom/string/ssse3-memmove-atom.S
similarity index 100%
rename from libc/arch-x86/string/ssse3-memmove-atom.S
rename to libc/arch-x86/atom/string/ssse3-memmove-atom.S
diff --git a/libc/arch-x86/string/ssse3-strcat-atom.S b/libc/arch-x86/atom/string/ssse3-strcat-atom.S
similarity index 100%
rename from libc/arch-x86/string/ssse3-strcat-atom.S
rename to libc/arch-x86/atom/string/ssse3-strcat-atom.S
diff --git a/libc/arch-x86/string/ssse3-strcmp-atom.S b/libc/arch-x86/atom/string/ssse3-strcmp-atom.S
similarity index 100%
rename from libc/arch-x86/string/ssse3-strcmp-atom.S
rename to libc/arch-x86/atom/string/ssse3-strcmp-atom.S
diff --git a/libc/arch-x86/string/ssse3-strcpy-atom.S b/libc/arch-x86/atom/string/ssse3-strcpy-atom.S
similarity index 100%
rename from libc/arch-x86/string/ssse3-strcpy-atom.S
rename to libc/arch-x86/atom/string/ssse3-strcpy-atom.S
diff --git a/libc/arch-x86/string/ssse3-strlcat-atom.S b/libc/arch-x86/atom/string/ssse3-strlcat-atom.S
similarity index 100%
rename from libc/arch-x86/string/ssse3-strlcat-atom.S
rename to libc/arch-x86/atom/string/ssse3-strlcat-atom.S
diff --git a/libc/arch-x86/string/ssse3-strlcpy-atom.S b/libc/arch-x86/atom/string/ssse3-strlcpy-atom.S
similarity index 100%
rename from libc/arch-x86/string/ssse3-strlcpy-atom.S
rename to libc/arch-x86/atom/string/ssse3-strlcpy-atom.S
diff --git a/libc/arch-x86/string/ssse3-strncat-atom.S b/libc/arch-x86/atom/string/ssse3-strncat-atom.S
similarity index 100%
rename from libc/arch-x86/string/ssse3-strncat-atom.S
rename to libc/arch-x86/atom/string/ssse3-strncat-atom.S
diff --git a/libc/arch-x86/string/ssse3-strncmp-atom.S b/libc/arch-x86/atom/string/ssse3-strncmp-atom.S
similarity index 100%
rename from libc/arch-x86/string/ssse3-strncmp-atom.S
rename to libc/arch-x86/atom/string/ssse3-strncmp-atom.S
diff --git a/libc/arch-x86/string/ssse3-strncpy-atom.S b/libc/arch-x86/atom/string/ssse3-strncpy-atom.S
similarity index 100%
rename from libc/arch-x86/string/ssse3-strncpy-atom.S
rename to libc/arch-x86/atom/string/ssse3-strncpy-atom.S
diff --git a/libc/arch-x86/string/ssse3-wcscat-atom.S b/libc/arch-x86/atom/string/ssse3-wcscat-atom.S
similarity index 100%
rename from libc/arch-x86/string/ssse3-wcscat-atom.S
rename to libc/arch-x86/atom/string/ssse3-wcscat-atom.S
diff --git a/libc/arch-x86/string/ssse3-wcscpy-atom.S b/libc/arch-x86/atom/string/ssse3-wcscpy-atom.S
similarity index 100%
rename from libc/arch-x86/string/ssse3-wcscpy-atom.S
rename to libc/arch-x86/atom/string/ssse3-wcscpy-atom.S
diff --git a/libc/arch-x86/string/ssse3-wmemcmp-atom.S b/libc/arch-x86/atom/string/ssse3-wmemcmp-atom.S
similarity index 100%
rename from libc/arch-x86/string/ssse3-wmemcmp-atom.S
rename to libc/arch-x86/atom/string/ssse3-wmemcmp-atom.S
diff --git a/libc/arch-x86/generic/generic.mk b/libc/arch-x86/generic/generic.mk
new file mode 100644
index 0000000..c8b40ee
--- /dev/null
+++ b/libc/arch-x86/generic/generic.mk
@@ -0,0 +1,55 @@
+libc_bionic_src_files_x86 += \
+    arch-x86/atom/string/sse2-index-atom.S \
+    arch-x86/atom/string/sse2-memchr-atom.S \
+    arch-x86/atom/string/sse2-memrchr-atom.S \
+    arch-x86/atom/string/sse2-strchr-atom.S \
+    arch-x86/atom/string/sse2-strnlen-atom.S \
+    arch-x86/atom/string/sse2-strrchr-atom.S \
+    arch-x86/atom/string/sse2-wcschr-atom.S \
+    arch-x86/atom/string/sse2-wcsrchr-atom.S \
+    arch-x86/atom/string/sse2-wcslen-atom.S \
+    arch-x86/atom/string/sse2-wcscmp-atom.S \
+    arch-x86/silvermont/string/sse2-bcopy-slm.S \
+    arch-x86/silvermont/string/sse2-bzero-slm.S \
+    arch-x86/silvermont/string/sse2-memcpy-slm.S \
+    arch-x86/silvermont/string/sse2-memmove-slm.S \
+    arch-x86/silvermont/string/sse2-memset-slm.S \
+    arch-x86/silvermont/string/sse2-stpcpy-slm.S \
+    arch-x86/silvermont/string/sse2-stpncpy-slm.S \
+    arch-x86/silvermont/string/sse2-strcpy-slm.S \
+    arch-x86/silvermont/string/sse2-strlen-slm.S \
+    arch-x86/silvermont/string/sse2-strncpy-slm.S
+
+ifeq ($(ARCH_X86_HAVE_SSSE3),true)
+libc_bionic_src_files_x86 += \
+    arch-x86/atom/string/ssse3-strncat-atom.S \
+    arch-x86/atom/string/ssse3-strlcat-atom.S \
+    arch-x86/atom/string/ssse3-strlcpy-atom.S \
+    arch-x86/atom/string/ssse3-strcmp-atom.S \
+    arch-x86/atom/string/ssse3-strncmp-atom.S \
+    arch-x86/atom/string/ssse3-strcat-atom.S \
+    arch-x86/atom/string/ssse3-memcmp16-atom.S \
+    arch-x86/atom/string/ssse3-wcscat-atom.S \
+    arch-x86/atom/string/ssse3-wcscpy-atom.S
+else
+libc_bionic_src_files_x86 += \
+    arch-x86/generic/string/strcmp.S \
+    arch-x86/generic/string/strncmp.S \
+    arch-x86/generic/string/strcat.S \
+    bionic/__memcmp16.cpp \
+    upstream-freebsd/lib/libc/string/wcscpy.c \
+    upstream-freebsd/lib/libc/string/wcscat.c \
+    upstream-openbsd/lib/libc/string/strlcat.c \
+    upstream-openbsd/lib/libc/string/strlcpy.c \
+    upstream-openbsd/lib/libc/string/strncat.c
+endif
+
+ifeq ($(ARCH_X86_HAVE_SSE4),true)
+ libc_bionic_src_files_x86 += \
+    arch-x86/silvermont/string/sse4-memcmp-slm.S \
+    arch-x86/silvermont/string/sse4-wmemcmp-slm.S
+else
+libc_bionic_src_files_x86 += \
+    arch-x86/generic/string/memcmp.S \
+    upstream-freebsd/lib/libc/string/wmemcmp.c
+endif
diff --git a/libc/arch-x86/string/bcopy.S b/libc/arch-x86/generic/string/bcopy.S
similarity index 100%
rename from libc/arch-x86/string/bcopy.S
rename to libc/arch-x86/generic/string/bcopy.S
diff --git a/libc/arch-x86/string/memcmp.S b/libc/arch-x86/generic/string/memcmp.S
similarity index 100%
rename from libc/arch-x86/string/memcmp.S
rename to libc/arch-x86/generic/string/memcmp.S
diff --git a/libc/arch-x86/string/memcpy.S b/libc/arch-x86/generic/string/memcpy.S
similarity index 100%
rename from libc/arch-x86/string/memcpy.S
rename to libc/arch-x86/generic/string/memcpy.S
diff --git a/libc/arch-x86/string/memmove.S b/libc/arch-x86/generic/string/memmove.S
similarity index 100%
rename from libc/arch-x86/string/memmove.S
rename to libc/arch-x86/generic/string/memmove.S
diff --git a/libc/arch-x86/string/strcat.S b/libc/arch-x86/generic/string/strcat.S
similarity index 100%
rename from libc/arch-x86/string/strcat.S
rename to libc/arch-x86/generic/string/strcat.S
diff --git a/libc/arch-x86/string/strcmp.S b/libc/arch-x86/generic/string/strcmp.S
similarity index 100%
rename from libc/arch-x86/string/strcmp.S
rename to libc/arch-x86/generic/string/strcmp.S
diff --git a/libc/arch-x86/string/strncmp.S b/libc/arch-x86/generic/string/strncmp.S
similarity index 100%
rename from libc/arch-x86/string/strncmp.S
rename to libc/arch-x86/generic/string/strncmp.S
diff --git a/libc/arch-x86/string/swab.S b/libc/arch-x86/generic/string/swab.S
similarity index 100%
rename from libc/arch-x86/string/swab.S
rename to libc/arch-x86/generic/string/swab.S
diff --git a/libc/arch-x86/silvermont/silvermont.mk b/libc/arch-x86/silvermont/silvermont.mk
new file mode 100644
index 0000000..b951ad5
--- /dev/null
+++ b/libc/arch-x86/silvermont/silvermont.mk
@@ -0,0 +1,34 @@
+libc_bionic_src_files_x86 += \
+    arch-x86/silvermont/string/sse2-bcopy-slm.S \
+    arch-x86/silvermont/string/sse2-bzero-slm.S \
+    arch-x86/silvermont/string/sse2-memcpy-slm.S \
+    arch-x86/silvermont/string/sse2-memmove-slm.S \
+    arch-x86/silvermont/string/sse2-memset-slm.S \
+    arch-x86/silvermont/string/sse2-stpcpy-slm.S \
+    arch-x86/silvermont/string/sse2-stpncpy-slm.S \
+    arch-x86/silvermont/string/sse2-strcpy-slm.S \
+    arch-x86/silvermont/string/sse2-strlen-slm.S \
+    arch-x86/silvermont/string/sse2-strncpy-slm.S \
+    arch-x86/silvermont/string/sse4-memcmp-slm.S \
+    arch-x86/silvermont/string/sse4-wmemcmp-slm.S
+
+libc_bionic_src_files_x86 += \
+    arch-x86/atom/string/sse2-memchr-atom.S \
+    arch-x86/atom/string/sse2-memrchr-atom.S \
+    arch-x86/atom/string/sse2-strchr-atom.S \
+    arch-x86/atom/string/sse2-strrchr-atom.S \
+    arch-x86/atom/string/sse2-index-atom.S \
+    arch-x86/atom/string/sse2-strnlen-atom.S \
+    arch-x86/atom/string/sse2-wcschr-atom.S \
+    arch-x86/atom/string/sse2-wcsrchr-atom.S \
+    arch-x86/atom/string/sse2-wcslen-atom.S \
+    arch-x86/atom/string/sse2-wcscmp-atom.S \
+    arch-x86/atom/string/ssse3-strncat-atom.S \
+    arch-x86/atom/string/ssse3-strlcat-atom.S \
+    arch-x86/atom/string/ssse3-strlcpy-atom.S \
+    arch-x86/atom/string/ssse3-strcmp-atom.S \
+    arch-x86/atom/string/ssse3-strncmp-atom.S \
+    arch-x86/atom/string/ssse3-strcat-atom.S \
+    arch-x86/atom/string/ssse3-memcmp16-atom.S \
+    arch-x86/atom/string/ssse3-wcscat-atom.S \
+    arch-x86/atom/string/ssse3-wcscpy-atom.S
diff --git a/libc/arch-x86/string/cache.h b/libc/arch-x86/silvermont/string/cache.h
similarity index 90%
rename from libc/arch-x86/string/cache.h
rename to libc/arch-x86/silvermont/string/cache.h
index 9d0a563..c342b1c 100644
--- a/libc/arch-x86/string/cache.h
+++ b/libc/arch-x86/silvermont/string/cache.h
@@ -28,15 +28,9 @@
 SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 */
 
-#if defined(__slm__)
 /* Values are optimized for Silvermont */
 #define SHARED_CACHE_SIZE	(1024*1024)			/* Silvermont L2 Cache */
 #define DATA_CACHE_SIZE		(24*1024)			/* Silvermont L1 Data Cache */
-#else
-/* Values are optimized for Atom */
-#define SHARED_CACHE_SIZE	(512*1024)			/* Atom L2 Cache */
-#define DATA_CACHE_SIZE		(24*1024)			/* Atom L1 Data Cache */
-#endif
 
 #define SHARED_CACHE_SIZE_HALF	(SHARED_CACHE_SIZE / 2)
 #define DATA_CACHE_SIZE_HALF	(DATA_CACHE_SIZE / 2)
diff --git a/libc/arch-x86/string/cache.h b/libc/arch-x86/silvermont/string/sse2-bcopy-slm.S
similarity index 73%
copy from libc/arch-x86/string/cache.h
copy to libc/arch-x86/silvermont/string/sse2-bcopy-slm.S
index 9d0a563..190d52f 100644
--- a/libc/arch-x86/string/cache.h
+++ b/libc/arch-x86/silvermont/string/sse2-bcopy-slm.S
@@ -1,5 +1,5 @@
 /*
-Copyright (c) 2010, Intel Corporation
+Copyright (c) 2014, Intel Corporation
 All rights reserved.
 
 Redistribution and use in source and binary forms, with or without
@@ -28,15 +28,7 @@
 SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 */
 
-#if defined(__slm__)
-/* Values are optimized for Silvermont */
-#define SHARED_CACHE_SIZE	(1024*1024)			/* Silvermont L2 Cache */
-#define DATA_CACHE_SIZE		(24*1024)			/* Silvermont L1 Data Cache */
-#else
-/* Values are optimized for Atom */
-#define SHARED_CACHE_SIZE	(512*1024)			/* Atom L2 Cache */
-#define DATA_CACHE_SIZE		(24*1024)			/* Atom L1 Data Cache */
-#endif
 
-#define SHARED_CACHE_SIZE_HALF	(SHARED_CACHE_SIZE / 2)
-#define DATA_CACHE_SIZE_HALF	(DATA_CACHE_SIZE / 2)
+#define MEMMOVE	bcopy
+#define USE_AS_BCOPY
+#include "sse2-memmove-slm.S"
diff --git a/libc/arch-x86/string/cache.h b/libc/arch-x86/silvermont/string/sse2-bzero-slm.S
similarity index 73%
copy from libc/arch-x86/string/cache.h
copy to libc/arch-x86/silvermont/string/sse2-bzero-slm.S
index 9d0a563..b682ed6 100644
--- a/libc/arch-x86/string/cache.h
+++ b/libc/arch-x86/silvermont/string/sse2-bzero-slm.S
@@ -1,5 +1,5 @@
 /*
-Copyright (c) 2010, Intel Corporation
+Copyright (c) 2014, Intel Corporation
 All rights reserved.
 
 Redistribution and use in source and binary forms, with or without
@@ -28,15 +28,6 @@
 SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 */
 
-#if defined(__slm__)
-/* Values are optimized for Silvermont */
-#define SHARED_CACHE_SIZE	(1024*1024)			/* Silvermont L2 Cache */
-#define DATA_CACHE_SIZE		(24*1024)			/* Silvermont L1 Data Cache */
-#else
-/* Values are optimized for Atom */
-#define SHARED_CACHE_SIZE	(512*1024)			/* Atom L2 Cache */
-#define DATA_CACHE_SIZE		(24*1024)			/* Atom L1 Data Cache */
-#endif
-
-#define SHARED_CACHE_SIZE_HALF	(SHARED_CACHE_SIZE / 2)
-#define DATA_CACHE_SIZE_HALF	(DATA_CACHE_SIZE / 2)
+#define USE_AS_BZERO
+#define MEMSET  bzero
+#include "sse2-memset-slm.S"
diff --git a/libc/arch-x86/silvermont/string/sse2-memcpy-slm.S b/libc/arch-x86/silvermont/string/sse2-memcpy-slm.S
new file mode 100644
index 0000000..1b305c7
--- /dev/null
+++ b/libc/arch-x86/silvermont/string/sse2-memcpy-slm.S
@@ -0,0 +1,308 @@
+/*
+Copyright (c) 2014, Intel Corporation
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+    * Redistributions of source code must retain the above copyright notice,
+    * this list of conditions and the following disclaimer.
+
+    * Redistributions in binary form must reproduce the above copyright notice,
+    * this list of conditions and the following disclaimer in the documentation
+    * and/or other materials provided with the distribution.
+
+    * Neither the name of Intel Corporation nor the names of its contributors
+    * may be used to endorse or promote products derived from this software
+    * without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
+ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+#include "cache.h"
+
+#ifndef MEMCPY
+# define MEMCPY	memcpy
+#endif
+
+#ifndef L
+# define L(label)	.L##label
+#endif
+
+#ifndef cfi_startproc
+# define cfi_startproc	.cfi_startproc
+#endif
+
+#ifndef cfi_endproc
+# define cfi_endproc	.cfi_endproc
+#endif
+
+#ifndef cfi_rel_offset
+# define cfi_rel_offset(reg, off)	.cfi_rel_offset reg, off
+#endif
+
+#ifndef cfi_restore
+# define cfi_restore(reg)	.cfi_restore reg
+#endif
+
+#ifndef cfi_adjust_cfa_offset
+# define cfi_adjust_cfa_offset(off)	.cfi_adjust_cfa_offset off
+#endif
+
+#ifndef ENTRY
+# define ENTRY(name)		\
+	.type name,  @function;		\
+	.globl name;		\
+	.p2align 4;		\
+name:		\
+	cfi_startproc
+#endif
+
+#ifndef END
+# define END(name)		\
+	cfi_endproc;		\
+	.size name, .-name
+#endif
+
+#define DEST		PARMS
+#define SRC		DEST+4
+#define LEN		SRC+4
+
+#define CFI_PUSH(REG)		\
+  cfi_adjust_cfa_offset (4);		\
+  cfi_rel_offset (REG, 0)
+
+#define CFI_POP(REG)		\
+  cfi_adjust_cfa_offset (-4);		\
+  cfi_restore (REG)
+
+#define PUSH(REG)	pushl REG; CFI_PUSH (REG)
+#define POP(REG)	popl REG; CFI_POP (REG)
+
+#define PARMS		8		/* Preserve EBX.  */
+#define ENTRANCE	PUSH (%ebx);
+#define RETURN_END	POP (%ebx); ret
+#define RETURN		RETURN_END; CFI_PUSH (%ebx)
+
+	.section .text.sse2,"ax",@progbits
+ENTRY (MEMCPY)
+	ENTRANCE
+	movl	LEN(%esp), %ecx
+	movl	SRC(%esp), %eax
+	movl	DEST(%esp), %edx
+
+	cmp	%eax, %edx
+	je	L(return)
+
+	cmp	$16, %ecx
+	jbe	L(len_0_16_bytes)
+
+	cmp     $SHARED_CACHE_SIZE_HALF, %ecx
+	jae     L(large_page)
+
+	movdqu	(%eax), %xmm0
+	movdqu	-16(%eax, %ecx), %xmm1
+	cmpl    $32, %ecx
+	movdqu	%xmm0, (%edx)
+	movdqu	%xmm1, -16(%edx, %ecx)
+	jbe	L(return)
+
+	movdqu	16(%eax), %xmm0
+	movdqu	-32(%eax, %ecx), %xmm1
+	cmpl    $64, %ecx
+	movdqu	%xmm0, 16(%edx)
+	movdqu	%xmm1, -32(%edx, %ecx)
+	jbe	L(return)
+
+	movdqu	32(%eax), %xmm0
+	movdqu	48(%eax), %xmm1
+	movdqu	-48(%eax, %ecx), %xmm2
+	movdqu	-64(%eax, %ecx), %xmm3
+	cmpl    $128, %ecx
+	movdqu	%xmm0, 32(%edx)
+	movdqu	%xmm1, 48(%edx)
+	movdqu	%xmm2, -48(%edx, %ecx)
+	movdqu	%xmm3, -64(%edx, %ecx)
+	jbe	L(return)
+
+/* Now the main loop: we align the address of the destination.  */
+	leal	64(%edx), %ebx
+	andl	$-64, %ebx
+
+	addl	%edx, %ecx
+	andl	$-64, %ecx
+
+	subl	%edx, %eax
+
+/* We should stop two iterations before the termination
+	(in order not to misprefetch).  */
+	subl	$64, %ecx
+	cmpl	%ebx, %ecx
+	je	L(main_loop_just_one_iteration)
+
+	subl	$64, %ecx
+	cmpl	%ebx, %ecx
+	je	L(main_loop_last_two_iterations)
+
+
+	.p2align 4
+L(main_loop_cache):
+
+	prefetcht0 128(%ebx, %eax)
+
+	movdqu	(%ebx, %eax), %xmm0
+	movdqu	16(%ebx, %eax), %xmm1
+	movdqu	32(%ebx, %eax), %xmm2
+	movdqu	48(%ebx, %eax), %xmm3
+	movdqa	%xmm0, (%ebx)
+	movdqa	%xmm1, 16(%ebx)
+	movdqa	%xmm2, 32(%ebx)
+	movdqa	%xmm3, 48(%ebx)
+	lea	64(%ebx), %ebx
+	cmpl	%ebx, %ecx
+	jne	L(main_loop_cache)
+
+L(main_loop_last_two_iterations):
+	movdqu	(%ebx, %eax), %xmm0
+	movdqu	16(%ebx, %eax), %xmm1
+	movdqu	32(%ebx, %eax), %xmm2
+	movdqu	48(%ebx, %eax), %xmm3
+	movdqu	64(%ebx, %eax), %xmm4
+	movdqu	80(%ebx, %eax), %xmm5
+	movdqu	96(%ebx, %eax), %xmm6
+	movdqu	112(%ebx, %eax), %xmm7
+	movdqa	%xmm0, (%ebx)
+	movdqa	%xmm1, 16(%ebx)
+	movdqa	%xmm2, 32(%ebx)
+	movdqa	%xmm3, 48(%ebx)
+	movdqa	%xmm4, 64(%ebx)
+	movdqa	%xmm5, 80(%ebx)
+	movdqa	%xmm6, 96(%ebx)
+	movdqa	%xmm7, 112(%ebx)
+	jmp	L(return)
+
+L(main_loop_just_one_iteration):
+	movdqu	(%ebx, %eax), %xmm0
+	movdqu	16(%ebx, %eax), %xmm1
+	movdqu	32(%ebx, %eax), %xmm2
+	movdqu	48(%ebx, %eax), %xmm3
+	movdqa	%xmm0, (%ebx)
+	movdqa	%xmm1, 16(%ebx)
+	movdqa	%xmm2, 32(%ebx)
+	movdqa	%xmm3, 48(%ebx)
+	jmp	L(return)
+
+L(large_page):
+	movdqu	(%eax), %xmm0
+	movdqu	16(%eax), %xmm1
+	movdqu	32(%eax), %xmm2
+	movdqu	48(%eax), %xmm3
+	movdqu	-64(%eax, %ecx), %xmm4
+	movdqu	-48(%eax, %ecx), %xmm5
+	movdqu	-32(%eax, %ecx), %xmm6
+	movdqu	-16(%eax, %ecx), %xmm7
+	movdqu	%xmm0, (%edx)
+	movdqu	%xmm1, 16(%edx)
+	movdqu	%xmm2, 32(%edx)
+	movdqu	%xmm3, 48(%edx)
+	movdqu	%xmm4, -64(%edx, %ecx)
+	movdqu	%xmm5, -48(%edx, %ecx)
+	movdqu	%xmm6, -32(%edx, %ecx)
+	movdqu	%xmm7, -16(%edx, %ecx)
+
+	movdqu	64(%eax), %xmm0
+	movdqu	80(%eax), %xmm1
+	movdqu	96(%eax), %xmm2
+	movdqu	112(%eax), %xmm3
+	movdqu	-128(%eax, %ecx), %xmm4
+	movdqu	-112(%eax, %ecx), %xmm5
+	movdqu	-96(%eax, %ecx), %xmm6
+	movdqu	-80(%eax, %ecx), %xmm7
+	movdqu	%xmm0, 64(%edx)
+	movdqu	%xmm1, 80(%edx)
+	movdqu	%xmm2, 96(%edx)
+	movdqu	%xmm3, 112(%edx)
+	movdqu	%xmm4, -128(%edx, %ecx)
+	movdqu	%xmm5, -112(%edx, %ecx)
+	movdqu	%xmm6, -96(%edx, %ecx)
+	movdqu	%xmm7, -80(%edx, %ecx)
+
+/* Now the main loop with non temporal stores. We align
+	the address of the destination.  */
+	leal	128(%edx), %ebx
+	andl	$-128, %ebx
+
+	addl	%edx, %ecx
+	andl	$-128, %ecx
+
+	subl	%edx, %eax
+
+	.p2align 4
+L(main_loop_large_page):
+	movdqu	(%ebx, %eax), %xmm0
+	movdqu	16(%ebx, %eax), %xmm1
+	movdqu	32(%ebx, %eax), %xmm2
+	movdqu	48(%ebx, %eax), %xmm3
+	movdqu	64(%ebx, %eax), %xmm4
+	movdqu	80(%ebx, %eax), %xmm5
+	movdqu	96(%ebx, %eax), %xmm6
+	movdqu	112(%ebx, %eax), %xmm7
+	movntdq	%xmm0, (%ebx)
+	movntdq	%xmm1, 16(%ebx)
+	movntdq	%xmm2, 32(%ebx)
+	movntdq	%xmm3, 48(%ebx)
+	movntdq	%xmm4, 64(%ebx)
+	movntdq	%xmm5, 80(%ebx)
+	movntdq	%xmm6, 96(%ebx)
+	movntdq	%xmm7, 112(%ebx)
+	lea	128(%ebx), %ebx
+	cmpl	%ebx, %ecx
+	jne	L(main_loop_large_page)
+	sfence
+	jmp	L(return)
+
+L(len_0_16_bytes):
+	testb	$24, %cl
+	jne	L(len_9_16_bytes)
+	testb	$4, %cl
+	.p2align 4,,5
+	jne	L(len_5_8_bytes)
+	testl	%ecx, %ecx
+	.p2align 4,,2
+	je	L(return)
+	movzbl	(%eax), %ebx
+	testb	$2, %cl
+	movb	%bl, (%edx)
+	je	L(return)
+	movzwl	-2(%eax,%ecx), %ebx
+	movw	%bx, -2(%edx,%ecx)
+	jmp	L(return)
+
+L(len_9_16_bytes):
+	movq	(%eax), %xmm0
+	movq	-8(%eax, %ecx), %xmm1
+	movq	%xmm0, (%edx)
+	movq	%xmm1, -8(%edx, %ecx)
+	jmp	L(return)
+
+L(len_5_8_bytes):
+	movl	(%eax), %ebx
+	movl	%ebx, (%edx)
+	movl	-4(%eax,%ecx), %ebx
+	movl	%ebx, -4(%edx,%ecx)
+	jmp	L(return)
+
+L(return):
+	movl	%edx, %eax
+	RETURN
+
+END (MEMCPY)
diff --git a/libc/arch-x86/silvermont/string/sse2-memmove-slm.S b/libc/arch-x86/silvermont/string/sse2-memmove-slm.S
new file mode 100644
index 0000000..79a0a36
--- /dev/null
+++ b/libc/arch-x86/silvermont/string/sse2-memmove-slm.S
@@ -0,0 +1,673 @@
+/*
+Copyright (c) 2014, Intel Corporation
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+    * Redistributions of source code must retain the above copyright notice,
+    * this list of conditions and the following disclaimer.
+
+    * Redistributions in binary form must reproduce the above copyright notice,
+    * this list of conditions and the following disclaimer in the documentation
+    * and/or other materials provided with the distribution.
+
+    * Neither the name of Intel Corporation nor the names of its contributors
+    * may be used to endorse or promote products derived from this software
+    * without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
+ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+#include "cache.h"
+
+#ifndef MEMMOVE
+# define MEMMOVE	memmove
+#endif
+
+#ifndef L
+# define L(label)	.L##label
+#endif
+
+#ifndef cfi_startproc
+# define cfi_startproc	.cfi_startproc
+#endif
+
+#ifndef cfi_endproc
+# define cfi_endproc	.cfi_endproc
+#endif
+
+#ifndef cfi_rel_offset
+# define cfi_rel_offset(reg, off)	.cfi_rel_offset reg, off
+#endif
+
+#ifndef cfi_restore
+# define cfi_restore(reg)	.cfi_restore reg
+#endif
+
+#ifndef cfi_adjust_cfa_offset
+# define cfi_adjust_cfa_offset(off)	.cfi_adjust_cfa_offset off
+#endif
+
+#ifndef ENTRY
+# define ENTRY(name)		\
+	.type name,  @function;		\
+	.globl name;		\
+	.p2align 4;		\
+name:		\
+	cfi_startproc
+#endif
+
+#ifndef END
+# define END(name)		\
+	cfi_endproc;		\
+	.size name, .-name
+#endif
+
+#ifdef USE_AS_BCOPY
+# define SRC           PARMS
+# define DEST          SRC+4
+# define LEN           DEST+4
+#else
+# define DEST          PARMS
+# define SRC           DEST+4
+# define LEN           SRC+4
+#endif
+
+#define CFI_PUSH(REG)		\
+  cfi_adjust_cfa_offset (4);		\
+  cfi_rel_offset (REG, 0)
+
+#define CFI_POP(REG)		\
+  cfi_adjust_cfa_offset (-4);		\
+  cfi_restore (REG)
+
+#define PUSH(REG)	pushl REG; CFI_PUSH (REG)
+#define POP(REG)	popl REG; CFI_POP (REG)
+
+#define PARMS		8		/* Preserve EBX.  */
+#define ENTRANCE	PUSH (%ebx);
+#define RETURN_END	POP (%ebx); ret
+#define RETURN		RETURN_END; CFI_PUSH (%ebx)
+
+	.section .text.sse2,"ax",@progbits
+ENTRY (MEMMOVE)
+	ENTRANCE
+	movl	LEN(%esp), %ecx
+	movl	SRC(%esp), %eax
+	movl	DEST(%esp), %edx
+
+/* Check whether we should copy backward or forward.  */
+	cmp	%eax, %edx
+	je	L(mm_return)
+	ja	L(mm_len_0_or_more_backward)
+
+/* Now do checks for lengths. We do [0..16], [0..32], [0..64], [0..128]
+	separately.  */
+	cmp	$16, %ecx
+	jbe	L(mm_len_0_16_bytes_forward)
+
+	cmpl    $32, %ecx
+	jg	L(mm_len_32_or_more_forward)
+
+/* Copy [0..32] and return.  */
+	movdqu	(%eax), %xmm0
+	movdqu	-16(%eax, %ecx), %xmm1
+	movdqu	%xmm0, (%edx)
+	movdqu	%xmm1, -16(%edx, %ecx)
+	jmp	L(mm_return)
+
+L(mm_len_32_or_more_forward):
+	cmpl    $64, %ecx
+	jg	L(mm_len_64_or_more_forward)
+
+/* Copy [0..64] and return.  */
+	movdqu	(%eax), %xmm0
+	movdqu	16(%eax), %xmm1
+	movdqu	-16(%eax, %ecx), %xmm2
+	movdqu	-32(%eax, %ecx), %xmm3
+	movdqu	%xmm0, (%edx)
+	movdqu	%xmm1, 16(%edx)
+	movdqu	%xmm2, -16(%edx, %ecx)
+	movdqu	%xmm3, -32(%edx, %ecx)
+	jmp	L(mm_return)
+
+L(mm_len_64_or_more_forward):
+	cmpl    $128, %ecx
+	jg	L(mm_len_128_or_more_forward)
+
+/* Copy [0..128] and return.  */
+	movdqu	(%eax), %xmm0
+	movdqu	16(%eax), %xmm1
+	movdqu	32(%eax), %xmm2
+	movdqu	48(%eax), %xmm3
+	movdqu	-64(%eax, %ecx), %xmm4
+	movdqu	-48(%eax, %ecx), %xmm5
+	movdqu	-32(%eax, %ecx), %xmm6
+	movdqu	-16(%eax, %ecx), %xmm7
+	movdqu	%xmm0, (%edx)
+	movdqu	%xmm1, 16(%edx)
+	movdqu	%xmm2, 32(%edx)
+	movdqu	%xmm3, 48(%edx)
+	movdqu	%xmm4, -64(%edx, %ecx)
+	movdqu	%xmm5, -48(%edx, %ecx)
+	movdqu	%xmm6, -32(%edx, %ecx)
+	movdqu	%xmm7, -16(%edx, %ecx)
+	jmp	L(mm_return)
+
+L(mm_len_128_or_more_forward):
+
+	cmp     $SHARED_CACHE_SIZE_HALF, %ecx
+	jae     L(mm_large_page_forward)
+
+	PUSH (%esi)
+	PUSH (%edi)
+	movl	%eax, %esi
+	movl	%edx, %edi
+
+/* Aligning the address of destination.  */
+	movdqu	(%esi), %xmm0
+	movdqu	16(%esi), %xmm1
+	movdqu	32(%esi), %xmm2
+	movdqu	48(%esi), %xmm3
+
+	leal	64(%edi), %edx
+	andl	$-64, %edx
+
+	movl	%esi, %eax
+	subl	%edi, %eax
+
+	movdqu	(%edx, %eax), %xmm4
+	movdqu	16(%edx, %eax), %xmm5
+	movdqu	32(%edx, %eax), %xmm6
+	movdqu	48(%edx, %eax), %xmm7
+
+	movdqu	%xmm0, (%edi)
+	movdqu	%xmm1, 16(%edi)
+	movdqu	%xmm2, 32(%edi)
+	movdqu	%xmm3, 48(%edi)
+	movdqa	%xmm4, (%edx)
+	movdqa	%xmm5, 16(%edx)
+	movdqa	%xmm6, 32(%edx)
+	movdqa	%xmm7, 48(%edx)
+	addl	$64, %edx
+
+	leal	(%edi, %ecx), %ebx
+	andl	$-64, %ebx
+
+	cmp	%edx, %ebx
+	jbe	L(mm_copy_remaining_forward)
+
+	.p2align 4
+L(mm_main_loop_forward):
+
+	prefetcht0 128(%edx, %eax)
+
+	movdqu	(%edx, %eax), %xmm0
+	movdqu	16(%edx, %eax), %xmm1
+	movdqu	32(%edx, %eax), %xmm2
+	movdqu	48(%edx, %eax), %xmm3
+	movdqa	%xmm0, (%edx)
+	movdqa	%xmm1, 16(%edx)
+	movdqa	%xmm2, 32(%edx)
+	movdqa	%xmm3, 48(%edx)
+	leal	64(%edx), %edx
+	cmp	%edx, %ebx
+	ja	L(mm_main_loop_forward)
+
+L(mm_copy_remaining_forward):
+	addl	%edi, %ecx
+	subl	%edx, %ecx
+/* We copied all up till %edx position in the dst.
+	In %ecx now is how many bytes are left to copy.
+	Now we need to advance %esi. */
+	leal	(%edx, %eax), %esi
+
+L(mm_remaining_0_64_bytes_forward):
+	cmp	$32, %ecx
+	ja	L(mm_remaining_33_64_bytes_forward)
+	cmp	$16, %ecx
+	ja	L(mm_remaining_17_32_bytes_forward)
+	testl	%ecx, %ecx
+	.p2align 4,,2
+	je	L(mm_return_pop_all)
+
+	cmpb	$8, %cl
+	ja	L(mm_remaining_9_16_bytes_forward)
+	cmpb	$4, %cl
+	.p2align 4,,5
+	ja	L(mm_remaining_5_8_bytes_forward)
+	cmpb	$2, %cl
+	.p2align 4,,1
+	ja	L(mm_remaining_3_4_bytes_forward)
+	movzbl	-1(%esi,%ecx), %eax
+	movzbl	(%esi), %ebx
+	movb	%al, -1(%edx,%ecx)
+	movb	%bl, (%edx)
+	jmp	L(mm_return_pop_all)
+
+L(mm_remaining_33_64_bytes_forward):
+	movdqu	(%esi), %xmm0
+	movdqu	16(%esi), %xmm1
+	movdqu	-32(%esi, %ecx), %xmm2
+	movdqu	-16(%esi, %ecx), %xmm3
+	movdqu	%xmm0, (%edx)
+	movdqu	%xmm1, 16(%edx)
+	movdqu	%xmm2, -32(%edx, %ecx)
+	movdqu	%xmm3, -16(%edx, %ecx)
+	jmp	L(mm_return_pop_all)
+
+L(mm_remaining_17_32_bytes_forward):
+	movdqu	(%esi), %xmm0
+	movdqu	-16(%esi, %ecx), %xmm1
+	movdqu	%xmm0, (%edx)
+	movdqu	%xmm1, -16(%edx, %ecx)
+	jmp	L(mm_return_pop_all)
+
+L(mm_remaining_3_4_bytes_forward):
+	movzwl	-2(%esi,%ecx), %eax
+	movzwl	(%esi), %ebx
+	movw	%ax, -2(%edx,%ecx)
+	movw	%bx, (%edx)
+	jmp	L(mm_return_pop_all)
+
+L(mm_remaining_5_8_bytes_forward):
+	movl	(%esi), %eax
+	movl	-4(%esi,%ecx), %ebx
+	movl	%eax, (%edx)
+	movl	%ebx, -4(%edx,%ecx)
+	jmp	L(mm_return_pop_all)
+
+L(mm_remaining_9_16_bytes_forward):
+	movq	(%esi), %xmm0
+	movq	-8(%esi, %ecx), %xmm1
+	movq	%xmm0, (%edx)
+	movq	%xmm1, -8(%edx, %ecx)
+	jmp	L(mm_return_pop_all)
+
+
+L(mm_len_0_16_bytes_forward):
+	testb	$24, %cl
+	jne	L(mm_len_9_16_bytes_forward)
+	testb	$4, %cl
+	.p2align 4,,5
+	jne	L(mm_len_5_8_bytes_forward)
+	testl	%ecx, %ecx
+	.p2align 4,,2
+	je	L(mm_return)
+	testb	$2, %cl
+	.p2align 4,,1
+	jne	L(mm_len_2_4_bytes_forward)
+	movzbl	-1(%eax,%ecx), %ebx
+	movzbl	(%eax), %eax
+	movb	%bl, -1(%edx,%ecx)
+	movb	%al, (%edx)
+	jmp	L(mm_return)
+
+L(mm_len_2_4_bytes_forward):
+	movzwl	-2(%eax,%ecx), %ebx
+	movzwl	(%eax), %eax
+	movw	%bx, -2(%edx,%ecx)
+	movw	%ax, (%edx)
+	jmp	L(mm_return)
+
+L(mm_len_5_8_bytes_forward):
+	movl	(%eax), %ebx
+	movl	-4(%eax,%ecx), %eax
+	movl	%ebx, (%edx)
+	movl	%eax, -4(%edx,%ecx)
+	jmp	L(mm_return)
+
+L(mm_len_9_16_bytes_forward):
+	movq	(%eax), %xmm0
+	movq	-8(%eax, %ecx), %xmm1
+	movq	%xmm0, (%edx)
+	movq	%xmm1, -8(%edx, %ecx)
+	jmp	L(mm_return)
+
+/* The code for copying backwards.  */
+L(mm_len_0_or_more_backward):
+
+/* Now do checks for lengths. We do [0..16], [0..32], [0..64], [0..128]
+	separately.  */
+	cmp	$16, %ecx
+	jbe	L(mm_len_0_16_bytes_backward)
+
+	cmpl    $32, %ecx
+	jg	L(mm_len_32_or_more_backward)
+
+/* Copy [0..32] and return.  */
+	movdqu	(%eax), %xmm0
+	movdqu	-16(%eax, %ecx), %xmm1
+	movdqu	%xmm0, (%edx)
+	movdqu	%xmm1, -16(%edx, %ecx)
+	jmp	L(mm_return)
+
+L(mm_len_32_or_more_backward):
+	cmpl    $64, %ecx
+	jg	L(mm_len_64_or_more_backward)
+
+/* Copy [0..64] and return.  */
+	movdqu	(%eax), %xmm0
+	movdqu	16(%eax), %xmm1
+	movdqu	-16(%eax, %ecx), %xmm2
+	movdqu	-32(%eax, %ecx), %xmm3
+	movdqu	%xmm0, (%edx)
+	movdqu	%xmm1, 16(%edx)
+	movdqu	%xmm2, -16(%edx, %ecx)
+	movdqu	%xmm3, -32(%edx, %ecx)
+	jmp	L(mm_return)
+
+L(mm_len_64_or_more_backward):
+	cmpl    $128, %ecx
+	jg	L(mm_len_128_or_more_backward)
+
+/* Copy [0..128] and return.  */
+	movdqu	(%eax), %xmm0
+	movdqu	16(%eax), %xmm1
+	movdqu	32(%eax), %xmm2
+	movdqu	48(%eax), %xmm3
+	movdqu	-64(%eax, %ecx), %xmm4
+	movdqu	-48(%eax, %ecx), %xmm5
+	movdqu	-32(%eax, %ecx), %xmm6
+	movdqu	-16(%eax, %ecx), %xmm7
+	movdqu	%xmm0, (%edx)
+	movdqu	%xmm1, 16(%edx)
+	movdqu	%xmm2, 32(%edx)
+	movdqu	%xmm3, 48(%edx)
+	movdqu	%xmm4, -64(%edx, %ecx)
+	movdqu	%xmm5, -48(%edx, %ecx)
+	movdqu	%xmm6, -32(%edx, %ecx)
+	movdqu	%xmm7, -16(%edx, %ecx)
+	jmp	L(mm_return)
+
+L(mm_len_128_or_more_backward):
+
+	cmp     $SHARED_CACHE_SIZE_HALF, %ecx
+	jae     L(mm_large_page_backward)
+
+	PUSH (%esi)
+	PUSH (%edi)
+
+/* Aligning the address of destination. We need to save
+	16 bits from the source in order not to overwrite them.  */
+	movdqu	-16(%eax, %ecx), %xmm0
+	movdqu	-32(%eax, %ecx), %xmm1
+	movdqu	-48(%eax, %ecx), %xmm2
+	movdqu	-64(%eax, %ecx), %xmm3
+
+	leal	(%edx, %ecx), %edi
+	andl	$-64, %edi
+
+	movl	%eax, %esi
+	subl	%edx, %esi
+
+	movdqu	-16(%edi, %esi), %xmm4
+	movdqu	-32(%edi, %esi), %xmm5
+	movdqu	-48(%edi, %esi), %xmm6
+	movdqu	-64(%edi, %esi), %xmm7
+
+	movdqu	%xmm0, -16(%edx, %ecx)
+	movdqu	%xmm1, -32(%edx, %ecx)
+	movdqu	%xmm2, -48(%edx, %ecx)
+	movdqu	%xmm3, -64(%edx, %ecx)
+	movdqa	%xmm4, -16(%edi)
+	movdqa	%xmm5, -32(%edi)
+	movdqa	%xmm6, -48(%edi)
+	movdqa	%xmm7, -64(%edi)
+	leal	-64(%edi), %edi
+
+	leal	64(%edx), %ebx
+	andl	$-64, %ebx
+
+/* Compute in %ecx how many bytes are left to copy after
+	the main loop stops.  */
+	movl	%ebx, %ecx
+	subl	%edx, %ecx
+
+	cmp	%edi, %ebx
+	jb	L(mm_main_loop_backward)
+
+	POP (%edi)
+	POP (%esi)
+	jmp	L(mm_len_0_or_more_backward)
+
+	.p2align 4
+L(mm_main_loop_backward):
+
+	prefetcht0 -128(%edi, %esi)
+
+	movdqu	-64(%edi, %esi), %xmm0
+	movdqu	-48(%edi, %esi), %xmm1
+	movdqu	-32(%edi, %esi), %xmm2
+	movdqu	-16(%edi, %esi), %xmm3
+	movdqa	%xmm0, -64(%edi)
+	movdqa	%xmm1, -48(%edi)
+	movdqa	%xmm2, -32(%edi)
+	movdqa	%xmm3, -16(%edi)
+	leal	-64(%edi), %edi
+	cmp	%edi, %ebx
+	jb	L(mm_main_loop_backward)
+	POP (%edi)
+	POP (%esi)
+	jmp	L(mm_len_0_or_more_backward)
+
+/* Copy [0..16] and return.  */
+L(mm_len_0_16_bytes_backward):
+	testb	$24, %cl
+	jnz	L(mm_len_9_16_bytes_backward)
+	testb	$4, %cl
+	.p2align 4,,5
+	jnz	L(mm_len_5_8_bytes_backward)
+	testl	%ecx, %ecx
+	.p2align 4,,2
+	je	L(mm_return)
+	testb	$2, %cl
+	.p2align 4,,1
+	jne	L(mm_len_3_4_bytes_backward)
+	movzbl	-1(%eax,%ecx), %ebx
+	movzbl	(%eax), %eax
+	movb	%bl, -1(%edx,%ecx)
+	movb	%al, (%edx)
+	jmp	L(mm_return)
+
+L(mm_len_3_4_bytes_backward):
+	movzwl	-2(%eax,%ecx), %ebx
+	movzwl	(%eax), %eax
+	movw	%bx, -2(%edx,%ecx)
+	movw	%ax, (%edx)
+	jmp	L(mm_return)
+
+L(mm_len_9_16_bytes_backward):
+	PUSH (%esi)
+	movl	-4(%eax,%ecx), %ebx
+	movl	-8(%eax,%ecx), %esi
+	movl	%ebx, -4(%edx,%ecx)
+	movl	%esi, -8(%edx,%ecx)
+	subl	$8, %ecx
+	POP (%esi)
+	jmp	L(mm_len_0_16_bytes_backward)
+
+L(mm_len_5_8_bytes_backward):
+	movl	(%eax), %ebx
+	movl	-4(%eax,%ecx), %eax
+	movl	%ebx, (%edx)
+	movl	%eax, -4(%edx,%ecx)
+
+L(mm_return):
+	movl	%edx, %eax
+	RETURN
+
+L(mm_return_pop_all):
+	movl	%edi, %eax
+	POP (%edi)
+	POP (%esi)
+	RETURN
+
+/* Big length copy forward part.  */
+
+L(mm_large_page_forward):
+/* Aligning the address of destination. We need to save
+	16 bits from the source in order not to overwrite them.  */
+
+	PUSH (%esi)
+	PUSH (%edi)
+	movl	%eax, %esi
+	movl	%edx, %edi
+
+	movdqu	(%esi), %xmm0
+	movdqu	16(%esi), %xmm1
+	movdqu	32(%esi), %xmm2
+	movdqu	48(%esi), %xmm3
+
+	leal	64(%edi), %edx
+	andl	$-64, %edx
+
+	movl	%esi, %eax
+	subl	%edi, %eax
+
+	movdqu	(%edx, %eax), %xmm4
+	movdqu	16(%edx, %eax), %xmm5
+	movdqu	32(%edx, %eax), %xmm6
+	movdqu	48(%edx, %eax), %xmm7
+
+	movdqu	%xmm0, (%edi)
+	movdqu	%xmm1, 16(%edi)
+	movdqu	%xmm2, 32(%edi)
+	movdqu	%xmm3, 48(%edi)
+	movntdq	%xmm4, (%edx)
+	movntdq	%xmm5, 16(%edx)
+	movntdq	%xmm6, 32(%edx)
+	movntdq	%xmm7, 48(%edx)
+	addl	$64, %edx
+
+	leal	(%edi, %ecx), %ebx
+	andl	$-128, %ebx
+
+	cmp	%edx, %ebx
+	jbe	L(mm_copy_remaining_forward)
+
+	.p2align 4
+L(mm_large_page_loop_forward):
+	movdqu	(%edx, %eax), %xmm0
+	movdqu	16(%edx, %eax), %xmm1
+	movdqu	32(%edx, %eax), %xmm2
+	movdqu	48(%edx, %eax), %xmm3
+	movdqu	64(%edx, %eax), %xmm4
+	movdqu	80(%edx, %eax), %xmm5
+	movdqu	96(%edx, %eax), %xmm6
+	movdqu	112(%edx, %eax), %xmm7
+	movntdq	%xmm0, (%edx)
+	movntdq	%xmm1, 16(%edx)
+	movntdq	%xmm2, 32(%edx)
+	movntdq	%xmm3, 48(%edx)
+	movntdq	%xmm4, 64(%edx)
+	movntdq	%xmm5, 80(%edx)
+	movntdq	%xmm6, 96(%edx)
+	movntdq	%xmm7, 112(%edx)
+	leal	128(%edx), %edx
+	cmp	%edx, %ebx
+	ja	L(mm_large_page_loop_forward)
+	sfence
+
+	addl	%edi, %ecx
+	subl	%edx, %ecx
+/* We copied all up till %edx position in the dst.
+	In %ecx now is how many bytes are left to copy.
+	Now we need to advance %esi. */
+	leal	(%edx, %eax), %esi
+
+	cmp	$64, %ecx
+	jb	L(mm_remaining_0_64_bytes_forward)
+
+	movdqu	(%esi), %xmm0
+	movdqu	16(%esi), %xmm1
+	movdqu	32(%esi), %xmm2
+	movdqu	48(%esi), %xmm3
+	movdqu	-64(%esi, %ecx), %xmm4
+	movdqu	-48(%esi, %ecx), %xmm5
+	movdqu	-32(%esi, %ecx), %xmm6
+	movdqu	-16(%esi, %ecx), %xmm7
+	movdqu	%xmm0, (%edx)
+	movdqu	%xmm1, 16(%edx)
+	movdqu	%xmm2, 32(%edx)
+	movdqu	%xmm3, 48(%edx)
+	movdqu	%xmm4, -64(%edx, %ecx)
+	movdqu	%xmm5, -48(%edx, %ecx)
+	movdqu	%xmm6, -32(%edx, %ecx)
+	movdqu	%xmm7, -16(%edx, %ecx)
+	jmp	L(mm_return_pop_all)
+
+
+/* Big length copy backward part.  */
+L(mm_large_page_backward):
+/* Aligning the address of destination. We need to save
+	16 bits from the source in order not to overwrite them.  */
+
+	PUSH (%esi)
+	PUSH (%edi)
+
+	movdqu	-16(%eax, %ecx), %xmm0
+	movdqu	-32(%eax, %ecx), %xmm1
+	movdqu	-48(%eax, %ecx), %xmm2
+	movdqu	-64(%eax, %ecx), %xmm3
+
+	leal	(%edx, %ecx), %edi
+	andl	$-64, %edi
+
+	movl	%eax, %esi
+	subl	%edx, %esi
+
+	movdqu	-16(%edi, %esi), %xmm4
+	movdqu	-32(%edi, %esi), %xmm5
+	movdqu	-48(%edi, %esi), %xmm6
+	movdqu	-64(%edi, %esi), %xmm7
+
+	movdqu	%xmm0, -16(%edx, %ecx)
+	movdqu	%xmm1, -32(%edx, %ecx)
+	movdqu	%xmm2, -48(%edx, %ecx)
+	movdqu	%xmm3, -64(%edx, %ecx)
+	movntdq	%xmm4, -16(%edi)
+	movntdq	%xmm5, -32(%edi)
+	movntdq	%xmm6, -48(%edi)
+	movntdq	%xmm7, -64(%edi)
+	leal	-64(%edi), %edi
+
+	leal	128(%edx), %ebx
+	andl	$-64, %ebx
+
+/* Compute in %ecx how many bytes are left to copy after
+	the main loop stops.  */
+	movl	%ebx, %ecx
+	subl	%edx, %ecx
+
+	cmp	%edi, %ebx
+	jae	L(mm_len_0_or_more_backward)
+
+	.p2align 4
+L(mm_large_page_loop_backward):
+	movdqu	-64(%edi, %esi), %xmm0
+	movdqu	-48(%edi, %esi), %xmm1
+	movdqu	-32(%edi, %esi), %xmm2
+	movdqu	-16(%edi, %esi), %xmm3
+	movntdq	%xmm0, -64(%edi)
+	movntdq	%xmm1, -48(%edi)
+	movntdq	%xmm2, -32(%edi)
+	movntdq	%xmm3, -16(%edi)
+	leal	-64(%edi), %edi
+	cmp	%edi, %ebx
+	jb	L(mm_large_page_loop_backward)
+	POP (%edi)
+	POP (%esi)
+	jmp	L(mm_len_0_or_more_backward)
+
+END (MEMMOVE)
diff --git a/libc/arch-x86/string/sse2-memset-atom.S b/libc/arch-x86/silvermont/string/sse2-memset-slm.S
similarity index 80%
copy from libc/arch-x86/string/sse2-memset-atom.S
copy to libc/arch-x86/silvermont/string/sse2-memset-slm.S
index a54bf51..c30bf74 100644
--- a/libc/arch-x86/string/sse2-memset-atom.S
+++ b/libc/arch-x86/silvermont/string/sse2-memset-slm.S
@@ -1,5 +1,5 @@
 /*
-Copyright (c) 2010, Intel Corporation
+Copyright (c) 2014, Intel Corporation
 All rights reserved.
 
 Redistribution and use in source and binary forms, with or without
@@ -29,7 +29,10 @@
 */
 
 #include "cache.h"
-#undef __i686
+
+#ifndef MEMSET
+# define MEMSET memset
+#endif
 
 #ifndef L
 # define L(label)	.L##label
@@ -61,7 +64,7 @@
 
 #ifndef ENTRY
 # define ENTRY(name)			\
-	.type name,  @function; 	\
+	.type name,  @function;		\
 	.globl name;			\
 	.p2align 4;			\
 name:					\
@@ -107,7 +110,7 @@
    jump table with relative offsets.   */
 # define BRANCH_TO_JMPTBL_ENTRY(TABLE)				\
     /* We first load PC into EBX.  */				\
-    call	__i686.get_pc_thunk.bx;				\
+    call	__x86.get_pc_thunk.bx;				\
     /* Get the address of the jump table.  */			\
     add		$(TABLE - .), %ebx;				\
     /* Get the entry and convert the relative offset to the	\
@@ -117,12 +120,12 @@
     /* We loaded the jump table and adjuested EDX. Go.  */	\
     jmp		*%ebx
 
-	.section	.gnu.linkonce.t.__i686.get_pc_thunk.bx,"ax",@progbits
-	.globl	__i686.get_pc_thunk.bx
-	.hidden	__i686.get_pc_thunk.bx
+	.section	.gnu.linkonce.t.__x86.get_pc_thunk.bx,"ax",@progbits
+	.globl	__x86.get_pc_thunk.bx
+	.hidden	__x86.get_pc_thunk.bx
 	ALIGN (4)
-	.type	__i686.get_pc_thunk.bx,@function
-__i686.get_pc_thunk.bx:
+	.type	__x86.get_pc_thunk.bx,@function
+__x86.get_pc_thunk.bx:
 	movl	(%esp), %ebx
 	ret
 #else
@@ -139,16 +142,18 @@
     jmp		*TABLE(,%ecx,4)
 #endif
 
-#ifndef MEMSET
-# define MEMSET memset
-#endif
-
 	.section .text.sse2,"ax",@progbits
 	ALIGN (4)
 ENTRY (MEMSET)
 	ENTRANCE
 
 	movl	LEN(%esp), %ecx
+	cmp	$0, %ecx
+	ja	L(1byteormore)
+	SETRTNVAL
+	RETURN
+
+L(1byteormore):
 #ifdef USE_AS_BZERO
 	xor	%eax, %eax
 #else
@@ -156,147 +161,62 @@
 	movb	%al, %ah
 	/* Fill the whole EAX with pattern.  */
 	movl	%eax, %edx
-	shl	$16, %eax
+	shl	 $16, %eax
 	or	%edx, %eax
 #endif
 	movl	DEST(%esp), %edx
-	cmp	$32, %ecx
-	jae	L(32bytesormore)
+	cmp	$1, %ecx
+	je	L(1byte)
+	cmp	$16, %ecx
+	jae	L(16bytesormore)
 
-L(write_less32bytes):
-	BRANCH_TO_JMPTBL_ENTRY (L(table_less_32bytes))
+	cmp	$4, %ecx
+	jb	L(4bytesless)
+	movl	%eax, (%edx)
+	movl	%eax, -4(%edx, %ecx)
+	cmp	$8, %ecx
+	jb	L(8bytesless)
+	movl	%eax, 4(%edx)
+	movl	%eax, -8(%edx, %ecx)
+L(8bytesless):
+	SETRTNVAL
+	RETURN
 
+L(4bytesless):
+	movw	%ax, (%edx)
+	movw	%ax, -2(%edx, %ecx)
+	SETRTNVAL
+	RETURN
 
-	.pushsection .rodata.sse2,"a",@progbits
-	ALIGN (2)
-L(table_less_32bytes):
-	.int	JMPTBL (L(write_0bytes), L(table_less_32bytes))
-	.int	JMPTBL (L(write_1bytes), L(table_less_32bytes))
-	.int	JMPTBL (L(write_2bytes), L(table_less_32bytes))
-	.int	JMPTBL (L(write_3bytes), L(table_less_32bytes))
-	.int	JMPTBL (L(write_4bytes), L(table_less_32bytes))
-	.int	JMPTBL (L(write_5bytes), L(table_less_32bytes))
-	.int	JMPTBL (L(write_6bytes), L(table_less_32bytes))
-	.int	JMPTBL (L(write_7bytes), L(table_less_32bytes))
-	.int	JMPTBL (L(write_8bytes), L(table_less_32bytes))
-	.int	JMPTBL (L(write_9bytes), L(table_less_32bytes))
-	.int	JMPTBL (L(write_10bytes), L(table_less_32bytes))
-	.int	JMPTBL (L(write_11bytes), L(table_less_32bytes))
-	.int	JMPTBL (L(write_12bytes), L(table_less_32bytes))
-	.int	JMPTBL (L(write_13bytes), L(table_less_32bytes))
-	.int	JMPTBL (L(write_14bytes), L(table_less_32bytes))
-	.int	JMPTBL (L(write_15bytes), L(table_less_32bytes))
-	.int	JMPTBL (L(write_16bytes), L(table_less_32bytes))
-	.int	JMPTBL (L(write_17bytes), L(table_less_32bytes))
-	.int	JMPTBL (L(write_18bytes), L(table_less_32bytes))
-	.int	JMPTBL (L(write_19bytes), L(table_less_32bytes))
-	.int	JMPTBL (L(write_20bytes), L(table_less_32bytes))
-	.int	JMPTBL (L(write_21bytes), L(table_less_32bytes))
-	.int	JMPTBL (L(write_22bytes), L(table_less_32bytes))
-	.int	JMPTBL (L(write_23bytes), L(table_less_32bytes))
-	.int	JMPTBL (L(write_24bytes), L(table_less_32bytes))
-	.int	JMPTBL (L(write_25bytes), L(table_less_32bytes))
-	.int	JMPTBL (L(write_26bytes), L(table_less_32bytes))
-	.int	JMPTBL (L(write_27bytes), L(table_less_32bytes))
-	.int	JMPTBL (L(write_28bytes), L(table_less_32bytes))
-	.int	JMPTBL (L(write_29bytes), L(table_less_32bytes))
-	.int	JMPTBL (L(write_30bytes), L(table_less_32bytes))
-	.int	JMPTBL (L(write_31bytes), L(table_less_32bytes))
-	.popsection
-
-	ALIGN (4)
-L(write_28bytes):
-	movl	%eax, -28(%edx)
-L(write_24bytes):
-	movl	%eax, -24(%edx)
-L(write_20bytes):
-	movl	%eax, -20(%edx)
-L(write_16bytes):
-	movl	%eax, -16(%edx)
-L(write_12bytes):
-	movl	%eax, -12(%edx)
-L(write_8bytes):
-	movl	%eax, -8(%edx)
-L(write_4bytes):
-	movl	%eax, -4(%edx)
-L(write_0bytes):
+L(1byte):
+	movb	%al, (%edx)
 	SETRTNVAL
 	RETURN
 
 	ALIGN (4)
-L(write_29bytes):
-	movl	%eax, -29(%edx)
-L(write_25bytes):
-	movl	%eax, -25(%edx)
-L(write_21bytes):
-	movl	%eax, -21(%edx)
-L(write_17bytes):
-	movl	%eax, -17(%edx)
-L(write_13bytes):
-	movl	%eax, -13(%edx)
-L(write_9bytes):
-	movl	%eax, -9(%edx)
-L(write_5bytes):
-	movl	%eax, -5(%edx)
-L(write_1bytes):
-	movb	%al, -1(%edx)
-	SETRTNVAL
-	RETURN
-
-	ALIGN (4)
-L(write_30bytes):
-	movl	%eax, -30(%edx)
-L(write_26bytes):
-	movl	%eax, -26(%edx)
-L(write_22bytes):
-	movl	%eax, -22(%edx)
-L(write_18bytes):
-	movl	%eax, -18(%edx)
-L(write_14bytes):
-	movl	%eax, -14(%edx)
-L(write_10bytes):
-	movl	%eax, -10(%edx)
-L(write_6bytes):
-	movl	%eax, -6(%edx)
-L(write_2bytes):
-	movw	%ax, -2(%edx)
-	SETRTNVAL
-	RETURN
-
-	ALIGN (4)
-L(write_31bytes):
-	movl	%eax, -31(%edx)
-L(write_27bytes):
-	movl	%eax, -27(%edx)
-L(write_23bytes):
-	movl	%eax, -23(%edx)
-L(write_19bytes):
-	movl	%eax, -19(%edx)
-L(write_15bytes):
-	movl	%eax, -15(%edx)
-L(write_11bytes):
-	movl	%eax, -11(%edx)
-L(write_7bytes):
-	movl	%eax, -7(%edx)
-L(write_3bytes):
-	movw	%ax, -3(%edx)
-	movb	%al, -1(%edx)
-	SETRTNVAL
-	RETURN
-
-	ALIGN (4)
-/* ECX > 32 and EDX is 4 byte aligned.  */
-L(32bytesormore):
-	/* Fill xmm0 with the pattern.  */
+L(16bytesormore):
 #ifdef USE_AS_BZERO
 	pxor	%xmm0, %xmm0
 #else
 	movd	%eax, %xmm0
 	pshufd	$0, %xmm0, %xmm0
 #endif
+
+	cmp	$64, %ecx
+	ja	L(64bytesmore)
+	movdqu	%xmm0, (%edx)
+	movdqu	%xmm0, -16(%edx, %ecx)
+	cmp	$32, %ecx
+	jbe	L(32bytesless)
+	movdqu	%xmm0, 16(%edx)
+	movdqu	%xmm0, -32(%edx, %ecx)
+L(32bytesless):
+	SETRTNVAL
+	RETURN
+
+L(64bytesmore):
 	testl	$0xf, %edx
 	jz	L(aligned_16)
-/* ECX > 32 and EDX is not 16 byte aligned.  */
 L(not_aligned_16):
 	movdqu	%xmm0, (%edx)
 	movl	%edx, %eax
@@ -321,71 +241,73 @@
 	mov	$SHARED_CACHE_SIZE, %ebx
 #else
 # if (defined SHARED || defined __PIC__)
-	call	__i686.get_pc_thunk.bx
+	call	__x86.get_pc_thunk.bx
 	add	$_GLOBAL_OFFSET_TABLE_, %ebx
-	mov	__x86_shared_cache_size@GOTOFF(%ebx), %ebx
+	mov	$__x86_shared_cache_size@GOTOFF(%ebx), %ebx
 # else
 	PUSH (%ebx)
-	mov	__x86_shared_cache_size, %ebx
+	mov	$__x86_shared_cache_size, %ebx
 # endif
 #endif
 	cmp	%ebx, %ecx
 	jae	L(128bytesormore_nt_start)
 
+	POP (%ebx)
 
 #ifdef DATA_CACHE_SIZE
-	POP (%ebx)
-# define RESTORE_EBX_STATE CFI_PUSH (%ebx)
-	cmp	$DATA_CACHE_SIZE, %ecx
+	PUSH (%ebx)
+	mov	$DATA_CACHE_SIZE, %ebx
 #else
 # if (defined SHARED || defined __PIC__)
-#  define RESTORE_EBX_STATE
-	call	__i686.get_pc_thunk.bx
+	call	__x86.get_pc_thunk.bx
 	add	$_GLOBAL_OFFSET_TABLE_, %ebx
-	cmp	__x86_data_cache_size@GOTOFF(%ebx), %ecx
+	mov	$__x86_data_cache_size@GOTOFF(%ebx), %ebx
 # else
-	POP (%ebx)
-#  define RESTORE_EBX_STATE CFI_PUSH (%ebx)
-	cmp	__x86_data_cache_size, %ecx
+	PUSH (%ebx)
+	mov	$__x86_data_cache_size, %ebx
 # endif
 #endif
 
+	cmp	%ebx, %ecx
 	jae	L(128bytes_L2_normal)
 	subl	$128, %ecx
 L(128bytesormore_normal):
 	sub	$128, %ecx
 	movdqa	%xmm0, (%edx)
-	movdqa	%xmm0, 0x10(%edx)
-	movdqa	%xmm0, 0x20(%edx)
-	movdqa	%xmm0, 0x30(%edx)
-	movdqa	%xmm0, 0x40(%edx)
-	movdqa	%xmm0, 0x50(%edx)
-	movdqa	%xmm0, 0x60(%edx)
-	movdqa	%xmm0, 0x70(%edx)
+	movaps	%xmm0, 0x10(%edx)
+	movaps	%xmm0, 0x20(%edx)
+	movaps	%xmm0, 0x30(%edx)
+	movaps	%xmm0, 0x40(%edx)
+	movaps	%xmm0, 0x50(%edx)
+	movaps	%xmm0, 0x60(%edx)
+	movaps	%xmm0, 0x70(%edx)
 	lea	128(%edx), %edx
 	jb	L(128bytesless_normal)
 
 
 	sub	$128, %ecx
 	movdqa	%xmm0, (%edx)
-	movdqa	%xmm0, 0x10(%edx)
-	movdqa	%xmm0, 0x20(%edx)
-	movdqa	%xmm0, 0x30(%edx)
-	movdqa	%xmm0, 0x40(%edx)
-	movdqa	%xmm0, 0x50(%edx)
-	movdqa	%xmm0, 0x60(%edx)
-	movdqa	%xmm0, 0x70(%edx)
+	movaps	%xmm0, 0x10(%edx)
+	movaps	%xmm0, 0x20(%edx)
+	movaps	%xmm0, 0x30(%edx)
+	movaps	%xmm0, 0x40(%edx)
+	movaps	%xmm0, 0x50(%edx)
+	movaps	%xmm0, 0x60(%edx)
+	movaps	%xmm0, 0x70(%edx)
 	lea	128(%edx), %edx
 	jae	L(128bytesormore_normal)
 
 L(128bytesless_normal):
-	add	$128, %ecx
+	lea	128(%ecx), %ecx
+#if defined DATA_CACHE_SIZE || !(defined SHARED || defined __PIC__)
+	POP (%ebx)
+#endif
 	BRANCH_TO_JMPTBL_ENTRY (L(table_16_128bytes))
 
 	ALIGN (4)
 L(128bytes_L2_normal):
-	prefetcht0	0x380(%edx)
-	prefetcht0	0x3c0(%edx)
+	prefetchnta	0x380(%edx)
+	prefetchnta	0x3c0(%edx)
 	sub	$128, %ecx
 	movdqa	%xmm0, (%edx)
 	movaps	%xmm0, 0x10(%edx)
@@ -400,28 +322,26 @@
 	jae	L(128bytes_L2_normal)
 
 L(128bytesless_L2_normal):
+#if defined DATA_CACHE_SIZE || !(defined SHARED || defined __PIC__)
+	POP (%ebx)
+#endif
 	BRANCH_TO_JMPTBL_ENTRY (L(table_16_128bytes))
 
-	RESTORE_EBX_STATE
 L(128bytesormore_nt_start):
 	sub	%ebx, %ecx
-	mov	%ebx, %eax
-	and	$0x7f, %eax
-	add	%eax, %ecx
-	movd	%xmm0, %eax
 	ALIGN (4)
 L(128bytesormore_shared_cache_loop):
-	prefetcht0	0x3c0(%edx)
-	prefetcht0	0x380(%edx)
+	prefetchnta	0x3c0(%edx)
+	prefetchnta	0x380(%edx)
 	sub	$0x80, %ebx
 	movdqa	%xmm0, (%edx)
-	movdqa	%xmm0, 0x10(%edx)
-	movdqa	%xmm0, 0x20(%edx)
-	movdqa	%xmm0, 0x30(%edx)
-	movdqa	%xmm0, 0x40(%edx)
-	movdqa	%xmm0, 0x50(%edx)
-	movdqa	%xmm0, 0x60(%edx)
-	movdqa	%xmm0, 0x70(%edx)
+	movaps	%xmm0, 0x10(%edx)
+	movaps	%xmm0, 0x20(%edx)
+	movaps	%xmm0, 0x30(%edx)
+	movaps	%xmm0, 0x40(%edx)
+	movaps	%xmm0, 0x50(%edx)
+	movaps	%xmm0, 0x60(%edx)
+	movaps	%xmm0, 0x70(%edx)
 	add	$0x80, %edx
 	cmp	$0x80, %ebx
 	jae	L(128bytesormore_shared_cache_loop)
@@ -443,7 +363,7 @@
 	jae	L(128bytesormore_nt)
 	sfence
 L(shared_cache_loop_end):
-#if defined DATA_CACHE_SIZE || !(defined SHARED || defined __PIC__)
+#if defined SHARED_CACHE_SIZE || !(defined SHARED || defined __PIC__)
 	POP (%ebx)
 #endif
 	BRANCH_TO_JMPTBL_ENTRY (L(table_16_128bytes))
diff --git a/libc/arch-x86/string/cache.h b/libc/arch-x86/silvermont/string/sse2-stpcpy-slm.S
old mode 100644
new mode 100755
similarity index 73%
copy from libc/arch-x86/string/cache.h
copy to libc/arch-x86/silvermont/string/sse2-stpcpy-slm.S
index 9d0a563..5c43fa5
--- a/libc/arch-x86/string/cache.h
+++ b/libc/arch-x86/silvermont/string/sse2-stpcpy-slm.S
@@ -1,42 +1,33 @@
-/*
-Copyright (c) 2010, Intel Corporation
-All rights reserved.
-
-Redistribution and use in source and binary forms, with or without
-modification, are permitted provided that the following conditions are met:
-
-    * Redistributions of source code must retain the above copyright notice,
-    * this list of conditions and the following disclaimer.
-
-    * Redistributions in binary form must reproduce the above copyright notice,
-    * this list of conditions and the following disclaimer in the documentation
-    * and/or other materials provided with the distribution.
-
-    * Neither the name of Intel Corporation nor the names of its contributors
-    * may be used to endorse or promote products derived from this software
-    * without specific prior written permission.
-
-THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
-ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
-WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
-DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
-ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
-(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
-LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
-ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-*/
-
-#if defined(__slm__)
-/* Values are optimized for Silvermont */
-#define SHARED_CACHE_SIZE	(1024*1024)			/* Silvermont L2 Cache */
-#define DATA_CACHE_SIZE		(24*1024)			/* Silvermont L1 Data Cache */
-#else
-/* Values are optimized for Atom */
-#define SHARED_CACHE_SIZE	(512*1024)			/* Atom L2 Cache */
-#define DATA_CACHE_SIZE		(24*1024)			/* Atom L1 Data Cache */
-#endif
-
-#define SHARED_CACHE_SIZE_HALF	(SHARED_CACHE_SIZE / 2)
-#define DATA_CACHE_SIZE_HALF	(DATA_CACHE_SIZE / 2)
+/*

+Copyright (c) 2014, Intel Corporation

+All rights reserved.

+

+Redistribution and use in source and binary forms, with or without

+modification, are permitted provided that the following conditions are met:

+

+    * Redistributions of source code must retain the above copyright notice,

+    * this list of conditions and the following disclaimer.

+

+    * Redistributions in binary form must reproduce the above copyright notice,

+    * this list of conditions and the following disclaimer in the documentation

+    * and/or other materials provided with the distribution.

+

+    * Neither the name of Intel Corporation nor the names of its contributors

+    * may be used to endorse or promote products derived from this software

+    * without specific prior written permission.

+

+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND

+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED

+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE

+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR

+ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES

+(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;

+LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON

+ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT

+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS

+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

+*/

+

+#define USE_AS_STPCPY

+#define STRCPY stpcpy

+#include "sse2-strcpy-slm.S"

diff --git a/libc/arch-x86/string/cache.h b/libc/arch-x86/silvermont/string/sse2-stpncpy-slm.S
similarity index 73%
copy from libc/arch-x86/string/cache.h
copy to libc/arch-x86/silvermont/string/sse2-stpncpy-slm.S
index 9d0a563..af5c0d3 100644
--- a/libc/arch-x86/string/cache.h
+++ b/libc/arch-x86/silvermont/string/sse2-stpncpy-slm.S
@@ -1,42 +1,34 @@
-/*
-Copyright (c) 2010, Intel Corporation
-All rights reserved.
-
-Redistribution and use in source and binary forms, with or without
-modification, are permitted provided that the following conditions are met:
-
-    * Redistributions of source code must retain the above copyright notice,
-    * this list of conditions and the following disclaimer.
-
-    * Redistributions in binary form must reproduce the above copyright notice,
-    * this list of conditions and the following disclaimer in the documentation
-    * and/or other materials provided with the distribution.
-
-    * Neither the name of Intel Corporation nor the names of its contributors
-    * may be used to endorse or promote products derived from this software
-    * without specific prior written permission.
-
-THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
-ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
-WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
-DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
-ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
-(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
-LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
-ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-*/
-
-#if defined(__slm__)
-/* Values are optimized for Silvermont */
-#define SHARED_CACHE_SIZE	(1024*1024)			/* Silvermont L2 Cache */
-#define DATA_CACHE_SIZE		(24*1024)			/* Silvermont L1 Data Cache */
-#else
-/* Values are optimized for Atom */
-#define SHARED_CACHE_SIZE	(512*1024)			/* Atom L2 Cache */
-#define DATA_CACHE_SIZE		(24*1024)			/* Atom L1 Data Cache */
-#endif
-
-#define SHARED_CACHE_SIZE_HALF	(SHARED_CACHE_SIZE / 2)
-#define DATA_CACHE_SIZE_HALF	(DATA_CACHE_SIZE / 2)
+/*

+Copyright (c) 2014, Intel Corporation

+All rights reserved.

+

+Redistribution and use in source and binary forms, with or without

+modification, are permitted provided that the following conditions are met:

+

+    * Redistributions of source code must retain the above copyright notice,

+    * this list of conditions and the following disclaimer.

+

+    * Redistributions in binary form must reproduce the above copyright notice,

+    * this list of conditions and the following disclaimer in the documentation

+    * and/or other materials provided with the distribution.

+

+    * Neither the name of Intel Corporation nor the names of its contributors

+    * may be used to endorse or promote products derived from this software

+    * without specific prior written permission.

+

+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND

+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED

+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE

+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR

+ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES

+(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;

+LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON

+ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT

+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS

+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

+*/

+

+#define USE_AS_STRNCPY

+#define USE_AS_STPCPY

+#define STRCPY stpncpy

+#include "sse2-strcpy-slm.S"

diff --git a/libc/arch-x86/silvermont/string/sse2-strcpy-slm.S b/libc/arch-x86/silvermont/string/sse2-strcpy-slm.S
new file mode 100755
index 0000000..b5d84b5
--- /dev/null
+++ b/libc/arch-x86/silvermont/string/sse2-strcpy-slm.S
@@ -0,0 +1,2157 @@
+/*
+Copyright (c) 2014, Intel Corporation
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+    * Redistributions of source code must retain the above copyright notice,
+    * this list of conditions and the following disclaimer.
+
+    * Redistributions in binary form must reproduce the above copyright notice,
+    * this list of conditions and the following disclaimer in the documentation
+    * and/or other materials provided with the distribution.
+
+    * Neither the name of Intel Corporation nor the names of its contributors
+    * may be used to endorse or promote products derived from this software
+    * without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
+ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+#ifndef L
+# define L(label)	.L##label
+#endif
+
+#ifndef cfi_startproc
+# define cfi_startproc	.cfi_startproc
+#endif
+
+#ifndef cfi_endproc
+# define cfi_endproc	.cfi_endproc
+#endif
+
+#ifndef cfi_rel_offset
+# define cfi_rel_offset(reg, off)	.cfi_rel_offset reg, off
+#endif
+
+#ifndef cfi_restore
+# define cfi_restore(reg)	.cfi_restore reg
+#endif
+
+#ifndef cfi_adjust_cfa_offset
+# define cfi_adjust_cfa_offset(off)	.cfi_adjust_cfa_offset off
+#endif
+
+#ifndef ENTRY
+# define ENTRY(name)             \
+	.type name, @function;   \
+	.globl name;             \
+	.p2align 4;              \
+name:                            \
+	cfi_startproc
+#endif
+
+#ifndef END
+# define END(name)               \
+	cfi_endproc;             \
+	.size name, .-name
+#endif
+
+#define CFI_PUSH(REG)                  \
+	cfi_adjust_cfa_offset (4);     \
+	cfi_rel_offset (REG, 0)
+
+#define CFI_POP(REG)                   \
+	cfi_adjust_cfa_offset (-4);    \
+	cfi_restore (REG)
+
+#define PUSH(REG) pushl REG; CFI_PUSH (REG)
+#define POP(REG) popl REG; CFI_POP (REG)
+
+#ifndef STRCPY
+# define STRCPY  strcpy
+#endif
+
+#ifdef USE_AS_STPNCPY
+# define USE_AS_STRNCPY
+# define USE_AS_STPCPY
+#endif
+
+#ifdef USE_AS_STRNCPY
+# define PARMS  16
+# define ENTRANCE PUSH(%ebx); PUSH(%esi); PUSH(%edi)
+# define RETURN  POP(%edi); POP(%esi); POP(%ebx); ret; CFI_PUSH(%ebx); CFI_PUSH(%edi); CFI_PUSH(%edi);
+#else
+# define PARMS  12
+# define ENTRANCE PUSH(%esi); PUSH(%edi)
+# define RETURN  POP(%edi); POP(%esi); ret; CFI_PUSH(%esi); CFI_PUSH(%edi);
+#endif
+
+#define STR1  PARMS
+#define STR2  STR1+4
+#define LEN  STR2+4
+
+
+#if (defined SHARED || defined __PIC__)
+# define JMPTBL(I, B)	I - B
+
+/* Load an entry in a jump table into ECX and branch to it.  TABLE is a
+	jump	table with relative offsets.  INDEX is a register contains the
+	index	into the jump table.   SCALE is the scale of INDEX. */
+
+# define BRANCH_TO_JMPTBL_ENTRY(TABLE, INDEX, SCALE)            \
+	/* We first load PC into ECX.  */                       \
+	call	__x86.get_pc_thunk.cx;                         \
+	/* Get the address of the jump table.  */               \
+	addl	$(TABLE - .), %ecx;                             \
+	/* Get the entry and convert the relative offset to the \
+	absolute	address.  */                            \
+	addl	(%ecx,INDEX,SCALE), %ecx;                       \
+	/* We loaded the jump table and adjuested ECX. Go.  */  \
+	jmp	*%ecx
+#else
+# define JMPTBL(I, B)	I
+
+/* Branch to an entry in a jump table.  TABLE is a jump table with
+	absolute	offsets.  INDEX is a register contains the index into the
+	jump	table.  SCALE is the scale of INDEX. */
+
+# define BRANCH_TO_JMPTBL_ENTRY(TABLE, INDEX, SCALE)		\
+	jmp	*TABLE(,INDEX,SCALE)
+#endif
+
+.text
+ENTRY (STRCPY)
+	ENTRANCE
+	mov	STR1(%esp), %edi
+	mov	STR2(%esp), %esi
+#ifdef USE_AS_STRNCPY
+	movl	LEN(%esp), %ebx
+	test	%ebx, %ebx
+	jz	L(ExitZero)
+#endif
+
+	mov	%esi, %ecx
+#ifndef USE_AS_STPCPY
+	mov	%edi, %eax      /* save result */
+#endif
+	and	$15, %ecx
+	jz	L(SourceStringAlignmentZero)
+
+	and	$-16, %esi
+	pxor	%xmm0, %xmm0
+	pxor	%xmm1, %xmm1
+
+	pcmpeqb	(%esi), %xmm1
+#ifdef USE_AS_STRNCPY
+	add	%ecx, %ebx
+#endif
+	pmovmskb %xmm1, %edx
+	shr	%cl, %edx
+#ifdef USE_AS_STRNCPY
+#ifdef USE_AS_STPCPY
+	cmp	$16, %ebx
+	jbe	L(CopyFrom1To16BytesTailCase2OrCase3)
+#else
+	cmp	$17, %ebx
+	jbe	L(CopyFrom1To16BytesTailCase2OrCase3)
+#endif
+#endif
+	test	%edx, %edx
+	jnz	L(CopyFrom1To16BytesTail)
+
+	pcmpeqb	16(%esi), %xmm0
+	pmovmskb %xmm0, %edx
+#ifdef USE_AS_STRNCPY
+#ifdef USE_AS_STPCPY
+	cmp	$32, %ebx
+	jbe	L(CopyFrom1To32BytesCase2OrCase3)
+#else
+	cmp	$33, %ebx
+	jbe	L(CopyFrom1To32BytesCase2OrCase3)
+#endif
+#endif
+	test	%edx, %edx
+	jnz	L(CopyFrom1To32Bytes)
+
+	movdqu	(%esi, %ecx), %xmm1   /* copy 16 bytes */
+	movdqu	%xmm1, (%edi)
+
+	sub	%ecx, %edi
+	mov	%edi, %edx
+	mov	$16, %ecx
+	and	$15, %edx
+	jz	L(Align16Both)
+
+/* If source adress alignment != destination adress alignment */
+	.p2align 4
+L(Unalign16Both):
+	movdqa	(%esi, %ecx), %xmm1
+	movaps	16(%esi, %ecx), %xmm2
+	movdqu	%xmm1, (%edi, %ecx)
+	pcmpeqb	%xmm2, %xmm0
+	pmovmskb %xmm0, %edx
+	add	$16, %ecx
+#ifdef USE_AS_STRNCPY
+	sub	$48, %ebx
+	jbe	L(CopyFrom1To16BytesCase2OrCase3)
+	test	%edx, %edx
+	jnz	L(CopyFrom1To16BytesUnalignedXmm2)
+#else
+	test	%edx, %edx
+	jnz	L(CopyFrom1To16Bytes)
+#endif
+
+	movaps	16(%esi, %ecx), %xmm3
+	movdqu	%xmm2, (%edi, %ecx)
+	pcmpeqb	%xmm3, %xmm0
+	pmovmskb %xmm0, %edx
+	add	$16, %ecx
+#ifdef USE_AS_STRNCPY
+	sub	$16, %ebx
+	jbe	L(CopyFrom1To16BytesCase2OrCase3)
+	test	%edx, %edx
+	jnz	L(CopyFrom1To16BytesUnalignedXmm3)
+#else
+	test	%edx, %edx
+	jnz	L(CopyFrom1To16Bytes)
+#endif
+
+	movaps	16(%esi, %ecx), %xmm4
+	movdqu	%xmm3, (%edi, %ecx)
+	pcmpeqb	%xmm4, %xmm0
+	pmovmskb %xmm0, %edx
+	add	$16, %ecx
+#ifdef USE_AS_STRNCPY
+	sub	$16, %ebx
+	jbe	L(CopyFrom1To16BytesCase2OrCase3)
+	test	%edx, %edx
+	jnz	L(CopyFrom1To16BytesUnalignedXmm4)
+#else
+	test	%edx, %edx
+	jnz	L(CopyFrom1To16Bytes)
+#endif
+
+	movaps	16(%esi, %ecx), %xmm1
+	movdqu	%xmm4, (%edi, %ecx)
+	pcmpeqb	%xmm1, %xmm0
+	pmovmskb %xmm0, %edx
+	add	$16, %ecx
+#ifdef USE_AS_STRNCPY
+	sub	$16, %ebx
+	jbe	L(CopyFrom1To16BytesCase2OrCase3)
+	test	%edx, %edx
+	jnz	L(CopyFrom1To16BytesUnalignedXmm1)
+#else
+	test	%edx, %edx
+	jnz	L(CopyFrom1To16Bytes)
+#endif
+
+	movaps	16(%esi, %ecx), %xmm2
+	movdqu	%xmm1, (%edi, %ecx)
+	pcmpeqb	%xmm2, %xmm0
+	pmovmskb %xmm0, %edx
+	add	$16, %ecx
+#ifdef USE_AS_STRNCPY
+	sub	$16, %ebx
+	jbe	L(CopyFrom1To16BytesCase2OrCase3)
+	test	%edx, %edx
+	jnz	L(CopyFrom1To16BytesUnalignedXmm2)
+#else
+	test	%edx, %edx
+	jnz	L(CopyFrom1To16Bytes)
+#endif
+
+	movaps	16(%esi, %ecx), %xmm3
+	movdqu	%xmm2, (%edi, %ecx)
+	pcmpeqb	%xmm3, %xmm0
+	pmovmskb %xmm0, %edx
+	add	$16, %ecx
+#ifdef USE_AS_STRNCPY
+	sub	$16, %ebx
+	jbe	L(CopyFrom1To16BytesCase2OrCase3)
+	test	%edx, %edx
+	jnz	L(CopyFrom1To16BytesUnalignedXmm3)
+#else
+	test	%edx, %edx
+	jnz	L(CopyFrom1To16Bytes)
+#endif
+
+	movdqu	%xmm3, (%edi, %ecx)
+	mov	%esi, %edx
+	lea	16(%esi, %ecx), %esi
+	and	$-0x40, %esi
+	sub	%esi, %edx
+	sub	%edx, %edi
+#ifdef USE_AS_STRNCPY
+	lea	64+64(%ebx, %edx), %ebx
+#endif
+L(Unaligned64Loop):
+	movaps	(%esi), %xmm2
+	movaps	%xmm2, %xmm4
+	movaps	16(%esi), %xmm5
+	movaps	32(%esi), %xmm3
+	movaps	%xmm3, %xmm6
+	movaps	48(%esi), %xmm7
+	pminub	%xmm5, %xmm2
+	pminub	%xmm7, %xmm3
+	pminub	%xmm2, %xmm3
+	pcmpeqb	%xmm0, %xmm3
+	pmovmskb %xmm3, %edx
+#ifdef USE_AS_STRNCPY
+	sub	$64, %ebx
+	jbe	L(UnalignedLeaveCase2OrCase3)
+#endif
+	test	%edx, %edx
+	jnz	L(Unaligned64Leave)
+
+L(Unaligned64Loop_start):
+	add	$64, %edi
+	add	$64, %esi
+	movdqu	%xmm4, -64(%edi)
+	movaps	(%esi), %xmm2
+	movdqa	%xmm2, %xmm4
+	movdqu	%xmm5, -48(%edi)
+	movaps	16(%esi), %xmm5
+	pminub	%xmm5, %xmm2
+	movaps	32(%esi), %xmm3
+	movdqu	%xmm6, -32(%edi)
+	movaps	%xmm3, %xmm6
+	movdqu	%xmm7, -16(%edi)
+	movaps	48(%esi), %xmm7
+	pminub	%xmm7, %xmm3
+	pminub	%xmm2, %xmm3
+	pcmpeqb	%xmm3, %xmm0
+	pmovmskb %xmm0, %edx
+#ifdef USE_AS_STRNCPY
+	sub	$64, %ebx
+	jbe	L(UnalignedLeaveCase2OrCase3)
+#endif
+	test	%edx, %edx
+	jz	L(Unaligned64Loop_start)
+
+L(Unaligned64Leave):
+	pxor	%xmm0, %xmm0
+	pxor	%xmm1, %xmm1
+
+	pcmpeqb	%xmm4, %xmm0
+	pcmpeqb	%xmm5, %xmm1
+	pmovmskb %xmm0, %edx
+	pmovmskb %xmm1, %ecx
+	test	%edx, %edx
+	jnz	L(CopyFrom1To16BytesUnaligned_0)
+	test	%ecx, %ecx
+	jnz	L(CopyFrom1To16BytesUnaligned_16)
+
+	pcmpeqb	%xmm6, %xmm0
+	pcmpeqb	%xmm7, %xmm1
+	pmovmskb %xmm0, %edx
+	pmovmskb %xmm1, %ecx
+	test	%edx, %edx
+	jnz	L(CopyFrom1To16BytesUnaligned_32)
+
+	bsf	%ecx, %edx
+	movdqu	%xmm4, (%edi)
+	movdqu	%xmm5, 16(%edi)
+	movdqu	%xmm6, 32(%edi)
+#ifdef USE_AS_STRNCPY
+#ifdef USE_AS_STPCPY
+	lea	48(%edi, %edx), %eax
+#endif
+	movdqu	%xmm7, 48(%edi)
+	add	$15, %ebx
+	sub	%edx, %ebx
+	lea	49(%edi, %edx), %edi
+	jmp	L(StrncpyFillTailWithZero)
+#else
+	add	$48, %esi
+	add	$48, %edi
+	BRANCH_TO_JMPTBL_ENTRY (L(ExitTable), %edx, 4)
+#endif
+
+/* If source adress alignment == destination adress alignment */
+
+L(SourceStringAlignmentZero):
+	pxor	%xmm0, %xmm0
+	movdqa	(%esi), %xmm1
+	pcmpeqb	%xmm1, %xmm0
+	pmovmskb %xmm0, %edx
+
+#ifdef USE_AS_STRNCPY
+#ifdef USE_AS_STPCPY
+	cmp	$16, %ebx
+	jbe	L(CopyFrom1To16BytesTail1Case2OrCase3)
+#else
+	cmp	$17, %ebx
+	jbe	L(CopyFrom1To16BytesTail1Case2OrCase3)
+#endif
+#endif
+	test	%edx, %edx
+	jnz	L(CopyFrom1To16BytesTail1)
+
+	pcmpeqb	16(%esi), %xmm0
+	movdqu	%xmm1, (%edi)
+	pmovmskb %xmm0, %edx
+
+#ifdef USE_AS_STRNCPY
+#ifdef USE_AS_STPCPY
+	cmp	$32, %ebx
+	jbe	L(CopyFrom1To32Bytes1Case2OrCase3)
+#else
+	cmp	$33, %ebx
+	jbe	L(CopyFrom1To32Bytes1Case2OrCase3)
+#endif
+#endif
+	test	%edx, %edx
+	jnz	L(CopyFrom1To32Bytes1)
+
+	mov	%edi, %edx
+	mov	$16, %ecx
+	and	$15, %edx
+	jnz	L(Unalign16Both)
+
+L(Align16Both):
+	movdqa	(%esi, %ecx), %xmm1
+	movdqa	16(%esi, %ecx), %xmm2
+	movdqa	%xmm1, (%edi, %ecx)
+	pcmpeqb	%xmm2, %xmm0
+	pmovmskb %xmm0, %edx
+	add	$16, %ecx
+#ifdef USE_AS_STRNCPY
+	sub	$48, %ebx
+	jbe	L(CopyFrom1To16BytesCase2OrCase3)
+	test	%edx, %edx
+	jnz	L(CopyFrom1To16BytesXmm2)
+#else
+	test	%edx, %edx
+	jnz	L(CopyFrom1To16Bytes)
+#endif
+
+	movdqa	16(%esi, %ecx), %xmm3
+	movdqa	%xmm2, (%edi, %ecx)
+	pcmpeqb	%xmm3, %xmm0
+	pmovmskb %xmm0, %edx
+	lea	16(%ecx), %ecx
+#ifdef USE_AS_STRNCPY
+	sub	$16, %ebx
+	jbe	L(CopyFrom1To16BytesCase2OrCase3)
+	test	%edx, %edx
+	jnz	L(CopyFrom1To16BytesXmm3)
+#else
+	test	%edx, %edx
+	jnz	L(CopyFrom1To16Bytes)
+#endif
+
+	movdqa	16(%esi, %ecx), %xmm4
+	movdqa	%xmm3, (%edi, %ecx)
+	pcmpeqb	%xmm4, %xmm0
+	pmovmskb %xmm0, %edx
+	lea	16(%ecx), %ecx
+#ifdef USE_AS_STRNCPY
+	sub	$16, %ebx
+	jbe	L(CopyFrom1To16BytesCase2OrCase3)
+	test	%edx, %edx
+	jnz	L(CopyFrom1To16BytesXmm4)
+#else
+	test	%edx, %edx
+	jnz	L(CopyFrom1To16Bytes)
+#endif
+
+	movdqa	16(%esi, %ecx), %xmm1
+	movdqa	%xmm4, (%edi, %ecx)
+	pcmpeqb	%xmm1, %xmm0
+	pmovmskb %xmm0, %edx
+	lea	16(%ecx), %ecx
+#ifdef USE_AS_STRNCPY
+	sub	$16, %ebx
+	jbe	L(CopyFrom1To16BytesCase2OrCase3)
+	test	%edx, %edx
+	jnz	L(CopyFrom1To16BytesXmm1)
+#else
+	test	%edx, %edx
+	jnz	L(CopyFrom1To16Bytes)
+#endif
+
+	movdqa	16(%esi, %ecx), %xmm2
+	movdqa	%xmm1, (%edi, %ecx)
+	pcmpeqb	%xmm2, %xmm0
+	pmovmskb %xmm0, %edx
+	lea	16(%ecx), %ecx
+#ifdef USE_AS_STRNCPY
+	sub	$16, %ebx
+	jbe	L(CopyFrom1To16BytesCase2OrCase3)
+	test	%edx, %edx
+	jnz	L(CopyFrom1To16BytesXmm2)
+#else
+	test	%edx, %edx
+	jnz	L(CopyFrom1To16Bytes)
+#endif
+
+	movdqa	16(%esi, %ecx), %xmm3
+	movdqa	%xmm2, (%edi, %ecx)
+	pcmpeqb	%xmm3, %xmm0
+	pmovmskb %xmm0, %edx
+	lea	16(%ecx), %ecx
+#ifdef USE_AS_STRNCPY
+	sub	$16, %ebx
+	jbe	L(CopyFrom1To16BytesCase2OrCase3)
+	test	%edx, %edx
+	jnz	L(CopyFrom1To16BytesXmm3)
+#else
+	test	%edx, %edx
+	jnz	L(CopyFrom1To16Bytes)
+#endif
+
+	movdqa	%xmm3, (%edi, %ecx)
+	mov	%esi, %edx
+	lea	16(%esi, %ecx), %esi
+	and	$-0x40, %esi
+	sub	%esi, %edx
+	sub	%edx, %edi
+#ifdef USE_AS_STRNCPY
+	lea	64+64(%ebx, %edx), %ebx
+#endif
+L(Aligned64Loop):
+	movdqa	(%esi), %xmm2
+	movdqa	%xmm2, %xmm4
+	movaps	16(%esi), %xmm5
+	movdqa	32(%esi), %xmm3
+	movdqa	%xmm3, %xmm6
+	movaps	48(%esi), %xmm7
+	pminub	%xmm5, %xmm2
+	pminub	%xmm7, %xmm3
+	pminub	%xmm2, %xmm3
+	pcmpeqb	%xmm0, %xmm3
+	pmovmskb %xmm3, %edx
+#ifdef USE_AS_STRNCPY
+	sub	$64, %ebx
+	jbe	L(AlignedLeaveCase2OrCase3)
+#endif
+	test	%edx, %edx
+	jnz	L(Aligned64Leave)
+
+L(Aligned64Loop_start):
+	add	$64, %esi
+	add	$64, %edi
+	movaps	%xmm4, -64(%edi)
+	movdqa	(%esi), %xmm2
+	movdqa	%xmm2, %xmm4
+	movaps	%xmm5, -48(%edi)
+	movaps	16(%esi), %xmm5
+	pminub	%xmm5, %xmm2
+	movaps	32(%esi), %xmm3
+	movaps	%xmm6, -32(%edi)
+	movdqa	%xmm3, %xmm6
+	movaps	%xmm7, -16(%edi)
+	movaps	48(%esi), %xmm7
+	pminub	%xmm7, %xmm3
+	pminub	%xmm2, %xmm3
+	pcmpeqb	%xmm3, %xmm0
+	pmovmskb %xmm0, %edx
+#ifdef USE_AS_STRNCPY
+	sub	$64, %ebx
+	jbe	L(AlignedLeaveCase2OrCase3)
+#endif
+	test	%edx, %edx
+	jz	L(Aligned64Loop_start)
+
+L(Aligned64Leave):
+	pxor	%xmm0, %xmm0
+	pxor	%xmm1, %xmm1
+
+	pcmpeqb	%xmm4, %xmm0
+	pcmpeqb	%xmm5, %xmm1
+	pmovmskb %xmm0, %edx
+	pmovmskb %xmm1, %ecx
+	test	%edx, %edx
+	jnz	L(CopyFrom1To16Bytes_0)
+	test	%ecx, %ecx
+	jnz	L(CopyFrom1To16Bytes_16)
+
+	pcmpeqb	%xmm6, %xmm0
+	pcmpeqb	%xmm7, %xmm1
+	pmovmskb %xmm0, %edx
+	pmovmskb %xmm1, %ecx
+	test	%edx, %edx
+	jnz	L(CopyFrom1To16Bytes_32)
+
+	bsf	%ecx, %edx
+	movdqa	%xmm4, (%edi)
+	movdqa	%xmm5, 16(%edi)
+	movdqa	%xmm6, 32(%edi)
+#ifdef USE_AS_STRNCPY
+#ifdef USE_AS_STPCPY
+	lea	48(%edi, %edx), %eax
+#endif
+	movdqa	%xmm7, 48(%edi)
+	add	$15, %ebx
+	sub	%edx, %ebx
+	lea	49(%edi, %edx), %edi
+	jmp	L(StrncpyFillTailWithZero)
+#else
+	add	$48, %esi
+	add	$48, %edi
+	BRANCH_TO_JMPTBL_ENTRY (L(ExitTable), %edx, 4)
+#endif
+
+/*----------------------------------------------------*/
+
+/* Case1 */
+#ifndef USE_AS_STRNCPY
+	.p2align 4
+L(CopyFrom1To16Bytes):
+	add	%ecx, %edi
+	add	%ecx, %esi
+	bsf	%edx, %edx
+	BRANCH_TO_JMPTBL_ENTRY (L(ExitTable), %edx, 4)
+#endif
+	.p2align 4
+L(CopyFrom1To16BytesTail):
+#ifdef USE_AS_STRNCPY
+	sub	%ecx, %ebx
+#endif
+	add	%ecx, %esi
+	bsf	%edx, %edx
+	BRANCH_TO_JMPTBL_ENTRY (L(ExitTable), %edx, 4)
+
+	.p2align 4
+L(CopyFrom1To32Bytes1):
+	add	$16, %esi
+	add	$16, %edi
+#ifdef USE_AS_STRNCPY
+	sub	$16, %ebx
+#endif
+L(CopyFrom1To16BytesTail1):
+	bsf	%edx, %edx
+	BRANCH_TO_JMPTBL_ENTRY (L(ExitTable), %edx, 4)
+
+	.p2align 4
+L(CopyFrom1To32Bytes):
+#ifdef USE_AS_STRNCPY
+	sub	%ecx, %ebx
+#endif
+	bsf	%edx, %edx
+	add	%ecx, %esi
+	add	$16, %edx
+	sub	%ecx, %edx
+	BRANCH_TO_JMPTBL_ENTRY (L(ExitTable), %edx, 4)
+
+	.p2align 4
+L(CopyFrom1To16Bytes_0):
+	bsf	%edx, %edx
+#ifdef USE_AS_STRNCPY
+#ifdef USE_AS_STPCPY
+	lea	(%edi, %edx), %eax
+#endif
+	movdqa	%xmm4, (%edi)
+	add	$63, %ebx
+	sub	%edx, %ebx
+	lea	1(%edi, %edx), %edi
+	jmp	L(StrncpyFillTailWithZero)
+#else
+	BRANCH_TO_JMPTBL_ENTRY (L(ExitTable), %edx, 4)
+#endif
+
+	.p2align 4
+L(CopyFrom1To16Bytes_16):
+	bsf	%ecx, %edx
+	movdqa	%xmm4, (%edi)
+#ifdef USE_AS_STRNCPY
+#ifdef USE_AS_STPCPY
+	lea	16(%edi, %edx), %eax
+#endif
+	movdqa	%xmm5, 16(%edi)
+	add	$47, %ebx
+	sub	%edx, %ebx
+	lea	17(%edi, %edx), %edi
+	jmp	L(StrncpyFillTailWithZero)
+#else
+	add	$16, %esi
+	add	$16, %edi
+	BRANCH_TO_JMPTBL_ENTRY (L(ExitTable), %edx, 4)
+#endif
+
+	.p2align 4
+L(CopyFrom1To16Bytes_32):
+	bsf	%edx, %edx
+	movdqa	%xmm4, (%edi)
+	movdqa	%xmm5, 16(%edi)
+#ifdef USE_AS_STRNCPY
+#ifdef USE_AS_STPCPY
+	lea	32(%edi, %edx), %eax
+#endif
+	movdqa	%xmm6, 32(%edi)
+	add	$31, %ebx
+	sub	%edx, %ebx
+	lea	33(%edi, %edx), %edi
+	jmp	L(StrncpyFillTailWithZero)
+#else
+	add	$32, %esi
+	add	$32, %edi
+	BRANCH_TO_JMPTBL_ENTRY (L(ExitTable), %edx, 4)
+#endif
+
+	.p2align 4
+L(CopyFrom1To16BytesUnaligned_0):
+	bsf	%edx, %edx
+#ifdef USE_AS_STRNCPY
+#ifdef USE_AS_STPCPY
+	lea	(%edi, %edx), %eax
+#endif
+	movdqu	%xmm4, (%edi)
+	add	$63, %ebx
+	sub	%edx, %ebx
+	lea	1(%edi, %edx), %edi
+	jmp	L(StrncpyFillTailWithZero)
+#else
+	BRANCH_TO_JMPTBL_ENTRY (L(ExitTable), %edx, 4)
+#endif
+
+	.p2align 4
+L(CopyFrom1To16BytesUnaligned_16):
+	bsf	%ecx, %edx
+	movdqu	%xmm4, (%edi)
+#ifdef USE_AS_STRNCPY
+#ifdef USE_AS_STPCPY
+	lea	16(%edi, %edx), %eax
+#endif
+	movdqu	%xmm5, 16(%edi)
+	add	$47, %ebx
+	sub	%edx, %ebx
+	lea	17(%edi, %edx), %edi
+	jmp	L(StrncpyFillTailWithZero)
+#else
+	add	$16, %esi
+	add	$16, %edi
+	BRANCH_TO_JMPTBL_ENTRY (L(ExitTable), %edx, 4)
+#endif
+
+	.p2align 4
+L(CopyFrom1To16BytesUnaligned_32):
+	bsf	%edx, %edx
+	movdqu	%xmm4, (%edi)
+	movdqu	%xmm5, 16(%edi)
+#ifdef USE_AS_STRNCPY
+#ifdef USE_AS_STPCPY
+	lea	32(%edi, %edx), %eax
+#endif
+	movdqu	%xmm6, 32(%edi)
+	add	$31, %ebx
+	sub	%edx, %ebx
+	lea	33(%edi, %edx), %edi
+	jmp	L(StrncpyFillTailWithZero)
+#else
+	add	$32, %esi
+	add	$32, %edi
+	BRANCH_TO_JMPTBL_ENTRY (L(ExitTable), %edx, 4)
+#endif
+
+#ifdef USE_AS_STRNCPY
+	.p2align 4
+L(CopyFrom1To16BytesXmm6):
+	movdqa	%xmm6, (%edi, %ecx)
+	jmp	L(CopyFrom1To16BytesXmmExit)
+
+	.p2align 4
+L(CopyFrom1To16BytesXmm5):
+	movdqa	%xmm5, (%edi, %ecx)
+	jmp	L(CopyFrom1To16BytesXmmExit)
+
+	.p2align 4
+L(CopyFrom1To16BytesXmm4):
+	movdqa	%xmm4, (%edi, %ecx)
+	jmp	L(CopyFrom1To16BytesXmmExit)
+
+	.p2align 4
+L(CopyFrom1To16BytesXmm3):
+	movdqa	%xmm3, (%edi, %ecx)
+	jmp	L(CopyFrom1To16BytesXmmExit)
+
+	.p2align 4
+L(CopyFrom1To16BytesXmm2):
+	movdqa	%xmm2, (%edi, %ecx)
+	jmp	L(CopyFrom1To16BytesXmmExit)
+
+	.p2align 4
+L(CopyFrom1To16BytesXmm1):
+	movdqa	%xmm1, (%edi, %ecx)
+	jmp	L(CopyFrom1To16BytesXmmExit)
+
+	.p2align 4
+L(CopyFrom1To16BytesUnalignedXmm6):
+	movdqu	%xmm6, (%edi, %ecx)
+	jmp	L(CopyFrom1To16BytesXmmExit)
+
+	.p2align 4
+L(CopyFrom1To16BytesUnalignedXmm5):
+	movdqu	%xmm5, (%edi, %ecx)
+	jmp	L(CopyFrom1To16BytesXmmExit)
+
+	.p2align 4
+L(CopyFrom1To16BytesUnalignedXmm4):
+	movdqu	%xmm4, (%edi, %ecx)
+	jmp	L(CopyFrom1To16BytesXmmExit)
+
+	.p2align 4
+L(CopyFrom1To16BytesUnalignedXmm3):
+	movdqu	%xmm3, (%edi, %ecx)
+	jmp	L(CopyFrom1To16BytesXmmExit)
+
+	.p2align 4
+L(CopyFrom1To16BytesUnalignedXmm1):
+	movdqu	%xmm1, (%edi, %ecx)
+	jmp	L(CopyFrom1To16BytesXmmExit)
+
+	.p2align 4
+L(CopyFrom1To16BytesExit):
+	BRANCH_TO_JMPTBL_ENTRY (L(ExitTable), %edx, 4)
+
+/* Case2 */
+
+	.p2align 4
+L(CopyFrom1To16BytesCase2):
+	add	$16, %ebx
+	add	%ecx, %edi
+	add	%ecx, %esi
+	bsf	%edx, %edx
+	cmp	%ebx, %edx
+	jb	L(CopyFrom1To16BytesExit)
+	BRANCH_TO_JMPTBL_ENTRY (L(ExitStrncpyTable), %ebx, 4)
+
+	.p2align 4
+L(CopyFrom1To32BytesCase2):
+	sub	%ecx, %ebx
+	add	%ecx, %esi
+	bsf	%edx, %edx
+	add	$16, %edx
+	sub	%ecx, %edx
+	cmp	%ebx, %edx
+	jb	L(CopyFrom1To16BytesExit)
+	BRANCH_TO_JMPTBL_ENTRY (L(ExitStrncpyTable), %ebx, 4)
+
+L(CopyFrom1To16BytesTailCase2):
+	sub	%ecx, %ebx
+	add	%ecx, %esi
+	bsf	%edx, %edx
+	cmp	%ebx, %edx
+	jb	L(CopyFrom1To16BytesExit)
+	BRANCH_TO_JMPTBL_ENTRY (L(ExitStrncpyTable), %ebx, 4)
+
+L(CopyFrom1To16BytesTail1Case2):
+	bsf	%edx, %edx
+	cmp	%ebx, %edx
+	jb	L(CopyFrom1To16BytesExit)
+	BRANCH_TO_JMPTBL_ENTRY (L(ExitStrncpyTable), %ebx, 4)
+
+/* Case2 or Case3,  Case3 */
+
+	.p2align 4
+L(CopyFrom1To16BytesCase2OrCase3):
+	test	%edx, %edx
+	jnz	L(CopyFrom1To16BytesCase2)
+L(CopyFrom1To16BytesCase3):
+	add	$16, %ebx
+	add	%ecx, %edi
+	add	%ecx, %esi
+	BRANCH_TO_JMPTBL_ENTRY (L(ExitStrncpyTable), %ebx, 4)
+
+	.p2align 4
+L(CopyFrom1To32BytesCase2OrCase3):
+	test	%edx, %edx
+	jnz	L(CopyFrom1To32BytesCase2)
+	sub	%ecx, %ebx
+	add	%ecx, %esi
+	BRANCH_TO_JMPTBL_ENTRY (L(ExitStrncpyTable), %ebx, 4)
+
+	.p2align 4
+L(CopyFrom1To16BytesTailCase2OrCase3):
+	test	%edx, %edx
+	jnz	L(CopyFrom1To16BytesTailCase2)
+	sub	%ecx, %ebx
+	add	%ecx, %esi
+	BRANCH_TO_JMPTBL_ENTRY (L(ExitStrncpyTable), %ebx, 4)
+
+	.p2align 4
+L(CopyFrom1To32Bytes1Case2OrCase3):
+	add	$16, %edi
+	add	$16, %esi
+	sub	$16, %ebx
+L(CopyFrom1To16BytesTail1Case2OrCase3):
+	test	%edx, %edx
+	jnz	L(CopyFrom1To16BytesTail1Case2)
+	BRANCH_TO_JMPTBL_ENTRY (L(ExitStrncpyTable), %ebx, 4)
+
+#endif
+
+/*-----------------------------------------------------------------*/
+	.p2align 4
+L(Exit0):
+#ifdef USE_AS_STPCPY
+	mov	%edi, %eax
+#endif
+	RETURN
+
+	.p2align 4
+L(Exit1):
+	movb	%dh, (%edi)
+#ifdef USE_AS_STPCPY
+	lea	(%edi), %eax
+#endif
+#ifdef USE_AS_STRNCPY
+	sub	$1, %ebx
+	lea	1(%edi), %edi
+	jnz	L(StrncpyFillTailWithZero)
+#endif
+	RETURN
+
+	.p2align 4
+L(Exit2):
+	movw	(%esi), %dx
+	movw	%dx, (%edi)
+#ifdef USE_AS_STPCPY
+	lea	1(%edi), %eax
+#endif
+#ifdef USE_AS_STRNCPY
+	sub	$2, %ebx
+	lea	2(%edi), %edi
+	jnz	L(StrncpyFillTailWithZero)
+#endif
+	RETURN
+
+	.p2align 4
+L(Exit3):
+	movw	(%esi), %cx
+	movw	%cx, (%edi)
+	movb	%dh, 2(%edi)
+#ifdef USE_AS_STPCPY
+	lea	2(%edi), %eax
+#endif
+#ifdef USE_AS_STRNCPY
+	sub	$3, %ebx
+	lea	3(%edi), %edi
+	jnz	L(StrncpyFillTailWithZero)
+#endif
+	RETURN
+
+	.p2align 4
+L(Exit4):
+	movl	(%esi), %edx
+	movl	%edx, (%edi)
+#ifdef USE_AS_STPCPY
+	lea	3(%edi), %eax
+#endif
+#ifdef USE_AS_STRNCPY
+	sub	$4, %ebx
+	lea	4(%edi), %edi
+	jnz	L(StrncpyFillTailWithZero)
+#endif
+	RETURN
+
+	.p2align 4
+L(Exit5):
+	movl	(%esi), %ecx
+	movb	%dh, 4(%edi)
+	movl	%ecx, (%edi)
+#ifdef USE_AS_STPCPY
+	lea	4(%edi), %eax
+#endif
+#ifdef USE_AS_STRNCPY
+	sub	$5, %ebx
+	lea	5(%edi), %edi
+	jnz	L(StrncpyFillTailWithZero)
+#endif
+	RETURN
+
+	.p2align 4
+L(Exit6):
+	movl	(%esi), %ecx
+	movw	4(%esi), %dx
+	movl	%ecx, (%edi)
+	movw	%dx, 4(%edi)
+#ifdef USE_AS_STPCPY
+	lea	5(%edi), %eax
+#endif
+#ifdef USE_AS_STRNCPY
+	sub	$6, %ebx
+	lea	6(%edi), %edi
+	jnz	L(StrncpyFillTailWithZero)
+#endif
+	RETURN
+
+	.p2align 4
+L(Exit7):
+	movl	(%esi), %ecx
+	movl	3(%esi), %edx
+	movl	%ecx, (%edi)
+	movl	%edx, 3(%edi)
+#ifdef USE_AS_STPCPY
+	lea	6(%edi), %eax
+#endif
+#ifdef USE_AS_STRNCPY
+	sub	$7, %ebx
+	lea	7(%edi), %edi
+	jnz	L(StrncpyFillTailWithZero)
+#endif
+	RETURN
+
+	.p2align 4
+L(Exit8):
+	movlpd	(%esi), %xmm0
+	movlpd	%xmm0, (%edi)
+#ifdef USE_AS_STPCPY
+	lea	7(%edi), %eax
+#endif
+#ifdef USE_AS_STRNCPY
+	sub	$8, %ebx
+	lea	8(%edi), %edi
+	jnz	L(StrncpyFillTailWithZero)
+#endif
+	RETURN
+
+	.p2align 4
+L(Exit9):
+	movlpd	(%esi), %xmm0
+	movb	%dh, 8(%edi)
+	movlpd	%xmm0, (%edi)
+#ifdef USE_AS_STPCPY
+	lea	8(%edi), %eax
+#endif
+#ifdef USE_AS_STRNCPY
+	sub	$9, %ebx
+	lea	9(%edi), %edi
+	jnz	L(StrncpyFillTailWithZero)
+#endif
+	RETURN
+
+	.p2align 4
+L(Exit10):
+	movlpd	(%esi), %xmm0
+	movw	8(%esi), %dx
+	movlpd	%xmm0, (%edi)
+	movw	%dx, 8(%edi)
+#ifdef USE_AS_STPCPY
+	lea	9(%edi), %eax
+#endif
+#ifdef USE_AS_STRNCPY
+	sub	$10, %ebx
+	lea	10(%edi), %edi
+	jnz	L(StrncpyFillTailWithZero)
+#endif
+	RETURN
+
+	.p2align 4
+L(Exit11):
+	movlpd	(%esi), %xmm0
+	movl	7(%esi), %edx
+	movlpd	%xmm0, (%edi)
+	movl	%edx, 7(%edi)
+#ifdef USE_AS_STPCPY
+	lea	10(%edi), %eax
+#endif
+#ifdef USE_AS_STRNCPY
+	sub	$11, %ebx
+	lea	11(%edi), %edi
+	jnz	L(StrncpyFillTailWithZero)
+#endif
+	RETURN
+
+	.p2align 4
+L(Exit12):
+	movlpd	(%esi), %xmm0
+	movl	8(%esi), %edx
+	movlpd	%xmm0, (%edi)
+	movl	%edx, 8(%edi)
+#ifdef USE_AS_STPCPY
+	lea	11(%edi), %eax
+#endif
+#ifdef USE_AS_STRNCPY
+	sub	$12, %ebx
+	lea	12(%edi), %edi
+	jnz	L(StrncpyFillTailWithZero)
+#endif
+	RETURN
+
+	.p2align 4
+L(Exit13):
+	movlpd	(%esi), %xmm0
+	movlpd	5(%esi), %xmm1
+	movlpd	%xmm0, (%edi)
+	movlpd	%xmm1, 5(%edi)
+#ifdef USE_AS_STPCPY
+	lea	12(%edi), %eax
+#endif
+#ifdef USE_AS_STRNCPY
+	sub	$13, %ebx
+	lea	13(%edi), %edi
+	jnz	L(StrncpyFillTailWithZero)
+#endif
+	RETURN
+
+	.p2align 4
+L(Exit14):
+	movlpd	(%esi), %xmm0
+	movlpd	6(%esi), %xmm1
+	movlpd	%xmm0, (%edi)
+	movlpd	%xmm1, 6(%edi)
+#ifdef USE_AS_STPCPY
+	lea	13(%edi), %eax
+#endif
+#ifdef USE_AS_STRNCPY
+	sub	$14, %ebx
+	lea	14(%edi), %edi
+	jnz	L(StrncpyFillTailWithZero)
+#endif
+	RETURN
+
+	.p2align 4
+L(Exit15):
+	movlpd	(%esi), %xmm0
+	movlpd	7(%esi), %xmm1
+	movlpd	%xmm0, (%edi)
+	movlpd	%xmm1, 7(%edi)
+#ifdef USE_AS_STPCPY
+	lea	14(%edi), %eax
+#endif
+#ifdef USE_AS_STRNCPY
+	sub	$15, %ebx
+	lea	15(%edi), %edi
+	jnz	L(StrncpyFillTailWithZero)
+#endif
+	RETURN
+
+	.p2align 4
+L(Exit16):
+	movdqu	(%esi), %xmm0
+	movdqu	%xmm0, (%edi)
+#ifdef USE_AS_STPCPY
+	lea	15(%edi), %eax
+#endif
+#ifdef USE_AS_STRNCPY
+	sub	$16, %ebx
+	lea	16(%edi), %edi
+	jnz	L(StrncpyFillTailWithZero)
+#endif
+	RETURN
+
+	.p2align 4
+L(Exit17):
+	movdqu	(%esi), %xmm0
+	xor	%cl, %cl
+	movdqu	%xmm0, (%edi)
+	movb	%cl, 16(%edi)
+#ifdef USE_AS_STPCPY
+	lea	16(%edi), %eax
+#endif
+#ifdef USE_AS_STRNCPY
+	sub	$17, %ebx
+	lea	17(%edi), %edi
+	jnz	L(StrncpyFillTailWithZero)
+#endif
+	RETURN
+
+	.p2align 4
+L(Exit18):
+	movdqu	(%esi), %xmm0
+	movw	16(%esi), %cx
+	movdqu	%xmm0, (%edi)
+	movw	%cx, 16(%edi)
+#ifdef USE_AS_STPCPY
+	lea	17(%edi), %eax
+#endif
+#ifdef USE_AS_STRNCPY
+	sub	$18, %ebx
+	lea	18(%edi), %edi
+	jnz	L(StrncpyFillTailWithZero)
+#endif
+	RETURN
+
+	.p2align 4
+L(Exit19):
+	movdqu	(%esi), %xmm0
+	movl	15(%esi), %ecx
+	movdqu	%xmm0, (%edi)
+	movl	%ecx, 15(%edi)
+#ifdef USE_AS_STPCPY
+	lea	18(%edi), %eax
+#endif
+#ifdef USE_AS_STRNCPY
+	sub	$19, %ebx
+	lea	19(%edi), %edi
+	jnz	L(StrncpyFillTailWithZero)
+#endif
+	RETURN
+
+	.p2align 4
+L(Exit20):
+	movdqu	(%esi), %xmm0
+	movl	16(%esi), %ecx
+	movdqu	%xmm0, (%edi)
+	movl	%ecx, 16(%edi)
+#ifdef USE_AS_STPCPY
+	lea	19(%edi), %eax
+#endif
+#ifdef USE_AS_STRNCPY
+	sub	$20, %ebx
+	lea	20(%edi), %edi
+	jnz	L(StrncpyFillTailWithZero)
+#endif
+	RETURN
+
+	.p2align 4
+L(Exit21):
+	movdqu	(%esi), %xmm0
+	movl	16(%esi), %ecx
+	xor	%dl, %dl
+	movdqu	%xmm0, (%edi)
+	movl	%ecx, 16(%edi)
+	movb	%dl, 20(%edi)
+#ifdef USE_AS_STPCPY
+	lea	20(%edi), %eax
+#endif
+#ifdef USE_AS_STRNCPY
+	sub	$21, %ebx
+	lea	21(%edi), %edi
+	jnz	L(StrncpyFillTailWithZero)
+#endif
+	RETURN
+
+	.p2align 4
+L(Exit22):
+	movdqu	(%esi), %xmm0
+	movlpd	14(%esi), %xmm3
+	movdqu	%xmm0, (%edi)
+	movlpd	%xmm3, 14(%edi)
+#ifdef USE_AS_STPCPY
+	lea	21(%edi), %eax
+#endif
+#ifdef USE_AS_STRNCPY
+	sub	$22, %ebx
+	lea	22(%edi), %edi
+	jnz	L(StrncpyFillTailWithZero)
+#endif
+	RETURN
+
+	.p2align 4
+L(Exit23):
+	movdqu	(%esi), %xmm0
+	movlpd	15(%esi), %xmm3
+	movdqu	%xmm0, (%edi)
+	movlpd	%xmm3, 15(%edi)
+#ifdef USE_AS_STPCPY
+	lea	22(%edi), %eax
+#endif
+#ifdef USE_AS_STRNCPY
+	sub	$23, %ebx
+	lea	23(%edi), %edi
+	jnz	L(StrncpyFillTailWithZero)
+#endif
+	RETURN
+
+	.p2align 4
+L(Exit24):
+	movdqu	(%esi), %xmm0
+	movlpd	16(%esi), %xmm2
+	movdqu	%xmm0, (%edi)
+	movlpd	%xmm2, 16(%edi)
+#ifdef USE_AS_STPCPY
+	lea	23(%edi), %eax
+#endif
+#ifdef USE_AS_STRNCPY
+	sub	$24, %ebx
+	lea	24(%edi), %edi
+	jnz	L(StrncpyFillTailWithZero)
+#endif
+	RETURN
+
+	.p2align 4
+L(Exit25):
+	movdqu	(%esi), %xmm0
+	movlpd	16(%esi), %xmm2
+	xor	%cl, %cl
+	movdqu	%xmm0, (%edi)
+	movlpd	%xmm2, 16(%edi)
+	movb	%cl, 24(%edi)
+#ifdef USE_AS_STPCPY
+	lea	24(%edi), %eax
+#endif
+#ifdef USE_AS_STRNCPY
+	sub	$25, %ebx
+	lea	25(%edi), %edi
+	jnz	L(StrncpyFillTailWithZero)
+#endif
+	RETURN
+
+	.p2align 4
+L(Exit26):
+	movdqu	(%esi), %xmm0
+	movlpd	16(%esi), %xmm2
+	movw	24(%esi), %cx
+	movdqu	%xmm0, (%edi)
+	movlpd	%xmm2, 16(%edi)
+	movw	%cx, 24(%edi)
+#ifdef USE_AS_STPCPY
+	lea	25(%edi), %eax
+#endif
+#ifdef USE_AS_STRNCPY
+	sub	$26, %ebx
+	lea	26(%edi), %edi
+	jnz	L(StrncpyFillTailWithZero)
+#endif
+	RETURN
+
+	.p2align 4
+L(Exit27):
+	movdqu	(%esi), %xmm0
+	movlpd	16(%esi), %xmm2
+	movl	23(%esi), %ecx
+	movdqu	%xmm0, (%edi)
+	movlpd	%xmm2, 16(%edi)
+	movl	%ecx, 23(%edi)
+#ifdef USE_AS_STPCPY
+	lea	26(%edi), %eax
+#endif
+#ifdef USE_AS_STRNCPY
+	sub	$27, %ebx
+	lea	27(%edi), %edi
+	jnz	L(StrncpyFillTailWithZero)
+#endif
+	RETURN
+
+	.p2align 4
+L(Exit28):
+	movdqu	(%esi), %xmm0
+	movlpd	16(%esi), %xmm2
+	movl	24(%esi), %ecx
+	movdqu	%xmm0, (%edi)
+	movlpd	%xmm2, 16(%edi)
+	movl	%ecx, 24(%edi)
+#ifdef USE_AS_STPCPY
+	lea	27(%edi), %eax
+#endif
+#ifdef USE_AS_STRNCPY
+	sub	$28, %ebx
+	lea	28(%edi), %edi
+	jnz	L(StrncpyFillTailWithZero)
+#endif
+	RETURN
+
+	.p2align 4
+L(Exit29):
+	movdqu	(%esi), %xmm0
+	movdqu	13(%esi), %xmm2
+	movdqu	%xmm0, (%edi)
+	movdqu	%xmm2, 13(%edi)
+#ifdef USE_AS_STPCPY
+	lea	28(%edi), %eax
+#endif
+#ifdef USE_AS_STRNCPY
+	sub	$29, %ebx
+	lea	29(%edi), %edi
+	jnz	L(StrncpyFillTailWithZero)
+#endif
+	RETURN
+
+	.p2align 4
+L(Exit30):
+	movdqu	(%esi), %xmm0
+	movdqu	14(%esi), %xmm2
+	movdqu	%xmm0, (%edi)
+	movdqu	%xmm2, 14(%edi)
+#ifdef USE_AS_STPCPY
+	lea	29(%edi), %eax
+#endif
+#ifdef USE_AS_STRNCPY
+	sub	$30, %ebx
+	lea	30(%edi), %edi
+	jnz	L(StrncpyFillTailWithZero)
+#endif
+	RETURN
+
+
+	.p2align 4
+L(Exit31):
+	movdqu	(%esi), %xmm0
+	movdqu	15(%esi), %xmm2
+	movdqu	%xmm0, (%edi)
+	movdqu	%xmm2, 15(%edi)
+#ifdef USE_AS_STPCPY
+	lea	30(%edi), %eax
+#endif
+#ifdef USE_AS_STRNCPY
+	sub	$31, %ebx
+	lea	31(%edi), %edi
+	jnz	L(StrncpyFillTailWithZero)
+#endif
+	RETURN
+
+	.p2align 4
+L(Exit32):
+	movdqu	(%esi), %xmm0
+	movdqu	16(%esi), %xmm2
+	movdqu	%xmm0, (%edi)
+	movdqu	%xmm2, 16(%edi)
+#ifdef USE_AS_STPCPY
+	lea	31(%edi), %eax
+#endif
+#ifdef USE_AS_STRNCPY
+	sub	$32, %ebx
+	lea	32(%edi), %edi
+	jnz	L(StrncpyFillTailWithZero)
+#endif
+	RETURN
+
+#ifdef USE_AS_STRNCPY
+
+	.p2align 4
+L(StrncpyExit1):
+	movb	(%esi), %dl
+	movb	%dl, (%edi)
+#ifdef USE_AS_STPCPY
+	lea	1(%edi), %eax
+#endif
+	RETURN
+
+	.p2align 4
+L(StrncpyExit2):
+	movw	(%esi), %dx
+	movw	%dx, (%edi)
+#ifdef USE_AS_STPCPY
+	lea	2(%edi), %eax
+#endif
+	RETURN
+	.p2align 4
+L(StrncpyExit3):
+	movw	(%esi), %cx
+	movb	2(%esi), %dl
+	movw	%cx, (%edi)
+	movb	%dl, 2(%edi)
+#ifdef USE_AS_STPCPY
+	lea	3(%edi), %eax
+#endif
+	RETURN
+
+	.p2align 4
+L(StrncpyExit4):
+	movl	(%esi), %edx
+	movl	%edx, (%edi)
+#ifdef USE_AS_STPCPY
+	lea	4(%edi), %eax
+#endif
+	RETURN
+
+	.p2align 4
+L(StrncpyExit5):
+	movl	(%esi), %ecx
+	movb	4(%esi), %dl
+	movl	%ecx, (%edi)
+	movb	%dl, 4(%edi)
+#ifdef USE_AS_STPCPY
+	lea	5(%edi), %eax
+#endif
+	RETURN
+
+	.p2align 4
+L(StrncpyExit6):
+	movl	(%esi), %ecx
+	movw	4(%esi), %dx
+	movl	%ecx, (%edi)
+	movw	%dx, 4(%edi)
+#ifdef USE_AS_STPCPY
+	lea	6(%edi), %eax
+#endif
+	RETURN
+
+	.p2align 4
+L(StrncpyExit7):
+	movl	(%esi), %ecx
+	movl	3(%esi), %edx
+	movl	%ecx, (%edi)
+	movl	%edx, 3(%edi)
+#ifdef USE_AS_STPCPY
+	lea	7(%edi), %eax
+#endif
+	RETURN
+
+	.p2align 4
+L(StrncpyExit8):
+	movlpd	(%esi), %xmm0
+	movlpd	%xmm0, (%edi)
+#ifdef USE_AS_STPCPY
+	lea	8(%edi), %eax
+#endif
+	RETURN
+
+	.p2align 4
+L(StrncpyExit9):
+	movlpd	(%esi), %xmm0
+	movb	8(%esi), %dl
+	movlpd	%xmm0, (%edi)
+	movb	%dl, 8(%edi)
+#ifdef USE_AS_STPCPY
+	lea	9(%edi), %eax
+#endif
+	RETURN
+
+	.p2align 4
+L(StrncpyExit10):
+	movlpd	(%esi), %xmm0
+	movw	8(%esi), %dx
+	movlpd	%xmm0, (%edi)
+	movw	%dx, 8(%edi)
+#ifdef USE_AS_STPCPY
+	lea	10(%edi), %eax
+#endif
+	RETURN
+
+	.p2align 4
+L(StrncpyExit11):
+	movlpd	(%esi), %xmm0
+	movl	7(%esi), %edx
+	movlpd	%xmm0, (%edi)
+	movl	%edx, 7(%edi)
+#ifdef USE_AS_STPCPY
+	lea	11(%edi), %eax
+#endif
+	RETURN
+
+	.p2align 4
+L(StrncpyExit12):
+	movlpd	(%esi), %xmm0
+	movl	8(%esi), %edx
+	movlpd	%xmm0, (%edi)
+	movl	%edx, 8(%edi)
+#ifdef USE_AS_STPCPY
+	lea	12(%edi), %eax
+#endif
+	RETURN
+
+	.p2align 4
+L(StrncpyExit13):
+	movlpd	(%esi), %xmm0
+	movlpd	5(%esi), %xmm1
+	movlpd	%xmm0, (%edi)
+	movlpd	%xmm1, 5(%edi)
+#ifdef USE_AS_STPCPY
+	lea	13(%edi), %eax
+#endif
+	RETURN
+
+	.p2align 4
+L(StrncpyExit14):
+	movlpd	(%esi), %xmm0
+	movlpd	6(%esi), %xmm1
+	movlpd	%xmm0, (%edi)
+	movlpd	%xmm1, 6(%edi)
+#ifdef USE_AS_STPCPY
+	lea	14(%edi), %eax
+#endif
+	RETURN
+
+	.p2align 4
+L(StrncpyExit15):
+	movlpd	(%esi), %xmm0
+	movlpd	7(%esi), %xmm1
+	movlpd	%xmm0, (%edi)
+	movlpd	%xmm1, 7(%edi)
+#ifdef USE_AS_STPCPY
+	lea	15(%edi), %eax
+#endif
+	RETURN
+
+	.p2align 4
+L(StrncpyExit16):
+	movdqu	(%esi), %xmm0
+	movdqu	%xmm0, (%edi)
+#ifdef USE_AS_STPCPY
+	lea	16(%edi), %eax
+#endif
+	RETURN
+
+	.p2align 4
+L(StrncpyExit17):
+	movdqu	(%esi), %xmm0
+	movb	16(%esi), %cl
+	movdqu	%xmm0, (%edi)
+	movb	%cl, 16(%edi)
+#ifdef USE_AS_STPCPY
+	lea	17(%edi), %eax
+#endif
+	RETURN
+
+	.p2align 4
+L(StrncpyExit18):
+	movdqu	(%esi), %xmm0
+	movw	16(%esi), %cx
+	movdqu	%xmm0, (%edi)
+	movw	%cx, 16(%edi)
+#ifdef USE_AS_STPCPY
+	lea	18(%edi), %eax
+#endif
+	RETURN
+
+	.p2align 4
+L(StrncpyExit19):
+	movdqu	(%esi), %xmm0
+	movl	15(%esi), %ecx
+	movdqu	%xmm0, (%edi)
+	movl	%ecx, 15(%edi)
+#ifdef USE_AS_STPCPY
+	lea	19(%edi), %eax
+#endif
+	RETURN
+
+	.p2align 4
+L(StrncpyExit20):
+	movdqu	(%esi), %xmm0
+	movl	16(%esi), %ecx
+	movdqu	%xmm0, (%edi)
+	movl	%ecx, 16(%edi)
+#ifdef USE_AS_STPCPY
+	lea	20(%edi), %eax
+#endif
+	RETURN
+
+	.p2align 4
+L(StrncpyExit21):
+	movdqu	(%esi), %xmm0
+	movl	16(%esi), %ecx
+	movb	20(%esi), %dl
+	movdqu	%xmm0, (%edi)
+	movl	%ecx, 16(%edi)
+	movb	%dl, 20(%edi)
+#ifdef USE_AS_STPCPY
+	lea	21(%edi), %eax
+#endif
+	RETURN
+
+	.p2align 4
+L(StrncpyExit22):
+	movdqu	(%esi), %xmm0
+	movlpd	14(%esi), %xmm3
+	movdqu	%xmm0, (%edi)
+	movlpd	%xmm3, 14(%edi)
+#ifdef USE_AS_STPCPY
+	lea	22(%edi), %eax
+#endif
+	RETURN
+
+	.p2align 4
+L(StrncpyExit23):
+	movdqu	(%esi), %xmm0
+	movlpd	15(%esi), %xmm3
+	movdqu	%xmm0, (%edi)
+	movlpd	%xmm3, 15(%edi)
+#ifdef USE_AS_STPCPY
+	lea	23(%edi), %eax
+#endif
+	RETURN
+
+	.p2align 4
+L(StrncpyExit24):
+	movdqu	(%esi), %xmm0
+	movlpd	16(%esi), %xmm2
+	movdqu	%xmm0, (%edi)
+	movlpd	%xmm2, 16(%edi)
+#ifdef USE_AS_STPCPY
+	lea	24(%edi), %eax
+#endif
+	RETURN
+
+	.p2align 4
+L(StrncpyExit25):
+	movdqu	(%esi), %xmm0
+	movlpd	16(%esi), %xmm2
+	movb	24(%esi), %cl
+	movdqu	%xmm0, (%edi)
+	movlpd	%xmm2, 16(%edi)
+	movb	%cl, 24(%edi)
+#ifdef USE_AS_STPCPY
+	lea	25(%edi), %eax
+#endif
+	RETURN
+
+	.p2align 4
+L(StrncpyExit26):
+	movdqu	(%esi), %xmm0
+	movlpd	16(%esi), %xmm2
+	movw	24(%esi), %cx
+	movdqu	%xmm0, (%edi)
+	movlpd	%xmm2, 16(%edi)
+	movw	%cx, 24(%edi)
+#ifdef USE_AS_STPCPY
+	lea	26(%edi), %eax
+#endif
+	RETURN
+
+	.p2align 4
+L(StrncpyExit27):
+	movdqu	(%esi), %xmm0
+	movlpd	16(%esi), %xmm2
+	movl	23(%esi), %ecx
+	movdqu	%xmm0, (%edi)
+	movlpd	%xmm2, 16(%edi)
+	movl	%ecx, 23(%edi)
+#ifdef USE_AS_STPCPY
+	lea	27(%edi), %eax
+#endif
+	RETURN
+
+	.p2align 4
+L(StrncpyExit28):
+	movdqu	(%esi), %xmm0
+	movlpd	16(%esi), %xmm2
+	movl	24(%esi), %ecx
+	movdqu	%xmm0, (%edi)
+	movlpd	%xmm2, 16(%edi)
+	movl	%ecx, 24(%edi)
+#ifdef USE_AS_STPCPY
+	lea	28(%edi), %eax
+#endif
+	RETURN
+
+	.p2align 4
+L(StrncpyExit29):
+	movdqu	(%esi), %xmm0
+	movdqu	13(%esi), %xmm2
+	movdqu	%xmm0, (%edi)
+	movdqu	%xmm2, 13(%edi)
+#ifdef USE_AS_STPCPY
+	lea	29(%edi), %eax
+#endif
+	RETURN
+
+	.p2align 4
+L(StrncpyExit30):
+	movdqu	(%esi), %xmm0
+	movdqu	14(%esi), %xmm2
+	movdqu	%xmm0, (%edi)
+	movdqu	%xmm2, 14(%edi)
+#ifdef USE_AS_STPCPY
+	lea	30(%edi), %eax
+#endif
+	RETURN
+
+	.p2align 4
+L(StrncpyExit31):
+	movdqu	(%esi), %xmm0
+	movdqu	15(%esi), %xmm2
+	movdqu	%xmm0, (%edi)
+	movdqu	%xmm2, 15(%edi)
+#ifdef USE_AS_STPCPY
+	lea	31(%edi), %eax
+#endif
+	RETURN
+
+	.p2align 4
+L(StrncpyExit32):
+	movdqu	(%esi), %xmm0
+	movdqu	16(%esi), %xmm2
+	movdqu	%xmm0, (%edi)
+	movdqu	%xmm2, 16(%edi)
+#ifdef USE_AS_STPCPY
+	lea	32(%edi), %eax
+#endif
+	RETURN
+
+	.p2align 4
+L(StrncpyExit33):
+	movdqu	(%esi), %xmm0
+	movdqu	16(%esi), %xmm2
+	movb	32(%esi), %cl
+	movdqu	%xmm0, (%edi)
+	movdqu	%xmm2, 16(%edi)
+	movb	%cl, 32(%edi)
+	RETURN
+
+	.p2align 4
+L(Fill0):
+	RETURN
+
+	.p2align 4
+L(Fill1):
+	movb	%dl, (%edi)
+	RETURN
+
+	.p2align 4
+L(Fill2):
+	movw	%dx, (%edi)
+	RETURN
+
+	.p2align 4
+L(Fill3):
+	movl	%edx, -1(%edi)
+	RETURN
+
+	.p2align 4
+L(Fill4):
+	movl	%edx, (%edi)
+	RETURN
+
+	.p2align 4
+L(Fill5):
+	movl	%edx, (%edi)
+	movb	%dl, 4(%edi)
+	RETURN
+
+	.p2align 4
+L(Fill6):
+	movl	%edx, (%edi)
+	movw	%dx, 4(%edi)
+	RETURN
+
+	.p2align 4
+L(Fill7):
+	movlpd	%xmm0, -1(%edi)
+	RETURN
+
+	.p2align 4
+L(Fill8):
+	movlpd	%xmm0, (%edi)
+	RETURN
+
+	.p2align 4
+L(Fill9):
+	movlpd	%xmm0, (%edi)
+	movb	%dl, 8(%edi)
+	RETURN
+
+	.p2align 4
+L(Fill10):
+	movlpd	%xmm0, (%edi)
+	movw	%dx, 8(%edi)
+	RETURN
+
+	.p2align 4
+L(Fill11):
+	movlpd	%xmm0, (%edi)
+	movl	%edx, 7(%edi)
+	RETURN
+
+	.p2align 4
+L(Fill12):
+	movlpd	%xmm0, (%edi)
+	movl	%edx, 8(%edi)
+	RETURN
+
+	.p2align 4
+L(Fill13):
+	movlpd	%xmm0, (%edi)
+	movlpd	%xmm0, 5(%edi)
+	RETURN
+
+	.p2align 4
+L(Fill14):
+	movlpd	%xmm0, (%edi)
+	movlpd	%xmm0, 6(%edi)
+	RETURN
+
+	.p2align 4
+L(Fill15):
+	movdqu	%xmm0, -1(%edi)
+	RETURN
+
+	.p2align 4
+L(Fill16):
+	movdqu	%xmm0, (%edi)
+	RETURN
+
+	.p2align 4
+L(CopyFrom1To16BytesUnalignedXmm2):
+	movdqu	%xmm2, (%edi, %ecx)
+
+	.p2align 4
+L(CopyFrom1To16BytesXmmExit):
+	bsf	%edx, %edx
+	add	$15, %ebx
+	add	%ecx, %edi
+#ifdef USE_AS_STPCPY
+	lea	(%edi, %edx), %eax
+#endif
+	sub	%edx, %ebx
+	lea	1(%edi, %edx), %edi
+
+	.p2align 4
+L(StrncpyFillTailWithZero):
+	pxor	%xmm0, %xmm0
+	xor	%edx, %edx
+	sub	$16, %ebx
+	jbe	L(StrncpyFillExit)
+
+	movdqu	%xmm0, (%edi)
+	add	$16, %edi
+
+	mov	%edi, %esi
+	and	$0xf, %esi
+	sub	%esi, %edi
+	add	%esi, %ebx
+	sub	$64, %ebx
+	jb	L(StrncpyFillLess64)
+
+L(StrncpyFillLoopMovdqa):
+	movdqa	%xmm0, (%edi)
+	movdqa	%xmm0, 16(%edi)
+	movdqa	%xmm0, 32(%edi)
+	movdqa	%xmm0, 48(%edi)
+	add	$64, %edi
+	sub	$64, %ebx
+	jae	L(StrncpyFillLoopMovdqa)
+
+L(StrncpyFillLess64):
+	add	$32, %ebx
+	jl	L(StrncpyFillLess32)
+	movdqa	%xmm0, (%edi)
+	movdqa	%xmm0, 16(%edi)
+	add	$32, %edi
+	sub	$16, %ebx
+	jl	L(StrncpyFillExit)
+	movdqa	%xmm0, (%edi)
+	add	$16, %edi
+	BRANCH_TO_JMPTBL_ENTRY (L(FillTable), %ebx, 4)
+
+L(StrncpyFillLess32):
+	add	$16, %ebx
+	jl	L(StrncpyFillExit)
+	movdqa	%xmm0, (%edi)
+	add	$16, %edi
+	BRANCH_TO_JMPTBL_ENTRY (L(FillTable), %ebx, 4)
+
+L(StrncpyFillExit):
+	add	$16, %ebx
+	BRANCH_TO_JMPTBL_ENTRY (L(FillTable), %ebx, 4)
+
+	.p2align 4
+L(AlignedLeaveCase2OrCase3):
+	test	%edx, %edx
+	jnz	L(Aligned64LeaveCase2)
+L(Aligned64LeaveCase3):
+	lea	64(%ebx), %ecx
+	and	$-16, %ecx
+	add	$48, %ebx
+	jl	L(CopyFrom1To16BytesCase3)
+	movdqa	%xmm4, (%edi)
+	sub	$16, %ebx
+	jb	L(CopyFrom1To16BytesCase3)
+	movdqa	%xmm5, 16(%edi)
+	sub	$16, %ebx
+	jb	L(CopyFrom1To16BytesCase3)
+	movdqa	%xmm6, 32(%edi)
+	sub	$16, %ebx
+	jb	L(CopyFrom1To16BytesCase3)
+	movdqa	%xmm7, 48(%edi)
+#ifdef USE_AS_STPCPY
+	lea	64(%edi), %eax
+#endif
+	RETURN
+
+	.p2align 4
+L(Aligned64LeaveCase2):
+	pxor	%xmm0, %xmm0
+	xor	%ecx, %ecx
+	pcmpeqb	%xmm4, %xmm0
+	pmovmskb %xmm0, %edx
+	add	$48, %ebx
+	jle	L(CopyFrom1To16BytesCase2OrCase3)
+	test	%edx, %edx
+	jnz	L(CopyFrom1To16BytesXmm4)
+
+	pcmpeqb	%xmm5, %xmm0
+	pmovmskb %xmm0, %edx
+	movdqa	%xmm4, (%edi)
+	add	$16, %ecx
+	sub	$16, %ebx
+	jbe	L(CopyFrom1To16BytesCase2OrCase3)
+	test	%edx, %edx
+	jnz	L(CopyFrom1To16BytesXmm5)
+
+	pcmpeqb	%xmm6, %xmm0
+	pmovmskb %xmm0, %edx
+	movdqa	%xmm5, 16(%edi)
+	add	$16, %ecx
+	sub	$16, %ebx
+	jbe	L(CopyFrom1To16BytesCase2OrCase3)
+	test	%edx, %edx
+	jnz	L(CopyFrom1To16BytesXmm6)
+
+	pcmpeqb	%xmm7, %xmm0
+	pmovmskb %xmm0, %edx
+	movdqa	%xmm6, 32(%edi)
+	lea	16(%edi, %ecx), %edi
+	lea	16(%esi, %ecx), %esi
+	bsf	%edx, %edx
+	cmp	%ebx, %edx
+	jb	L(CopyFrom1To16BytesExit)
+	BRANCH_TO_JMPTBL_ENTRY (L(ExitStrncpyTable), %ebx, 4)
+
+	.p2align 4
+L(UnalignedLeaveCase2OrCase3):
+	test	%edx, %edx
+	jnz	L(Unaligned64LeaveCase2)
+L(Unaligned64LeaveCase3):
+	lea	64(%ebx), %ecx
+	and	$-16, %ecx
+	add	$48, %ebx
+	jl	L(CopyFrom1To16BytesCase3)
+	movdqu	%xmm4, (%edi)
+	sub	$16, %ebx
+	jb	L(CopyFrom1To16BytesCase3)
+	movdqu	%xmm5, 16(%edi)
+	sub	$16, %ebx
+	jb	L(CopyFrom1To16BytesCase3)
+	movdqu	%xmm6, 32(%edi)
+	sub	$16, %ebx
+	jb	L(CopyFrom1To16BytesCase3)
+	movdqu	%xmm7, 48(%edi)
+#ifdef USE_AS_STPCPY
+	lea	64(%edi), %eax
+#endif
+	RETURN
+
+	.p2align 4
+L(Unaligned64LeaveCase2):
+	pxor	%xmm0, %xmm0
+	xor	%ecx, %ecx
+	pcmpeqb	%xmm4, %xmm0
+	pmovmskb %xmm0, %edx
+	add	$48, %ebx
+	jle	L(CopyFrom1To16BytesCase2OrCase3)
+	test	%edx, %edx
+	jnz	L(CopyFrom1To16BytesUnalignedXmm4)
+
+	pcmpeqb	%xmm5, %xmm0
+	pmovmskb %xmm0, %edx
+	movdqu	%xmm4, (%edi)
+	add	$16, %ecx
+	sub	$16, %ebx
+	jbe	L(CopyFrom1To16BytesCase2OrCase3)
+	test	%edx, %edx
+	jnz	L(CopyFrom1To16BytesUnalignedXmm5)
+
+	pcmpeqb	%xmm6, %xmm0
+	pmovmskb %xmm0, %edx
+	movdqu	%xmm5, 16(%edi)
+	add	$16, %ecx
+	sub	$16, %ebx
+	jbe	L(CopyFrom1To16BytesCase2OrCase3)
+	test	%edx, %edx
+	jnz	L(CopyFrom1To16BytesUnalignedXmm6)
+
+	pcmpeqb	%xmm7, %xmm0
+	pmovmskb %xmm0, %edx
+	movdqu	%xmm6, 32(%edi)
+	lea	16(%edi, %ecx), %edi
+	lea	16(%esi, %ecx), %esi
+	bsf	%edx, %edx
+	cmp	%ebx, %edx
+	jb	L(CopyFrom1To16BytesExit)
+	BRANCH_TO_JMPTBL_ENTRY (L(ExitStrncpyTable), %ebx, 4)
+
+	.p2align 4
+L(ExitZero):
+	movl	%edi, %eax
+	RETURN
+#endif
+
+END (STRCPY)
+
+	.p2align 4
+	.section .rodata
+L(ExitTable):
+	.int	JMPTBL(L(Exit1), L(ExitTable))
+	.int	JMPTBL(L(Exit2), L(ExitTable))
+	.int	JMPTBL(L(Exit3), L(ExitTable))
+	.int	JMPTBL(L(Exit4), L(ExitTable))
+	.int	JMPTBL(L(Exit5), L(ExitTable))
+	.int	JMPTBL(L(Exit6), L(ExitTable))
+	.int	JMPTBL(L(Exit7), L(ExitTable))
+	.int	JMPTBL(L(Exit8), L(ExitTable))
+	.int	JMPTBL(L(Exit9), L(ExitTable))
+	.int	JMPTBL(L(Exit10), L(ExitTable))
+	.int	JMPTBL(L(Exit11), L(ExitTable))
+	.int	JMPTBL(L(Exit12), L(ExitTable))
+	.int	JMPTBL(L(Exit13), L(ExitTable))
+	.int	JMPTBL(L(Exit14), L(ExitTable))
+	.int	JMPTBL(L(Exit15), L(ExitTable))
+	.int	JMPTBL(L(Exit16), L(ExitTable))
+	.int	JMPTBL(L(Exit17), L(ExitTable))
+	.int	JMPTBL(L(Exit18), L(ExitTable))
+	.int	JMPTBL(L(Exit19), L(ExitTable))
+	.int	JMPTBL(L(Exit20), L(ExitTable))
+	.int	JMPTBL(L(Exit21), L(ExitTable))
+	.int	JMPTBL(L(Exit22), L(ExitTable))
+	.int    JMPTBL(L(Exit23), L(ExitTable))
+	.int	JMPTBL(L(Exit24), L(ExitTable))
+	.int	JMPTBL(L(Exit25), L(ExitTable))
+	.int	JMPTBL(L(Exit26), L(ExitTable))
+	.int	JMPTBL(L(Exit27), L(ExitTable))
+	.int	JMPTBL(L(Exit28), L(ExitTable))
+	.int	JMPTBL(L(Exit29), L(ExitTable))
+	.int	JMPTBL(L(Exit30), L(ExitTable))
+	.int	JMPTBL(L(Exit31), L(ExitTable))
+	.int	JMPTBL(L(Exit32), L(ExitTable))
+#ifdef USE_AS_STRNCPY
+L(ExitStrncpyTable):
+	.int	JMPTBL(L(Exit0), L(ExitStrncpyTable))
+	.int	JMPTBL(L(StrncpyExit1), L(ExitStrncpyTable))
+	.int	JMPTBL(L(StrncpyExit2), L(ExitStrncpyTable))
+	.int	JMPTBL(L(StrncpyExit3), L(ExitStrncpyTable))
+	.int	JMPTBL(L(StrncpyExit4), L(ExitStrncpyTable))
+	.int	JMPTBL(L(StrncpyExit5), L(ExitStrncpyTable))
+	.int	JMPTBL(L(StrncpyExit6), L(ExitStrncpyTable))
+	.int	JMPTBL(L(StrncpyExit7), L(ExitStrncpyTable))
+	.int	JMPTBL(L(StrncpyExit8), L(ExitStrncpyTable))
+	.int	JMPTBL(L(StrncpyExit9), L(ExitStrncpyTable))
+	.int	JMPTBL(L(StrncpyExit10), L(ExitStrncpyTable))
+	.int	JMPTBL(L(StrncpyExit11), L(ExitStrncpyTable))
+	.int	JMPTBL(L(StrncpyExit12), L(ExitStrncpyTable))
+	.int	JMPTBL(L(StrncpyExit13), L(ExitStrncpyTable))
+	.int	JMPTBL(L(StrncpyExit14), L(ExitStrncpyTable))
+	.int	JMPTBL(L(StrncpyExit15), L(ExitStrncpyTable))
+	.int	JMPTBL(L(StrncpyExit16), L(ExitStrncpyTable))
+	.int	JMPTBL(L(StrncpyExit17), L(ExitStrncpyTable))
+	.int	JMPTBL(L(StrncpyExit18), L(ExitStrncpyTable))
+	.int	JMPTBL(L(StrncpyExit19), L(ExitStrncpyTable))
+	.int	JMPTBL(L(StrncpyExit20), L(ExitStrncpyTable))
+	.int	JMPTBL(L(StrncpyExit21), L(ExitStrncpyTable))
+	.int	JMPTBL(L(StrncpyExit22), L(ExitStrncpyTable))
+	.int    JMPTBL(L(StrncpyExit23), L(ExitStrncpyTable))
+	.int	JMPTBL(L(StrncpyExit24), L(ExitStrncpyTable))
+	.int	JMPTBL(L(StrncpyExit25), L(ExitStrncpyTable))
+	.int	JMPTBL(L(StrncpyExit26), L(ExitStrncpyTable))
+	.int	JMPTBL(L(StrncpyExit27), L(ExitStrncpyTable))
+	.int	JMPTBL(L(StrncpyExit28), L(ExitStrncpyTable))
+	.int	JMPTBL(L(StrncpyExit29), L(ExitStrncpyTable))
+	.int	JMPTBL(L(StrncpyExit30), L(ExitStrncpyTable))
+	.int	JMPTBL(L(StrncpyExit31), L(ExitStrncpyTable))
+	.int	JMPTBL(L(StrncpyExit32), L(ExitStrncpyTable))
+	.int	JMPTBL(L(StrncpyExit33), L(ExitStrncpyTable))
+
+	.p2align 4
+L(FillTable):
+	.int	JMPTBL(L(Fill0), L(FillTable))
+	.int	JMPTBL(L(Fill1), L(FillTable))
+	.int	JMPTBL(L(Fill2), L(FillTable))
+	.int	JMPTBL(L(Fill3), L(FillTable))
+	.int	JMPTBL(L(Fill4), L(FillTable))
+	.int	JMPTBL(L(Fill5), L(FillTable))
+	.int	JMPTBL(L(Fill6), L(FillTable))
+	.int	JMPTBL(L(Fill7), L(FillTable))
+	.int	JMPTBL(L(Fill8), L(FillTable))
+	.int	JMPTBL(L(Fill9), L(FillTable))
+	.int	JMPTBL(L(Fill10), L(FillTable))
+	.int	JMPTBL(L(Fill11), L(FillTable))
+	.int	JMPTBL(L(Fill12), L(FillTable))
+	.int	JMPTBL(L(Fill13), L(FillTable))
+	.int	JMPTBL(L(Fill14), L(FillTable))
+	.int	JMPTBL(L(Fill15), L(FillTable))
+	.int	JMPTBL(L(Fill16), L(FillTable))
+#endif
diff --git a/libc/arch-x86/silvermont/string/sse2-strlen-slm.S b/libc/arch-x86/silvermont/string/sse2-strlen-slm.S
new file mode 100755
index 0000000..27cc025
--- /dev/null
+++ b/libc/arch-x86/silvermont/string/sse2-strlen-slm.S
@@ -0,0 +1,328 @@
+/*
+Copyright (c) 2014, Intel Corporation
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+    * Redistributions of source code must retain the above copyright notice,
+    * this list of conditions and the following disclaimer.
+
+    * Redistributions in binary form must reproduce the above copyright notice,
+    * this list of conditions and the following disclaimer in the documentation
+    * and/or other materials provided with the distribution.
+
+    * Neither the name of Intel Corporation nor the names of its contributors
+    * may be used to endorse or promote products derived from this software
+    * without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
+ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+#ifndef STRLEN
+# define STRLEN strlen
+#endif
+
+#ifndef L
+# define L(label)	.L##label
+#endif
+
+#ifndef cfi_startproc
+# define cfi_startproc	.cfi_startproc
+#endif
+
+#ifndef cfi_endproc
+# define cfi_endproc	.cfi_endproc
+#endif
+
+#ifndef cfi_rel_offset
+# define cfi_rel_offset(reg, off)	.cfi_rel_offset reg, off
+#endif
+
+#ifndef cfi_restore
+# define cfi_restore(reg)	.cfi_restore reg
+#endif
+
+#ifndef cfi_adjust_cfa_offset
+# define cfi_adjust_cfa_offset(off)	.cfi_adjust_cfa_offset off
+#endif
+
+#ifndef ENTRY
+# define ENTRY(name)             \
+	.type name,  @function;  \
+	.globl name;             \
+	.p2align 4;              \
+name:                            \
+	cfi_startproc
+#endif
+
+#ifndef END
+# define END(name)               \
+	cfi_endproc;             \
+	.size name,	.-name
+#endif
+
+#define CFI_PUSH(REG)                   \
+	cfi_adjust_cfa_offset (4);      \
+	cfi_rel_offset (REG, 0)
+
+#define CFI_POP(REG)                    \
+	cfi_adjust_cfa_offset (-4);     \
+	cfi_restore (REG)
+
+#define PUSH(REG) pushl REG; CFI_PUSH (REG)
+#define POP(REG) popl REG; CFI_POP (REG)
+
+	.section .text.sse2,"ax",@progbits
+ENTRY (STRLEN)
+	mov	4(%esp), %edx
+	mov	%edx, %ecx
+	and	$0x3f, %ecx
+	pxor	%xmm0, %xmm0
+	cmp	$0x30, %ecx
+	ja	L(next)
+	movdqu	(%edx), %xmm1
+	pcmpeqb	%xmm1, %xmm0
+	pmovmskb %xmm0, %ecx
+	test	%ecx, %ecx
+	jnz	L(exit_less16)
+	mov	%edx, %eax
+	and	$-16, %eax
+	jmp	L(align16_start)
+L(next):
+	mov	%edx, %eax
+	and	$-16, %eax
+	PUSH	(%edi)
+	pcmpeqb	(%eax), %xmm0
+	mov	$-1, %edi
+	sub	%eax, %ecx
+	shl	%cl, %edi
+	pmovmskb %xmm0, %ecx
+	and	%edi, %ecx
+	POP	(%edi)
+	jnz	L(exit_unaligned)
+	pxor	%xmm0, %xmm0
+L(align16_start):
+	pxor	%xmm1, %xmm1
+	pxor	%xmm2, %xmm2
+	pxor	%xmm3, %xmm3
+	pcmpeqb	16(%eax), %xmm0
+	pmovmskb %xmm0, %ecx
+	test	%ecx, %ecx
+	jnz	L(exit16)
+
+	pcmpeqb	32(%eax), %xmm1
+	pmovmskb %xmm1, %ecx
+	test	%ecx, %ecx
+	jnz	L(exit32)
+
+	pcmpeqb	48(%eax), %xmm2
+	pmovmskb %xmm2, %ecx
+	test	%ecx, %ecx
+	jnz	L(exit48)
+
+	pcmpeqb	64(%eax), %xmm3
+	pmovmskb %xmm3, %ecx
+	test	%ecx, %ecx
+	jnz	L(exit64)
+
+	pcmpeqb	80(%eax), %xmm0
+	add	$64, %eax
+	pmovmskb %xmm0, %ecx
+	test	%ecx, %ecx
+	jnz	L(exit16)
+
+	pcmpeqb	32(%eax), %xmm1
+	pmovmskb %xmm1, %ecx
+	test	%ecx, %ecx
+	jnz	L(exit32)
+
+	pcmpeqb	48(%eax), %xmm2
+	pmovmskb %xmm2, %ecx
+	test	%ecx, %ecx
+	jnz	L(exit48)
+
+	pcmpeqb	64(%eax), %xmm3
+	pmovmskb %xmm3, %ecx
+	test	%ecx, %ecx
+	jnz	L(exit64)
+
+	pcmpeqb	80(%eax), %xmm0
+	add	$64, %eax
+	pmovmskb %xmm0, %ecx
+	test	%ecx, %ecx
+	jnz	L(exit16)
+
+	pcmpeqb	32(%eax), %xmm1
+	pmovmskb %xmm1, %ecx
+	test	%ecx, %ecx
+	jnz	L(exit32)
+
+	pcmpeqb	48(%eax), %xmm2
+	pmovmskb %xmm2, %ecx
+	test	%ecx, %ecx
+	jnz	L(exit48)
+
+	pcmpeqb	64(%eax), %xmm3
+	pmovmskb %xmm3, %ecx
+	test	%ecx, %ecx
+	jnz	L(exit64)
+
+	pcmpeqb	80(%eax), %xmm0
+	add	$64, %eax
+	pmovmskb %xmm0, %ecx
+	test	%ecx, %ecx
+	jnz	L(exit16)
+
+	pcmpeqb	32(%eax), %xmm1
+	pmovmskb %xmm1, %ecx
+	test	%ecx, %ecx
+	jnz	L(exit32)
+
+	pcmpeqb	48(%eax), %xmm2
+	pmovmskb %xmm2, %ecx
+	test	%ecx, %ecx
+	jnz	L(exit48)
+
+	pcmpeqb	64(%eax), %xmm3
+	pmovmskb %xmm3, %ecx
+	test	%ecx, %ecx
+	jnz	L(exit64)
+
+
+	test	$0x3f, %eax
+	jz	L(align64_loop)
+
+	pcmpeqb	80(%eax), %xmm0
+	add	$80, %eax
+	pmovmskb %xmm0, %ecx
+	test	%ecx, %ecx
+	jnz	L(exit)
+
+	test	$0x3f, %eax
+	jz	L(align64_loop)
+
+	pcmpeqb	16(%eax), %xmm1
+	add	$16, %eax
+	pmovmskb %xmm1, %ecx
+	test	%ecx, %ecx
+	jnz	L(exit)
+
+	test	$0x3f, %eax
+	jz	L(align64_loop)
+
+	pcmpeqb	16(%eax), %xmm2
+	add	$16, %eax
+	pmovmskb %xmm2, %ecx
+	test	%ecx, %ecx
+	jnz	L(exit)
+
+	test	$0x3f, %eax
+	jz	L(align64_loop)
+
+	pcmpeqb	16(%eax), %xmm3
+	add	$16, %eax
+	pmovmskb %xmm3, %ecx
+	test	%ecx, %ecx
+	jnz	L(exit)
+
+	add	$16, %eax
+	.p2align 4
+L(align64_loop):
+	movaps	(%eax),	%xmm4
+	pminub	16(%eax), 	%xmm4
+	movaps	32(%eax), 	%xmm5
+	pminub	48(%eax), 	%xmm5
+	add	$64, 	%eax
+	pminub	%xmm4,	%xmm5
+	pcmpeqb	%xmm0,	%xmm5
+	pmovmskb %xmm5,	%ecx
+	test	%ecx,	%ecx
+	jz	L(align64_loop)
+
+
+	pcmpeqb	-64(%eax), %xmm0
+	sub	$80, 	%eax
+	pmovmskb %xmm0, %ecx
+	test	%ecx, %ecx
+	jnz	L(exit16)
+
+	pcmpeqb	32(%eax), %xmm1
+	pmovmskb %xmm1, %ecx
+	test	%ecx, %ecx
+	jnz	L(exit32)
+
+	pcmpeqb	48(%eax), %xmm2
+	pmovmskb %xmm2, %ecx
+	test	%ecx, %ecx
+	jnz	L(exit48)
+
+	pcmpeqb	64(%eax), %xmm3
+	pmovmskb %xmm3, %ecx
+	sub	%edx, %eax
+	bsf	%ecx, %ecx
+	add	%ecx, %eax
+	add	$64, %eax
+	ret
+
+	.p2align 4
+L(exit):
+	sub	%edx, %eax
+	bsf	%ecx, %ecx
+	add	%ecx, %eax
+	ret
+
+L(exit_less16):
+	bsf	%ecx, %eax
+	ret
+
+	.p2align 4
+L(exit_unaligned):
+	sub	%edx, %eax
+	bsf	%ecx, %ecx
+	add	%ecx, %eax
+	ret
+
+	.p2align 4
+L(exit16):
+	sub	%edx, %eax
+	bsf	%ecx, %ecx
+	add	%ecx, %eax
+	add	$16, %eax
+	ret
+
+	.p2align 4
+L(exit32):
+	sub	%edx, %eax
+	bsf	%ecx, %ecx
+	add	%ecx, %eax
+	add	$32, %eax
+	ret
+
+	.p2align 4
+L(exit48):
+	sub	%edx, %eax
+	bsf	%ecx, %ecx
+	add	%ecx, %eax
+	add	$48, %eax
+	ret
+
+	.p2align 4
+L(exit64):
+	sub	%edx, %eax
+	bsf	%ecx, %ecx
+	add	%ecx, %eax
+	add	$64, %eax
+	ret
+
+END (STRLEN)
+
diff --git a/libc/arch-x86/string/cache.h b/libc/arch-x86/silvermont/string/sse2-strncpy-slm.S
old mode 100644
new mode 100755
similarity index 73%
copy from libc/arch-x86/string/cache.h
copy to libc/arch-x86/silvermont/string/sse2-strncpy-slm.S
index 9d0a563..591419f
--- a/libc/arch-x86/string/cache.h
+++ b/libc/arch-x86/silvermont/string/sse2-strncpy-slm.S
@@ -1,42 +1,33 @@
-/*
-Copyright (c) 2010, Intel Corporation
-All rights reserved.
-
-Redistribution and use in source and binary forms, with or without
-modification, are permitted provided that the following conditions are met:
-
-    * Redistributions of source code must retain the above copyright notice,
-    * this list of conditions and the following disclaimer.
-
-    * Redistributions in binary form must reproduce the above copyright notice,
-    * this list of conditions and the following disclaimer in the documentation
-    * and/or other materials provided with the distribution.
-
-    * Neither the name of Intel Corporation nor the names of its contributors
-    * may be used to endorse or promote products derived from this software
-    * without specific prior written permission.
-
-THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
-ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
-WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
-DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
-ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
-(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
-LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
-ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-*/
-
-#if defined(__slm__)
-/* Values are optimized for Silvermont */
-#define SHARED_CACHE_SIZE	(1024*1024)			/* Silvermont L2 Cache */
-#define DATA_CACHE_SIZE		(24*1024)			/* Silvermont L1 Data Cache */
-#else
-/* Values are optimized for Atom */
-#define SHARED_CACHE_SIZE	(512*1024)			/* Atom L2 Cache */
-#define DATA_CACHE_SIZE		(24*1024)			/* Atom L1 Data Cache */
-#endif
-
-#define SHARED_CACHE_SIZE_HALF	(SHARED_CACHE_SIZE / 2)
-#define DATA_CACHE_SIZE_HALF	(DATA_CACHE_SIZE / 2)
+/*

+Copyright (c) 2014, Intel Corporation

+All rights reserved.

+

+Redistribution and use in source and binary forms, with or without

+modification, are permitted provided that the following conditions are met:

+

+    * Redistributions of source code must retain the above copyright notice,

+    * this list of conditions and the following disclaimer.

+

+    * Redistributions in binary form must reproduce the above copyright notice,

+    * this list of conditions and the following disclaimer in the documentation

+    * and/or other materials provided with the distribution.

+

+    * Neither the name of Intel Corporation nor the names of its contributors

+    * may be used to endorse or promote products derived from this software

+    * without specific prior written permission.

+

+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND

+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED

+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE

+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR

+ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES

+(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;

+LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON

+ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT

+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS

+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

+*/

+

+#define USE_AS_STRNCPY

+#define STRCPY strncpy

+#include "sse2-strcpy-slm.S"

diff --git a/libc/arch-x86/silvermont/string/sse4-memcmp-slm.S b/libc/arch-x86/silvermont/string/sse4-memcmp-slm.S
new file mode 100755
index 0000000..b302883
--- /dev/null
+++ b/libc/arch-x86/silvermont/string/sse4-memcmp-slm.S
@@ -0,0 +1,1277 @@
+/*
+Copyright (c) 2014, Intel Corporation
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+    * Redistributions of source code must retain the above copyright notice,
+    * this list of conditions and the following disclaimer.
+
+    * Redistributions in binary form must reproduce the above copyright notice,
+    * this list of conditions and the following disclaimer in the documentation
+    * and/or other materials provided with the distribution.
+
+    * Neither the name of Intel Corporation nor the names of its contributors
+    * may be used to endorse or promote products derived from this software
+    * without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES                                                                                                                                                               (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
+ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+#ifndef L
+# define L(label)	.L##label
+#endif
+
+#ifndef cfi_startproc
+# define cfi_startproc	.cfi_startproc
+#endif
+
+#ifndef cfi_endproc
+# define cfi_endproc	.cfi_endproc
+#endif
+
+#ifndef cfi_rel_offset
+# define cfi_rel_offset(reg, off)	.cfi_rel_offset reg, off
+#endif
+
+#ifndef cfi_restore
+# define cfi_restore(reg)	.cfi_restore reg
+#endif
+
+#ifndef cfi_adjust_cfa_offset
+# define cfi_adjust_cfa_offset(off)	.cfi_adjust_cfa_offset off
+#endif
+
+#ifndef cfi_remember_state
+# define cfi_remember_state	.cfi_remember_state
+#endif
+
+#ifndef cfi_restore_state
+# define cfi_restore_state	.cfi_restore_state
+#endif
+
+#ifndef ENTRY
+# define ENTRY(name)             \
+	.type name, @function;   \
+	.globl name;             \
+	.p2align 4;              \
+name:                            \
+	cfi_startproc
+#endif
+
+#ifndef END
+# define END(name)               \
+	cfi_endproc;             \
+	.size name, .-name
+#endif
+
+#ifndef MEMCMP
+# define MEMCMP	memcmp
+#endif
+
+#define CFI_PUSH(REG)	\
+	cfi_adjust_cfa_offset (4);	\
+	cfi_rel_offset (REG, 0)
+
+#define CFI_POP(REG)	\
+	cfi_adjust_cfa_offset (-4);	\
+	cfi_restore (REG)
+
+#define PUSH(REG)	pushl REG; CFI_PUSH (REG)
+#define POP(REG)	popl REG; CFI_POP (REG)
+
+#define PARMS	4
+#define BLK1	PARMS
+#define BLK2	BLK1 + 4
+#define LEN	BLK2 + 4
+#define RETURN	POP (%ebx); ret; CFI_PUSH (%ebx)
+
+
+#if (defined SHARED || defined __PIC__)
+# define JMPTBL(I, B)	I - B
+
+/* Load an entry in a jump table into EBX and branch to it.  TABLE is a
+	jump	table with relative offsets.  INDEX is a register contains the
+	index	into the jump table.   SCALE is the scale of INDEX. */
+
+# define BRANCH_TO_JMPTBL_ENTRY(TABLE, INDEX, SCALE)	\
+/* We first load PC into EBX.  */	\
+	call	__x86.get_pc_thunk.bx;	\
+/* Get the address of the jump table.  */	\
+	addl	$(TABLE - .), %ebx;	\
+/* Get the entry and convert the relative offset to the	\
+	absolute	address.  */	\
+	addl	(%ebx,INDEX,SCALE), %ebx;	\
+/* We loaded the jump table and adjuested EDX/ESI. Go.  */	\
+	jmp	*%ebx
+#else
+# define JMPTBL(I, B)	I
+
+/* Load an entry in a jump table into EBX and branch to it.  TABLE is a
+	jump	table with relative offsets.  INDEX is a register contains the
+	index	into the jump table.   SCALE is the scale of INDEX. */
+# define BRANCH_TO_JMPTBL_ENTRY(TABLE, INDEX, SCALE)	\
+	jmp	*TABLE(,INDEX,SCALE)
+#endif
+
+
+/* Warning!
+           wmemcmp has to use SIGNED comparison for elements.
+           memcmp has to use UNSIGNED comparison for elemnts.
+*/
+
+	.section .text.sse4.2,"ax",@progbits
+ENTRY (MEMCMP)
+	movl	BLK1(%esp), %eax
+	movl	BLK2(%esp), %edx
+	movl	LEN(%esp), %ecx
+
+#ifdef USE_AS_WMEMCMP
+	shl	$2, %ecx
+	test	%ecx, %ecx
+	jz	L(return0)
+#else
+	cmp	$1, %ecx
+	jbe	L(less1bytes)
+#endif
+
+	pxor	%xmm0, %xmm0
+	cmp	$64, %ecx
+	ja	L(64bytesormore)
+	cmp	$8, %ecx
+
+#ifndef USE_AS_WMEMCMP
+	PUSH	(%ebx)
+	jb	L(less8bytes)
+#else
+	jb	L(less8bytes)
+	PUSH	(%ebx)
+#endif
+
+	add	%ecx, %edx
+	add	%ecx, %eax
+	BRANCH_TO_JMPTBL_ENTRY(L(table_64bytes), %ecx, 4)
+
+#ifndef USE_AS_WMEMCMP
+	.p2align 4
+L(less8bytes):
+	mov	(%eax), %bl
+	cmpb	(%edx), %bl
+	jne	L(nonzero)
+
+	mov	1(%eax), %bl
+	cmpb	1(%edx), %bl
+	jne	L(nonzero)
+
+	cmp	$2, %ecx
+	jz	L(0bytes)
+
+	mov	2(%eax), %bl
+	cmpb	2(%edx), %bl
+	jne	L(nonzero)
+
+	cmp	$3, %ecx
+	jz	L(0bytes)
+
+	mov	3(%eax), %bl
+	cmpb	3(%edx), %bl
+	jne	L(nonzero)
+
+	cmp	$4, %ecx
+	jz	L(0bytes)
+
+	mov	4(%eax), %bl
+	cmpb	4(%edx), %bl
+	jne	L(nonzero)
+
+	cmp	$5, %ecx
+	jz	L(0bytes)
+
+	mov	5(%eax), %bl
+	cmpb	5(%edx), %bl
+	jne	L(nonzero)
+
+	cmp	$6, %ecx
+	jz	L(0bytes)
+
+	mov	6(%eax), %bl
+	cmpb	6(%edx), %bl
+	je	L(0bytes)
+
+L(nonzero):
+	POP	(%ebx)
+	mov	$1, %eax
+	ja	L(above)
+	neg	%eax
+L(above):
+	ret
+	CFI_PUSH (%ebx)
+#endif
+
+	.p2align 4
+L(0bytes):
+	POP	(%ebx)
+	xor	%eax, %eax
+	ret
+
+#ifdef USE_AS_WMEMCMP
+
+/* for wmemcmp, case N == 1 */
+
+	.p2align 4
+L(less8bytes):
+	mov	(%eax), %ecx
+	cmp	(%edx), %ecx
+	je	L(return0)
+	mov	$1, %eax
+	jg	L(find_diff_bigger)
+	neg	%eax
+	ret
+
+	.p2align 4
+L(find_diff_bigger):
+	ret
+
+	.p2align 4
+L(return0):
+	xor	%eax, %eax
+	ret
+#endif
+
+#ifndef USE_AS_WMEMCMP
+	.p2align 4
+L(less1bytes):
+	jb	L(0bytesend)
+	movzbl	(%eax), %eax
+	movzbl	(%edx), %edx
+	sub	%edx, %eax
+	ret
+
+	.p2align 4
+L(0bytesend):
+	xor	%eax, %eax
+	ret
+#endif
+	.p2align 4
+L(64bytesormore):
+	PUSH	(%ebx)
+	mov	%ecx, %ebx
+	mov	$64, %ecx
+	sub	$64, %ebx
+L(64bytesormore_loop):
+	movdqu	(%eax), %xmm1
+	movdqu	(%edx), %xmm2
+	pxor	%xmm1, %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(find_16diff)
+
+	movdqu	16(%eax), %xmm1
+	movdqu	16(%edx), %xmm2
+	pxor	%xmm1, %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(find_32diff)
+
+	movdqu	32(%eax), %xmm1
+	movdqu	32(%edx), %xmm2
+	pxor	%xmm1, %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(find_48diff)
+
+	movdqu	48(%eax), %xmm1
+	movdqu	48(%edx), %xmm2
+	pxor	%xmm1, %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(find_64diff)
+	add	%ecx, %eax
+	add	%ecx, %edx
+	sub	%ecx, %ebx
+	jae	L(64bytesormore_loop)
+	add	%ebx, %ecx
+	add	%ecx, %edx
+	add	%ecx, %eax
+	BRANCH_TO_JMPTBL_ENTRY(L(table_64bytes), %ecx, 4)
+
+#ifdef USE_AS_WMEMCMP
+
+/* Label needs only for table_64bytes filling */
+L(unreal_case):
+/* no code here */
+
+#endif
+	.p2align 4
+L(find_16diff):
+	sub	$16, %ecx
+L(find_32diff):
+	sub	$16, %ecx
+L(find_48diff):
+	sub	$16, %ecx
+L(find_64diff):
+	add	%ecx, %edx
+	add	%ecx, %eax
+
+#ifndef USE_AS_WMEMCMP
+	.p2align 4
+L(16bytes):
+	mov	-16(%eax), %ecx
+	mov	-16(%edx), %ebx
+	cmp	%ebx, %ecx
+	jne	L(find_diff)
+L(12bytes):
+	mov	-12(%eax), %ecx
+	mov	-12(%edx), %ebx
+	cmp	%ebx, %ecx
+	jne	L(find_diff)
+L(8bytes):
+	mov	-8(%eax), %ecx
+	mov	-8(%edx), %ebx
+	cmp	%ebx, %ecx
+	jne	L(find_diff)
+L(4bytes):
+	mov	-4(%eax), %ecx
+	mov	-4(%edx), %ebx
+	cmp	%ebx, %ecx
+	mov	$0, %eax
+	jne	L(find_diff)
+	RETURN
+#else
+	.p2align 4
+L(16bytes):
+	mov	-16(%eax), %ecx
+	cmp	-16(%edx), %ecx
+	jne	L(find_diff)
+L(12bytes):
+	mov	-12(%eax), %ecx
+	cmp	-12(%edx), %ecx
+	jne	L(find_diff)
+L(8bytes):
+	mov	-8(%eax), %ecx
+	cmp	-8(%edx), %ecx
+	jne	L(find_diff)
+L(4bytes):
+	mov	-4(%eax), %ecx
+	cmp	-4(%edx), %ecx
+	mov	$0, %eax
+	jne	L(find_diff)
+	RETURN
+#endif
+
+#ifndef USE_AS_WMEMCMP
+	.p2align 4
+L(49bytes):
+	movdqu	-49(%eax), %xmm1
+	movdqu	-49(%edx), %xmm2
+	mov	$-49, %ebx
+	pxor	%xmm1, %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(less16bytes)
+L(33bytes):
+	movdqu	-33(%eax), %xmm1
+	movdqu	-33(%edx), %xmm2
+	mov	$-33, %ebx
+	pxor	%xmm1, %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(less16bytes)
+L(17bytes):
+	mov	-17(%eax), %ecx
+	mov	-17(%edx), %ebx
+	cmp	%ebx, %ecx
+	jne	L(find_diff)
+L(13bytes):
+	mov	-13(%eax), %ecx
+	mov	-13(%edx), %ebx
+	cmp	%ebx, %ecx
+	jne	L(find_diff)
+L(9bytes):
+	mov	-9(%eax), %ecx
+	mov	-9(%edx), %ebx
+	cmp	%ebx, %ecx
+	jne	L(find_diff)
+L(5bytes):
+	mov	-5(%eax), %ecx
+	mov	-5(%edx), %ebx
+	cmp	%ebx, %ecx
+	jne	L(find_diff)
+	movzbl	-1(%eax), %ecx
+	cmp	-1(%edx), %cl
+	mov	$0, %eax
+	jne	L(end)
+	RETURN
+
+	.p2align 4
+L(50bytes):
+	mov	$-50, %ebx
+	movdqu	-50(%eax), %xmm1
+	movdqu	-50(%edx), %xmm2
+	pxor	%xmm1, %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(less16bytes)
+L(34bytes):
+	mov	$-34, %ebx
+	movdqu	-34(%eax), %xmm1
+	movdqu	-34(%edx), %xmm2
+	pxor	%xmm1, %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(less16bytes)
+L(18bytes):
+	mov	-18(%eax), %ecx
+	mov	-18(%edx), %ebx
+	cmp	%ebx, %ecx
+	jne	L(find_diff)
+L(14bytes):
+	mov	-14(%eax), %ecx
+	mov	-14(%edx), %ebx
+	cmp	%ebx, %ecx
+	jne	L(find_diff)
+L(10bytes):
+	mov	-10(%eax), %ecx
+	mov	-10(%edx), %ebx
+	cmp	%ebx, %ecx
+	jne	L(find_diff)
+L(6bytes):
+	mov	-6(%eax), %ecx
+	mov	-6(%edx), %ebx
+	cmp	%ebx, %ecx
+	jne	L(find_diff)
+L(2bytes):
+	movzwl	-2(%eax), %ecx
+	movzwl	-2(%edx), %ebx
+	cmp	%bl, %cl
+	jne	L(end)
+	cmp	%bh, %ch
+	mov	$0, %eax
+	jne	L(end)
+	RETURN
+
+	.p2align 4
+L(51bytes):
+	mov	$-51, %ebx
+	movdqu	-51(%eax), %xmm1
+	movdqu	-51(%edx), %xmm2
+	pxor	%xmm1, %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(less16bytes)
+L(35bytes):
+	mov	$-35, %ebx
+	movdqu	-35(%eax), %xmm1
+	movdqu	-35(%edx), %xmm2
+	pxor	%xmm1, %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(less16bytes)
+L(19bytes):
+	movl	-19(%eax), %ecx
+	movl	-19(%edx), %ebx
+	cmp	%ebx, %ecx
+	jne	L(find_diff)
+L(15bytes):
+	movl	-15(%eax), %ecx
+	movl	-15(%edx), %ebx
+	cmp	%ebx, %ecx
+	jne	L(find_diff)
+L(11bytes):
+	movl	-11(%eax), %ecx
+	movl	-11(%edx), %ebx
+	cmp	%ebx, %ecx
+	jne	L(find_diff)
+L(7bytes):
+	movl	-7(%eax), %ecx
+	movl	-7(%edx), %ebx
+	cmp	%ebx, %ecx
+	jne	L(find_diff)
+L(3bytes):
+	movzwl	-3(%eax), %ecx
+	movzwl	-3(%edx), %ebx
+	cmpb	%bl, %cl
+	jne	L(end)
+	cmp	%bx, %cx
+	jne	L(end)
+L(1bytes):
+	movzbl	-1(%eax), %eax
+	cmpb	-1(%edx), %al
+	mov	$0, %eax
+	jne	L(end)
+	RETURN
+#endif
+	.p2align 4
+L(52bytes):
+	movdqu	-52(%eax), %xmm1
+	movdqu	-52(%edx), %xmm2
+	mov	$-52, %ebx
+	pxor	%xmm1, %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(less16bytes)
+L(36bytes):
+	movdqu	-36(%eax), %xmm1
+	movdqu	-36(%edx), %xmm2
+	mov	$-36, %ebx
+	pxor	%xmm1, %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(less16bytes)
+L(20bytes):
+	movdqu	-20(%eax), %xmm1
+	movdqu	-20(%edx), %xmm2
+	mov	$-20, %ebx
+	pxor	%xmm1, %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(less16bytes)
+	mov	-4(%eax), %ecx
+#ifndef USE_AS_WMEMCMP
+	mov	-4(%edx), %ebx
+	cmp	%ebx, %ecx
+#else
+	cmp	-4(%edx), %ecx
+#endif
+	mov	$0, %eax
+	jne	L(find_diff)
+	RETURN
+
+#ifndef USE_AS_WMEMCMP
+	.p2align 4
+L(53bytes):
+	movdqu	-53(%eax), %xmm1
+	movdqu	-53(%edx), %xmm2
+	mov	$-53, %ebx
+	pxor	%xmm1, %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(less16bytes)
+L(37bytes):
+	mov	$-37, %ebx
+	movdqu	-37(%eax), %xmm1
+	movdqu	-37(%edx), %xmm2
+	pxor	%xmm1, %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(less16bytes)
+L(21bytes):
+	mov	$-21, %ebx
+	movdqu	-21(%eax), %xmm1
+	movdqu	-21(%edx), %xmm2
+	pxor	%xmm1, %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(less16bytes)
+	mov	-5(%eax), %ecx
+	mov	-5(%edx), %ebx
+	cmp	%ebx, %ecx
+	jne	L(find_diff)
+	movzbl	-1(%eax), %ecx
+	cmp	-1(%edx), %cl
+	mov	$0, %eax
+	jne	L(end)
+	RETURN
+
+	.p2align 4
+L(54bytes):
+	movdqu	-54(%eax), %xmm1
+	movdqu	-54(%edx), %xmm2
+	mov	$-54, %ebx
+	pxor	%xmm1, %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(less16bytes)
+L(38bytes):
+	mov	$-38, %ebx
+	movdqu	-38(%eax), %xmm1
+	movdqu	-38(%edx), %xmm2
+	pxor	%xmm1, %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(less16bytes)
+L(22bytes):
+	mov	$-22, %ebx
+	movdqu	-22(%eax), %xmm1
+	movdqu	-22(%edx), %xmm2
+	pxor	%xmm1, %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(less16bytes)
+
+	mov	-6(%eax), %ecx
+	mov	-6(%edx), %ebx
+	cmp	%ebx, %ecx
+	jne	L(find_diff)
+	movzwl	-2(%eax), %ecx
+	movzwl	-2(%edx), %ebx
+	cmp	%bl, %cl
+	jne	L(end)
+	cmp	%bh, %ch
+	mov	$0, %eax
+	jne	L(end)
+	RETURN
+
+	.p2align 4
+L(55bytes):
+	movdqu	-55(%eax), %xmm1
+	movdqu	-55(%edx), %xmm2
+	mov	$-55, %ebx
+	pxor	%xmm1, %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(less16bytes)
+L(39bytes):
+	mov	$-39, %ebx
+	movdqu	-39(%eax), %xmm1
+	movdqu	-39(%edx), %xmm2
+	pxor	%xmm1, %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(less16bytes)
+L(23bytes):
+	mov	$-23, %ebx
+	movdqu	-23(%eax), %xmm1
+	movdqu	-23(%edx), %xmm2
+	pxor	%xmm1, %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(less16bytes)
+	movl	-7(%eax), %ecx
+	movl	-7(%edx), %ebx
+	cmp	%ebx, %ecx
+	jne	L(find_diff)
+	movzwl	-3(%eax), %ecx
+	movzwl	-3(%edx), %ebx
+	cmpb	%bl, %cl
+	jne	L(end)
+	cmp	%bx, %cx
+	jne	L(end)
+	movzbl	-1(%eax), %eax
+	cmpb	-1(%edx), %al
+	mov	$0, %eax
+	jne	L(end)
+	RETURN
+#endif
+	.p2align 4
+L(56bytes):
+	movdqu	-56(%eax), %xmm1
+	movdqu	-56(%edx), %xmm2
+	mov	$-56, %ebx
+	pxor	%xmm1, %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(less16bytes)
+L(40bytes):
+	mov	$-40, %ebx
+	movdqu	-40(%eax), %xmm1
+	movdqu	-40(%edx), %xmm2
+	pxor	%xmm1, %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(less16bytes)
+L(24bytes):
+	mov	$-24, %ebx
+	movdqu	-24(%eax), %xmm1
+	movdqu	-24(%edx), %xmm2
+	pxor	%xmm1, %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(less16bytes)
+
+	mov	-8(%eax), %ecx
+#ifndef USE_AS_WMEMCMP
+	mov	-8(%edx), %ebx
+	cmp	%ebx, %ecx
+#else
+	cmp	-8(%edx), %ecx
+#endif
+	jne	L(find_diff)
+
+	mov	-4(%eax), %ecx
+#ifndef USE_AS_WMEMCMP
+	mov	-4(%edx), %ebx
+	cmp	%ebx, %ecx
+#else
+	cmp	-4(%edx), %ecx
+#endif
+	mov	$0, %eax
+	jne	L(find_diff)
+	RETURN
+
+#ifndef USE_AS_WMEMCMP
+	.p2align 4
+L(57bytes):
+	movdqu	-57(%eax), %xmm1
+	movdqu	-57(%edx), %xmm2
+	mov	$-57, %ebx
+	pxor	%xmm1, %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(less16bytes)
+L(41bytes):
+	mov	$-41, %ebx
+	movdqu	-41(%eax), %xmm1
+	movdqu	-41(%edx), %xmm2
+	pxor	%xmm1, %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(less16bytes)
+L(25bytes):
+	mov	$-25, %ebx
+	movdqu	-25(%eax), %xmm1
+	movdqu	-25(%edx), %xmm2
+	pxor	%xmm1, %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(less16bytes)
+	mov	-9(%eax), %ecx
+	mov	-9(%edx), %ebx
+	cmp	%ebx, %ecx
+	jne	L(find_diff)
+	mov	-5(%eax), %ecx
+	mov	-5(%edx), %ebx
+	cmp	%ebx, %ecx
+	jne	L(find_diff)
+	movzbl	-1(%eax), %ecx
+	cmp	-1(%edx), %cl
+	mov	$0, %eax
+	jne	L(end)
+	RETURN
+
+	.p2align 4
+L(58bytes):
+	movdqu	-58(%eax), %xmm1
+	movdqu	-58(%edx), %xmm2
+	mov	$-58, %ebx
+	pxor	%xmm1, %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(less16bytes)
+L(42bytes):
+	mov	$-42, %ebx
+	movdqu	-42(%eax), %xmm1
+	movdqu	-42(%edx), %xmm2
+	pxor	%xmm1, %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(less16bytes)
+L(26bytes):
+	mov	$-26, %ebx
+	movdqu	-26(%eax), %xmm1
+	movdqu	-26(%edx), %xmm2
+	pxor	%xmm1, %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(less16bytes)
+
+	mov	-10(%eax), %ecx
+	mov	-10(%edx), %ebx
+	cmp	%ebx, %ecx
+	jne	L(find_diff)
+
+	mov	-6(%eax), %ecx
+	mov	-6(%edx), %ebx
+	cmp	%ebx, %ecx
+	jne	L(find_diff)
+
+	movzwl	-2(%eax), %ecx
+	movzwl	-2(%edx), %ebx
+	cmp	%bl, %cl
+	jne	L(end)
+	cmp	%bh, %ch
+	mov	$0, %eax
+	jne	L(end)
+	RETURN
+
+	.p2align 4
+L(59bytes):
+	movdqu	-59(%eax), %xmm1
+	movdqu	-59(%edx), %xmm2
+	mov	$-59, %ebx
+	pxor	%xmm1, %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(less16bytes)
+L(43bytes):
+	mov	$-43, %ebx
+	movdqu	-43(%eax), %xmm1
+	movdqu	-43(%edx), %xmm2
+	pxor	%xmm1, %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(less16bytes)
+L(27bytes):
+	mov	$-27, %ebx
+	movdqu	-27(%eax), %xmm1
+	movdqu	-27(%edx), %xmm2
+	pxor	%xmm1, %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(less16bytes)
+	movl	-11(%eax), %ecx
+	movl	-11(%edx), %ebx
+	cmp	%ebx, %ecx
+	jne	L(find_diff)
+	movl	-7(%eax), %ecx
+	movl	-7(%edx), %ebx
+	cmp	%ebx, %ecx
+	jne	L(find_diff)
+	movzwl	-3(%eax), %ecx
+	movzwl	-3(%edx), %ebx
+	cmpb	%bl, %cl
+	jne	L(end)
+	cmp	%bx, %cx
+	jne	L(end)
+	movzbl	-1(%eax), %eax
+	cmpb	-1(%edx), %al
+	mov	$0, %eax
+	jne	L(end)
+	RETURN
+#endif
+	.p2align 4
+L(60bytes):
+	movdqu	-60(%eax), %xmm1
+	movdqu	-60(%edx), %xmm2
+	mov	$-60, %ebx
+	pxor	%xmm1, %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(less16bytes)
+L(44bytes):
+	mov	$-44, %ebx
+	movdqu	-44(%eax), %xmm1
+	movdqu	-44(%edx), %xmm2
+	pxor	%xmm1, %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(less16bytes)
+L(28bytes):
+	mov	$-28, %ebx
+	movdqu	-28(%eax), %xmm1
+	movdqu	-28(%edx), %xmm2
+	pxor	%xmm1, %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(less16bytes)
+
+	mov	-12(%eax), %ecx
+#ifndef USE_AS_WMEMCMP
+	mov	-12(%edx), %ebx
+	cmp	%ebx, %ecx
+#else
+	cmp	-12(%edx), %ecx
+#endif
+	jne	L(find_diff)
+
+	mov	-8(%eax), %ecx
+#ifndef USE_AS_WMEMCMP
+	mov	-8(%edx), %ebx
+	cmp	%ebx, %ecx
+#else
+	cmp	-8(%edx), %ecx
+#endif
+	jne	L(find_diff)
+
+	mov	-4(%eax), %ecx
+#ifndef USE_AS_WMEMCMP
+	mov	-4(%edx), %ebx
+	cmp	%ebx, %ecx
+#else
+	cmp	-4(%edx), %ecx
+#endif
+	mov	$0, %eax
+	jne	L(find_diff)
+	RETURN
+
+#ifndef USE_AS_WMEMCMP
+	.p2align 4
+L(61bytes):
+	movdqu	-61(%eax), %xmm1
+	movdqu	-61(%edx), %xmm2
+	mov	$-61, %ebx
+	pxor	%xmm1, %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(less16bytes)
+L(45bytes):
+	mov	$-45, %ebx
+	movdqu	-45(%eax), %xmm1
+	movdqu	-45(%edx), %xmm2
+	pxor	%xmm1, %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(less16bytes)
+L(29bytes):
+	mov	$-29, %ebx
+	movdqu	-29(%eax), %xmm1
+	movdqu	-29(%edx), %xmm2
+	pxor	%xmm1, %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(less16bytes)
+
+	mov	-13(%eax), %ecx
+	mov	-13(%edx), %ebx
+	cmp	%ebx, %ecx
+	jne	L(find_diff)
+
+	mov	-9(%eax), %ecx
+	mov	-9(%edx), %ebx
+	cmp	%ebx, %ecx
+	jne	L(find_diff)
+
+	mov	-5(%eax), %ecx
+	mov	-5(%edx), %ebx
+	cmp	%ebx, %ecx
+	jne	L(find_diff)
+	movzbl	-1(%eax), %ecx
+	cmp	-1(%edx), %cl
+	mov	$0, %eax
+	jne	L(end)
+	RETURN
+
+	.p2align 4
+L(62bytes):
+	movdqu	-62(%eax), %xmm1
+	movdqu	-62(%edx), %xmm2
+	mov	$-62, %ebx
+	pxor	%xmm1, %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(less16bytes)
+L(46bytes):
+	mov	$-46, %ebx
+	movdqu	-46(%eax), %xmm1
+	movdqu	-46(%edx), %xmm2
+	pxor	%xmm1, %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(less16bytes)
+L(30bytes):
+	mov	$-30, %ebx
+	movdqu	-30(%eax), %xmm1
+	movdqu	-30(%edx), %xmm2
+	pxor	%xmm1, %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(less16bytes)
+	mov	-14(%eax), %ecx
+	mov	-14(%edx), %ebx
+	cmp	%ebx, %ecx
+	jne	L(find_diff)
+	mov	-10(%eax), %ecx
+	mov	-10(%edx), %ebx
+	cmp	%ebx, %ecx
+	jne	L(find_diff)
+	mov	-6(%eax), %ecx
+	mov	-6(%edx), %ebx
+	cmp	%ebx, %ecx
+	jne	L(find_diff)
+	movzwl	-2(%eax), %ecx
+	movzwl	-2(%edx), %ebx
+	cmp	%bl, %cl
+	jne	L(end)
+	cmp	%bh, %ch
+	mov	$0, %eax
+	jne	L(end)
+	RETURN
+
+	.p2align 4
+L(63bytes):
+	movdqu	-63(%eax), %xmm1
+	movdqu	-63(%edx), %xmm2
+	mov	$-63, %ebx
+	pxor	%xmm1, %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(less16bytes)
+L(47bytes):
+	mov	$-47, %ebx
+	movdqu	-47(%eax), %xmm1
+	movdqu	-47(%edx), %xmm2
+	pxor	%xmm1, %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(less16bytes)
+L(31bytes):
+	mov	$-31, %ebx
+	movdqu	-31(%eax), %xmm1
+	movdqu	-31(%edx), %xmm2
+	pxor	%xmm1, %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(less16bytes)
+
+	movl	-15(%eax), %ecx
+	movl	-15(%edx), %ebx
+	cmp	%ebx, %ecx
+	jne	L(find_diff)
+	movl	-11(%eax), %ecx
+	movl	-11(%edx), %ebx
+	cmp	%ebx, %ecx
+	jne	L(find_diff)
+	movl	-7(%eax), %ecx
+	movl	-7(%edx), %ebx
+	cmp	%ebx, %ecx
+	jne	L(find_diff)
+	movzwl	-3(%eax), %ecx
+	movzwl	-3(%edx), %ebx
+	cmpb	%bl, %cl
+	jne	L(end)
+	cmp	%bx, %cx
+	jne	L(end)
+	movzbl	-1(%eax), %eax
+	cmpb	-1(%edx), %al
+	mov	$0, %eax
+	jne	L(end)
+	RETURN
+#endif
+
+	.p2align 4
+L(64bytes):
+	movdqu	-64(%eax), %xmm1
+	movdqu	-64(%edx), %xmm2
+	mov	$-64, %ebx
+	pxor	%xmm1, %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(less16bytes)
+L(48bytes):
+	movdqu	-48(%eax), %xmm1
+	movdqu	-48(%edx), %xmm2
+	mov	$-48, %ebx
+	pxor	%xmm1, %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(less16bytes)
+L(32bytes):
+	movdqu	-32(%eax), %xmm1
+	movdqu	-32(%edx), %xmm2
+	mov	$-32, %ebx
+	pxor	%xmm1, %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(less16bytes)
+
+	mov	-16(%eax), %ecx
+#ifndef USE_AS_WMEMCMP
+	mov	-16(%edx), %ebx
+	cmp	%ebx, %ecx
+#else
+	cmp	-16(%edx), %ecx
+#endif
+	jne	L(find_diff)
+
+	mov	-12(%eax), %ecx
+#ifndef USE_AS_WMEMCMP
+	mov	-12(%edx), %ebx
+	cmp	%ebx, %ecx
+#else
+	cmp	-12(%edx), %ecx
+#endif
+	jne	L(find_diff)
+
+	mov	-8(%eax), %ecx
+#ifndef USE_AS_WMEMCMP
+	mov	-8(%edx), %ebx
+	cmp	%ebx, %ecx
+#else
+	cmp	-8(%edx), %ecx
+#endif
+	jne	L(find_diff)
+
+	mov	-4(%eax), %ecx
+#ifndef USE_AS_WMEMCMP
+	mov	-4(%edx), %ebx
+	cmp	%ebx, %ecx
+#else
+	cmp	-4(%edx), %ecx
+#endif
+	mov	$0, %eax
+	jne	L(find_diff)
+	RETURN
+
+#ifndef USE_AS_WMEMCMP
+	.p2align 4
+L(less16bytes):
+	add	%ebx, %eax
+	add	%ebx, %edx
+
+	mov	(%eax), %ecx
+	mov	(%edx), %ebx
+	cmp	%ebx, %ecx
+	jne	L(find_diff)
+
+	mov	4(%eax), %ecx
+	mov	4(%edx), %ebx
+	cmp	%ebx, %ecx
+	jne	L(find_diff)
+
+	mov	8(%eax), %ecx
+	mov	8(%edx), %ebx
+	cmp	%ebx, %ecx
+	jne	L(find_diff)
+
+	mov	12(%eax), %ecx
+	mov	12(%edx), %ebx
+	cmp	%ebx, %ecx
+	mov	$0, %eax
+	jne	L(find_diff)
+	RETURN
+#else
+	.p2align 4
+L(less16bytes):
+	add	%ebx, %eax
+	add	%ebx, %edx
+
+	mov	(%eax), %ecx
+	cmp	(%edx), %ecx
+	jne	L(find_diff)
+
+	mov	4(%eax), %ecx
+	cmp	4(%edx), %ecx
+	jne	L(find_diff)
+
+	mov	8(%eax), %ecx
+	cmp	8(%edx), %ecx
+	jne	L(find_diff)
+
+	mov	12(%eax), %ecx
+	cmp	12(%edx), %ecx
+
+	mov	$0, %eax
+	jne	L(find_diff)
+	RETURN
+#endif
+
+	.p2align 4
+L(find_diff):
+#ifndef USE_AS_WMEMCMP
+	cmpb	%bl, %cl
+	jne	L(end)
+	cmp	%bx, %cx
+	jne	L(end)
+	shr	$16,%ecx
+	shr	$16,%ebx
+	cmp	%bl, %cl
+	jne	L(end)
+	cmp	%bx, %cx
+L(end):
+	POP	(%ebx)
+	mov	$1, %eax
+	ja	L(bigger)
+	neg	%eax
+L(bigger):
+	ret
+#else
+	POP	(%ebx)
+	mov	$1, %eax
+	jg	L(bigger)
+	neg	%eax
+	ret
+
+	.p2align 4
+L(bigger):
+	ret
+#endif
+END (MEMCMP)
+
+	.section .rodata.sse4.2,"a",@progbits
+	.p2align 2
+	.type	L(table_64bytes), @object
+#ifndef USE_AS_WMEMCMP
+L(table_64bytes):
+	.int	JMPTBL (L(0bytes), L(table_64bytes))
+	.int	JMPTBL (L(1bytes), L(table_64bytes))
+	.int	JMPTBL (L(2bytes), L(table_64bytes))
+	.int	JMPTBL (L(3bytes), L(table_64bytes))
+	.int	JMPTBL (L(4bytes), L(table_64bytes))
+	.int	JMPTBL (L(5bytes), L(table_64bytes))
+	.int	JMPTBL (L(6bytes), L(table_64bytes))
+	.int	JMPTBL (L(7bytes), L(table_64bytes))
+	.int	JMPTBL (L(8bytes), L(table_64bytes))
+	.int	JMPTBL (L(9bytes), L(table_64bytes))
+	.int	JMPTBL (L(10bytes), L(table_64bytes))
+	.int	JMPTBL (L(11bytes), L(table_64bytes))
+	.int	JMPTBL (L(12bytes), L(table_64bytes))
+	.int	JMPTBL (L(13bytes), L(table_64bytes))
+	.int	JMPTBL (L(14bytes), L(table_64bytes))
+	.int	JMPTBL (L(15bytes), L(table_64bytes))
+	.int	JMPTBL (L(16bytes), L(table_64bytes))
+	.int	JMPTBL (L(17bytes), L(table_64bytes))
+	.int	JMPTBL (L(18bytes), L(table_64bytes))
+	.int	JMPTBL (L(19bytes), L(table_64bytes))
+	.int	JMPTBL (L(20bytes), L(table_64bytes))
+	.int	JMPTBL (L(21bytes), L(table_64bytes))
+	.int	JMPTBL (L(22bytes), L(table_64bytes))
+	.int	JMPTBL (L(23bytes), L(table_64bytes))
+	.int	JMPTBL (L(24bytes), L(table_64bytes))
+	.int	JMPTBL (L(25bytes), L(table_64bytes))
+	.int	JMPTBL (L(26bytes), L(table_64bytes))
+	.int	JMPTBL (L(27bytes), L(table_64bytes))
+	.int	JMPTBL (L(28bytes), L(table_64bytes))
+	.int	JMPTBL (L(29bytes), L(table_64bytes))
+	.int	JMPTBL (L(30bytes), L(table_64bytes))
+	.int	JMPTBL (L(31bytes), L(table_64bytes))
+	.int	JMPTBL (L(32bytes), L(table_64bytes))
+	.int	JMPTBL (L(33bytes), L(table_64bytes))
+	.int	JMPTBL (L(34bytes), L(table_64bytes))
+	.int	JMPTBL (L(35bytes), L(table_64bytes))
+	.int	JMPTBL (L(36bytes), L(table_64bytes))
+	.int	JMPTBL (L(37bytes), L(table_64bytes))
+	.int	JMPTBL (L(38bytes), L(table_64bytes))
+	.int	JMPTBL (L(39bytes), L(table_64bytes))
+	.int	JMPTBL (L(40bytes), L(table_64bytes))
+	.int	JMPTBL (L(41bytes), L(table_64bytes))
+	.int	JMPTBL (L(42bytes), L(table_64bytes))
+	.int	JMPTBL (L(43bytes), L(table_64bytes))
+	.int	JMPTBL (L(44bytes), L(table_64bytes))
+	.int	JMPTBL (L(45bytes), L(table_64bytes))
+	.int	JMPTBL (L(46bytes), L(table_64bytes))
+	.int	JMPTBL (L(47bytes), L(table_64bytes))
+	.int	JMPTBL (L(48bytes), L(table_64bytes))
+	.int	JMPTBL (L(49bytes), L(table_64bytes))
+	.int	JMPTBL (L(50bytes), L(table_64bytes))
+	.int	JMPTBL (L(51bytes), L(table_64bytes))
+	.int	JMPTBL (L(52bytes), L(table_64bytes))
+	.int	JMPTBL (L(53bytes), L(table_64bytes))
+	.int	JMPTBL (L(54bytes), L(table_64bytes))
+	.int	JMPTBL (L(55bytes), L(table_64bytes))
+	.int	JMPTBL (L(56bytes), L(table_64bytes))
+	.int	JMPTBL (L(57bytes), L(table_64bytes))
+	.int	JMPTBL (L(58bytes), L(table_64bytes))
+	.int	JMPTBL (L(59bytes), L(table_64bytes))
+	.int	JMPTBL (L(60bytes), L(table_64bytes))
+	.int	JMPTBL (L(61bytes), L(table_64bytes))
+	.int	JMPTBL (L(62bytes), L(table_64bytes))
+	.int	JMPTBL (L(63bytes), L(table_64bytes))
+	.int	JMPTBL (L(64bytes), L(table_64bytes))
+#else
+L(table_64bytes):
+	.int	JMPTBL (L(0bytes), L(table_64bytes))
+	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+	.int	JMPTBL (L(4bytes), L(table_64bytes))
+	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+	.int	JMPTBL (L(8bytes), L(table_64bytes))
+	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+	.int	JMPTBL (L(12bytes), L(table_64bytes))
+	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+	.int	JMPTBL (L(16bytes), L(table_64bytes))
+	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+	.int	JMPTBL (L(20bytes), L(table_64bytes))
+	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+	.int	JMPTBL (L(24bytes), L(table_64bytes))
+	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+	.int	JMPTBL (L(28bytes), L(table_64bytes))
+	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+	.int	JMPTBL (L(32bytes), L(table_64bytes))
+	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+	.int	JMPTBL (L(36bytes), L(table_64bytes))
+	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+	.int	JMPTBL (L(40bytes), L(table_64bytes))
+	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+	.int	JMPTBL (L(44bytes), L(table_64bytes))
+	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+	.int	JMPTBL (L(48bytes), L(table_64bytes))
+	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+	.int	JMPTBL (L(52bytes), L(table_64bytes))
+	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+	.int	JMPTBL (L(56bytes), L(table_64bytes))
+	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+	.int	JMPTBL (L(60bytes), L(table_64bytes))
+	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+	.int	JMPTBL (L(64bytes), L(table_64bytes))
+#endif
diff --git a/libc/arch-x86/string/cache.h b/libc/arch-x86/silvermont/string/sse4-wmemcmp-slm.S
old mode 100644
new mode 100755
similarity index 73%
copy from libc/arch-x86/string/cache.h
copy to libc/arch-x86/silvermont/string/sse4-wmemcmp-slm.S
index 9d0a563..2c350bb
--- a/libc/arch-x86/string/cache.h
+++ b/libc/arch-x86/silvermont/string/sse4-wmemcmp-slm.S
@@ -1,5 +1,5 @@
 /*
-Copyright (c) 2010, Intel Corporation
+Copyright (c) 2014, Intel Corporation
 All rights reserved.
 
 Redistribution and use in source and binary forms, with or without
@@ -28,15 +28,6 @@
 SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 */
 
-#if defined(__slm__)
-/* Values are optimized for Silvermont */
-#define SHARED_CACHE_SIZE	(1024*1024)			/* Silvermont L2 Cache */
-#define DATA_CACHE_SIZE		(24*1024)			/* Silvermont L1 Data Cache */
-#else
-/* Values are optimized for Atom */
-#define SHARED_CACHE_SIZE	(512*1024)			/* Atom L2 Cache */
-#define DATA_CACHE_SIZE		(24*1024)			/* Atom L1 Data Cache */
-#endif
-
-#define SHARED_CACHE_SIZE_HALF	(SHARED_CACHE_SIZE / 2)
-#define DATA_CACHE_SIZE_HALF	(DATA_CACHE_SIZE / 2)
+#define USE_AS_WMEMCMP
+#define MEMCMP wmemcmp
+#include "sse4-memcmp-slm.S"
diff --git a/libc/arch-x86/syscalls/connect.S b/libc/arch-x86/syscalls/__connect.S
similarity index 94%
rename from libc/arch-x86/syscalls/connect.S
rename to libc/arch-x86/syscalls/__connect.S
index c0d73ca..2f53b33 100644
--- a/libc/arch-x86/syscalls/connect.S
+++ b/libc/arch-x86/syscalls/__connect.S
@@ -2,7 +2,7 @@
 
 #include <private/bionic_asm.h>
 
-ENTRY(connect)
+ENTRY(__connect)
     pushl   %ebx
     pushl   %ecx
     .cfi_def_cfa_offset 8
@@ -24,4 +24,4 @@
     popl    %ecx
     popl    %ebx
     ret
-END(connect)
+END(__connect)
diff --git a/libc/arch-x86/x86.mk b/libc/arch-x86/x86.mk
index a1d55f0..34da0ce 100644
--- a/libc/arch-x86/x86.mk
+++ b/libc/arch-x86/x86.mk
@@ -32,60 +32,15 @@
     arch-x86/bionic/syscall.S \
     arch-x86/bionic/vfork.S \
 
-ifeq ($(ARCH_X86_HAVE_SSSE3),true)
-libc_bionic_src_files_x86 += \
-    arch-x86/string/ssse3-memcpy-atom.S \
-    arch-x86/string/ssse3-memmove-atom.S \
-    arch-x86/string/ssse3-bcopy-atom.S \
-    arch-x86/string/ssse3-strncat-atom.S \
-    arch-x86/string/ssse3-strncpy-atom.S \
-    arch-x86/string/ssse3-strlcat-atom.S \
-    arch-x86/string/ssse3-strlcpy-atom.S \
-    arch-x86/string/ssse3-strcmp-atom.S \
-    arch-x86/string/ssse3-strncmp-atom.S \
-    arch-x86/string/ssse3-strcat-atom.S \
-    arch-x86/string/ssse3-strcpy-atom.S \
-    arch-x86/string/ssse3-memcmp-atom.S \
-    arch-x86/string/ssse3-wmemcmp-atom.S \
-    arch-x86/string/ssse3-memcmp16-atom.S \
-    arch-x86/string/ssse3-wcscat-atom.S \
-    arch-x86/string/ssse3-wcscpy-atom.S
-else
-libc_bionic_src_files_x86 += \
-    arch-x86/string/memcpy.S \
-    arch-x86/string/memmove.S \
-    arch-x86/string/bcopy.S \
-    arch-x86/string/strcmp.S \
-    arch-x86/string/strncmp.S \
-    arch-x86/string/strcat.S \
-    arch-x86/string/memcmp.S \
-    bionic/__memcmp16.cpp \
-    upstream-freebsd/lib/libc/string/wcscpy.c \
-    upstream-freebsd/lib/libc/string/wcscat.c \
-    upstream-freebsd/lib/libc/string/wmemcmp.c \
-    upstream-openbsd/lib/libc/string/strcpy.c \
-    upstream-openbsd/lib/libc/string/strlcat.c \
-    upstream-openbsd/lib/libc/string/strlcpy.c \
-    upstream-openbsd/lib/libc/string/strncat.c \
-    upstream-openbsd/lib/libc/string/strncpy.c \
-
+## ARCH variant specific source files
+arch_variant_mk := $(LOCAL_PATH)/arch-x86/$(TARGET_ARCH_VARIANT)/$(TARGET_ARCH_VARIANT).mk
+ifeq ($(wildcard $(arch_variant_mk)),)
+    arch_variant_mk := $(LOCAL_PATH)/arch-x86/generic/generic.mk
 endif
+include $(arch_variant_mk)
+libc_common_additional_dependencies += $(arch_variant_mk)
 
-libc_bionic_src_files_x86 += \
-    arch-x86/string/sse2-memset-atom.S \
-    arch-x86/string/sse2-bzero-atom.S \
-    arch-x86/string/sse2-memchr-atom.S \
-    arch-x86/string/sse2-memrchr-atom.S \
-    arch-x86/string/sse2-strchr-atom.S \
-    arch-x86/string/sse2-strrchr-atom.S \
-    arch-x86/string/sse2-index-atom.S \
-    arch-x86/string/sse2-strlen-atom.S \
-    arch-x86/string/sse2-strnlen-atom.S \
-    arch-x86/string/sse2-wcschr-atom.S \
-    arch-x86/string/sse2-wcsrchr-atom.S \
-    arch-x86/string/sse2-wcslen-atom.S \
-    arch-x86/string/sse2-wcscmp-atom.S \
-
+arch_variant_mk :=
 
 libc_crt_target_cflags_x86 := \
     -m32 \
diff --git a/libc/arch-x86/string/cache.h b/libc/arch-x86_64/string/cache.h
similarity index 88%
copy from libc/arch-x86/string/cache.h
copy to libc/arch-x86_64/string/cache.h
index 9d0a563..38acc6e 100644
--- a/libc/arch-x86/string/cache.h
+++ b/libc/arch-x86_64/string/cache.h
@@ -1,5 +1,5 @@
 /*
-Copyright (c) 2010, Intel Corporation
+Copyright (c) 2014, Intel Corporation
 All rights reserved.
 
 Redistribution and use in source and binary forms, with or without
@@ -28,15 +28,9 @@
 SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 */
 
-#if defined(__slm__)
 /* Values are optimized for Silvermont */
 #define SHARED_CACHE_SIZE	(1024*1024)			/* Silvermont L2 Cache */
 #define DATA_CACHE_SIZE		(24*1024)			/* Silvermont L1 Data Cache */
-#else
-/* Values are optimized for Atom */
-#define SHARED_CACHE_SIZE	(512*1024)			/* Atom L2 Cache */
-#define DATA_CACHE_SIZE		(24*1024)			/* Atom L1 Data Cache */
-#endif
 
 #define SHARED_CACHE_SIZE_HALF	(SHARED_CACHE_SIZE / 2)
 #define DATA_CACHE_SIZE_HALF	(DATA_CACHE_SIZE / 2)
diff --git a/libc/arch-x86/string/cache.h b/libc/arch-x86_64/string/sse2-bcopy-slm.S
similarity index 73%
copy from libc/arch-x86/string/cache.h
copy to libc/arch-x86_64/string/sse2-bcopy-slm.S
index 9d0a563..effab0e 100644
--- a/libc/arch-x86/string/cache.h
+++ b/libc/arch-x86_64/string/sse2-bcopy-slm.S
@@ -1,5 +1,5 @@
 /*
-Copyright (c) 2010, Intel Corporation
+Copyright (c) 2014, Intel Corporation
 All rights reserved.
 
 Redistribution and use in source and binary forms, with or without
@@ -28,15 +28,6 @@
 SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 */
 
-#if defined(__slm__)
-/* Values are optimized for Silvermont */
-#define SHARED_CACHE_SIZE	(1024*1024)			/* Silvermont L2 Cache */
-#define DATA_CACHE_SIZE		(24*1024)			/* Silvermont L1 Data Cache */
-#else
-/* Values are optimized for Atom */
-#define SHARED_CACHE_SIZE	(512*1024)			/* Atom L2 Cache */
-#define DATA_CACHE_SIZE		(24*1024)			/* Atom L1 Data Cache */
-#endif
-
-#define SHARED_CACHE_SIZE_HALF	(SHARED_CACHE_SIZE / 2)
-#define DATA_CACHE_SIZE_HALF	(DATA_CACHE_SIZE / 2)
+#define USE_AS_BCOPY
+#define MEMMOVE		bcopy
+#include "sse2-memmove-slm.S"
diff --git a/libc/arch-x86/string/cache.h b/libc/arch-x86_64/string/sse2-bzero-slm.S
similarity index 73%
copy from libc/arch-x86/string/cache.h
copy to libc/arch-x86_64/string/sse2-bzero-slm.S
index 9d0a563..446ea5b 100644
--- a/libc/arch-x86/string/cache.h
+++ b/libc/arch-x86_64/string/sse2-bzero-slm.S
@@ -1,5 +1,5 @@
 /*
-Copyright (c) 2010, Intel Corporation
+Copyright (c) 2014, Intel Corporation
 All rights reserved.
 
 Redistribution and use in source and binary forms, with or without
@@ -28,15 +28,6 @@
 SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 */
 
-#if defined(__slm__)
-/* Values are optimized for Silvermont */
-#define SHARED_CACHE_SIZE	(1024*1024)			/* Silvermont L2 Cache */
-#define DATA_CACHE_SIZE		(24*1024)			/* Silvermont L1 Data Cache */
-#else
-/* Values are optimized for Atom */
-#define SHARED_CACHE_SIZE	(512*1024)			/* Atom L2 Cache */
-#define DATA_CACHE_SIZE		(24*1024)			/* Atom L1 Data Cache */
-#endif
-
-#define SHARED_CACHE_SIZE_HALF	(SHARED_CACHE_SIZE / 2)
-#define DATA_CACHE_SIZE_HALF	(DATA_CACHE_SIZE / 2)
+#define USE_AS_BZERO_P
+#define MEMSET		bzero
+#include "sse2-memset-slm.S"
diff --git a/libc/arch-x86_64/string/sse2-memcpy-slm.S b/libc/arch-x86_64/string/sse2-memcpy-slm.S
new file mode 100644
index 0000000..4c30fb6
--- /dev/null
+++ b/libc/arch-x86_64/string/sse2-memcpy-slm.S
@@ -0,0 +1,299 @@
+/*
+Copyright (c) 2014, Intel Corporation
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+    * Redistributions of source code must retain the above copyright notice,
+    * this list of conditions and the following disclaimer.
+
+    * Redistributions in binary form must reproduce the above copyright notice,
+    * this list of conditions and the following disclaimer in the documentation
+    * and/or other materials provided with the distribution.
+
+    * Neither the name of Intel Corporation nor the names of its contributors
+    * may be used to endorse or promote products derived from this software
+    * without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
+ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+#include "cache.h"
+
+#ifndef MEMCPY
+# define MEMCPY		memcpy
+#endif
+
+#ifndef L
+# define L(label)	.L##label
+#endif
+
+#ifndef cfi_startproc
+# define cfi_startproc	.cfi_startproc
+#endif
+
+#ifndef cfi_endproc
+# define cfi_endproc	.cfi_endproc
+#endif
+
+#ifndef cfi_rel_offset
+# define cfi_rel_offset(reg, off)	.cfi_rel_offset reg, off
+#endif
+
+#ifndef cfi_restore
+# define cfi_restore(reg)	.cfi_restore reg
+#endif
+
+#ifndef cfi_adjust_cfa_offset
+# define cfi_adjust_cfa_offset(off)	.cfi_adjust_cfa_offset off
+#endif
+
+#ifndef ENTRY
+# define ENTRY(name)		\
+	.type name,  @function;		\
+	.globl name;		\
+	.p2align 4;		\
+name:		\
+	cfi_startproc
+#endif
+
+#ifndef END
+# define END(name)		\
+	cfi_endproc;		\
+	.size name, .-name
+#endif
+
+#define CFI_PUSH(REG)		\
+	cfi_adjust_cfa_offset (4);		\
+	cfi_rel_offset (REG, 0)
+
+#define CFI_POP(REG)		\
+	cfi_adjust_cfa_offset (-4);		\
+	cfi_restore (REG)
+
+#define PUSH(REG)	push REG;
+#define POP(REG)	pop REG;
+
+#define ENTRANCE	PUSH (%rbx);
+#define RETURN_END	POP (%rbx); ret
+#define RETURN		RETURN_END;
+
+	.section .text.sse2,"ax",@progbits
+ENTRY (MEMCPY)
+	ENTRANCE
+	cmp	%rsi, %rdi
+	je	L(return)
+
+	cmp	$16, %rdx
+	jbe	L(len_0_16_bytes)
+
+	cmp	$SHARED_CACHE_SIZE_HALF, %rdx
+	jae	L(large_page)
+
+	movdqu	(%rsi), %xmm0
+	movdqu	-16(%rsi, %rdx), %xmm1
+	cmp	$32, %rdx
+	movdqu	%xmm0, (%rdi)
+	movdqu	%xmm1, -16(%rdi, %rdx)
+	jbe	L(return)
+
+	movdqu	16(%rsi), %xmm0
+	movdqu	-32(%rsi, %rdx), %xmm1
+	cmp	$64, %rdx
+	movdqu	%xmm0, 16(%rdi)
+	movdqu	%xmm1, -32(%rdi, %rdx)
+	jbe	L(return)
+
+	movdqu	32(%rsi), %xmm0
+	movdqu	48(%rsi), %xmm1
+	movdqu	-48(%rsi, %rdx), %xmm2
+	movdqu	-64(%rsi, %rdx), %xmm3
+	cmp	$128, %rdx
+	movdqu	%xmm0, 32(%rdi)
+	movdqu	%xmm1, 48(%rdi)
+	movdqu	%xmm2, -48(%rdi, %rdx)
+	movdqu	%xmm3, -64(%rdi, %rdx)
+	jbe	L(return)
+
+/* Now the main loop: we align the address of the destination.  */
+	lea	64(%rdi), %r8
+	and	$-64, %r8
+
+	add	%rdi, %rdx
+	and	$-64, %rdx
+
+	sub	%rdi, %rsi
+
+/* We should stop two iterations before the termination
+	(in order not to misprefetch).  */
+	sub	$64, %rdx
+	cmp	%r8, %rdx
+	je	L(main_loop_just_one_iteration)
+
+	sub	$64, %rdx
+	cmp	%r8, %rdx
+	je	L(main_loop_last_two_iterations)
+
+
+	.p2align 4
+L(main_loop_cache):
+
+	prefetcht0 128(%r8, %rsi)
+
+	movdqu	(%r8, %rsi), %xmm0
+	movdqu	16(%r8, %rsi), %xmm1
+	movdqu	32(%r8, %rsi), %xmm2
+	movdqu	48(%r8, %rsi), %xmm3
+	movdqa	%xmm0, (%r8)
+	movdqa	%xmm1, 16(%r8)
+	movdqa	%xmm2, 32(%r8)
+	movdqa	%xmm3, 48(%r8)
+	lea	64(%r8), %r8
+	cmp	%r8, %rdx
+	jne	L(main_loop_cache)
+
+L(main_loop_last_two_iterations):
+	movdqu	(%r8, %rsi), %xmm0
+	movdqu	16(%r8, %rsi), %xmm1
+	movdqu	32(%r8, %rsi), %xmm2
+	movdqu	48(%r8, %rsi), %xmm3
+	movdqu	64(%r8, %rsi), %xmm4
+	movdqu	80(%r8, %rsi), %xmm5
+	movdqu	96(%r8, %rsi), %xmm6
+	movdqu	112(%r8, %rsi), %xmm7
+	movdqa	%xmm0, (%r8)
+	movdqa	%xmm1, 16(%r8)
+	movdqa	%xmm2, 32(%r8)
+	movdqa	%xmm3, 48(%r8)
+	movdqa	%xmm4, 64(%r8)
+	movdqa	%xmm5, 80(%r8)
+	movdqa	%xmm6, 96(%r8)
+	movdqa	%xmm7, 112(%r8)
+	jmp	L(return)
+
+L(main_loop_just_one_iteration):
+	movdqu	(%r8, %rsi), %xmm0
+	movdqu	16(%r8, %rsi), %xmm1
+	movdqu	32(%r8, %rsi), %xmm2
+	movdqu	48(%r8, %rsi), %xmm3
+	movdqa	%xmm0, (%r8)
+	movdqa	%xmm1, 16(%r8)
+	movdqa	%xmm2, 32(%r8)
+	movdqa	%xmm3, 48(%r8)
+	jmp	L(return)
+
+L(large_page):
+	movdqu	(%rsi), %xmm0
+	movdqu	16(%rsi), %xmm1
+	movdqu	32(%rsi), %xmm2
+	movdqu	48(%rsi), %xmm3
+	movdqu	-64(%rsi, %rdx), %xmm4
+	movdqu	-48(%rsi, %rdx), %xmm5
+	movdqu	-32(%rsi, %rdx), %xmm6
+	movdqu	-16(%rsi, %rdx), %xmm7
+	movdqu	%xmm0, (%rdi)
+	movdqu	%xmm1, 16(%rdi)
+	movdqu	%xmm2, 32(%rdi)
+	movdqu	%xmm3, 48(%rdi)
+	movdqu	%xmm4, -64(%rdi, %rdx)
+	movdqu	%xmm5, -48(%rdi, %rdx)
+	movdqu	%xmm6, -32(%rdi, %rdx)
+	movdqu	%xmm7, -16(%rdi, %rdx)
+
+	movdqu	64(%rsi), %xmm0
+	movdqu	80(%rsi), %xmm1
+	movdqu	96(%rsi), %xmm2
+	movdqu	112(%rsi), %xmm3
+	movdqu	-128(%rsi, %rdx), %xmm4
+	movdqu	-112(%rsi, %rdx), %xmm5
+	movdqu	-96(%rsi, %rdx), %xmm6
+	movdqu	-80(%rsi, %rdx), %xmm7
+	movdqu	%xmm0, 64(%rdi)
+	movdqu	%xmm1, 80(%rdi)
+	movdqu	%xmm2, 96(%rdi)
+	movdqu	%xmm3, 112(%rdi)
+	movdqu	%xmm4, -128(%rdi, %rdx)
+	movdqu	%xmm5, -112(%rdi, %rdx)
+	movdqu	%xmm6, -96(%rdi, %rdx)
+	movdqu	%xmm7, -80(%rdi, %rdx)
+
+/* Now the main loop with non temporal stores. We align
+	the address of the destination.  */
+	lea	128(%rdi), %r8
+	and	$-128, %r8
+
+	add	%rdi, %rdx
+	and	$-128, %rdx
+
+	sub	%rdi, %rsi
+
+	.p2align 4
+L(main_loop_large_page):
+	movdqu	(%r8, %rsi), %xmm0
+	movdqu	16(%r8, %rsi), %xmm1
+	movdqu	32(%r8, %rsi), %xmm2
+	movdqu	48(%r8, %rsi), %xmm3
+	movdqu	64(%r8, %rsi), %xmm4
+	movdqu	80(%r8, %rsi), %xmm5
+	movdqu	96(%r8, %rsi), %xmm6
+	movdqu	112(%r8, %rsi), %xmm7
+	movntdq	%xmm0, (%r8)
+	movntdq	%xmm1, 16(%r8)
+	movntdq	%xmm2, 32(%r8)
+	movntdq	%xmm3, 48(%r8)
+	movntdq	%xmm4, 64(%r8)
+	movntdq	%xmm5, 80(%r8)
+	movntdq	%xmm6, 96(%r8)
+	movntdq	%xmm7, 112(%r8)
+	lea	128(%r8), %r8
+	cmp	%r8, %rdx
+	jne	L(main_loop_large_page)
+	sfence
+	jmp	L(return)
+
+L(len_0_16_bytes):
+	testb	$24, %dl
+	jne	L(len_9_16_bytes)
+	testb	$4, %dl
+	.p2align 4,,5
+	jne	L(len_5_8_bytes)
+	test	%rdx, %rdx
+	.p2align 4,,2
+	je	L(return)
+	movzbl	(%rsi), %ebx
+	testb	$2, %dl
+	movb	%bl, (%rdi)
+	je	L(return)
+	movzwl	-2(%rsi,%rdx), %ebx
+	movw	%bx, -2(%rdi,%rdx)
+	jmp	L(return)
+
+L(len_9_16_bytes):
+	movq	(%rsi), %xmm0
+	movq	-8(%rsi, %rdx), %xmm1
+	movq	%xmm0, (%rdi)
+	movq	%xmm1, -8(%rdi, %rdx)
+	jmp	L(return)
+
+L(len_5_8_bytes):
+	movl	(%rsi), %ebx
+	movl	%ebx, (%rdi)
+	movl	-4(%rsi,%rdx), %ebx
+	movl	%ebx, -4(%rdi,%rdx)
+	jmp	L(return)
+
+L(return):
+	mov 	%rdi, %rax
+	RETURN
+
+END (MEMCPY)
diff --git a/libc/arch-x86_64/string/sse2-memmove-slm.S b/libc/arch-x86_64/string/sse2-memmove-slm.S
new file mode 100644
index 0000000..ee8440e
--- /dev/null
+++ b/libc/arch-x86_64/string/sse2-memmove-slm.S
@@ -0,0 +1,635 @@
+/*
+Copyright (c) 2014, Intel Corporation
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+    * Redistributions of source code must retain the above copyright notice,
+    * this list of conditions and the following disclaimer.
+
+    * Redistributions in binary form must reproduce the above copyright notice,
+    * this list of conditions and the following disclaimer in the documentation
+    * and/or other materials provided with the distribution.
+
+    * Neither the name of Intel Corporation nor the names of its contributors
+    * may be used to endorse or promote products derived from this software
+    * without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
+ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+#include "cache.h"
+
+#ifndef MEMMOVE
+# define MEMMOVE		memmove
+#endif
+
+#ifndef L
+# define L(label)	.L##label
+#endif
+
+#ifndef cfi_startproc
+# define cfi_startproc	.cfi_startproc
+#endif
+
+#ifndef cfi_endproc
+# define cfi_endproc	.cfi_endproc
+#endif
+
+#ifndef cfi_rel_offset
+# define cfi_rel_offset(reg, off)	.cfi_rel_offset reg, off
+#endif
+
+#ifndef cfi_restore
+# define cfi_restore(reg)	.cfi_restore reg
+#endif
+
+#ifndef cfi_adjust_cfa_offset
+# define cfi_adjust_cfa_offset(off)	.cfi_adjust_cfa_offset off
+#endif
+
+#ifndef ENTRY
+# define ENTRY(name)		\
+	.type name,  @function;		\
+	.globl name;		\
+	.p2align 4;		\
+name:		\
+	cfi_startproc
+#endif
+
+#ifndef END
+# define END(name)		\
+	cfi_endproc;		\
+	.size name, .-name
+#endif
+
+#define CFI_PUSH(REG)		\
+	cfi_adjust_cfa_offset (4);		\
+	cfi_rel_offset (REG, 0)
+
+#define CFI_POP(REG)		\
+	cfi_adjust_cfa_offset (-4);		\
+	cfi_restore (REG)
+
+#define PUSH(REG)	push REG;
+#define POP(REG)	pop REG;
+
+#define ENTRANCE	PUSH (%rbx);
+#define RETURN_END	POP (%rbx); ret
+#define RETURN		RETURN_END;
+
+	.section .text.sse2,"ax",@progbits
+ENTRY (MEMMOVE)
+	ENTRANCE
+#ifdef USE_AS_BCOPY
+	xchg	%rsi, %rdi
+#endif
+	mov	%rdi, %rax
+
+/* Check whether we should copy backward or forward.  */
+	cmp	%rsi, %rdi
+	je	L(mm_return)
+	ja	L(mm_len_0_or_more_backward)
+
+/* Now do checks for lengths. We do [0..16], [0..32], [0..64], [0..128]
+	separately.  */
+	cmp	$16, %rdx
+	jbe	L(mm_len_0_16_bytes_forward)
+
+	cmp	$32, %rdx
+	jg	L(mm_len_32_or_more_forward)
+
+/* Copy [0..32] and return.  */
+	movdqu	(%rsi), %xmm0
+	movdqu	-16(%rsi, %rdx), %xmm1
+	movdqu	%xmm0, (%rdi)
+	movdqu	%xmm1, -16(%rdi, %rdx)
+	jmp	L(mm_return)
+
+L(mm_len_32_or_more_forward):
+	cmp	$64, %rdx
+	jg	L(mm_len_64_or_more_forward)
+
+/* Copy [0..64] and return.  */
+	movdqu	(%rsi), %xmm0
+	movdqu	16(%rsi), %xmm1
+	movdqu	-16(%rsi, %rdx), %xmm2
+	movdqu	-32(%rsi, %rdx), %xmm3
+	movdqu	%xmm0, (%rdi)
+	movdqu	%xmm1, 16(%rdi)
+	movdqu	%xmm2, -16(%rdi, %rdx)
+	movdqu	%xmm3, -32(%rdi, %rdx)
+	jmp	L(mm_return)
+
+L(mm_len_64_or_more_forward):
+	cmp	$128, %rdx
+	jg	L(mm_len_128_or_more_forward)
+
+/* Copy [0..128] and return.  */
+	movdqu	(%rsi), %xmm0
+	movdqu	16(%rsi), %xmm1
+	movdqu	32(%rsi), %xmm2
+	movdqu	48(%rsi), %xmm3
+	movdqu	-64(%rsi, %rdx), %xmm4
+	movdqu	-48(%rsi, %rdx), %xmm5
+	movdqu	-32(%rsi, %rdx), %xmm6
+	movdqu	-16(%rsi, %rdx), %xmm7
+	movdqu	%xmm0, (%rdi)
+	movdqu	%xmm1, 16(%rdi)
+	movdqu	%xmm2, 32(%rdi)
+	movdqu	%xmm3, 48(%rdi)
+	movdqu	%xmm4, -64(%rdi, %rdx)
+	movdqu	%xmm5, -48(%rdi, %rdx)
+	movdqu	%xmm6, -32(%rdi, %rdx)
+	movdqu	%xmm7, -16(%rdi, %rdx)
+	jmp	L(mm_return)
+
+L(mm_len_128_or_more_forward):
+
+	cmp	$SHARED_CACHE_SIZE_HALF, %rdx
+	jae	L(mm_large_page_forward)
+
+	mov	%rsi, %r8  // copy src to r8
+	mov	%rdi, %r9  // copy dst to r9
+
+/* Aligning the address of destination.  */
+/*  save first unaligned 64 bytes */
+	movdqu	(%rsi), %xmm0
+	movdqu	16(%rsi), %xmm1
+	movdqu	32(%rsi), %xmm2
+	movdqu	48(%rsi), %xmm3
+
+	lea	64(%r9), %rdi
+	and	$-64, %rdi  /* rdi now aligned to next 64 byte boundary */
+
+	sub	%r9, %rsi /* rsi = src - dst = diff */
+
+	movdqu	(%rdi, %rsi), %xmm4
+	movdqu	16(%rdi, %rsi), %xmm5
+	movdqu	32(%rdi, %rsi), %xmm6
+	movdqu	48(%rdi, %rsi), %xmm7
+
+	movdqu	%xmm0, (%r9)
+	movdqu	%xmm1, 16(%r9)
+	movdqu	%xmm2, 32(%r9)
+	movdqu	%xmm3, 48(%r9)
+	movdqa	%xmm4, (%rdi)
+	movdqa	%xmm5, 16(%rdi)
+	movdqa	%xmm6, 32(%rdi)
+	movdqa	%xmm7, 48(%rdi)
+	add	$64, %rdi
+
+	lea	(%r9, %rdx), %rbx
+	and	$-64, %rbx
+
+	cmp	%rdi, %rbx
+	jbe	L(mm_copy_remaining_forward)
+
+	.p2align 4
+L(mm_main_loop_forward):
+
+	prefetcht0 128(%rdi, %rsi)
+
+	movdqu	(%rdi, %rsi), %xmm0
+	movdqu	16(%rdi, %rsi), %xmm1
+	movdqu	32(%rdi, %rsi), %xmm2
+	movdqu	48(%rdi, %rsi), %xmm3
+	movdqa	%xmm0, (%rdi)
+	movdqa	%xmm1, 16(%rdi)
+	movdqa	%xmm2, 32(%rdi)
+	movdqa	%xmm3, 48(%rdi)
+	lea	64(%rdi), %rdi
+	cmp	%rdi, %rbx
+	ja	L(mm_main_loop_forward)
+
+L(mm_copy_remaining_forward):
+	add	%r9, %rdx
+	sub	%rdi, %rdx
+/* We copied all up till %rdi position in the dst.
+	In %rdx now is how many bytes are left to copy.
+	Now we need to advance %r8. */
+	lea	(%rdi, %rsi), %r8
+
+L(mm_remaining_0_64_bytes_forward):
+	cmp	$32, %rdx
+	ja	L(mm_remaining_33_64_bytes_forward)
+	cmp	$16, %rdx
+	ja	L(mm_remaining_17_32_bytes_forward)
+	test	%rdx, %rdx
+	.p2align 4,,2
+	je	L(mm_return)
+
+	cmpb	$8, %dl
+	ja	L(mm_remaining_9_16_bytes_forward)
+	cmpb	$4, %dl
+	.p2align 4,,5
+	ja	L(mm_remaining_5_8_bytes_forward)
+	cmpb	$2, %dl
+	.p2align 4,,1
+	ja	L(mm_remaining_3_4_bytes_forward)
+	movzbl	-1(%r8,%rdx), %esi
+	movzbl	(%r8), %ebx
+	movb	%sil, -1(%rdi,%rdx)
+	movb	%bl, (%rdi)
+	jmp	L(mm_return)
+
+L(mm_remaining_33_64_bytes_forward):
+	movdqu	(%r8), %xmm0
+	movdqu	16(%r8), %xmm1
+	movdqu	-32(%r8, %rdx), %xmm2
+	movdqu	-16(%r8, %rdx), %xmm3
+	movdqu	%xmm0, (%rdi)
+	movdqu	%xmm1, 16(%rdi)
+	movdqu	%xmm2, -32(%rdi, %rdx)
+	movdqu	%xmm3, -16(%rdi, %rdx)
+	jmp	L(mm_return)
+
+L(mm_remaining_17_32_bytes_forward):
+	movdqu	(%r8), %xmm0
+	movdqu	-16(%r8, %rdx), %xmm1
+	movdqu	%xmm0, (%rdi)
+	movdqu	%xmm1, -16(%rdi, %rdx)
+	jmp	L(mm_return)
+
+L(mm_remaining_3_4_bytes_forward):
+	movzwl	-2(%r8,%rdx), %esi
+	movzwl	(%r8), %ebx
+	movw	%si, -2(%rdi,%rdx)
+	movw	%bx, (%rdi)
+	jmp	L(mm_return)
+
+L(mm_remaining_5_8_bytes_forward):
+	movl	(%r8), %esi
+	movl	-4(%r8,%rdx), %ebx
+	movl	%esi, (%rdi)
+	movl	%ebx, -4(%rdi,%rdx)
+	jmp	L(mm_return)
+
+L(mm_remaining_9_16_bytes_forward):
+	mov	(%r8), %rsi
+	mov	-8(%r8, %rdx), %rbx
+	mov	%rsi, (%rdi)
+	mov	%rbx, -8(%rdi, %rdx)
+	jmp	L(mm_return)
+
+L(mm_len_0_16_bytes_forward):
+	testb	$24, %dl
+	jne	L(mm_len_9_16_bytes_forward)
+	testb	$4, %dl
+	.p2align 4,,5
+	jne	L(mm_len_5_8_bytes_forward)
+	test	%rdx, %rdx
+	.p2align 4,,2
+	je	L(mm_return)
+	testb	$2, %dl
+	.p2align 4,,1
+	jne	L(mm_len_2_4_bytes_forward)
+	movzbl	-1(%rsi,%rdx), %ebx
+	movzbl	(%rsi), %esi
+	movb	%bl, -1(%rdi,%rdx)
+	movb	%sil, (%rdi)
+	jmp	L(mm_return)
+
+L(mm_len_2_4_bytes_forward):
+	movzwl	-2(%rsi,%rdx), %ebx
+	movzwl	(%rsi), %esi
+	movw	%bx, -2(%rdi,%rdx)
+	movw	%si, (%rdi)
+	jmp	L(mm_return)
+
+L(mm_len_5_8_bytes_forward):
+	movl	(%rsi), %ebx
+	movl	-4(%rsi,%rdx), %esi
+	movl	%ebx, (%rdi)
+	movl	%esi, -4(%rdi,%rdx)
+	jmp	L(mm_return)
+
+L(mm_len_9_16_bytes_forward):
+	mov	(%rsi), %rbx
+	mov	-8(%rsi, %rdx), %rsi
+	mov	%rbx, (%rdi)
+	mov	%rsi, -8(%rdi, %rdx)
+	jmp	L(mm_return)
+
+/* The code for copying backwards.  */
+L(mm_len_0_or_more_backward):
+
+/* Now do checks for lengths. We do [0..16], [0..32], [0..64], [0..128]
+	separately.  */
+	cmp	$16, %rdx
+	jbe	L(mm_len_0_16_bytes_backward)
+
+	cmp	$32, %rdx
+	jg	L(mm_len_32_or_more_backward)
+
+/* Copy [0..32] and return.  */
+	movdqu	(%rsi), %xmm0
+	movdqu	-16(%rsi, %rdx), %xmm1
+	movdqu	%xmm0, (%rdi)
+	movdqu	%xmm1, -16(%rdi, %rdx)
+	jmp	L(mm_return)
+
+L(mm_len_32_or_more_backward):
+	cmp	$64, %rdx
+	jg	L(mm_len_64_or_more_backward)
+
+/* Copy [0..64] and return.  */
+	movdqu	(%rsi), %xmm0
+	movdqu	16(%rsi), %xmm1
+	movdqu	-16(%rsi, %rdx), %xmm2
+	movdqu	-32(%rsi, %rdx), %xmm3
+	movdqu	%xmm0, (%rdi)
+	movdqu	%xmm1, 16(%rdi)
+	movdqu	%xmm2, -16(%rdi, %rdx)
+	movdqu	%xmm3, -32(%rdi, %rdx)
+	jmp	L(mm_return)
+
+L(mm_len_64_or_more_backward):
+	cmp	$128, %rdx
+	jg	L(mm_len_128_or_more_backward)
+
+/* Copy [0..128] and return.  */
+	movdqu	(%rsi), %xmm0
+	movdqu	16(%rsi), %xmm1
+	movdqu	32(%rsi), %xmm2
+	movdqu	48(%rsi), %xmm3
+	movdqu	-64(%rsi, %rdx), %xmm4
+	movdqu	-48(%rsi, %rdx), %xmm5
+	movdqu	-32(%rsi, %rdx), %xmm6
+	movdqu	-16(%rsi, %rdx), %xmm7
+	movdqu	%xmm0, (%rdi)
+	movdqu	%xmm1, 16(%rdi)
+	movdqu	%xmm2, 32(%rdi)
+	movdqu	%xmm3, 48(%rdi)
+	movdqu	%xmm4, -64(%rdi, %rdx)
+	movdqu	%xmm5, -48(%rdi, %rdx)
+	movdqu	%xmm6, -32(%rdi, %rdx)
+	movdqu	%xmm7, -16(%rdi, %rdx)
+	jmp	L(mm_return)
+
+L(mm_len_128_or_more_backward):
+
+	cmp	$SHARED_CACHE_SIZE_HALF, %rdx
+	jae	L(mm_large_page_backward)
+
+/* Aligning the address of destination. We need to save
+	16 bits from the source in order not to overwrite them.  */
+	movdqu	-16(%rsi, %rdx), %xmm0
+	movdqu	-32(%rsi, %rdx), %xmm1
+	movdqu	-48(%rsi, %rdx), %xmm2
+	movdqu	-64(%rsi, %rdx), %xmm3
+
+	lea	(%rdi, %rdx), %r9
+	and	$-64, %r9 /* r9 = aligned dst */
+
+	mov	%rsi, %r8
+	sub	%rdi, %r8 /* r8 = src - dst, diff */
+
+	movdqu	-16(%r9, %r8), %xmm4
+	movdqu	-32(%r9, %r8), %xmm5
+	movdqu	-48(%r9, %r8), %xmm6
+	movdqu	-64(%r9, %r8), %xmm7
+
+	movdqu	%xmm0, -16(%rdi, %rdx)
+	movdqu	%xmm1, -32(%rdi, %rdx)
+	movdqu	%xmm2, -48(%rdi, %rdx)
+	movdqu	%xmm3, -64(%rdi, %rdx)
+	movdqa	%xmm4, -16(%r9)
+	movdqa	%xmm5, -32(%r9)
+	movdqa	%xmm6, -48(%r9)
+	movdqa	%xmm7, -64(%r9)
+	lea	-64(%r9), %r9
+
+	lea	64(%rdi), %rbx
+	and	$-64, %rbx
+
+/* Compute in %rdx how many bytes are left to copy after
+	the main loop stops.  */
+	mov 	%rbx, %rdx
+	sub 	%rdi, %rdx
+
+	cmp	%r9, %rbx
+	jb	L(mm_main_loop_backward)
+	jmp	L(mm_len_0_or_more_backward)
+
+	.p2align 4
+L(mm_main_loop_backward):
+
+	prefetcht0 -128(%r9, %r8)
+
+	movdqu	-64(%r9, %r8), %xmm0
+	movdqu	-48(%r9, %r8), %xmm1
+	movdqu	-32(%r9, %r8), %xmm2
+	movdqu	-16(%r9, %r8), %xmm3
+	movdqa	%xmm0, -64(%r9)
+	movdqa	%xmm1, -48(%r9)
+	movdqa	%xmm2, -32(%r9)
+	movdqa	%xmm3, -16(%r9)
+	lea	-64(%r9), %r9
+	cmp	%r9, %rbx
+	jb	L(mm_main_loop_backward)
+	jmp	L(mm_len_0_or_more_backward)
+
+/* Copy [0..16] and return.  */
+L(mm_len_0_16_bytes_backward):
+	testb	$24, %dl
+	jnz	L(mm_len_9_16_bytes_backward)
+	testb	$4, %dl
+	.p2align 4,,5
+	jnz	L(mm_len_5_8_bytes_backward)
+	test	%rdx, %rdx
+	.p2align 4,,2
+	je	L(mm_return)
+	testb	$2, %dl
+	.p2align 4,,1
+	jne	L(mm_len_3_4_bytes_backward)
+	movzbl	-1(%rsi,%rdx), %ebx
+	movzbl	(%rsi), %ecx
+	movb	%bl, -1(%rdi,%rdx)
+	movb	%cl, (%rdi)
+	jmp	L(mm_return)
+
+L(mm_len_3_4_bytes_backward):
+	movzwl	-2(%rsi,%rdx), %ebx
+	movzwl	(%rsi), %ecx
+	movw	%bx, -2(%rdi,%rdx)
+	movw	%cx, (%rdi)
+	jmp	L(mm_return)
+
+L(mm_len_9_16_bytes_backward):
+	movl	-4(%rsi,%rdx), %ebx
+	movl	-8(%rsi,%rdx), %ecx
+	movl	%ebx, -4(%rdi,%rdx)
+	movl	%ecx, -8(%rdi,%rdx)
+	sub	$8, %rdx
+	jmp	L(mm_len_0_16_bytes_backward)
+
+L(mm_len_5_8_bytes_backward):
+	movl	(%rsi), %ebx
+	movl	-4(%rsi,%rdx), %ecx
+	movl	%ebx, (%rdi)
+	movl	%ecx, -4(%rdi,%rdx)
+
+L(mm_return):
+	RETURN
+
+/* Big length copy forward part.  */
+
+L(mm_large_page_forward):
+/* Aligning the address of destination. We need to save
+	16 bits from the source in order not to overwrite them.  */
+
+	mov	%rsi, %r8
+	mov	%rdi, %r9
+
+	movdqu	(%rsi), %xmm0
+	movdqu	16(%rsi), %xmm1
+	movdqu	32(%rsi), %xmm2
+	movdqu	48(%rsi), %xmm3
+
+	lea	64(%r9), %rdi
+	and	$-64, %rdi      /* rdi = aligned dst */
+
+	sub	%r9, %rsi        /* rsi = diff */
+
+	movdqu	(%rdi, %rsi), %xmm4
+	movdqu	16(%rdi, %rsi), %xmm5
+	movdqu	32(%rdi, %rsi), %xmm6
+	movdqu	48(%rdi, %rsi), %xmm7
+
+	movdqu	%xmm0, (%r9)
+	movdqu	%xmm1, 16(%r9)
+	movdqu	%xmm2, 32(%r9)
+	movdqu	%xmm3, 48(%r9)
+	movntdq	%xmm4, (%rdi)
+	movntdq	%xmm5, 16(%rdi)
+	movntdq	%xmm6, 32(%rdi)
+	movntdq	%xmm7, 48(%rdi)
+	add	$64, %rdi
+
+	lea	(%r9, %rdx), %rbx
+	and	$-128, %rbx
+
+	cmp	%rdi, %rbx
+	jbe	L(mm_copy_remaining_forward)
+
+	.p2align 4
+L(mm_large_page_loop_forward):
+	movdqu	(%rdi, %rsi), %xmm0
+	movdqu	16(%rdi, %rsi), %xmm1
+	movdqu	32(%rdi, %rsi), %xmm2
+	movdqu	48(%rdi, %rsi), %xmm3
+	movdqu	64(%rdi, %rsi), %xmm4
+	movdqu	80(%rdi, %rsi), %xmm5
+	movdqu	96(%rdi, %rsi), %xmm6
+	movdqu	112(%rdi, %rsi), %xmm7
+	movntdq	%xmm0, (%rdi)
+	movntdq	%xmm1, 16(%rdi)
+	movntdq	%xmm2, 32(%rdi)
+	movntdq	%xmm3, 48(%rdi)
+	movntdq	%xmm4, 64(%rdi)
+	movntdq	%xmm5, 80(%rdi)
+	movntdq	%xmm6, 96(%rdi)
+	movntdq	%xmm7, 112(%rdi)
+	lea 	128(%rdi), %rdi
+	cmp	%rdi, %rbx
+	ja	L(mm_large_page_loop_forward)
+	sfence
+
+	add 	%r9, %rdx
+	sub 	%rdi, %rdx
+/* We copied all up till %rdi position in the dst.
+	In %rdx now is how many bytes are left to copy.
+	Now we need to advance %r8. */
+	lea 	(%rdi, %rsi), %r8
+
+	cmp	$64, %rdx
+	jb	L(mm_remaining_0_64_bytes_forward)
+
+	movdqu	(%r8), %xmm0
+	movdqu	16(%r8), %xmm1
+	movdqu	32(%r8), %xmm2
+	movdqu	48(%r8), %xmm3
+	movdqu	-64(%r8, %rdx), %xmm4
+	movdqu	-48(%r8, %rdx), %xmm5
+	movdqu	-32(%r8, %rdx), %xmm6
+	movdqu	-16(%r8, %rdx), %xmm7
+	movdqu	%xmm0, (%rdi)
+	movdqu	%xmm1, 16(%rdi)
+	movdqu	%xmm2, 32(%rdi)
+	movdqu	%xmm3, 48(%rdi)
+	movdqu	%xmm4, -64(%rdi, %rdx)
+	movdqu	%xmm5, -48(%rdi, %rdx)
+	movdqu	%xmm6, -32(%rdi, %rdx)
+	movdqu	%xmm7, -16(%rdi, %rdx)
+	jmp	L(mm_return)
+
+
+/* Big length copy backward part.  */
+L(mm_large_page_backward):
+/* Aligning the address of destination. We need to save
+	16 bits from the source in order not to overwrite them.  */
+
+	movdqu	-16(%rsi, %rdx), %xmm0
+	movdqu	-32(%rsi, %rdx), %xmm1
+	movdqu	-48(%rsi, %rdx), %xmm2
+	movdqu	-64(%rsi, %rdx), %xmm3
+
+	lea	(%rdi, %rdx), %r9
+	and	$-64, %r9
+
+	mov 	%rsi, %r8
+	sub 	%rdi, %r8
+
+	movdqu	-16(%r9, %r8), %xmm4
+	movdqu	-32(%r9, %r8), %xmm5
+	movdqu	-48(%r9, %r8), %xmm6
+	movdqu	-64(%r9, %r8), %xmm7
+
+	movdqu	%xmm0, -16(%rdi, %rdx)
+	movdqu	%xmm1, -32(%rdi, %rdx)
+	movdqu	%xmm2, -48(%rdi, %rdx)
+	movdqu	%xmm3, -64(%rdi, %rdx)
+	movntdq	%xmm4, -16(%r9)
+	movntdq	%xmm5, -32(%r9)
+	movntdq	%xmm6, -48(%r9)
+	movntdq	%xmm7, -64(%r9)
+	lea 	-64(%r9), %r9
+
+	lea 	128(%rdi), %rbx
+	and 	$-64, %rbx
+
+/* Compute in %rdx how many bytes are left to copy after
+	the main loop stops.  */
+	mov 	%rbx, %rdx
+	sub 	%rdi, %rdx
+
+	cmp	%r9, %rbx
+	jae	L(mm_len_0_or_more_backward)
+
+	.p2align 4
+L(mm_large_page_loop_backward):
+	movdqu	-64(%r9, %r8), %xmm0
+	movdqu	-48(%r9, %r8), %xmm1
+	movdqu	-32(%r9, %r8), %xmm2
+	movdqu	-16(%r9, %r8), %xmm3
+	movntdq	%xmm0, -64(%r9)
+	movntdq	%xmm1, -48(%r9)
+	movntdq	%xmm2, -32(%r9)
+	movntdq	%xmm3, -16(%r9)
+	lea 	-64(%r9), %r9
+	cmp	%r9, %rbx
+	jb	L(mm_large_page_loop_backward)
+	jmp	L(mm_len_0_or_more_backward)
+
+END (MEMMOVE)
diff --git a/libc/arch-x86_64/string/sse2-memset-slm.S b/libc/arch-x86_64/string/sse2-memset-slm.S
new file mode 100644
index 0000000..bfcafae
--- /dev/null
+++ b/libc/arch-x86_64/string/sse2-memset-slm.S
@@ -0,0 +1,173 @@
+/*
+Copyright (c) 2014, Intel Corporation
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+    * Redistributions of source code must retain the above copyright notice,
+    * this list of conditions and the following disclaimer.
+
+    * Redistributions in binary form must reproduce the above copyright notice,
+    * this list of conditions and the following disclaimer in the documentation
+    * and/or other materials provided with the distribution.
+
+    * Neither the name of Intel Corporation nor the names of its contributors
+    * may be used to endorse or promote products derived from this software
+    * without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
+ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+#include "cache.h"
+
+#ifndef MEMSET
+# define MEMSET		memset
+#endif
+
+#ifndef L
+# define L(label)	.L##label
+#endif
+
+#ifndef ALIGN
+# define ALIGN(n)	.p2align n
+#endif
+
+#ifndef cfi_startproc
+# define cfi_startproc			.cfi_startproc
+#endif
+
+#ifndef cfi_endproc
+# define cfi_endproc			.cfi_endproc
+#endif
+
+#ifndef ENTRY
+# define ENTRY(name)			\
+	.type name,  @function;	\
+	.globl name;			\
+name:					\
+	cfi_startproc
+#endif
+
+#ifndef END
+# define END(name)			\
+	cfi_endproc;			\
+	.size name, .-name
+#endif
+
+	.section .text.sse2,"ax",@progbits
+ENTRY (MEMSET)
+	movq	%rdi, %rax
+#ifdef USE_AS_BZERO_P
+	mov	%rsi, %rdx
+	xor	%rcx, %rcx
+#else
+	and	$0xff, %rsi
+	mov	$0x0101010101010101, %rcx
+	imul	%rsi, %rcx
+#endif
+	cmpq	$16, %rdx
+	jae	L(16bytesormore)
+	testb	$8, %dl
+	jnz	L(8_15bytes)
+	testb	$4, %dl
+	jnz	L(4_7bytes)
+	testb	$2, %dl
+	jnz	L(2_3bytes)
+	testb	$1, %dl
+	jz	L(return)
+	movb	%cl, (%rdi)
+L(return):
+	ret
+
+L(8_15bytes):
+	movq	%rcx, (%rdi)
+	movq	%rcx, -8(%rdi, %rdx)
+	ret
+
+L(4_7bytes):
+	movl	%ecx, (%rdi)
+	movl	%ecx, -4(%rdi, %rdx)
+	ret
+
+L(2_3bytes):
+	movw	%cx, (%rdi)
+	movw	%cx, -2(%rdi, %rdx)
+	ret
+
+	ALIGN (4)
+L(16bytesormore):
+#ifdef USE_AS_BZERO_P
+	pxor	%xmm0, %xmm0
+#else
+	movd	%rcx, %xmm0
+	pshufd	$0, %xmm0, %xmm0
+#endif
+	movdqu	%xmm0, (%rdi)
+	movdqu	%xmm0, -16(%rdi, %rdx)
+	cmpq	$32, %rdx
+	jbe	L(32bytesless)
+	movdqu	%xmm0, 16(%rdi)
+	movdqu	%xmm0, -32(%rdi, %rdx)
+	cmpq	$64, %rdx
+	jbe	L(64bytesless)
+	movdqu	%xmm0, 32(%rdi)
+	movdqu	%xmm0, 48(%rdi)
+	movdqu	%xmm0, -64(%rdi, %rdx)
+	movdqu	%xmm0, -48(%rdi, %rdx)
+	cmpq	$128, %rdx
+	ja	L(128bytesmore)
+L(32bytesless):
+L(64bytesless):
+	ret
+
+	ALIGN (4)
+L(128bytesmore):
+	leaq	64(%rdi), %rcx
+	andq	$-64, %rcx
+	movq	%rdx, %r8
+	addq	%rdi, %rdx
+	andq	$-64, %rdx
+	cmpq	%rcx, %rdx
+	je	L(return)
+
+#ifdef SHARED_CACHE_SIZE
+	cmp	$SHARED_CACHE_SIZE, %r8
+#else
+	cmp	__x86_64_shared_cache_size(%rip), %r8
+#endif
+	ja	L(128bytesmore_nt)
+
+	ALIGN (4)
+L(128bytesmore_normal):
+	movdqa	%xmm0, (%rcx)
+	movaps	%xmm0, 0x10(%rcx)
+	movaps	%xmm0, 0x20(%rcx)
+	movaps	%xmm0, 0x30(%rcx)
+	addq	$64, %rcx
+	cmpq	%rcx, %rdx
+	jne	L(128bytesmore_normal)
+	ret
+
+	ALIGN (4)
+L(128bytesmore_nt):
+	movntdq	%xmm0, (%rcx)
+	movntdq	%xmm0, 0x10(%rcx)
+	movntdq	%xmm0, 0x20(%rcx)
+	movntdq	%xmm0, 0x30(%rcx)
+	leaq	64(%rcx), %rcx
+	cmpq	%rcx, %rdx
+	jne	L(128bytesmore_nt)
+	sfence
+	ret
+
+END (MEMSET)
diff --git a/libc/arch-x86/string/cache.h b/libc/arch-x86_64/string/sse2-stpcpy-slm.S
similarity index 73%
copy from libc/arch-x86/string/cache.h
copy to libc/arch-x86_64/string/sse2-stpcpy-slm.S
index 9d0a563..0ad2d44 100644
--- a/libc/arch-x86/string/cache.h
+++ b/libc/arch-x86_64/string/sse2-stpcpy-slm.S
@@ -1,42 +1,33 @@
-/*
-Copyright (c) 2010, Intel Corporation
-All rights reserved.
-
-Redistribution and use in source and binary forms, with or without
-modification, are permitted provided that the following conditions are met:
-
-    * Redistributions of source code must retain the above copyright notice,
-    * this list of conditions and the following disclaimer.
-
-    * Redistributions in binary form must reproduce the above copyright notice,
-    * this list of conditions and the following disclaimer in the documentation
-    * and/or other materials provided with the distribution.
-
-    * Neither the name of Intel Corporation nor the names of its contributors
-    * may be used to endorse or promote products derived from this software
-    * without specific prior written permission.
-
-THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
-ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
-WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
-DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
-ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
-(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
-LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
-ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-*/
-
-#if defined(__slm__)
-/* Values are optimized for Silvermont */
-#define SHARED_CACHE_SIZE	(1024*1024)			/* Silvermont L2 Cache */
-#define DATA_CACHE_SIZE		(24*1024)			/* Silvermont L1 Data Cache */
-#else
-/* Values are optimized for Atom */
-#define SHARED_CACHE_SIZE	(512*1024)			/* Atom L2 Cache */
-#define DATA_CACHE_SIZE		(24*1024)			/* Atom L1 Data Cache */
-#endif
-
-#define SHARED_CACHE_SIZE_HALF	(SHARED_CACHE_SIZE / 2)
-#define DATA_CACHE_SIZE_HALF	(DATA_CACHE_SIZE / 2)
+/*

+Copyright (c) 2014, Intel Corporation

+All rights reserved.

+

+Redistribution and use in source and binary forms, with or without

+modification, are permitted provided that the following conditions are met:

+

+    * Redistributions of source code must retain the above copyright notice,

+    * this list of conditions and the following disclaimer.

+

+    * Redistributions in binary form must reproduce the above copyright notice,

+    * this list of conditions and the following disclaimer in the documentation

+    * and/or other materials provided with the distribution.

+

+    * Neither the name of Intel Corporation nor the names of its contributors

+    * may be used to endorse or promote products derived from this software

+    * without specific prior written permission.

+

+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND

+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED

+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE

+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR

+ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES

+(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;

+LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON

+ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT

+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS

+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

+*/

+

+#define USE_AS_STPCPY

+#define STRCPY		stpcpy

+#include "sse2-strcpy-slm.S"

diff --git a/libc/arch-x86/string/cache.h b/libc/arch-x86_64/string/sse2-stpncpy-slm.S
similarity index 73%
copy from libc/arch-x86/string/cache.h
copy to libc/arch-x86_64/string/sse2-stpncpy-slm.S
index 9d0a563..3066685 100644
--- a/libc/arch-x86/string/cache.h
+++ b/libc/arch-x86_64/string/sse2-stpncpy-slm.S
@@ -1,42 +1,34 @@
-/*
-Copyright (c) 2010, Intel Corporation
-All rights reserved.
-
-Redistribution and use in source and binary forms, with or without
-modification, are permitted provided that the following conditions are met:
-
-    * Redistributions of source code must retain the above copyright notice,
-    * this list of conditions and the following disclaimer.
-
-    * Redistributions in binary form must reproduce the above copyright notice,
-    * this list of conditions and the following disclaimer in the documentation
-    * and/or other materials provided with the distribution.
-
-    * Neither the name of Intel Corporation nor the names of its contributors
-    * may be used to endorse or promote products derived from this software
-    * without specific prior written permission.
-
-THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
-ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
-WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
-DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
-ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
-(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
-LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
-ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-*/
-
-#if defined(__slm__)
-/* Values are optimized for Silvermont */
-#define SHARED_CACHE_SIZE	(1024*1024)			/* Silvermont L2 Cache */
-#define DATA_CACHE_SIZE		(24*1024)			/* Silvermont L1 Data Cache */
-#else
-/* Values are optimized for Atom */
-#define SHARED_CACHE_SIZE	(512*1024)			/* Atom L2 Cache */
-#define DATA_CACHE_SIZE		(24*1024)			/* Atom L1 Data Cache */
-#endif
-
-#define SHARED_CACHE_SIZE_HALF	(SHARED_CACHE_SIZE / 2)
-#define DATA_CACHE_SIZE_HALF	(DATA_CACHE_SIZE / 2)
+/*

+Copyright (c) 2014, Intel Corporation

+All rights reserved.

+

+Redistribution and use in source and binary forms, with or without

+modification, are permitted provided that the following conditions are met:

+

+    * Redistributions of source code must retain the above copyright notice,

+    * this list of conditions and the following disclaimer.

+

+    * Redistributions in binary form must reproduce the above copyright notice,

+    * this list of conditions and the following disclaimer in the documentation

+    * and/or other materials provided with the distribution.

+

+    * Neither the name of Intel Corporation nor the names of its contributors

+    * may be used to endorse or promote products derived from this software

+    * without specific prior written permission.

+

+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND

+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED

+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE

+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR

+ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES

+(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;

+LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON

+ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT

+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS

+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

+*/

+

+#define USE_AS_STRNCPY

+#define USE_AS_STPCPY

+#define STRCPY		stpncpy

+#include "sse2-strcpy-slm.S"

diff --git a/libc/arch-x86/string/cache.h b/libc/arch-x86_64/string/sse2-strcat-slm.S
similarity index 63%
copy from libc/arch-x86/string/cache.h
copy to libc/arch-x86_64/string/sse2-strcat-slm.S
index 9d0a563..dd8207f 100644
--- a/libc/arch-x86/string/cache.h
+++ b/libc/arch-x86_64/string/sse2-strcat-slm.S
@@ -1,42 +1,87 @@
-/*
-Copyright (c) 2010, Intel Corporation
-All rights reserved.
-
-Redistribution and use in source and binary forms, with or without
-modification, are permitted provided that the following conditions are met:
-
-    * Redistributions of source code must retain the above copyright notice,
-    * this list of conditions and the following disclaimer.
-
-    * Redistributions in binary form must reproduce the above copyright notice,
-    * this list of conditions and the following disclaimer in the documentation
-    * and/or other materials provided with the distribution.
-
-    * Neither the name of Intel Corporation nor the names of its contributors
-    * may be used to endorse or promote products derived from this software
-    * without specific prior written permission.
-
-THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
-ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
-WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
-DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
-ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
-(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
-LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
-ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-*/
-
-#if defined(__slm__)
-/* Values are optimized for Silvermont */
-#define SHARED_CACHE_SIZE	(1024*1024)			/* Silvermont L2 Cache */
-#define DATA_CACHE_SIZE		(24*1024)			/* Silvermont L1 Data Cache */
-#else
-/* Values are optimized for Atom */
-#define SHARED_CACHE_SIZE	(512*1024)			/* Atom L2 Cache */
-#define DATA_CACHE_SIZE		(24*1024)			/* Atom L1 Data Cache */
-#endif
-
-#define SHARED_CACHE_SIZE_HALF	(SHARED_CACHE_SIZE / 2)
-#define DATA_CACHE_SIZE_HALF	(DATA_CACHE_SIZE / 2)
+/*

+Copyright (c) 2014, Intel Corporation

+All rights reserved.

+

+Redistribution and use in source and binary forms, with or without

+modification, are permitted provided that the following conditions are met:

+

+    * Redistributions of source code must retain the above copyright notice,

+    * this list of conditions and the following disclaimer.

+

+    * Redistributions in binary form must reproduce the above copyright notice,

+    * this list of conditions and the following disclaimer in the documentation

+    * and/or other materials provided with the distribution.

+

+    * Neither the name of Intel Corporation nor the names of its contributors

+    * may be used to endorse or promote products derived from this software

+    * without specific prior written permission.

+

+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND

+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED

+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE

+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR

+ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES

+(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;

+LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON

+ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT

+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS

+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

+*/

+

+#ifndef STRCAT

+# define STRCAT		strcat

+#endif

+

+#ifndef L

+# define L(label)		.L##label

+#endif

+

+#ifndef cfi_startproc

+# define cfi_startproc		 .cfi_startproc

+#endif

+

+#ifndef cfi_endproc

+# define cfi_endproc		.cfi_endproc

+#endif

+

+#ifndef ENTRY

+# define ENTRY(name)		\

+	.type name,  @function;		\

+	.globl name;		\

+	.p2align 4;		\

+name:		\

+	cfi_startproc

+#endif

+

+#ifndef END

+# define END(name)		\

+	cfi_endproc;		\

+	.size name, .-name

+#endif

+

+#define USE_AS_STRCAT

+

+.text

+ENTRY (STRCAT)

+	mov	%rdi, %r9

+#ifdef USE_AS_STRNCAT

+	mov	%rdx, %r8

+#endif

+

+#define RETURN jmp L(Strcpy)

+#include "sse2-strlen-slm.S"

+

+#undef RETURN

+#define RETURN ret

+

+L(Strcpy):

+	lea	(%r9, %rax), %rdi

+	mov	%rsi, %rcx

+	mov	%r9, %rax	/* save result */

+

+#ifdef USE_AS_STRNCAT

+	test	%r8, %r8

+	jz	L(ExitZero)

+# define USE_AS_STRNCPY

+#endif

+#include "sse2-strcpy-slm.S"

diff --git a/libc/arch-x86_64/string/sse2-strcpy-slm.S b/libc/arch-x86_64/string/sse2-strcpy-slm.S
new file mode 100644
index 0000000..3e146bf
--- /dev/null
+++ b/libc/arch-x86_64/string/sse2-strcpy-slm.S
@@ -0,0 +1,1921 @@
+/*
+Copyright (c) 2014, Intel Corporation
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+    * Redistributions of source code must retain the above copyright notice,
+    * this list of conditions and the following disclaimer.
+
+    * Redistributions in binary form must reproduce the above copyright notice,
+    * this list of conditions and the following disclaimer in the documentation
+    * and/or other materials provided with the distribution.
+
+    * Neither the name of Intel Corporation nor the names of its contributors
+    * may be used to endorse or promote products derived from this software
+    * without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
+ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+#ifndef USE_AS_STRCAT
+
+# ifndef STRCPY
+#  define STRCPY	strcpy
+# endif
+
+# ifndef L
+#  define L(label)	.L##label
+# endif
+
+# ifndef cfi_startproc
+#  define cfi_startproc	.cfi_startproc
+# endif
+
+# ifndef cfi_endproc
+#  define cfi_endproc	.cfi_endproc
+# endif
+
+# ifndef ENTRY
+#  define ENTRY(name)	\
+	.type name, @function;	\
+	.globl name;	\
+	.p2align 4;	\
+name:	\
+	cfi_startproc
+# endif
+
+# ifndef END
+#  define END(name)	\
+	cfi_endproc;	\
+	.size name, .-name
+# endif
+
+#endif
+
+#define JMPTBL(I, B)	I - B
+#define BRANCH_TO_JMPTBL_ENTRY(TABLE, INDEX, SCALE)	\
+	lea	TABLE(%rip), %r11;	\
+	movslq	(%r11, INDEX, SCALE), %rcx;	\
+	lea	(%r11, %rcx), %rcx;	\
+	jmp	*%rcx
+
+#ifndef USE_AS_STRCAT
+
+# define RETURN ret
+
+.text
+ENTRY (STRCPY)
+# ifdef USE_AS_STRNCPY
+	mov	%rdx, %r8
+	test	%r8, %r8
+	jz	L(ExitZero)
+# endif
+	mov	%rsi, %rcx
+# ifndef USE_AS_STPCPY
+	mov	%rdi, %rax      /* save result */
+# endif
+
+#endif
+	and	$63, %rcx
+	cmp	$32, %rcx
+	jbe	L(SourceStringAlignmentLess32)
+
+	and	$-16, %rsi
+	and	$15, %rcx
+	pxor	%xmm0, %xmm0
+	pxor	%xmm1, %xmm1
+
+	pcmpeqb	(%rsi), %xmm1
+	pmovmskb %xmm1, %rdx
+	shr	%cl, %rdx
+#ifdef USE_AS_STRNCPY
+# if defined USE_AS_STPCPY || defined USE_AS_STRCAT
+	mov	$16, %r10
+	sub	%rcx, %r10
+	cmp	%r10, %r8
+# else
+	mov	$17, %r10
+	sub	%rcx, %r10
+	cmp	%r10, %r8
+# endif
+	jbe	L(CopyFrom1To16BytesTailCase2OrCase3)
+#endif
+	test	%rdx, %rdx
+	jnz	L(CopyFrom1To16BytesTail)
+
+	pcmpeqb	16(%rsi), %xmm0
+	pmovmskb %xmm0, %rdx
+#ifdef USE_AS_STRNCPY
+	add	$16, %r10
+	cmp	%r10, %r8
+	jbe	L(CopyFrom1To32BytesCase2OrCase3)
+#endif
+	test	%rdx, %rdx
+	jnz	L(CopyFrom1To32Bytes)
+
+	movdqu	(%rsi, %rcx), %xmm1   /* copy 16 bytes */
+	movdqu	%xmm1, (%rdi)
+
+/* If source adress alignment != destination adress alignment */
+	.p2align 4
+L(Unalign16Both):
+	sub	%rcx, %rdi
+#ifdef USE_AS_STRNCPY
+	add	%rcx, %r8
+#endif
+	mov	$16, %rcx
+	movdqa	(%rsi, %rcx), %xmm1
+	movaps	16(%rsi, %rcx), %xmm2
+	movdqu	%xmm1, (%rdi, %rcx)
+	pcmpeqb	%xmm2, %xmm0
+	pmovmskb %xmm0, %rdx
+	add	$16, %rcx
+#ifdef USE_AS_STRNCPY
+	sub	$48, %r8
+	jbe	L(CopyFrom1To16BytesCase2OrCase3)
+#endif
+	test	%rdx, %rdx
+#if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
+	jnz	L(CopyFrom1To16BytesUnalignedXmm2)
+#else
+	jnz	L(CopyFrom1To16Bytes)
+#endif
+
+	movaps	16(%rsi, %rcx), %xmm3
+	movdqu	%xmm2, (%rdi, %rcx)
+	pcmpeqb	%xmm3, %xmm0
+	pmovmskb %xmm0, %rdx
+	add	$16, %rcx
+#ifdef USE_AS_STRNCPY
+	sub	$16, %r8
+	jbe	L(CopyFrom1To16BytesCase2OrCase3)
+#endif
+	test	%rdx, %rdx
+#if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
+	jnz	L(CopyFrom1To16BytesUnalignedXmm3)
+#else
+	jnz	L(CopyFrom1To16Bytes)
+#endif
+
+	movaps	16(%rsi, %rcx), %xmm4
+	movdqu	%xmm3, (%rdi, %rcx)
+	pcmpeqb	%xmm4, %xmm0
+	pmovmskb %xmm0, %rdx
+	add	$16, %rcx
+#ifdef USE_AS_STRNCPY
+	sub	$16, %r8
+	jbe	L(CopyFrom1To16BytesCase2OrCase3)
+#endif
+	test	%rdx, %rdx
+#if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
+	jnz	L(CopyFrom1To16BytesUnalignedXmm4)
+#else
+	jnz	L(CopyFrom1To16Bytes)
+#endif
+
+	movaps	16(%rsi, %rcx), %xmm1
+	movdqu	%xmm4, (%rdi, %rcx)
+	pcmpeqb	%xmm1, %xmm0
+	pmovmskb %xmm0, %rdx
+	add	$16, %rcx
+#ifdef USE_AS_STRNCPY
+	sub	$16, %r8
+	jbe	L(CopyFrom1To16BytesCase2OrCase3)
+#endif
+	test	%rdx, %rdx
+#if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
+	jnz	L(CopyFrom1To16BytesUnalignedXmm1)
+#else
+	jnz	L(CopyFrom1To16Bytes)
+#endif
+
+	movaps	16(%rsi, %rcx), %xmm2
+	movdqu	%xmm1, (%rdi, %rcx)
+	pcmpeqb	%xmm2, %xmm0
+	pmovmskb %xmm0, %rdx
+	add	$16, %rcx
+#ifdef USE_AS_STRNCPY
+	sub	$16, %r8
+	jbe	L(CopyFrom1To16BytesCase2OrCase3)
+#endif
+	test	%rdx, %rdx
+#if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
+	jnz	L(CopyFrom1To16BytesUnalignedXmm2)
+#else
+	jnz	L(CopyFrom1To16Bytes)
+#endif
+
+	movaps	16(%rsi, %rcx), %xmm3
+	movdqu	%xmm2, (%rdi, %rcx)
+	pcmpeqb	%xmm3, %xmm0
+	pmovmskb %xmm0, %rdx
+	add	$16, %rcx
+#ifdef USE_AS_STRNCPY
+	sub	$16, %r8
+	jbe	L(CopyFrom1To16BytesCase2OrCase3)
+#endif
+	test	%rdx, %rdx
+#if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
+	jnz	L(CopyFrom1To16BytesUnalignedXmm3)
+#else
+	jnz	L(CopyFrom1To16Bytes)
+#endif
+
+	movdqu	%xmm3, (%rdi, %rcx)
+	mov	%rsi, %rdx
+	lea	16(%rsi, %rcx), %rsi
+	and	$-0x40, %rsi
+	sub	%rsi, %rdx
+	sub	%rdx, %rdi
+#ifdef USE_AS_STRNCPY
+	lea	128(%r8, %rdx), %r8
+#endif
+L(Unaligned64Loop):
+	movaps	(%rsi), %xmm2
+	movaps	%xmm2, %xmm4
+	movaps	16(%rsi), %xmm5
+	movaps	32(%rsi), %xmm3
+	movaps	%xmm3, %xmm6
+	movaps	48(%rsi), %xmm7
+	pminub	%xmm5, %xmm2
+	pminub	%xmm7, %xmm3
+	pminub	%xmm2, %xmm3
+	pcmpeqb	%xmm0, %xmm3
+	pmovmskb %xmm3, %rdx
+#ifdef USE_AS_STRNCPY
+	sub	$64, %r8
+	jbe	L(UnalignedLeaveCase2OrCase3)
+#endif
+	test	%rdx, %rdx
+	jnz	L(Unaligned64Leave)
+
+L(Unaligned64Loop_start):
+	add	$64, %rdi
+	add	$64, %rsi
+	movdqu	%xmm4, -64(%rdi)
+	movaps	(%rsi), %xmm2
+	movdqa	%xmm2, %xmm4
+	movdqu	%xmm5, -48(%rdi)
+	movaps	16(%rsi), %xmm5
+	pminub	%xmm5, %xmm2
+	movaps	32(%rsi), %xmm3
+	movdqu	%xmm6, -32(%rdi)
+	movaps	%xmm3, %xmm6
+	movdqu	%xmm7, -16(%rdi)
+	movaps	48(%rsi), %xmm7
+	pminub	%xmm7, %xmm3
+	pminub	%xmm2, %xmm3
+	pcmpeqb	%xmm0, %xmm3
+	pmovmskb %xmm3, %rdx
+#ifdef USE_AS_STRNCPY
+	sub	$64, %r8
+	jbe	L(UnalignedLeaveCase2OrCase3)
+#endif
+	test	%rdx, %rdx
+	jz	L(Unaligned64Loop_start)
+
+L(Unaligned64Leave):
+	pxor	%xmm1, %xmm1
+
+	pcmpeqb	%xmm4, %xmm0
+	pcmpeqb	%xmm5, %xmm1
+	pmovmskb %xmm0, %rdx
+	pmovmskb %xmm1, %rcx
+	test	%rdx, %rdx
+	jnz	L(CopyFrom1To16BytesUnaligned_0)
+	test	%rcx, %rcx
+	jnz	L(CopyFrom1To16BytesUnaligned_16)
+
+	pcmpeqb	%xmm6, %xmm0
+	pcmpeqb	%xmm7, %xmm1
+	pmovmskb %xmm0, %rdx
+	pmovmskb %xmm1, %rcx
+	test	%rdx, %rdx
+	jnz	L(CopyFrom1To16BytesUnaligned_32)
+
+	bsf	%rcx, %rdx
+	movdqu	%xmm4, (%rdi)
+	movdqu	%xmm5, 16(%rdi)
+	movdqu	%xmm6, 32(%rdi)
+#if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
+# ifdef USE_AS_STPCPY
+	lea	48(%rdi, %rdx), %rax
+# endif
+	movdqu	%xmm7, 48(%rdi)
+	add	$15, %r8
+	sub	%rdx, %r8
+	lea	49(%rdi, %rdx), %rdi
+	jmp	L(StrncpyFillTailWithZero)
+#else
+	add	$48, %rsi
+	add	$48, %rdi
+	BRANCH_TO_JMPTBL_ENTRY (L(ExitTable), %rdx, 4)
+#endif
+
+/* If source adress alignment == destination adress alignment */
+
+L(SourceStringAlignmentLess32):
+	pxor	%xmm0, %xmm0
+	movdqu	(%rsi), %xmm1
+	movdqu	16(%rsi), %xmm2
+	pcmpeqb	%xmm1, %xmm0
+	pmovmskb %xmm0, %rdx
+
+#ifdef USE_AS_STRNCPY
+# if defined USE_AS_STPCPY || defined USE_AS_STRCAT
+	cmp	$16, %r8
+# else
+	cmp	$17, %r8
+# endif
+	jbe	L(CopyFrom1To16BytesTail1Case2OrCase3)
+#endif
+	test	%rdx, %rdx
+	jnz	L(CopyFrom1To16BytesTail1)
+
+	pcmpeqb	%xmm2, %xmm0
+	movdqu	%xmm1, (%rdi)
+	pmovmskb %xmm0, %rdx
+
+#ifdef USE_AS_STRNCPY
+# if defined USE_AS_STPCPY || defined USE_AS_STRCAT
+	cmp	$32, %r8
+# else
+	cmp	$33, %r8
+# endif
+	jbe	L(CopyFrom1To32Bytes1Case2OrCase3)
+#endif
+	test	%rdx, %rdx
+	jnz	L(CopyFrom1To32Bytes1)
+
+	and	$15, %rcx
+	and	$-16, %rsi
+
+	jmp	L(Unalign16Both)
+
+/*------End of main part with loops---------------------*/
+
+/* Case1 */
+
+#if (!defined USE_AS_STRNCPY) || (defined USE_AS_STRCAT)
+	.p2align 4
+L(CopyFrom1To16Bytes):
+	add	%rcx, %rdi
+	add	%rcx, %rsi
+	bsf	%rdx, %rdx
+	BRANCH_TO_JMPTBL_ENTRY (L(ExitTable), %rdx, 4)
+#endif
+	.p2align 4
+L(CopyFrom1To16BytesTail):
+	add	%rcx, %rsi
+	bsf	%rdx, %rdx
+	BRANCH_TO_JMPTBL_ENTRY (L(ExitTable), %rdx, 4)
+
+	.p2align 4
+L(CopyFrom1To32Bytes1):
+	add	$16, %rsi
+	add	$16, %rdi
+#if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
+	sub	$16, %r8
+#endif
+L(CopyFrom1To16BytesTail1):
+	bsf	%rdx, %rdx
+	BRANCH_TO_JMPTBL_ENTRY (L(ExitTable), %rdx, 4)
+
+	.p2align 4
+L(CopyFrom1To32Bytes):
+	bsf	%rdx, %rdx
+	add	%rcx, %rsi
+	add	$16, %rdx
+	sub	%rcx, %rdx
+	BRANCH_TO_JMPTBL_ENTRY (L(ExitTable), %rdx, 4)
+
+	.p2align 4
+L(CopyFrom1To16BytesUnaligned_0):
+	bsf	%rdx, %rdx
+#if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
+# ifdef USE_AS_STPCPY
+	lea	(%rdi, %rdx), %rax
+# endif
+	movdqu	%xmm4, (%rdi)
+	add	$63, %r8
+	sub	%rdx, %r8
+	lea	1(%rdi, %rdx), %rdi
+	jmp	L(StrncpyFillTailWithZero)
+#else
+	BRANCH_TO_JMPTBL_ENTRY (L(ExitTable), %rdx, 4)
+#endif
+
+	.p2align 4
+L(CopyFrom1To16BytesUnaligned_16):
+	bsf	%rcx, %rdx
+	movdqu	%xmm4, (%rdi)
+#if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
+# ifdef USE_AS_STPCPY
+	lea	16(%rdi, %rdx), %rax
+# endif
+	movdqu	%xmm5, 16(%rdi)
+	add	$47, %r8
+	sub	%rdx, %r8
+	lea	17(%rdi, %rdx), %rdi
+	jmp	L(StrncpyFillTailWithZero)
+#else
+	add	$16, %rsi
+	add	$16, %rdi
+	BRANCH_TO_JMPTBL_ENTRY (L(ExitTable), %rdx, 4)
+#endif
+
+	.p2align 4
+L(CopyFrom1To16BytesUnaligned_32):
+	bsf	%rdx, %rdx
+	movdqu	%xmm4, (%rdi)
+	movdqu	%xmm5, 16(%rdi)
+#if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
+# ifdef USE_AS_STPCPY
+	lea	32(%rdi, %rdx), %rax
+# endif
+	movdqu	%xmm6, 32(%rdi)
+	add	$31, %r8
+	sub	%rdx, %r8
+	lea	33(%rdi, %rdx), %rdi
+	jmp	L(StrncpyFillTailWithZero)
+#else
+	add	$32, %rsi
+	add	$32, %rdi
+	BRANCH_TO_JMPTBL_ENTRY (L(ExitTable), %rdx, 4)
+#endif
+
+#ifdef USE_AS_STRNCPY
+# ifndef USE_AS_STRCAT 
+	.p2align 4
+L(CopyFrom1To16BytesUnalignedXmm6):
+	movdqu	%xmm6, (%rdi, %rcx)
+	jmp	L(CopyFrom1To16BytesXmmExit)
+
+	.p2align 4
+L(CopyFrom1To16BytesUnalignedXmm5):
+	movdqu	%xmm5, (%rdi, %rcx)
+	jmp	L(CopyFrom1To16BytesXmmExit)
+
+	.p2align 4
+L(CopyFrom1To16BytesUnalignedXmm4):
+	movdqu	%xmm4, (%rdi, %rcx)
+	jmp	L(CopyFrom1To16BytesXmmExit)
+
+	.p2align 4
+L(CopyFrom1To16BytesUnalignedXmm3):
+	movdqu	%xmm3, (%rdi, %rcx)
+	jmp	L(CopyFrom1To16BytesXmmExit)
+
+	.p2align 4
+L(CopyFrom1To16BytesUnalignedXmm1):
+	movdqu	%xmm1, (%rdi, %rcx)
+	jmp	L(CopyFrom1To16BytesXmmExit)
+# endif
+
+	.p2align 4
+L(CopyFrom1To16BytesExit):
+	BRANCH_TO_JMPTBL_ENTRY (L(ExitTable), %rdx, 4)
+
+/* Case2 */
+
+	.p2align 4
+L(CopyFrom1To16BytesCase2):
+	add	$16, %r8
+	add	%rcx, %rdi
+	add	%rcx, %rsi
+	bsf	%rdx, %rdx
+	cmp	%r8, %rdx
+	jb	L(CopyFrom1To16BytesExit)
+	BRANCH_TO_JMPTBL_ENTRY (L(ExitStrncpyTable), %r8, 4)
+
+	.p2align 4
+L(CopyFrom1To32BytesCase2):
+	add	%rcx, %rsi
+	bsf	%rdx, %rdx
+	add	$16, %rdx
+	sub	%rcx, %rdx
+	cmp	%r8, %rdx
+	jb	L(CopyFrom1To16BytesExit)
+	BRANCH_TO_JMPTBL_ENTRY (L(ExitStrncpyTable), %r8, 4)
+
+L(CopyFrom1To16BytesTailCase2):
+	add	%rcx, %rsi
+	bsf	%rdx, %rdx
+	cmp	%r8, %rdx
+	jb	L(CopyFrom1To16BytesExit)
+	BRANCH_TO_JMPTBL_ENTRY (L(ExitStrncpyTable), %r8, 4)
+
+L(CopyFrom1To16BytesTail1Case2):
+	bsf	%rdx, %rdx
+	cmp	%r8, %rdx
+	jb	L(CopyFrom1To16BytesExit)
+	BRANCH_TO_JMPTBL_ENTRY (L(ExitStrncpyTable), %r8, 4)
+
+/* Case2 or Case3,  Case3 */
+
+	.p2align 4
+L(CopyFrom1To16BytesCase2OrCase3):
+	test	%rdx, %rdx
+	jnz	L(CopyFrom1To16BytesCase2)
+L(CopyFrom1To16BytesCase3):
+	add	$16, %r8
+	add	%rcx, %rdi
+	add	%rcx, %rsi
+	BRANCH_TO_JMPTBL_ENTRY (L(ExitStrncpyTable), %r8, 4)
+
+	.p2align 4
+L(CopyFrom1To32BytesCase2OrCase3):
+	test	%rdx, %rdx
+	jnz	L(CopyFrom1To32BytesCase2)
+	add	%rcx, %rsi
+	BRANCH_TO_JMPTBL_ENTRY (L(ExitStrncpyTable), %r8, 4)
+
+	.p2align 4
+L(CopyFrom1To16BytesTailCase2OrCase3):
+	test	%rdx, %rdx
+	jnz	L(CopyFrom1To16BytesTailCase2)
+	add	%rcx, %rsi
+	BRANCH_TO_JMPTBL_ENTRY (L(ExitStrncpyTable), %r8, 4)
+
+	.p2align 4
+L(CopyFrom1To32Bytes1Case2OrCase3):
+	add	$16, %rdi
+	add	$16, %rsi
+	sub	$16, %r8
+L(CopyFrom1To16BytesTail1Case2OrCase3):
+	test	%rdx, %rdx
+	jnz	L(CopyFrom1To16BytesTail1Case2)
+	BRANCH_TO_JMPTBL_ENTRY (L(ExitStrncpyTable), %r8, 4)
+
+#endif
+
+/*------------End labels regarding with copying 1-16 bytes--and 1-32 bytes----*/
+
+	.p2align 4
+L(Exit1):
+	mov	%dh, (%rdi)
+#ifdef USE_AS_STPCPY
+	lea	(%rdi), %rax
+#endif
+#if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
+	sub	$1, %r8
+	lea	1(%rdi), %rdi
+	jnz	L(StrncpyFillTailWithZero)
+#endif
+	RETURN
+
+	.p2align 4
+L(Exit2):
+	mov	(%rsi), %dx
+	mov	%dx, (%rdi)
+#ifdef USE_AS_STPCPY
+	lea	1(%rdi), %rax
+#endif
+#if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
+	sub	$2, %r8
+	lea	2(%rdi), %rdi
+	jnz	L(StrncpyFillTailWithZero)
+#endif
+	RETURN
+
+	.p2align 4
+L(Exit3):
+	mov	(%rsi), %cx
+	mov	%cx, (%rdi)
+	mov	%dh, 2(%rdi)
+#ifdef USE_AS_STPCPY
+	lea	2(%rdi), %rax
+#endif
+#if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
+	sub	$3, %r8
+	lea	3(%rdi), %rdi
+	jnz	L(StrncpyFillTailWithZero)
+#endif
+	RETURN
+
+	.p2align 4
+L(Exit4):
+	mov	(%rsi), %edx
+	mov	%edx, (%rdi)
+#ifdef USE_AS_STPCPY
+	lea	3(%rdi), %rax
+#endif
+#if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
+	sub	$4, %r8
+	lea	4(%rdi), %rdi
+	jnz	L(StrncpyFillTailWithZero)
+#endif
+	RETURN
+
+	.p2align 4
+L(Exit5):
+	mov	(%rsi), %ecx
+	mov	%dh, 4(%rdi)
+	mov	%ecx, (%rdi)
+#ifdef USE_AS_STPCPY
+	lea	4(%rdi), %rax
+#endif
+#if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
+	sub	$5, %r8
+	lea	5(%rdi), %rdi
+	jnz	L(StrncpyFillTailWithZero)
+#endif
+	RETURN
+
+	.p2align 4
+L(Exit6):
+	mov	(%rsi), %ecx
+	mov	4(%rsi), %dx
+	mov	%ecx, (%rdi)
+	mov	%dx, 4(%rdi)
+#ifdef USE_AS_STPCPY
+	lea	5(%rdi), %rax
+#endif
+#if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
+	sub	$6, %r8
+	lea	6(%rdi), %rdi
+	jnz	L(StrncpyFillTailWithZero)
+#endif
+	RETURN
+
+	.p2align 4
+L(Exit7):
+	mov	(%rsi), %ecx
+	mov	3(%rsi), %edx
+	mov	%ecx, (%rdi)
+	mov	%edx, 3(%rdi)
+#ifdef USE_AS_STPCPY
+	lea	6(%rdi), %rax
+#endif
+#if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
+	sub	$7, %r8
+	lea	7(%rdi), %rdi
+	jnz	L(StrncpyFillTailWithZero)
+#endif
+	RETURN
+
+	.p2align 4
+L(Exit8):
+	mov	(%rsi), %rdx
+	mov	%rdx, (%rdi)
+#ifdef USE_AS_STPCPY
+	lea	7(%rdi), %rax
+#endif
+#if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
+	sub	$8, %r8
+	lea	8(%rdi), %rdi
+	jnz	L(StrncpyFillTailWithZero)
+#endif
+	RETURN
+
+	.p2align 4
+L(Exit9):
+	mov	(%rsi), %rcx
+	mov	%dh, 8(%rdi)
+	mov	%rcx, (%rdi)
+#ifdef USE_AS_STPCPY
+	lea	8(%rdi), %rax
+#endif
+#if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
+	sub	$9, %r8
+	lea	9(%rdi), %rdi
+	jnz	L(StrncpyFillTailWithZero)
+#endif
+	RETURN
+
+	.p2align 4
+L(Exit10):
+	mov	(%rsi), %rcx
+	mov	8(%rsi), %dx
+	mov	%rcx, (%rdi)
+	mov	%dx, 8(%rdi)
+#ifdef USE_AS_STPCPY
+	lea	9(%rdi), %rax
+#endif
+#if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
+	sub	$10, %r8
+	lea	10(%rdi), %rdi
+	jnz	L(StrncpyFillTailWithZero)
+#endif
+	RETURN
+
+	.p2align 4
+L(Exit11):
+	mov	(%rsi), %rcx
+	mov	7(%rsi), %edx
+	mov	%rcx, (%rdi)
+	mov	%edx, 7(%rdi)
+#ifdef USE_AS_STPCPY
+	lea	10(%rdi), %rax
+#endif
+#if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
+	sub	$11, %r8
+	lea	11(%rdi), %rdi
+	jnz	L(StrncpyFillTailWithZero)
+#endif
+	RETURN
+
+	.p2align 4
+L(Exit12):
+	mov	(%rsi), %rcx
+	mov	8(%rsi), %edx
+	mov	%rcx, (%rdi)
+	mov	%edx, 8(%rdi)
+#ifdef USE_AS_STPCPY
+	lea	11(%rdi), %rax
+#endif
+#if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
+	sub	$12, %r8
+	lea	12(%rdi), %rdi
+	jnz	L(StrncpyFillTailWithZero)
+#endif
+	RETURN
+
+	.p2align 4
+L(Exit13):
+	mov	(%rsi), %rcx
+	mov	5(%rsi), %rdx
+	mov	%rcx, (%rdi)
+	mov	%rdx, 5(%rdi)
+#ifdef USE_AS_STPCPY
+	lea	12(%rdi), %rax
+#endif
+#if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
+	sub	$13, %r8
+	lea	13(%rdi), %rdi
+	jnz	L(StrncpyFillTailWithZero)
+#endif
+	RETURN
+
+	.p2align 4
+L(Exit14):
+	mov	(%rsi), %rcx
+	mov	6(%rsi), %rdx
+	mov	%rcx, (%rdi)
+	mov	%rdx, 6(%rdi)
+#ifdef USE_AS_STPCPY
+	lea	13(%rdi), %rax
+#endif
+#if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
+	sub	$14, %r8
+	lea	14(%rdi), %rdi
+	jnz	L(StrncpyFillTailWithZero)
+#endif
+	RETURN
+
+	.p2align 4
+L(Exit15):
+	mov	(%rsi), %rcx
+	mov	7(%rsi), %rdx
+	mov	%rcx, (%rdi)
+	mov	%rdx, 7(%rdi)
+#ifdef USE_AS_STPCPY
+	lea	14(%rdi), %rax
+#endif
+#if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
+	sub	$15, %r8
+	lea	15(%rdi), %rdi
+	jnz	L(StrncpyFillTailWithZero)
+#endif
+	RETURN
+
+	.p2align 4
+L(Exit16):
+	movdqu	(%rsi), %xmm0
+	movdqu	%xmm0, (%rdi)
+#ifdef USE_AS_STPCPY
+	lea	15(%rdi), %rax
+#endif
+#if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
+	sub	$16, %r8
+	lea	16(%rdi), %rdi
+	jnz	L(StrncpyFillTailWithZero)
+#endif
+	RETURN
+
+	.p2align 4
+L(Exit17):
+	movdqu	(%rsi), %xmm0
+	movdqu	%xmm0, (%rdi)
+	mov	%dh, 16(%rdi)
+#ifdef USE_AS_STPCPY
+	lea	16(%rdi), %rax
+#endif
+#if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
+	sub	$17, %r8
+	lea	17(%rdi), %rdi
+	jnz	L(StrncpyFillTailWithZero)
+#endif
+	RETURN
+
+	.p2align 4
+L(Exit18):
+	movdqu	(%rsi), %xmm0
+	mov	16(%rsi), %cx
+	movdqu	%xmm0, (%rdi)
+	mov	%cx, 16(%rdi)
+#ifdef USE_AS_STPCPY
+	lea	17(%rdi), %rax
+#endif
+#if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
+	sub	$18, %r8
+	lea	18(%rdi), %rdi
+	jnz	L(StrncpyFillTailWithZero)
+#endif
+	RETURN
+
+	.p2align 4
+L(Exit19):
+	movdqu	(%rsi), %xmm0
+	mov	15(%rsi), %ecx
+	movdqu	%xmm0, (%rdi)
+	mov	%ecx, 15(%rdi)
+#ifdef USE_AS_STPCPY
+	lea	18(%rdi), %rax
+#endif
+#if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
+	sub	$19, %r8
+	lea	19(%rdi), %rdi
+	jnz	L(StrncpyFillTailWithZero)
+#endif
+	RETURN
+
+	.p2align 4
+L(Exit20):
+	movdqu	(%rsi), %xmm0
+	mov	16(%rsi), %ecx
+	movdqu	%xmm0, (%rdi)
+	mov	%ecx, 16(%rdi)
+#ifdef USE_AS_STPCPY
+	lea	19(%rdi), %rax
+#endif
+#if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
+	sub	$20, %r8
+	lea	20(%rdi), %rdi
+	jnz	L(StrncpyFillTailWithZero)
+#endif
+	RETURN
+
+	.p2align 4
+L(Exit21):
+	movdqu	(%rsi), %xmm0
+	mov	16(%rsi), %ecx
+	movdqu	%xmm0, (%rdi)
+	mov	%ecx, 16(%rdi)
+	mov	%dh, 20(%rdi)
+#ifdef USE_AS_STPCPY
+	lea	20(%rdi), %rax
+#endif
+#if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
+	sub	$21, %r8
+	lea	21(%rdi), %rdi
+	jnz	L(StrncpyFillTailWithZero)
+#endif
+	RETURN
+
+	.p2align 4
+L(Exit22):
+	movdqu	(%rsi), %xmm0
+	mov	14(%rsi), %rcx
+	movdqu	%xmm0, (%rdi)
+	mov	%rcx, 14(%rdi)
+#ifdef USE_AS_STPCPY
+	lea	21(%rdi), %rax
+#endif
+#if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
+	sub	$22, %r8
+	lea	22(%rdi), %rdi
+	jnz	L(StrncpyFillTailWithZero)
+#endif
+	RETURN
+
+	.p2align 4
+L(Exit23):
+	movdqu	(%rsi), %xmm0
+	mov	15(%rsi), %rcx
+	movdqu	%xmm0, (%rdi)
+	mov	%rcx, 15(%rdi)
+#ifdef USE_AS_STPCPY
+	lea	22(%rdi), %rax
+#endif
+#if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
+	sub	$23, %r8
+	lea	23(%rdi), %rdi
+	jnz	L(StrncpyFillTailWithZero)
+#endif
+	RETURN
+
+	.p2align 4
+L(Exit24):
+	movdqu	(%rsi), %xmm0
+	mov	16(%rsi), %rcx
+	movdqu	%xmm0, (%rdi)
+	mov	%rcx, 16(%rdi)
+#ifdef USE_AS_STPCPY
+	lea	23(%rdi), %rax
+#endif
+#if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
+	sub	$24, %r8
+	lea	24(%rdi), %rdi
+	jnz	L(StrncpyFillTailWithZero)
+#endif
+	RETURN
+
+	.p2align 4
+L(Exit25):
+	movdqu	(%rsi), %xmm0
+	mov	16(%rsi), %rcx
+	movdqu	%xmm0, (%rdi)
+	mov	%rcx, 16(%rdi)
+	mov	%dh, 24(%rdi)
+#ifdef USE_AS_STPCPY
+	lea	24(%rdi), %rax
+#endif
+#if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
+	sub	$25, %r8
+	lea	25(%rdi), %rdi
+	jnz	L(StrncpyFillTailWithZero)
+#endif
+	RETURN
+
+	.p2align 4
+L(Exit26):
+	movdqu	(%rsi), %xmm0
+	mov	16(%rsi), %rdx
+	mov	24(%rsi), %cx
+	movdqu	%xmm0, (%rdi)
+	mov	%rdx, 16(%rdi)
+	mov	%cx, 24(%rdi)
+#ifdef USE_AS_STPCPY
+	lea	25(%rdi), %rax
+#endif
+#if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
+	sub	$26, %r8
+	lea	26(%rdi), %rdi
+	jnz	L(StrncpyFillTailWithZero)
+#endif
+	RETURN
+
+	.p2align 4
+L(Exit27):
+	movdqu	(%rsi), %xmm0
+	mov	16(%rsi), %rdx
+	mov	23(%rsi), %ecx
+	movdqu	%xmm0, (%rdi)
+	mov	%rdx, 16(%rdi)
+	mov	%ecx, 23(%rdi)
+#ifdef USE_AS_STPCPY
+	lea	26(%rdi), %rax
+#endif
+#if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
+	sub	$27, %r8
+	lea	27(%rdi), %rdi
+	jnz	L(StrncpyFillTailWithZero)
+#endif
+	RETURN
+
+	.p2align 4
+L(Exit28):
+	movdqu	(%rsi), %xmm0
+	mov	16(%rsi), %rdx
+	mov	24(%rsi), %ecx
+	movdqu	%xmm0, (%rdi)
+	mov	%rdx, 16(%rdi)
+	mov	%ecx, 24(%rdi)
+#ifdef USE_AS_STPCPY
+	lea	27(%rdi), %rax
+#endif
+#if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
+	sub	$28, %r8
+	lea	28(%rdi), %rdi
+	jnz	L(StrncpyFillTailWithZero)
+#endif
+	RETURN
+
+	.p2align 4
+L(Exit29):
+	movdqu	(%rsi), %xmm0
+	movdqu	13(%rsi), %xmm2
+	movdqu	%xmm0, (%rdi)
+	movdqu	%xmm2, 13(%rdi)
+#ifdef USE_AS_STPCPY
+	lea	28(%rdi), %rax
+#endif
+#if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
+	sub	$29, %r8
+	lea	29(%rdi), %rdi
+	jnz	L(StrncpyFillTailWithZero)
+#endif
+	RETURN
+
+	.p2align 4
+L(Exit30):
+	movdqu	(%rsi), %xmm0
+	movdqu	14(%rsi), %xmm2
+	movdqu	%xmm0, (%rdi)
+	movdqu	%xmm2, 14(%rdi)
+#ifdef USE_AS_STPCPY
+	lea	29(%rdi), %rax
+#endif
+#if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
+	sub	$30, %r8
+	lea	30(%rdi), %rdi
+	jnz	L(StrncpyFillTailWithZero)
+#endif
+	RETURN
+
+	.p2align 4
+L(Exit31):
+	movdqu	(%rsi), %xmm0
+	movdqu	15(%rsi), %xmm2
+	movdqu	%xmm0, (%rdi)
+	movdqu	%xmm2, 15(%rdi)
+#ifdef USE_AS_STPCPY
+	lea	30(%rdi), %rax
+#endif
+#if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
+	sub	$31, %r8
+	lea	31(%rdi), %rdi
+	jnz	L(StrncpyFillTailWithZero)
+#endif
+	RETURN
+
+	.p2align 4
+L(Exit32):
+	movdqu	(%rsi), %xmm0
+	movdqu	16(%rsi), %xmm2
+	movdqu	%xmm0, (%rdi)
+	movdqu	%xmm2, 16(%rdi)
+#ifdef USE_AS_STPCPY
+	lea	31(%rdi), %rax
+#endif
+#if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
+	sub	$32, %r8
+	lea	32(%rdi), %rdi
+	jnz	L(StrncpyFillTailWithZero)
+#endif
+	RETURN
+
+#ifdef USE_AS_STRNCPY
+
+	.p2align 4
+L(StrncpyExit0):
+#ifdef USE_AS_STPCPY
+	mov	%rdi, %rax
+#endif
+#ifdef USE_AS_STRCAT
+	xor	%ch, %ch
+	movb	%ch, (%rdi)
+#endif
+	RETURN
+
+	.p2align 4
+L(StrncpyExit1):
+	mov	(%rsi), %dl
+	mov	%dl, (%rdi)
+#ifdef USE_AS_STPCPY
+	lea	1(%rdi), %rax
+#endif
+#ifdef USE_AS_STRCAT
+	xor	%ch, %ch
+	movb	%ch, 1(%rdi)
+#endif
+	RETURN
+
+	.p2align 4
+L(StrncpyExit2):
+	mov	(%rsi), %dx
+	mov	%dx, (%rdi)
+#ifdef USE_AS_STPCPY
+	lea	2(%rdi), %rax
+#endif
+#ifdef USE_AS_STRCAT
+	xor	%ch, %ch
+	movb	%ch, 2(%rdi)
+#endif
+	RETURN
+
+	.p2align 4
+L(StrncpyExit3):
+	mov	(%rsi), %cx
+	mov	2(%rsi), %dl
+	mov	%cx, (%rdi)
+	mov	%dl, 2(%rdi)
+#ifdef USE_AS_STPCPY
+	lea	3(%rdi), %rax
+#endif
+#ifdef USE_AS_STRCAT
+	xor	%ch, %ch
+	movb	%ch, 3(%rdi)
+#endif
+	RETURN
+
+	.p2align 4
+L(StrncpyExit4):
+	mov	(%rsi), %edx
+	mov	%edx, (%rdi)
+#ifdef USE_AS_STPCPY
+	lea	4(%rdi), %rax
+#endif
+#ifdef USE_AS_STRCAT
+	xor	%ch, %ch
+	movb	%ch, 4(%rdi)
+#endif
+	RETURN
+
+	.p2align 4
+L(StrncpyExit5):
+	mov	(%rsi), %ecx
+	mov	4(%rsi), %dl
+	mov	%ecx, (%rdi)
+	mov	%dl, 4(%rdi)
+#ifdef USE_AS_STPCPY
+	lea	5(%rdi), %rax
+#endif
+#ifdef USE_AS_STRCAT
+	xor	%ch, %ch
+	movb	%ch, 5(%rdi)
+#endif
+	RETURN
+
+	.p2align 4
+L(StrncpyExit6):
+	mov	(%rsi), %ecx
+	mov	4(%rsi), %dx
+	mov	%ecx, (%rdi)
+	mov	%dx, 4(%rdi)
+#ifdef USE_AS_STPCPY
+	lea	6(%rdi), %rax
+#endif
+#ifdef USE_AS_STRCAT
+	xor	%ch, %ch
+	movb	%ch, 6(%rdi)
+#endif
+	RETURN
+
+	.p2align 4
+L(StrncpyExit7):
+	mov	(%rsi), %ecx
+	mov	3(%rsi), %edx
+	mov	%ecx, (%rdi)
+	mov	%edx, 3(%rdi)
+#ifdef USE_AS_STPCPY
+	lea	7(%rdi), %rax
+#endif
+#ifdef USE_AS_STRCAT
+	xor	%ch, %ch
+	movb	%ch, 7(%rdi)
+#endif
+	RETURN
+
+	.p2align 4
+L(StrncpyExit8):
+	mov	(%rsi), %rdx
+	mov	%rdx, (%rdi)
+#ifdef USE_AS_STPCPY
+	lea	8(%rdi), %rax
+#endif
+#ifdef USE_AS_STRCAT
+	xor	%ch, %ch
+	movb	%ch, 8(%rdi)
+#endif
+	RETURN
+
+	.p2align 4
+L(StrncpyExit9):
+	mov	(%rsi), %rcx
+	mov	8(%rsi), %dl
+	mov	%rcx, (%rdi)
+	mov	%dl, 8(%rdi)
+#ifdef USE_AS_STPCPY
+	lea	9(%rdi), %rax
+#endif
+#ifdef USE_AS_STRCAT
+	xor	%ch, %ch
+	movb	%ch, 9(%rdi)
+#endif
+	RETURN
+
+	.p2align 4
+L(StrncpyExit10):
+	mov	(%rsi), %rcx
+	mov	8(%rsi), %dx
+	mov	%rcx, (%rdi)
+	mov	%dx, 8(%rdi)
+#ifdef USE_AS_STPCPY
+	lea	10(%rdi), %rax
+#endif
+#ifdef USE_AS_STRCAT
+	xor	%ch, %ch
+	movb	%ch, 10(%rdi)
+#endif
+	RETURN
+
+	.p2align 4
+L(StrncpyExit11):
+	mov	(%rsi), %rcx
+	mov	7(%rsi), %edx
+	mov	%rcx, (%rdi)
+	mov	%edx, 7(%rdi)
+#ifdef USE_AS_STPCPY
+	lea	11(%rdi), %rax
+#endif
+#ifdef USE_AS_STRCAT
+	xor	%ch, %ch
+	movb	%ch, 11(%rdi)
+#endif
+	RETURN
+
+	.p2align 4
+L(StrncpyExit12):
+	mov	(%rsi), %rcx
+	mov	8(%rsi), %edx
+	mov	%rcx, (%rdi)
+	mov	%edx, 8(%rdi)
+#ifdef USE_AS_STPCPY
+	lea	12(%rdi), %rax
+#endif
+#ifdef USE_AS_STRCAT
+	xor	%ch, %ch
+	movb	%ch, 12(%rdi)
+#endif
+	RETURN
+
+	.p2align 4
+L(StrncpyExit13):
+	mov	(%rsi), %rcx
+	mov	5(%rsi), %rdx
+	mov	%rcx, (%rdi)
+	mov	%rdx, 5(%rdi)
+#ifdef USE_AS_STPCPY
+	lea	13(%rdi), %rax
+#endif
+#ifdef USE_AS_STRCAT
+	xor	%ch, %ch
+	movb	%ch, 13(%rdi)
+#endif
+	RETURN
+
+	.p2align 4
+L(StrncpyExit14):
+	mov	(%rsi), %rcx
+	mov	6(%rsi), %rdx
+	mov	%rcx, (%rdi)
+	mov	%rdx, 6(%rdi)
+#ifdef USE_AS_STPCPY
+	lea	14(%rdi), %rax
+#endif
+#ifdef USE_AS_STRCAT
+	xor	%ch, %ch
+	movb	%ch, 14(%rdi)
+#endif
+	RETURN
+
+	.p2align 4
+L(StrncpyExit15):
+	mov	(%rsi), %rcx
+	mov	7(%rsi), %rdx
+	mov	%rcx, (%rdi)
+	mov	%rdx, 7(%rdi)
+#ifdef USE_AS_STPCPY
+	lea	15(%rdi), %rax
+#endif
+#ifdef USE_AS_STRCAT
+	xor	%ch, %ch
+	movb	%ch, 15(%rdi)
+#endif
+	RETURN
+
+	.p2align 4
+L(StrncpyExit16):
+	movdqu	(%rsi), %xmm0
+	movdqu	%xmm0, (%rdi)
+#ifdef USE_AS_STPCPY
+	lea	16(%rdi), %rax
+#endif
+#ifdef USE_AS_STRCAT
+	xor	%ch, %ch
+	movb	%ch, 16(%rdi)
+#endif
+	RETURN
+
+	.p2align 4
+L(StrncpyExit17):
+	movdqu	(%rsi), %xmm0
+	mov	16(%rsi), %cl
+	movdqu	%xmm0, (%rdi)
+	mov	%cl, 16(%rdi)
+#ifdef USE_AS_STPCPY
+	lea	17(%rdi), %rax
+#endif
+#ifdef USE_AS_STRCAT
+	xor	%ch, %ch
+	movb	%ch, 17(%rdi)
+#endif
+	RETURN
+
+	.p2align 4
+L(StrncpyExit18):
+	movdqu	(%rsi), %xmm0
+	mov	16(%rsi), %cx
+	movdqu	%xmm0, (%rdi)
+	mov	%cx, 16(%rdi)
+#ifdef USE_AS_STPCPY
+	lea	18(%rdi), %rax
+#endif
+#ifdef USE_AS_STRCAT
+	xor	%ch, %ch
+	movb	%ch, 18(%rdi)
+#endif
+	RETURN
+
+	.p2align 4
+L(StrncpyExit19):
+	movdqu	(%rsi), %xmm0
+	mov	15(%rsi), %ecx
+	movdqu	%xmm0, (%rdi)
+	mov	%ecx, 15(%rdi)
+#ifdef USE_AS_STPCPY
+	lea	19(%rdi), %rax
+#endif
+#ifdef USE_AS_STRCAT
+	xor	%ch, %ch
+	movb	%ch, 19(%rdi)
+#endif
+	RETURN
+
+	.p2align 4
+L(StrncpyExit20):
+	movdqu	(%rsi), %xmm0
+	mov	16(%rsi), %ecx
+	movdqu	%xmm0, (%rdi)
+	mov	%ecx, 16(%rdi)
+#ifdef USE_AS_STPCPY
+	lea	20(%rdi), %rax
+#endif
+#ifdef USE_AS_STRCAT
+	xor	%ch, %ch
+	movb	%ch, 20(%rdi)
+#endif
+	RETURN
+
+	.p2align 4
+L(StrncpyExit21):
+	movdqu	(%rsi), %xmm0
+	mov	16(%rsi), %ecx
+	mov	20(%rsi), %dl
+	movdqu	%xmm0, (%rdi)
+	mov	%ecx, 16(%rdi)
+	mov	%dl, 20(%rdi)
+#ifdef USE_AS_STPCPY
+	lea	21(%rdi), %rax
+#endif
+#ifdef USE_AS_STRCAT
+	xor	%ch, %ch
+	movb	%ch, 21(%rdi)
+#endif
+	RETURN
+
+	.p2align 4
+L(StrncpyExit22):
+	movdqu	(%rsi), %xmm0
+	mov	14(%rsi), %rcx
+	movdqu	%xmm0, (%rdi)
+	mov	%rcx, 14(%rdi)
+#ifdef USE_AS_STPCPY
+	lea	22(%rdi), %rax
+#endif
+#ifdef USE_AS_STRCAT
+	xor	%ch, %ch
+	movb	%ch, 22(%rdi)
+#endif
+	RETURN
+
+	.p2align 4
+L(StrncpyExit23):
+	movdqu	(%rsi), %xmm0
+	mov	15(%rsi), %rcx
+	movdqu	%xmm0, (%rdi)
+	mov	%rcx, 15(%rdi)
+#ifdef USE_AS_STPCPY
+	lea	23(%rdi), %rax
+#endif
+#ifdef USE_AS_STRCAT
+	xor	%ch, %ch
+	movb	%ch, 23(%rdi)
+#endif
+	RETURN
+
+	.p2align 4
+L(StrncpyExit24):
+	movdqu	(%rsi), %xmm0
+	mov	16(%rsi), %rcx
+	movdqu	%xmm0, (%rdi)
+	mov	%rcx, 16(%rdi)
+#ifdef USE_AS_STPCPY
+	lea	24(%rdi), %rax
+#endif
+#ifdef USE_AS_STRCAT
+	xor	%ch, %ch
+	movb	%ch, 24(%rdi)
+#endif
+	RETURN
+
+	.p2align 4
+L(StrncpyExit25):
+	movdqu	(%rsi), %xmm0
+	mov	16(%rsi), %rdx
+	mov	24(%rsi), %cl
+	movdqu	%xmm0, (%rdi)
+	mov	%rdx, 16(%rdi)
+	mov	%cl, 24(%rdi)
+#ifdef USE_AS_STPCPY
+	lea	25(%rdi), %rax
+#endif
+#ifdef USE_AS_STRCAT
+	xor	%ch, %ch
+	movb	%ch, 25(%rdi)
+#endif
+	RETURN
+
+	.p2align 4
+L(StrncpyExit26):
+	movdqu	(%rsi), %xmm0
+	mov	16(%rsi), %rdx
+	mov	24(%rsi), %cx
+	movdqu	%xmm0, (%rdi)
+	mov	%rdx, 16(%rdi)
+	mov	%cx, 24(%rdi)
+#ifdef USE_AS_STPCPY
+	lea	26(%rdi), %rax
+#endif
+#ifdef USE_AS_STRCAT
+	xor	%ch, %ch
+	movb	%ch, 26(%rdi)
+#endif
+	RETURN
+
+	.p2align 4
+L(StrncpyExit27):
+	movdqu	(%rsi), %xmm0
+	mov	16(%rsi), %rdx
+	mov	23(%rsi), %ecx
+	movdqu	%xmm0, (%rdi)
+	mov	%rdx, 16(%rdi)
+	mov	%ecx, 23(%rdi)
+#ifdef USE_AS_STPCPY
+	lea	27(%rdi), %rax
+#endif
+#ifdef USE_AS_STRCAT
+	xor	%ch, %ch
+	movb	%ch, 27(%rdi)
+#endif
+	RETURN
+
+	.p2align 4
+L(StrncpyExit28):
+	movdqu	(%rsi), %xmm0
+	mov	16(%rsi), %rdx
+	mov	24(%rsi), %ecx
+	movdqu	%xmm0, (%rdi)
+	mov	%rdx, 16(%rdi)
+	mov	%ecx, 24(%rdi)
+#ifdef USE_AS_STPCPY
+	lea	28(%rdi), %rax
+#endif
+#ifdef USE_AS_STRCAT
+	xor	%ch, %ch
+	movb	%ch, 28(%rdi)
+#endif
+	RETURN
+
+	.p2align 4
+L(StrncpyExit29):
+	movdqu	(%rsi), %xmm0
+	movdqu	13(%rsi), %xmm2
+	movdqu	%xmm0, (%rdi)
+	movdqu	%xmm2, 13(%rdi)
+#ifdef USE_AS_STPCPY
+	lea	29(%rdi), %rax
+#endif
+#ifdef USE_AS_STRCAT
+	xor	%ch, %ch
+	movb	%ch, 29(%rdi)
+#endif
+	RETURN
+
+	.p2align 4
+L(StrncpyExit30):
+	movdqu	(%rsi), %xmm0
+	movdqu	14(%rsi), %xmm2
+	movdqu	%xmm0, (%rdi)
+	movdqu	%xmm2, 14(%rdi)
+#ifdef USE_AS_STPCPY
+	lea	30(%rdi), %rax
+#endif
+#ifdef USE_AS_STRCAT
+	xor	%ch, %ch
+	movb	%ch, 30(%rdi)
+#endif
+	RETURN
+
+	.p2align 4
+L(StrncpyExit31):
+	movdqu	(%rsi), %xmm0
+	movdqu	15(%rsi), %xmm2
+	movdqu	%xmm0, (%rdi)
+	movdqu	%xmm2, 15(%rdi)
+#ifdef USE_AS_STPCPY
+	lea	31(%rdi), %rax
+#endif
+#ifdef USE_AS_STRCAT
+	xor	%ch, %ch
+	movb	%ch, 31(%rdi)
+#endif
+	RETURN
+
+	.p2align 4
+L(StrncpyExit32):
+	movdqu	(%rsi), %xmm0
+	movdqu	16(%rsi), %xmm2
+	movdqu	%xmm0, (%rdi)
+	movdqu	%xmm2, 16(%rdi)
+#ifdef USE_AS_STPCPY
+	lea	32(%rdi), %rax
+#endif
+#ifdef USE_AS_STRCAT
+	xor	%ch, %ch
+	movb	%ch, 32(%rdi)
+#endif
+	RETURN
+
+	.p2align 4
+L(StrncpyExit33):
+	movdqu	(%rsi), %xmm0
+	movdqu	16(%rsi), %xmm2
+	mov	32(%rsi), %cl
+	movdqu	%xmm0, (%rdi)
+	movdqu	%xmm2, 16(%rdi)
+	mov	%cl, 32(%rdi)
+#ifdef USE_AS_STRCAT
+	xor	%ch, %ch
+	movb	%ch, 33(%rdi)
+#endif
+	RETURN
+
+#ifndef USE_AS_STRCAT
+
+	.p2align 4
+L(Fill0):
+	RETURN
+
+	.p2align 4
+L(Fill1):
+	mov	%dl, (%rdi)
+	RETURN
+
+	.p2align 4
+L(Fill2):
+	mov	%dx, (%rdi)
+	RETURN
+
+	.p2align 4
+L(Fill3):
+	mov	%edx, -1(%rdi)
+	RETURN
+
+	.p2align 4
+L(Fill4):
+	mov	%edx, (%rdi)
+	RETURN
+
+	.p2align 4
+L(Fill5):
+	mov	%edx, (%rdi)
+	mov	%dl, 4(%rdi)
+	RETURN
+
+	.p2align 4
+L(Fill6):
+	mov	%edx, (%rdi)
+	mov	%dx, 4(%rdi)
+	RETURN
+
+	.p2align 4
+L(Fill7):
+	mov	%rdx, -1(%rdi)
+	RETURN
+
+	.p2align 4
+L(Fill8):
+	mov	%rdx, (%rdi)
+	RETURN
+
+	.p2align 4
+L(Fill9):
+	mov	%rdx, (%rdi)
+	mov	%dl, 8(%rdi)
+	RETURN
+
+	.p2align 4
+L(Fill10):
+	mov	%rdx, (%rdi)
+	mov	%dx, 8(%rdi)
+	RETURN
+
+	.p2align 4
+L(Fill11):
+	mov	%rdx, (%rdi)
+	mov	%edx, 7(%rdi)
+	RETURN
+
+	.p2align 4
+L(Fill12):
+	mov	%rdx, (%rdi)
+	mov	%edx, 8(%rdi)
+	RETURN
+
+	.p2align 4
+L(Fill13):
+	mov	%rdx, (%rdi)
+	mov	%rdx, 5(%rdi)
+	RETURN
+
+	.p2align 4
+L(Fill14):
+	mov	%rdx, (%rdi)
+	mov	%rdx, 6(%rdi)
+	RETURN
+
+	.p2align 4
+L(Fill15):
+	movdqu	%xmm0, -1(%rdi)
+	RETURN
+
+	.p2align 4
+L(Fill16):
+	movdqu	%xmm0, (%rdi)
+	RETURN
+
+	.p2align 4
+L(CopyFrom1To16BytesUnalignedXmm2):
+	movdqu	%xmm2, (%rdi, %rcx)
+
+	.p2align 4
+L(CopyFrom1To16BytesXmmExit):
+	bsf	%rdx, %rdx
+	add	$15, %r8
+	add	%rcx, %rdi
+#ifdef USE_AS_STPCPY
+	lea	(%rdi, %rdx), %rax
+#endif
+	sub	%rdx, %r8
+	lea	1(%rdi, %rdx), %rdi
+
+	.p2align 4
+L(StrncpyFillTailWithZero):
+	pxor	%xmm0, %xmm0
+	xor	%rdx, %rdx
+	sub	$16, %r8
+	jbe	L(StrncpyFillExit)
+
+	movdqu	%xmm0, (%rdi)
+	add	$16, %rdi
+
+	mov	%rdi, %rsi
+	and	$0xf, %rsi
+	sub	%rsi, %rdi
+	add	%rsi, %r8
+	sub	$64, %r8
+	jb	L(StrncpyFillLess64)
+
+L(StrncpyFillLoopMovdqa):
+	movdqa	%xmm0, (%rdi)
+	movdqa	%xmm0, 16(%rdi)
+	movdqa	%xmm0, 32(%rdi)
+	movdqa	%xmm0, 48(%rdi)
+	add	$64, %rdi
+	sub	$64, %r8
+	jae	L(StrncpyFillLoopMovdqa)
+
+L(StrncpyFillLess64):
+	add	$32, %r8
+	jl	L(StrncpyFillLess32)
+	movdqa	%xmm0, (%rdi)
+	movdqa	%xmm0, 16(%rdi)
+	add	$32, %rdi
+	sub	$16, %r8
+	jl	L(StrncpyFillExit)
+	movdqa	%xmm0, (%rdi)
+	add	$16, %rdi
+	BRANCH_TO_JMPTBL_ENTRY (L(FillTable), %r8, 4)
+
+L(StrncpyFillLess32):
+	add	$16, %r8
+	jl	L(StrncpyFillExit)
+	movdqa	%xmm0, (%rdi)
+	add	$16, %rdi
+	BRANCH_TO_JMPTBL_ENTRY (L(FillTable), %r8, 4)
+
+L(StrncpyFillExit):
+	add	$16, %r8
+	BRANCH_TO_JMPTBL_ENTRY (L(FillTable), %r8, 4)
+
+/* end of ifndef USE_AS_STRCAT */
+#endif
+
+	.p2align 4
+L(UnalignedLeaveCase2OrCase3):
+	test	%rdx, %rdx
+	jnz	L(Unaligned64LeaveCase2)
+L(Unaligned64LeaveCase3):
+	lea	64(%r8), %rcx
+	and	$-16, %rcx
+	add	$48, %r8
+	jl	L(CopyFrom1To16BytesCase3)
+	movdqu	%xmm4, (%rdi)
+	sub	$16, %r8
+	jb	L(CopyFrom1To16BytesCase3)
+	movdqu	%xmm5, 16(%rdi)
+	sub	$16, %r8
+	jb	L(CopyFrom1To16BytesCase3)
+	movdqu	%xmm6, 32(%rdi)
+	sub	$16, %r8
+	jb	L(CopyFrom1To16BytesCase3)
+	movdqu	%xmm7, 48(%rdi)
+#ifdef USE_AS_STPCPY
+	lea	64(%rdi), %rax
+#endif
+#ifdef USE_AS_STRCAT
+	xor	%ch, %ch
+	movb	%ch, 64(%rdi)
+#endif
+	RETURN
+
+	.p2align 4
+L(Unaligned64LeaveCase2):
+	xor	%rcx, %rcx
+	pcmpeqb	%xmm4, %xmm0
+	pmovmskb %xmm0, %rdx
+	add	$48, %r8
+	jle	L(CopyFrom1To16BytesCase2OrCase3)
+	test	%rdx, %rdx
+#ifndef USE_AS_STRCAT
+	jnz	L(CopyFrom1To16BytesUnalignedXmm4)
+#else
+	jnz	L(CopyFrom1To16Bytes)
+#endif
+	pcmpeqb	%xmm5, %xmm0
+	pmovmskb %xmm0, %rdx
+	movdqu	%xmm4, (%rdi)
+	add	$16, %rcx
+	sub	$16, %r8
+	jbe	L(CopyFrom1To16BytesCase2OrCase3)
+	test	%rdx, %rdx
+#ifndef USE_AS_STRCAT
+	jnz	L(CopyFrom1To16BytesUnalignedXmm5)
+#else
+	jnz	L(CopyFrom1To16Bytes)
+#endif
+
+	pcmpeqb	%xmm6, %xmm0
+	pmovmskb %xmm0, %rdx
+	movdqu	%xmm5, 16(%rdi)
+	add	$16, %rcx
+	sub	$16, %r8
+	jbe	L(CopyFrom1To16BytesCase2OrCase3)
+	test	%rdx, %rdx
+#ifndef USE_AS_STRCAT
+	jnz	L(CopyFrom1To16BytesUnalignedXmm6)
+#else
+	jnz	L(CopyFrom1To16Bytes)
+#endif
+
+	pcmpeqb	%xmm7, %xmm0
+	pmovmskb %xmm0, %rdx
+	movdqu	%xmm6, 32(%rdi)
+	lea	16(%rdi, %rcx), %rdi
+	lea	16(%rsi, %rcx), %rsi
+	bsf	%rdx, %rdx
+	cmp	%r8, %rdx
+	jb	L(CopyFrom1To16BytesExit)
+	BRANCH_TO_JMPTBL_ENTRY (L(ExitStrncpyTable), %r8, 4)
+
+	.p2align 4
+L(ExitZero):
+#ifndef USE_AS_STRCAT
+	mov	%rdi, %rax
+#endif
+	RETURN
+
+#endif
+
+#ifndef USE_AS_STRCAT
+END (STRCPY)
+#else
+END (STRCAT)
+#endif
+	.p2align 4
+	.section .rodata
+L(ExitTable):
+	.int	JMPTBL(L(Exit1), L(ExitTable))
+	.int	JMPTBL(L(Exit2), L(ExitTable))
+	.int	JMPTBL(L(Exit3), L(ExitTable))
+	.int	JMPTBL(L(Exit4), L(ExitTable))
+	.int	JMPTBL(L(Exit5), L(ExitTable))
+	.int	JMPTBL(L(Exit6), L(ExitTable))
+	.int	JMPTBL(L(Exit7), L(ExitTable))
+	.int	JMPTBL(L(Exit8), L(ExitTable))
+	.int	JMPTBL(L(Exit9), L(ExitTable))
+	.int	JMPTBL(L(Exit10), L(ExitTable))
+	.int	JMPTBL(L(Exit11), L(ExitTable))
+	.int	JMPTBL(L(Exit12), L(ExitTable))
+	.int	JMPTBL(L(Exit13), L(ExitTable))
+	.int	JMPTBL(L(Exit14), L(ExitTable))
+	.int	JMPTBL(L(Exit15), L(ExitTable))
+	.int	JMPTBL(L(Exit16), L(ExitTable))
+	.int	JMPTBL(L(Exit17), L(ExitTable))
+	.int	JMPTBL(L(Exit18), L(ExitTable))
+	.int	JMPTBL(L(Exit19), L(ExitTable))
+	.int	JMPTBL(L(Exit20), L(ExitTable))
+	.int	JMPTBL(L(Exit21), L(ExitTable))
+	.int	JMPTBL(L(Exit22), L(ExitTable))
+	.int	JMPTBL(L(Exit23), L(ExitTable))
+	.int	JMPTBL(L(Exit24), L(ExitTable))
+	.int	JMPTBL(L(Exit25), L(ExitTable))
+	.int	JMPTBL(L(Exit26), L(ExitTable))
+	.int	JMPTBL(L(Exit27), L(ExitTable))
+	.int	JMPTBL(L(Exit28), L(ExitTable))
+	.int	JMPTBL(L(Exit29), L(ExitTable))
+	.int	JMPTBL(L(Exit30), L(ExitTable))
+	.int	JMPTBL(L(Exit31), L(ExitTable))
+	.int	JMPTBL(L(Exit32), L(ExitTable))
+#ifdef USE_AS_STRNCPY
+L(ExitStrncpyTable):
+	.int	JMPTBL(L(StrncpyExit0), L(ExitStrncpyTable))
+	.int	JMPTBL(L(StrncpyExit1), L(ExitStrncpyTable))
+	.int	JMPTBL(L(StrncpyExit2), L(ExitStrncpyTable))
+	.int	JMPTBL(L(StrncpyExit3), L(ExitStrncpyTable))
+	.int	JMPTBL(L(StrncpyExit4), L(ExitStrncpyTable))
+	.int	JMPTBL(L(StrncpyExit5), L(ExitStrncpyTable))
+	.int	JMPTBL(L(StrncpyExit6), L(ExitStrncpyTable))
+	.int	JMPTBL(L(StrncpyExit7), L(ExitStrncpyTable))
+	.int	JMPTBL(L(StrncpyExit8), L(ExitStrncpyTable))
+	.int	JMPTBL(L(StrncpyExit9), L(ExitStrncpyTable))
+	.int	JMPTBL(L(StrncpyExit10), L(ExitStrncpyTable))
+	.int	JMPTBL(L(StrncpyExit11), L(ExitStrncpyTable))
+	.int	JMPTBL(L(StrncpyExit12), L(ExitStrncpyTable))
+	.int	JMPTBL(L(StrncpyExit13), L(ExitStrncpyTable))
+	.int	JMPTBL(L(StrncpyExit14), L(ExitStrncpyTable))
+	.int	JMPTBL(L(StrncpyExit15), L(ExitStrncpyTable))
+	.int	JMPTBL(L(StrncpyExit16), L(ExitStrncpyTable))
+	.int	JMPTBL(L(StrncpyExit17), L(ExitStrncpyTable))
+	.int	JMPTBL(L(StrncpyExit18), L(ExitStrncpyTable))
+	.int	JMPTBL(L(StrncpyExit19), L(ExitStrncpyTable))
+	.int	JMPTBL(L(StrncpyExit20), L(ExitStrncpyTable))
+	.int	JMPTBL(L(StrncpyExit21), L(ExitStrncpyTable))
+	.int	JMPTBL(L(StrncpyExit22), L(ExitStrncpyTable))
+	.int	JMPTBL(L(StrncpyExit23), L(ExitStrncpyTable))
+	.int	JMPTBL(L(StrncpyExit24), L(ExitStrncpyTable))
+	.int	JMPTBL(L(StrncpyExit25), L(ExitStrncpyTable))
+	.int	JMPTBL(L(StrncpyExit26), L(ExitStrncpyTable))
+	.int	JMPTBL(L(StrncpyExit27), L(ExitStrncpyTable))
+	.int	JMPTBL(L(StrncpyExit28), L(ExitStrncpyTable))
+	.int	JMPTBL(L(StrncpyExit29), L(ExitStrncpyTable))
+	.int	JMPTBL(L(StrncpyExit30), L(ExitStrncpyTable))
+	.int	JMPTBL(L(StrncpyExit31), L(ExitStrncpyTable))
+	.int	JMPTBL(L(StrncpyExit32), L(ExitStrncpyTable))
+	.int	JMPTBL(L(StrncpyExit33), L(ExitStrncpyTable))
+# ifndef USE_AS_STRCAT
+	.p2align 4
+L(FillTable):
+	.int	JMPTBL(L(Fill0), L(FillTable))
+	.int	JMPTBL(L(Fill1), L(FillTable))
+	.int	JMPTBL(L(Fill2), L(FillTable))
+	.int	JMPTBL(L(Fill3), L(FillTable))
+	.int	JMPTBL(L(Fill4), L(FillTable))
+	.int	JMPTBL(L(Fill5), L(FillTable))
+	.int	JMPTBL(L(Fill6), L(FillTable))
+	.int	JMPTBL(L(Fill7), L(FillTable))
+	.int	JMPTBL(L(Fill8), L(FillTable))
+	.int	JMPTBL(L(Fill9), L(FillTable))
+	.int	JMPTBL(L(Fill10), L(FillTable))
+	.int	JMPTBL(L(Fill11), L(FillTable))
+	.int	JMPTBL(L(Fill12), L(FillTable))
+	.int	JMPTBL(L(Fill13), L(FillTable))
+	.int	JMPTBL(L(Fill14), L(FillTable))
+	.int	JMPTBL(L(Fill15), L(FillTable))
+	.int	JMPTBL(L(Fill16), L(FillTable))
+# endif
+#endif
diff --git a/libc/arch-x86_64/string/sse2-strlen-slm.S b/libc/arch-x86_64/string/sse2-strlen-slm.S
new file mode 100644
index 0000000..3772fe7
--- /dev/null
+++ b/libc/arch-x86_64/string/sse2-strlen-slm.S
@@ -0,0 +1,294 @@
+/*
+Copyright (c) 2014, Intel Corporation
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+    * Redistributions of source code must retain the above copyright notice,
+    * this list of conditions and the following disclaimer.
+
+    * Redistributions in binary form must reproduce the above copyright notice,
+    * this list of conditions and the following disclaimer in the documentation
+    * and/or other materials provided with the distribution.
+
+    * Neither the name of Intel Corporation nor the names of its contributors
+    * may be used to endorse or promote products derived from this software
+    * without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
+ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+#ifndef USE_AS_STRCAT
+
+#ifndef STRLEN
+# define STRLEN		strlen
+#endif
+
+#ifndef L
+# define L(label)	.L##label
+#endif
+
+#ifndef cfi_startproc
+# define cfi_startproc			.cfi_startproc
+#endif
+
+#ifndef cfi_endproc
+# define cfi_endproc			.cfi_endproc
+#endif
+
+#ifndef ENTRY
+# define ENTRY(name)			\
+	.type name,  @function; 	\
+	.globl name;			\
+	.p2align 4;			\
+name:					\
+	cfi_startproc
+#endif
+
+#ifndef END
+# define END(name)			\
+	cfi_endproc;			\
+	.size name, .-name
+#endif
+#define RETURN ret
+	.section .text.sse2,"ax",@progbits
+ENTRY (STRLEN)
+/* end ifndef USE_AS_STRCAT */
+#endif
+	xor	%rax, %rax
+	mov	%edi, %ecx
+	and	$0x3f, %ecx
+	pxor	%xmm0, %xmm0
+	cmp	$0x30, %ecx
+	ja	L(next)
+	movdqu	(%rdi), %xmm1
+	pcmpeqb	%xmm1, %xmm0
+	pmovmskb %xmm0, %edx
+	test	%edx, %edx
+	jnz	L(exit_less16)
+	mov	%rdi, %rax
+	and	$-16, %rax
+	jmp	L(align16_start)
+L(next):
+	mov	%rdi, %rax
+	and	$-16, %rax
+	pcmpeqb	(%rax), %xmm0
+	mov	$-1, %r10d
+	sub	%rax, %rcx
+	shl	%cl, %r10d
+	pmovmskb %xmm0, %edx
+	and	%r10d, %edx
+	jnz	L(exit)
+L(align16_start):
+	pxor	%xmm0, %xmm0
+	pxor	%xmm1, %xmm1
+	pxor	%xmm2, %xmm2
+	pxor	%xmm3, %xmm3
+	pcmpeqb	16(%rax), %xmm0
+	pmovmskb %xmm0, %edx
+	test	%edx, %edx
+	jnz	L(exit16)
+
+	pcmpeqb	32(%rax), %xmm1
+	pmovmskb %xmm1, %edx
+	test	%edx, %edx
+	jnz	L(exit32)
+
+	pcmpeqb	48(%rax), %xmm2
+	pmovmskb %xmm2, %edx
+	test	%edx, %edx
+	jnz	L(exit48)
+
+	pcmpeqb	64(%rax), %xmm3
+	pmovmskb %xmm3, %edx
+	test	%edx, %edx
+	jnz	L(exit64)
+
+	pcmpeqb	80(%rax), %xmm0
+	add	$64, %rax
+	pmovmskb %xmm0, %edx
+	test	%edx, %edx
+	jnz	L(exit16)
+
+	pcmpeqb	32(%rax), %xmm1
+	pmovmskb %xmm1, %edx
+	test	%edx, %edx
+	jnz	L(exit32)
+
+	pcmpeqb	48(%rax), %xmm2
+	pmovmskb %xmm2, %edx
+	test	%edx, %edx
+	jnz	L(exit48)
+
+	pcmpeqb	64(%rax), %xmm3
+	pmovmskb %xmm3, %edx
+	test	%edx, %edx
+	jnz	L(exit64)
+
+	pcmpeqb	80(%rax), %xmm0
+	add	$64, %rax
+	pmovmskb %xmm0, %edx
+	test	%edx, %edx
+	jnz	L(exit16)
+
+	pcmpeqb	32(%rax), %xmm1
+	pmovmskb %xmm1, %edx
+	test	%edx, %edx
+	jnz	L(exit32)
+
+	pcmpeqb	48(%rax), %xmm2
+	pmovmskb %xmm2, %edx
+	test	%edx, %edx
+	jnz	L(exit48)
+
+	pcmpeqb	64(%rax), %xmm3
+	pmovmskb %xmm3, %edx
+	test	%edx, %edx
+	jnz	L(exit64)
+	
+	pcmpeqb	80(%rax), %xmm0
+	add	$64, %rax
+	pmovmskb %xmm0, %edx
+	test	%edx, %edx
+	jnz	L(exit16)
+
+	pcmpeqb	32(%rax), %xmm1
+	pmovmskb %xmm1, %edx
+	test	%edx, %edx
+	jnz	L(exit32)
+
+	pcmpeqb	48(%rax), %xmm2
+	pmovmskb %xmm2, %edx
+	test	%edx, %edx
+	jnz	L(exit48)
+
+	pcmpeqb	64(%rax), %xmm3
+	pmovmskb %xmm3, %edx
+	test	%edx, %edx
+	jnz	L(exit64)
+	
+
+	test	$0x3f, %rax
+	jz	L(align64_loop)
+
+	pcmpeqb	80(%rax), %xmm0
+	add	$80, %rax
+	pmovmskb %xmm0, %edx
+	test	%edx, %edx
+	jnz	L(exit)
+
+	test	$0x3f, %rax
+	jz	L(align64_loop)
+
+	pcmpeqb	16(%rax), %xmm1
+	add	$16, %rax
+	pmovmskb %xmm1, %edx
+	test	%edx, %edx
+	jnz	L(exit)
+
+	test	$0x3f, %rax
+	jz	L(align64_loop)
+
+	pcmpeqb	16(%rax), %xmm2
+	add	$16, %rax
+	pmovmskb %xmm2, %edx
+	test	%edx, %edx
+	jnz	L(exit)
+
+	test	$0x3f, %rax
+	jz	L(align64_loop)
+
+	pcmpeqb	16(%rax), %xmm3
+	add	$16, %rax
+	pmovmskb %xmm3, %edx
+	test	%edx, %edx
+	jnz	L(exit)
+
+	add	$16, %rax
+	.p2align 4
+	L(align64_loop):
+	movaps	(%rax),	%xmm4
+	pminub	16(%rax), 	%xmm4
+	movaps	32(%rax), 	%xmm5
+	pminub	48(%rax), 	%xmm5
+	add	$64, 	%rax
+	pminub	%xmm4,	%xmm5
+	pcmpeqb	%xmm0,	%xmm5
+	pmovmskb %xmm5,	%edx
+	test	%edx,	%edx
+	jz	L(align64_loop)
+
+
+	pcmpeqb	-64(%rax), %xmm0
+	sub	$80, 	%rax
+	pmovmskb %xmm0, %edx
+	test	%edx, %edx
+	jnz	L(exit16)
+
+	pcmpeqb	32(%rax), %xmm1
+	pmovmskb %xmm1, %edx
+	test	%edx, %edx
+	jnz	L(exit32)
+
+	pcmpeqb	48(%rax), %xmm2
+	pmovmskb %xmm2, %edx
+	test	%edx, %edx
+	jnz	L(exit48)
+
+	pcmpeqb	64(%rax), %xmm3
+	pmovmskb %xmm3, %edx
+	sub	%rdi, %rax
+	bsf	%rdx, %rdx
+	add	%rdx, %rax
+	add	$64, %rax
+	RETURN
+
+	.p2align 4
+L(exit):
+	sub	%rdi, %rax
+L(exit_less16):
+	bsf	%rdx, %rdx
+	add	%rdx, %rax
+	RETURN
+	.p2align 4
+L(exit16):
+	sub	%rdi, %rax
+	bsf	%rdx, %rdx
+	add	%rdx, %rax
+	add	$16, %rax
+	RETURN
+	.p2align 4
+L(exit32):
+	sub	%rdi, %rax
+	bsf	%rdx, %rdx
+	add	%rdx, %rax
+	add	$32, %rax
+	RETURN
+	.p2align 4
+L(exit48):
+	sub	%rdi, %rax
+	bsf	%rdx, %rdx
+	add	%rdx, %rax
+	add	$48, %rax
+	RETURN
+	.p2align 4
+L(exit64):
+	sub	%rdi, %rax
+	bsf	%rdx, %rdx
+	add	%rdx, %rax
+	add	$64, %rax
+#ifndef USE_AS_STRCAT
+	RETURN
+
+END (STRLEN)
+#endif
diff --git a/libc/arch-x86/string/cache.h b/libc/arch-x86_64/string/sse2-strncat-slm.S
similarity index 73%
copy from libc/arch-x86/string/cache.h
copy to libc/arch-x86_64/string/sse2-strncat-slm.S
index 9d0a563..6b4a430 100644
--- a/libc/arch-x86/string/cache.h
+++ b/libc/arch-x86_64/string/sse2-strncat-slm.S
@@ -1,42 +1,33 @@
-/*
-Copyright (c) 2010, Intel Corporation
-All rights reserved.
-
-Redistribution and use in source and binary forms, with or without
-modification, are permitted provided that the following conditions are met:
-
-    * Redistributions of source code must retain the above copyright notice,
-    * this list of conditions and the following disclaimer.
-
-    * Redistributions in binary form must reproduce the above copyright notice,
-    * this list of conditions and the following disclaimer in the documentation
-    * and/or other materials provided with the distribution.
-
-    * Neither the name of Intel Corporation nor the names of its contributors
-    * may be used to endorse or promote products derived from this software
-    * without specific prior written permission.
-
-THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
-ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
-WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
-DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
-ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
-(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
-LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
-ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-*/
-
-#if defined(__slm__)
-/* Values are optimized for Silvermont */
-#define SHARED_CACHE_SIZE	(1024*1024)			/* Silvermont L2 Cache */
-#define DATA_CACHE_SIZE		(24*1024)			/* Silvermont L1 Data Cache */
-#else
-/* Values are optimized for Atom */
-#define SHARED_CACHE_SIZE	(512*1024)			/* Atom L2 Cache */
-#define DATA_CACHE_SIZE		(24*1024)			/* Atom L1 Data Cache */
-#endif
-
-#define SHARED_CACHE_SIZE_HALF	(SHARED_CACHE_SIZE / 2)
-#define DATA_CACHE_SIZE_HALF	(DATA_CACHE_SIZE / 2)
+/*

+Copyright (c) 2014, Intel Corporation

+All rights reserved.

+

+Redistribution and use in source and binary forms, with or without

+modification, are permitted provided that the following conditions are met:

+

+    * Redistributions of source code must retain the above copyright notice,

+    * this list of conditions and the following disclaimer.

+

+    * Redistributions in binary form must reproduce the above copyright notice,

+    * this list of conditions and the following disclaimer in the documentation

+    * and/or other materials provided with the distribution.

+

+    * Neither the name of Intel Corporation nor the names of its contributors

+    * may be used to endorse or promote products derived from this software

+    * without specific prior written permission.

+

+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND

+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED

+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE

+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR

+ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES

+(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;

+LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON

+ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT

+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS

+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

+*/

+

+#define USE_AS_STRNCAT

+#define STRCAT		strncat

+#include "sse2-strcat-slm.S"

diff --git a/libc/arch-x86/string/cache.h b/libc/arch-x86_64/string/sse2-strncpy-slm.S
similarity index 73%
copy from libc/arch-x86/string/cache.h
copy to libc/arch-x86_64/string/sse2-strncpy-slm.S
index 9d0a563..594e78f 100644
--- a/libc/arch-x86/string/cache.h
+++ b/libc/arch-x86_64/string/sse2-strncpy-slm.S
@@ -1,42 +1,33 @@
-/*
-Copyright (c) 2010, Intel Corporation
-All rights reserved.
-
-Redistribution and use in source and binary forms, with or without
-modification, are permitted provided that the following conditions are met:
-
-    * Redistributions of source code must retain the above copyright notice,
-    * this list of conditions and the following disclaimer.
-
-    * Redistributions in binary form must reproduce the above copyright notice,
-    * this list of conditions and the following disclaimer in the documentation
-    * and/or other materials provided with the distribution.
-
-    * Neither the name of Intel Corporation nor the names of its contributors
-    * may be used to endorse or promote products derived from this software
-    * without specific prior written permission.
-
-THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
-ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
-WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
-DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
-ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
-(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
-LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
-ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-*/
-
-#if defined(__slm__)
-/* Values are optimized for Silvermont */
-#define SHARED_CACHE_SIZE	(1024*1024)			/* Silvermont L2 Cache */
-#define DATA_CACHE_SIZE		(24*1024)			/* Silvermont L1 Data Cache */
-#else
-/* Values are optimized for Atom */
-#define SHARED_CACHE_SIZE	(512*1024)			/* Atom L2 Cache */
-#define DATA_CACHE_SIZE		(24*1024)			/* Atom L1 Data Cache */
-#endif
-
-#define SHARED_CACHE_SIZE_HALF	(SHARED_CACHE_SIZE / 2)
-#define DATA_CACHE_SIZE_HALF	(DATA_CACHE_SIZE / 2)
+/*

+Copyright (c) 2014, Intel Corporation

+All rights reserved.

+

+Redistribution and use in source and binary forms, with or without

+modification, are permitted provided that the following conditions are met:

+

+    * Redistributions of source code must retain the above copyright notice,

+    * this list of conditions and the following disclaimer.

+

+    * Redistributions in binary form must reproduce the above copyright notice,

+    * this list of conditions and the following disclaimer in the documentation

+    * and/or other materials provided with the distribution.

+

+    * Neither the name of Intel Corporation nor the names of its contributors

+    * may be used to endorse or promote products derived from this software

+    * without specific prior written permission.

+

+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND

+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED

+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE

+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR

+ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES

+(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;

+LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON

+ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT

+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS

+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

+*/

+

+#define USE_AS_STRNCPY

+#define STRCPY		strncpy

+#include "sse2-strcpy-slm.S"

diff --git a/libc/arch-x86_64/string/sse4-memcmp-slm.S b/libc/arch-x86_64/string/sse4-memcmp-slm.S
new file mode 100644
index 0000000..8a8b180
--- /dev/null
+++ b/libc/arch-x86_64/string/sse4-memcmp-slm.S
@@ -0,0 +1,1799 @@
+/*
+Copyright (c) 2014, Intel Corporation
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+    * Redistributions of source code must retain the above copyright notice,
+    * this list of conditions and the following disclaimer.
+
+    * Redistributions in binary form must reproduce the above copyright notice,
+    * this list of conditions and the following disclaimer in the documentation
+    * and/or other materials provided with the distribution.
+
+    * Neither the name of Intel Corporation nor the names of its contributors
+    * may be used to endorse or promote products derived from this software
+    * without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
+ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+#include "cache.h"
+
+#ifndef MEMCMP
+# define MEMCMP		memcmp
+#endif
+
+#ifndef L
+# define L(label)	.L##label
+#endif
+
+#ifndef ALIGN
+# define ALIGN(n)	.p2align n
+#endif
+
+#ifndef cfi_startproc
+# define cfi_startproc			.cfi_startproc
+#endif
+
+#ifndef cfi_endproc
+# define cfi_endproc			.cfi_endproc
+#endif
+
+#ifndef ENTRY
+# define ENTRY(name)			\
+	.type name,  @function; 	\
+	.globl name;			\
+	.p2align 4;			\
+name:					\
+	cfi_startproc
+#endif
+
+#ifndef END
+# define END(name)			\
+	cfi_endproc;			\
+	.size name, .-name
+#endif
+
+#ifndef ALIGN
+# define ALIGN(n)	.p2align n
+#endif
+
+#define JMPTBL(I, B)	(I - B)
+
+#define BRANCH_TO_JMPTBL_ENTRY(TABLE, INDEX, SCALE)		\
+  lea		TABLE(%rip), %r11;				\
+  movslq	(%r11, INDEX, SCALE), %rcx;			\
+  add		%r11, %rcx;					\
+  jmp		*%rcx;						\
+  ud2
+
+	.section .text.sse4.1,"ax",@progbits
+ENTRY (MEMCMP)
+#ifdef USE_AS_WMEMCMP
+	shl	$2, %rdx
+#endif
+	pxor	%xmm0, %xmm0
+	cmp	$79, %rdx
+	ja	L(79bytesormore)
+#ifndef USE_AS_WMEMCMP
+	cmp	$1, %rdx
+	je	L(firstbyte)
+#endif
+	add	%rdx, %rsi
+	add	%rdx, %rdi
+	BRANCH_TO_JMPTBL_ENTRY(L(table_64bytes), %rdx, 4)
+
+#ifndef USE_AS_WMEMCMP
+	ALIGN (4)
+L(firstbyte):
+	movzbl	(%rdi), %eax
+	movzbl	(%rsi), %ecx
+	sub	%ecx, %eax
+	ret
+#endif
+
+	ALIGN (4)
+L(79bytesormore):
+	movdqu	(%rsi), %xmm1
+	movdqu	(%rdi), %xmm2
+	pxor	%xmm1, %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(16bytesin256)
+	mov	%rsi, %rcx
+	and	$-16, %rsi
+	add	$16, %rsi
+	sub	%rsi, %rcx
+
+	sub	%rcx, %rdi
+	add	%rcx, %rdx
+	test	$0xf, %rdi
+	jz	L(2aligned)
+
+	cmp	$128, %rdx
+	ja	L(128bytesormore)
+L(less128bytes):
+	sub	$64, %rdx
+
+	movdqu	(%rdi), %xmm2
+	pxor	(%rsi), %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(16bytesin256)
+
+	movdqu	16(%rdi), %xmm2
+	pxor	16(%rsi), %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(32bytesin256)
+
+	movdqu	32(%rdi), %xmm2
+	pxor	32(%rsi), %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(48bytesin256)
+
+	movdqu	48(%rdi), %xmm2
+	pxor	48(%rsi), %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(64bytesin256)
+	cmp	$32, %rdx
+	jb	L(less32bytesin64)
+
+	movdqu	64(%rdi), %xmm2
+	pxor	64(%rsi), %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(80bytesin256)
+
+	movdqu	80(%rdi), %xmm2
+	pxor	80(%rsi), %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(96bytesin256)
+	sub	$32, %rdx
+	add	$32, %rdi
+	add	$32, %rsi
+L(less32bytesin64):
+	add	$64, %rdi
+	add	$64, %rsi
+	add	%rdx, %rsi
+	add	%rdx, %rdi
+	BRANCH_TO_JMPTBL_ENTRY(L(table_64bytes), %rdx, 4)
+
+L(128bytesormore):
+	cmp	$512, %rdx
+	ja	L(512bytesormore)
+	cmp	$256, %rdx
+	ja	L(less512bytes)
+L(less256bytes):
+	sub	$128, %rdx
+
+	movdqu	(%rdi), %xmm2
+	pxor	(%rsi), %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(16bytesin256)
+
+	movdqu	16(%rdi), %xmm2
+	pxor	16(%rsi), %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(32bytesin256)
+
+	movdqu	32(%rdi), %xmm2
+	pxor	32(%rsi), %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(48bytesin256)
+
+	movdqu	48(%rdi), %xmm2
+	pxor	48(%rsi), %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(64bytesin256)
+
+	movdqu	64(%rdi), %xmm2
+	pxor	64(%rsi), %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(80bytesin256)
+
+	movdqu	80(%rdi), %xmm2
+	pxor	80(%rsi), %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(96bytesin256)
+
+	movdqu	96(%rdi), %xmm2
+	pxor	96(%rsi), %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(112bytesin256)
+
+	movdqu	112(%rdi), %xmm2
+	pxor	112(%rsi), %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(128bytesin256)
+
+	add	$128, %rsi
+	add	$128, %rdi
+
+	cmp	$64, %rdx
+	jae	L(less128bytes)
+
+	cmp	$32, %rdx
+	jb	L(less32bytesin128)
+
+	movdqu	(%rdi), %xmm2
+	pxor	(%rsi), %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(16bytesin256)
+
+	movdqu	16(%rdi), %xmm2
+	pxor	16(%rsi), %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(32bytesin256)
+	sub	$32, %rdx
+	add	$32, %rdi
+	add	$32, %rsi
+L(less32bytesin128):
+	add	%rdx, %rsi
+	add	%rdx, %rdi
+	BRANCH_TO_JMPTBL_ENTRY(L(table_64bytes), %rdx, 4)
+
+L(less512bytes):
+	sub	$256, %rdx
+	movdqu	(%rdi), %xmm2
+	pxor	(%rsi), %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(16bytesin256)
+
+	movdqu	16(%rdi), %xmm2
+	pxor	16(%rsi), %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(32bytesin256)
+
+	movdqu	32(%rdi), %xmm2
+	pxor	32(%rsi), %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(48bytesin256)
+
+	movdqu	48(%rdi), %xmm2
+	pxor	48(%rsi), %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(64bytesin256)
+
+	movdqu	64(%rdi), %xmm2
+	pxor	64(%rsi), %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(80bytesin256)
+
+	movdqu	80(%rdi), %xmm2
+	pxor	80(%rsi), %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(96bytesin256)
+
+	movdqu	96(%rdi), %xmm2
+	pxor	96(%rsi), %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(112bytesin256)
+
+	movdqu	112(%rdi), %xmm2
+	pxor	112(%rsi), %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(128bytesin256)
+
+	movdqu	128(%rdi), %xmm2
+	pxor	128(%rsi), %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(144bytesin256)
+
+	movdqu	144(%rdi), %xmm2
+	pxor	144(%rsi), %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(160bytesin256)
+
+	movdqu	160(%rdi), %xmm2
+	pxor	160(%rsi), %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(176bytesin256)
+
+	movdqu	176(%rdi), %xmm2
+	pxor	176(%rsi), %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(192bytesin256)
+
+	movdqu	192(%rdi), %xmm2
+	pxor	192(%rsi), %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(208bytesin256)
+
+	movdqu	208(%rdi), %xmm2
+	pxor	208(%rsi), %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(224bytesin256)
+
+	movdqu	224(%rdi), %xmm2
+	pxor	224(%rsi), %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(240bytesin256)
+
+	movdqu	240(%rdi), %xmm2
+	pxor	240(%rsi), %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(256bytesin256)
+
+	add	$256, %rsi
+	add	$256, %rdi
+
+	cmp	$128, %rdx
+	jae	L(less256bytes)
+
+	cmp	$64, %rdx
+	jae	L(less128bytes)
+
+	cmp	$32, %rdx
+	jb	L(less32bytesin256)
+
+	movdqu	(%rdi), %xmm2
+	pxor	(%rsi), %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(16bytesin256)
+
+	movdqu	16(%rdi), %xmm2
+	pxor	16(%rsi), %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(32bytesin256)
+	sub	$32, %rdx
+	add	$32, %rdi
+	add	$32, %rsi
+L(less32bytesin256):
+	add	%rdx, %rsi
+	add	%rdx, %rdi
+	BRANCH_TO_JMPTBL_ENTRY(L(table_64bytes), %rdx, 4)
+
+	ALIGN (4)
+L(512bytesormore):
+#ifdef DATA_CACHE_SIZE_HALF
+	mov	$DATA_CACHE_SIZE_HALF, %r8
+#else
+	mov	__x86_64_data_cache_size_half(%rip), %r8
+#endif
+	mov	%r8, %r9
+	shr	$1, %r8
+	add	%r9, %r8
+	cmp	%r8, %rdx
+	ja	L(L2_L3_cache_unaglined)
+	sub	$64, %rdx
+	ALIGN (4)
+L(64bytesormore_loop):
+	movdqu	(%rdi), %xmm2
+	pxor	(%rsi), %xmm2
+	movdqa	%xmm2, %xmm1
+
+	movdqu	16(%rdi), %xmm3
+	pxor	16(%rsi), %xmm3
+	por	%xmm3, %xmm1
+
+	movdqu	32(%rdi), %xmm4
+	pxor	32(%rsi), %xmm4
+	por	%xmm4, %xmm1
+
+	movdqu	48(%rdi), %xmm5
+	pxor	48(%rsi), %xmm5
+	por	%xmm5, %xmm1
+
+	ptest	%xmm1, %xmm0
+	jnc	L(64bytesormore_loop_end)
+	add	$64, %rsi
+	add	$64, %rdi
+	sub	$64, %rdx
+	jae	L(64bytesormore_loop)
+
+	add	$64, %rdx
+	add	%rdx, %rsi
+	add	%rdx, %rdi
+	BRANCH_TO_JMPTBL_ENTRY(L(table_64bytes), %rdx, 4)
+
+L(L2_L3_cache_unaglined):
+	sub	$64, %rdx
+	ALIGN (4)
+L(L2_L3_unaligned_128bytes_loop):
+	prefetchnta 0x1c0(%rdi)
+	prefetchnta 0x1c0(%rsi)
+	movdqu	(%rdi), %xmm2
+	pxor	(%rsi), %xmm2
+	movdqa	%xmm2, %xmm1
+
+	movdqu	16(%rdi), %xmm3
+	pxor	16(%rsi), %xmm3
+	por	%xmm3, %xmm1
+
+	movdqu	32(%rdi), %xmm4
+	pxor	32(%rsi), %xmm4
+	por	%xmm4, %xmm1
+
+	movdqu	48(%rdi), %xmm5
+	pxor	48(%rsi), %xmm5
+	por	%xmm5, %xmm1
+
+	ptest	%xmm1, %xmm0
+	jnc	L(64bytesormore_loop_end)
+	add	$64, %rsi
+	add	$64, %rdi
+	sub	$64, %rdx
+	jae	L(L2_L3_unaligned_128bytes_loop)
+
+	add	$64, %rdx
+	add	%rdx, %rsi
+	add	%rdx, %rdi
+	BRANCH_TO_JMPTBL_ENTRY(L(table_64bytes), %rdx, 4)
+
+/*
+ * This case is for machines which are sensitive for unaligned instructions.
+ */
+	ALIGN (4)
+L(2aligned):
+	cmp	$128, %rdx
+	ja	L(128bytesormorein2aligned)
+L(less128bytesin2aligned):
+	sub	$64, %rdx
+
+	movdqa	(%rdi), %xmm2
+	pxor	(%rsi), %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(16bytesin256)
+
+	movdqa	16(%rdi), %xmm2
+	pxor	16(%rsi), %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(32bytesin256)
+
+	movdqa	32(%rdi), %xmm2
+	pxor	32(%rsi), %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(48bytesin256)
+
+	movdqa	48(%rdi), %xmm2
+	pxor	48(%rsi), %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(64bytesin256)
+	cmp	$32, %rdx
+	jb	L(less32bytesin64in2alinged)
+
+	movdqa	64(%rdi), %xmm2
+	pxor	64(%rsi), %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(80bytesin256)
+
+	movdqa	80(%rdi), %xmm2
+	pxor	80(%rsi), %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(96bytesin256)
+	sub	$32, %rdx
+	add	$32, %rdi
+	add	$32, %rsi
+L(less32bytesin64in2alinged):
+	add	$64, %rdi
+	add	$64, %rsi
+	add	%rdx, %rsi
+	add	%rdx, %rdi
+	BRANCH_TO_JMPTBL_ENTRY(L(table_64bytes), %rdx, 4)
+
+	ALIGN (4)
+L(128bytesormorein2aligned):
+	cmp	$512, %rdx
+	ja	L(512bytesormorein2aligned)
+	cmp	$256, %rdx
+	ja	L(256bytesormorein2aligned)
+L(less256bytesin2alinged):
+	sub	$128, %rdx
+
+	movdqa	(%rdi), %xmm2
+	pxor	(%rsi), %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(16bytesin256)
+
+	movdqa	16(%rdi), %xmm2
+	pxor	16(%rsi), %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(32bytesin256)
+
+	movdqa	32(%rdi), %xmm2
+	pxor	32(%rsi), %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(48bytesin256)
+
+	movdqa	48(%rdi), %xmm2
+	pxor	48(%rsi), %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(64bytesin256)
+
+	movdqa	64(%rdi), %xmm2
+	pxor	64(%rsi), %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(80bytesin256)
+
+	movdqa	80(%rdi), %xmm2
+	pxor	80(%rsi), %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(96bytesin256)
+
+	movdqa	96(%rdi), %xmm2
+	pxor	96(%rsi), %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(112bytesin256)
+
+	movdqa	112(%rdi), %xmm2
+	pxor	112(%rsi), %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(128bytesin256)
+
+	add	$128, %rsi
+	add	$128, %rdi
+
+	cmp	$64, %rdx
+	jae	L(less128bytesin2aligned)
+
+	cmp	$32, %rdx
+	jb	L(less32bytesin128in2aligned)
+
+	movdqu	(%rdi), %xmm2
+	pxor	(%rsi), %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(16bytesin256)
+
+	movdqu	16(%rdi), %xmm2
+	pxor	16(%rsi), %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(32bytesin256)
+	sub	$32, %rdx
+	add	$32, %rdi
+	add	$32, %rsi
+L(less32bytesin128in2aligned):
+	add	%rdx, %rsi
+	add	%rdx, %rdi
+	BRANCH_TO_JMPTBL_ENTRY(L(table_64bytes), %rdx, 4)
+
+	ALIGN (4)
+L(256bytesormorein2aligned):
+
+	sub	$256, %rdx
+	movdqa	(%rdi), %xmm2
+	pxor	(%rsi), %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(16bytesin256)
+
+	movdqa	16(%rdi), %xmm2
+	pxor	16(%rsi), %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(32bytesin256)
+
+	movdqa	32(%rdi), %xmm2
+	pxor	32(%rsi), %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(48bytesin256)
+
+	movdqa	48(%rdi), %xmm2
+	pxor	48(%rsi), %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(64bytesin256)
+
+	movdqa	64(%rdi), %xmm2
+	pxor	64(%rsi), %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(80bytesin256)
+
+	movdqa	80(%rdi), %xmm2
+	pxor	80(%rsi), %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(96bytesin256)
+
+	movdqa	96(%rdi), %xmm2
+	pxor	96(%rsi), %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(112bytesin256)
+
+	movdqa	112(%rdi), %xmm2
+	pxor	112(%rsi), %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(128bytesin256)
+
+	movdqa	128(%rdi), %xmm2
+	pxor	128(%rsi), %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(144bytesin256)
+
+	movdqa	144(%rdi), %xmm2
+	pxor	144(%rsi), %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(160bytesin256)
+
+	movdqa	160(%rdi), %xmm2
+	pxor	160(%rsi), %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(176bytesin256)
+
+	movdqa	176(%rdi), %xmm2
+	pxor	176(%rsi), %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(192bytesin256)
+
+	movdqa	192(%rdi), %xmm2
+	pxor	192(%rsi), %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(208bytesin256)
+
+	movdqa	208(%rdi), %xmm2
+	pxor	208(%rsi), %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(224bytesin256)
+
+	movdqa	224(%rdi), %xmm2
+	pxor	224(%rsi), %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(240bytesin256)
+
+	movdqa	240(%rdi), %xmm2
+	pxor	240(%rsi), %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(256bytesin256)
+
+	add	$256, %rsi
+	add	$256, %rdi
+
+	cmp	$128, %rdx
+	jae	L(less256bytesin2alinged)
+
+	cmp	$64, %rdx
+	jae	L(less128bytesin2aligned)
+
+	cmp	$32, %rdx
+	jb	L(less32bytesin256in2alinged)
+
+	movdqa	(%rdi), %xmm2
+	pxor	(%rsi), %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(16bytesin256)
+
+	movdqa	16(%rdi), %xmm2
+	pxor	16(%rsi), %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(32bytesin256)
+	sub	$32, %rdx
+	add	$32, %rdi
+	add	$32, %rsi
+L(less32bytesin256in2alinged):
+	add	%rdx, %rsi
+	add	%rdx, %rdi
+	BRANCH_TO_JMPTBL_ENTRY(L(table_64bytes), %rdx, 4)
+
+	ALIGN (4)
+L(512bytesormorein2aligned):
+#ifdef DATA_CACHE_SIZE_HALF
+	mov	$DATA_CACHE_SIZE_HALF, %r8
+#else
+	mov	__x86_64_data_cache_size_half(%rip), %r8
+#endif
+	mov	%r8, %r9
+	shr	$1, %r8
+	add	%r9, %r8
+	cmp	%r8, %rdx
+	ja	L(L2_L3_cache_aglined)
+
+	sub	$64, %rdx
+	ALIGN (4)
+L(64bytesormore_loopin2aligned):
+	movdqa	(%rdi), %xmm2
+	pxor	(%rsi), %xmm2
+	movdqa	%xmm2, %xmm1
+
+	movdqa	16(%rdi), %xmm3
+	pxor	16(%rsi), %xmm3
+	por	%xmm3, %xmm1
+
+	movdqa	32(%rdi), %xmm4
+	pxor	32(%rsi), %xmm4
+	por	%xmm4, %xmm1
+
+	movdqa	48(%rdi), %xmm5
+	pxor	48(%rsi), %xmm5
+	por	%xmm5, %xmm1
+
+	ptest	%xmm1, %xmm0
+	jnc	L(64bytesormore_loop_end)
+	add	$64, %rsi
+	add	$64, %rdi
+	sub	$64, %rdx
+	jae	L(64bytesormore_loopin2aligned)
+
+	add	$64, %rdx
+	add	%rdx, %rsi
+	add	%rdx, %rdi
+	BRANCH_TO_JMPTBL_ENTRY(L(table_64bytes), %rdx, 4)
+L(L2_L3_cache_aglined):
+	sub	$64, %rdx
+	ALIGN (4)
+L(L2_L3_aligned_128bytes_loop):
+	prefetchnta 0x1c0(%rdi)
+	prefetchnta 0x1c0(%rsi)
+	movdqa	(%rdi), %xmm2
+	pxor	(%rsi), %xmm2
+	movdqa	%xmm2, %xmm1
+
+	movdqa	16(%rdi), %xmm3
+	pxor	16(%rsi), %xmm3
+	por	%xmm3, %xmm1
+
+	movdqa	32(%rdi), %xmm4
+	pxor	32(%rsi), %xmm4
+	por	%xmm4, %xmm1
+
+	movdqa	48(%rdi), %xmm5
+	pxor	48(%rsi), %xmm5
+	por	%xmm5, %xmm1
+
+	ptest	%xmm1, %xmm0
+	jnc	L(64bytesormore_loop_end)
+	add	$64, %rsi
+	add	$64, %rdi
+	sub	$64, %rdx
+	jae	L(L2_L3_aligned_128bytes_loop)
+
+	add	$64, %rdx
+	add	%rdx, %rsi
+	add	%rdx, %rdi
+	BRANCH_TO_JMPTBL_ENTRY(L(table_64bytes), %rdx, 4)
+
+
+	ALIGN (4)
+L(64bytesormore_loop_end):
+	add	$16, %rdi
+	add	$16, %rsi
+	ptest	%xmm2, %xmm0
+	jnc	L(16bytes)
+
+	add	$16, %rdi
+	add	$16, %rsi
+	ptest	%xmm3, %xmm0
+	jnc	L(16bytes)
+
+	add	$16, %rdi
+	add	$16, %rsi
+	ptest	%xmm4, %xmm0
+	jnc	L(16bytes)
+
+	add	$16, %rdi
+	add	$16, %rsi
+	jmp	L(16bytes)
+
+L(256bytesin256):
+	add	$256, %rdi
+	add	$256, %rsi
+	jmp	L(16bytes)
+L(240bytesin256):
+	add	$240, %rdi
+	add	$240, %rsi
+	jmp	L(16bytes)
+L(224bytesin256):
+	add	$224, %rdi
+	add	$224, %rsi
+	jmp	L(16bytes)
+L(208bytesin256):
+	add	$208, %rdi
+	add	$208, %rsi
+	jmp	L(16bytes)
+L(192bytesin256):
+	add	$192, %rdi
+	add	$192, %rsi
+	jmp	L(16bytes)
+L(176bytesin256):
+	add	$176, %rdi
+	add	$176, %rsi
+	jmp	L(16bytes)
+L(160bytesin256):
+	add	$160, %rdi
+	add	$160, %rsi
+	jmp	L(16bytes)
+L(144bytesin256):
+	add	$144, %rdi
+	add	$144, %rsi
+	jmp	L(16bytes)
+L(128bytesin256):
+	add	$128, %rdi
+	add	$128, %rsi
+	jmp	L(16bytes)
+L(112bytesin256):
+	add	$112, %rdi
+	add	$112, %rsi
+	jmp	L(16bytes)
+L(96bytesin256):
+	add	$96, %rdi
+	add	$96, %rsi
+	jmp	L(16bytes)
+L(80bytesin256):
+	add	$80, %rdi
+	add	$80, %rsi
+	jmp	L(16bytes)
+L(64bytesin256):
+	add	$64, %rdi
+	add	$64, %rsi
+	jmp	L(16bytes)
+L(48bytesin256):
+	add	$16, %rdi
+	add	$16, %rsi
+L(32bytesin256):
+	add	$16, %rdi
+	add	$16, %rsi
+L(16bytesin256):
+	add	$16, %rdi
+	add	$16, %rsi
+L(16bytes):
+	mov	-16(%rdi), %rax
+	mov	-16(%rsi), %rcx
+	cmp	%rax, %rcx
+	jne	L(diffin8bytes)
+L(8bytes):
+	mov	-8(%rdi), %rax
+	mov	-8(%rsi), %rcx
+	cmp	%rax, %rcx
+	jne	L(diffin8bytes)
+	xor	%eax, %eax
+	ret
+
+	ALIGN (4)
+L(12bytes):
+	mov	-12(%rdi), %rax
+	mov	-12(%rsi), %rcx
+	cmp	%rax, %rcx
+	jne	L(diffin8bytes)
+L(4bytes):
+	mov	-4(%rsi), %ecx
+	mov	-4(%rdi), %eax
+	cmp	%eax, %ecx
+	jne	L(diffin4bytes)
+L(0bytes):
+	xor	%eax, %eax
+	ret
+
+#ifndef USE_AS_WMEMCMP
+/* unreal case for wmemcmp */
+	ALIGN (4)
+L(65bytes):
+	movdqu	-65(%rdi), %xmm1
+	movdqu	-65(%rsi), %xmm2
+	mov	$-65, %dl
+	pxor	%xmm1, %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(less16bytes)
+L(49bytes):
+	movdqu	-49(%rdi), %xmm1
+	movdqu	-49(%rsi), %xmm2
+	mov	$-49, %dl
+	pxor	%xmm1, %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(less16bytes)
+L(33bytes):
+	movdqu	-33(%rdi), %xmm1
+	movdqu	-33(%rsi), %xmm2
+	mov	$-33, %dl
+	pxor	%xmm1, %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(less16bytes)
+L(17bytes):
+	mov	-17(%rdi), %rax
+	mov	-17(%rsi), %rcx
+	cmp	%rax, %rcx
+	jne	L(diffin8bytes)
+L(9bytes):
+	mov	-9(%rdi), %rax
+	mov	-9(%rsi), %rcx
+	cmp	%rax, %rcx
+	jne	L(diffin8bytes)
+	movzbl	-1(%rdi), %eax
+	movzbl	-1(%rsi), %edx
+	sub	%edx, %eax
+	ret
+
+	ALIGN (4)
+L(13bytes):
+	mov	-13(%rdi), %rax
+	mov	-13(%rsi), %rcx
+	cmp	%rax, %rcx
+	jne	L(diffin8bytes)
+	mov	-8(%rdi), %rax
+	mov	-8(%rsi), %rcx
+	cmp	%rax, %rcx
+	jne	L(diffin8bytes)
+	xor	%eax, %eax
+	ret
+
+	ALIGN (4)
+L(5bytes):
+	mov	-5(%rdi), %eax
+	mov	-5(%rsi), %ecx
+	cmp	%eax, %ecx
+	jne	L(diffin4bytes)
+	movzbl	-1(%rdi), %eax
+	movzbl	-1(%rsi), %edx
+	sub	%edx, %eax
+	ret
+
+	ALIGN (4)
+L(66bytes):
+	movdqu	-66(%rdi), %xmm1
+	movdqu	-66(%rsi), %xmm2
+	mov	$-66, %dl
+	pxor	%xmm1, %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(less16bytes)
+L(50bytes):
+	movdqu	-50(%rdi), %xmm1
+	movdqu	-50(%rsi), %xmm2
+	mov	$-50, %dl
+	pxor	%xmm1, %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(less16bytes)
+L(34bytes):
+	movdqu	-34(%rdi), %xmm1
+	movdqu	-34(%rsi), %xmm2
+	mov	$-34, %dl
+	pxor	%xmm1, %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(less16bytes)
+L(18bytes):
+	mov	-18(%rdi), %rax
+	mov	-18(%rsi), %rcx
+	cmp	%rax, %rcx
+	jne	L(diffin8bytes)
+L(10bytes):
+	mov	-10(%rdi), %rax
+	mov	-10(%rsi), %rcx
+	cmp	%rax, %rcx
+	jne	L(diffin8bytes)
+	movzwl	-2(%rdi), %eax
+	movzwl	-2(%rsi), %ecx
+	cmp	%cl, %al
+	jne	L(end)
+	and	$0xffff, %eax
+	and	$0xffff, %ecx
+	sub	%ecx, %eax
+	ret
+
+	ALIGN (4)
+L(14bytes):
+	mov	-14(%rdi), %rax
+	mov	-14(%rsi), %rcx
+	cmp	%rax, %rcx
+	jne	L(diffin8bytes)
+	mov	-8(%rdi), %rax
+	mov	-8(%rsi), %rcx
+	cmp	%rax, %rcx
+	jne	L(diffin8bytes)
+	xor	%eax, %eax
+	ret
+
+	ALIGN (4)
+L(6bytes):
+	mov	-6(%rdi), %eax
+	mov	-6(%rsi), %ecx
+	cmp	%eax, %ecx
+	jne	L(diffin4bytes)
+L(2bytes):
+	movzwl	-2(%rsi), %ecx
+	movzwl	-2(%rdi), %eax
+	cmp	%cl, %al
+	jne	L(end)
+	and	$0xffff, %eax
+	and	$0xffff, %ecx
+	sub	%ecx, %eax
+	ret
+
+	ALIGN (4)
+L(67bytes):
+	movdqu	-67(%rdi), %xmm2
+	movdqu	-67(%rsi), %xmm1
+	mov	$-67, %dl
+	pxor	%xmm1, %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(less16bytes)
+L(51bytes):
+	movdqu	-51(%rdi), %xmm2
+	movdqu	-51(%rsi), %xmm1
+	mov	$-51, %dl
+	pxor	%xmm1, %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(less16bytes)
+L(35bytes):
+	movdqu	-35(%rsi), %xmm1
+	movdqu	-35(%rdi), %xmm2
+	mov	$-35, %dl
+	pxor	%xmm1, %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(less16bytes)
+L(19bytes):
+	mov	-19(%rdi), %rax
+	mov	-19(%rsi), %rcx
+	cmp	%rax, %rcx
+	jne	L(diffin8bytes)
+L(11bytes):
+	mov	-11(%rdi), %rax
+	mov	-11(%rsi), %rcx
+	cmp	%rax, %rcx
+	jne	L(diffin8bytes)
+	mov	-4(%rdi), %eax
+	mov	-4(%rsi), %ecx
+	cmp	%eax, %ecx
+	jne	L(diffin4bytes)
+	xor	%eax, %eax
+	ret
+
+	ALIGN (4)
+L(15bytes):
+	mov	-15(%rdi), %rax
+	mov	-15(%rsi), %rcx
+	cmp	%rax, %rcx
+	jne	L(diffin8bytes)
+	mov	-8(%rdi), %rax
+	mov	-8(%rsi), %rcx
+	cmp	%rax, %rcx
+	jne	L(diffin8bytes)
+	xor	%eax, %eax
+	ret
+
+	ALIGN (4)
+L(7bytes):
+	mov	-7(%rdi), %eax
+	mov	-7(%rsi), %ecx
+	cmp	%eax, %ecx
+	jne	L(diffin4bytes)
+	mov	-4(%rdi), %eax
+	mov	-4(%rsi), %ecx
+	cmp	%eax, %ecx
+	jne	L(diffin4bytes)
+	xor	%eax, %eax
+	ret
+
+	ALIGN (4)
+L(3bytes):
+	movzwl	-3(%rdi), %eax
+	movzwl	-3(%rsi), %ecx
+	cmp	%eax, %ecx
+	jne	L(diffin2bytes)
+L(1bytes):
+	movzbl	-1(%rdi), %eax
+	movzbl	-1(%rsi), %ecx
+	sub	%ecx, %eax
+	ret
+#endif
+
+	ALIGN (4)
+L(68bytes):
+	movdqu	-68(%rdi), %xmm2
+	movdqu	-68(%rsi), %xmm1
+	mov	$-68, %dl
+	pxor	%xmm1, %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(less16bytes)
+L(52bytes):
+	movdqu	-52(%rdi), %xmm2
+	movdqu	-52(%rsi), %xmm1
+	mov	$-52, %dl
+	pxor	%xmm1, %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(less16bytes)
+L(36bytes):
+	movdqu	-36(%rdi), %xmm2
+	movdqu	-36(%rsi), %xmm1
+	mov	$-36, %dl
+	pxor	%xmm1, %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(less16bytes)
+L(20bytes):
+	movdqu	-20(%rdi), %xmm2
+	movdqu	-20(%rsi), %xmm1
+	mov	$-20, %dl
+	pxor	%xmm1, %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(less16bytes)
+	mov	-4(%rdi), %eax
+	mov	-4(%rsi), %ecx
+	cmp	%eax, %ecx
+	jne	L(diffin4bytes)
+	xor	%eax, %eax
+	ret
+
+#ifndef USE_AS_WMEMCMP
+/* unreal cases for wmemcmp */
+	ALIGN (4)
+L(69bytes):
+	movdqu	-69(%rsi), %xmm1
+	movdqu	-69(%rdi), %xmm2
+	mov	$-69, %dl
+	pxor	%xmm1, %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(less16bytes)
+L(53bytes):
+	movdqu	-53(%rsi), %xmm1
+	movdqu	-53(%rdi), %xmm2
+	mov	$-53, %dl
+	pxor	%xmm1, %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(less16bytes)
+L(37bytes):
+	movdqu	-37(%rsi), %xmm1
+	movdqu	-37(%rdi), %xmm2
+	mov	$-37, %dl
+	pxor	%xmm1, %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(less16bytes)
+L(21bytes):
+	movdqu	-21(%rsi), %xmm1
+	movdqu	-21(%rdi), %xmm2
+	mov	$-21, %dl
+	pxor	%xmm1, %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(less16bytes)
+	mov	-8(%rdi), %rax
+	mov	-8(%rsi), %rcx
+	cmp	%rax, %rcx
+	jne	L(diffin8bytes)
+	xor	%eax, %eax
+	ret
+
+	ALIGN (4)
+L(70bytes):
+	movdqu	-70(%rsi), %xmm1
+	movdqu	-70(%rdi), %xmm2
+	mov	$-70, %dl
+	pxor	%xmm1, %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(less16bytes)
+L(54bytes):
+	movdqu	-54(%rsi), %xmm1
+	movdqu	-54(%rdi), %xmm2
+	mov	$-54, %dl
+	pxor	%xmm1, %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(less16bytes)
+L(38bytes):
+	movdqu	-38(%rsi), %xmm1
+	movdqu	-38(%rdi), %xmm2
+	mov	$-38, %dl
+	pxor	%xmm1, %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(less16bytes)
+L(22bytes):
+	movdqu	-22(%rsi), %xmm1
+	movdqu	-22(%rdi), %xmm2
+	mov	$-22, %dl
+	pxor	%xmm1, %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(less16bytes)
+	mov	-8(%rdi), %rax
+	mov	-8(%rsi), %rcx
+	cmp	%rax, %rcx
+	jne	L(diffin8bytes)
+	xor	%eax, %eax
+	ret
+
+	ALIGN (4)
+L(71bytes):
+	movdqu	-71(%rsi), %xmm1
+	movdqu	-71(%rdi), %xmm2
+	mov	$-71, %dl
+	pxor	%xmm1, %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(less16bytes)
+L(55bytes):
+	movdqu	-55(%rdi), %xmm2
+	movdqu	-55(%rsi), %xmm1
+	mov	$-55, %dl
+	pxor	%xmm1, %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(less16bytes)
+L(39bytes):
+	movdqu	-39(%rdi), %xmm2
+	movdqu	-39(%rsi), %xmm1
+	mov	$-39, %dl
+	pxor	%xmm1, %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(less16bytes)
+L(23bytes):
+	movdqu	-23(%rdi), %xmm2
+	movdqu	-23(%rsi), %xmm1
+	mov	$-23, %dl
+	pxor	%xmm1, %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(less16bytes)
+	mov	-8(%rdi), %rax
+	mov	-8(%rsi), %rcx
+	cmp	%rax, %rcx
+	jne	L(diffin8bytes)
+	xor	%eax, %eax
+	ret
+#endif
+
+	ALIGN (4)
+L(72bytes):
+	movdqu	-72(%rsi), %xmm1
+	movdqu	-72(%rdi), %xmm2
+	mov	$-72, %dl
+	pxor	%xmm1, %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(less16bytes)
+L(56bytes):
+	movdqu	-56(%rdi), %xmm2
+	movdqu	-56(%rsi), %xmm1
+	mov	$-56, %dl
+	pxor	%xmm1, %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(less16bytes)
+L(40bytes):
+	movdqu	-40(%rdi), %xmm2
+	movdqu	-40(%rsi), %xmm1
+	mov	$-40, %dl
+	pxor	%xmm1, %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(less16bytes)
+L(24bytes):
+	movdqu	-24(%rdi), %xmm2
+	movdqu	-24(%rsi), %xmm1
+	mov	$-24, %dl
+	pxor	%xmm1, %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(less16bytes)
+	mov	-8(%rdi), %rax
+	mov	-8(%rsi), %rcx
+	cmp	%rax, %rcx
+	jne	L(diffin8bytes)
+	xor	%eax, %eax
+	ret
+
+#ifndef USE_AS_WMEMCMP
+/* unreal cases for wmemcmp */
+	ALIGN (4)
+L(73bytes):
+	movdqu	-73(%rsi), %xmm1
+	movdqu	-73(%rdi), %xmm2
+	mov	$-73, %dl
+	pxor	%xmm1, %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(less16bytes)
+L(57bytes):
+	movdqu	-57(%rdi), %xmm2
+	movdqu	-57(%rsi), %xmm1
+	mov	$-57, %dl
+	pxor	%xmm1, %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(less16bytes)
+L(41bytes):
+	movdqu	-41(%rdi), %xmm2
+	movdqu	-41(%rsi), %xmm1
+	mov	$-41, %dl
+	pxor	%xmm1, %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(less16bytes)
+L(25bytes):
+	movdqu	-25(%rdi), %xmm2
+	movdqu	-25(%rsi), %xmm1
+	mov	$-25, %dl
+	pxor	%xmm1, %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(less16bytes)
+	mov	-9(%rdi), %rax
+	mov	-9(%rsi), %rcx
+	cmp	%rax, %rcx
+	jne	L(diffin8bytes)
+	movzbl	-1(%rdi), %eax
+	movzbl	-1(%rsi), %ecx
+	sub	%ecx, %eax
+	ret
+
+	ALIGN (4)
+L(74bytes):
+	movdqu	-74(%rsi), %xmm1
+	movdqu	-74(%rdi), %xmm2
+	mov	$-74, %dl
+	pxor	%xmm1, %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(less16bytes)
+L(58bytes):
+	movdqu	-58(%rdi), %xmm2
+	movdqu	-58(%rsi), %xmm1
+	mov	$-58, %dl
+	pxor	%xmm1, %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(less16bytes)
+L(42bytes):
+	movdqu	-42(%rdi), %xmm2
+	movdqu	-42(%rsi), %xmm1
+	mov	$-42, %dl
+	pxor	%xmm1, %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(less16bytes)
+L(26bytes):
+	movdqu	-26(%rdi), %xmm2
+	movdqu	-26(%rsi), %xmm1
+	mov	$-26, %dl
+	pxor	%xmm1, %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(less16bytes)
+	mov	-10(%rdi), %rax
+	mov	-10(%rsi), %rcx
+	cmp	%rax, %rcx
+	jne	L(diffin8bytes)
+	movzwl	-2(%rdi), %eax
+	movzwl	-2(%rsi), %ecx
+	jmp	L(diffin2bytes)
+
+	ALIGN (4)
+L(75bytes):
+	movdqu	-75(%rsi), %xmm1
+	movdqu	-75(%rdi), %xmm2
+	mov	$-75, %dl
+	pxor	%xmm1, %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(less16bytes)
+L(59bytes):
+	movdqu	-59(%rdi), %xmm2
+	movdqu	-59(%rsi), %xmm1
+	mov	$-59, %dl
+	pxor	%xmm1, %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(less16bytes)
+L(43bytes):
+	movdqu	-43(%rdi), %xmm2
+	movdqu	-43(%rsi), %xmm1
+	mov	$-43, %dl
+	pxor	%xmm1, %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(less16bytes)
+L(27bytes):
+	movdqu	-27(%rdi), %xmm2
+	movdqu	-27(%rsi), %xmm1
+	mov	$-27, %dl
+	pxor	%xmm1, %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(less16bytes)
+	mov	-11(%rdi), %rax
+	mov	-11(%rsi), %rcx
+	cmp	%rax, %rcx
+	jne	L(diffin8bytes)
+	mov	-4(%rdi), %eax
+	mov	-4(%rsi), %ecx
+	cmp	%eax, %ecx
+	jne	L(diffin4bytes)
+	xor	%eax, %eax
+	ret
+#endif
+	ALIGN (4)
+L(76bytes):
+	movdqu	-76(%rsi), %xmm1
+	movdqu	-76(%rdi), %xmm2
+	mov	$-76, %dl
+	pxor	%xmm1, %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(less16bytes)
+L(60bytes):
+	movdqu	-60(%rdi), %xmm2
+	movdqu	-60(%rsi), %xmm1
+	mov	$-60, %dl
+	pxor	%xmm1, %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(less16bytes)
+L(44bytes):
+	movdqu	-44(%rdi), %xmm2
+	movdqu	-44(%rsi), %xmm1
+	mov	$-44, %dl
+	pxor	%xmm1, %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(less16bytes)
+L(28bytes):
+	movdqu	-28(%rdi), %xmm2
+	movdqu	-28(%rsi), %xmm1
+	mov	$-28, %dl
+	pxor	%xmm1, %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(less16bytes)
+	mov	-12(%rdi), %rax
+	mov	-12(%rsi), %rcx
+	cmp	%rax, %rcx
+	jne	L(diffin8bytes)
+	mov	-4(%rdi), %eax
+	mov	-4(%rsi), %ecx
+	cmp	%eax, %ecx
+	jne	L(diffin4bytes)
+	xor	%eax, %eax
+	ret
+
+#ifndef USE_AS_WMEMCMP
+/* unreal cases for wmemcmp */
+	ALIGN (4)
+L(77bytes):
+	movdqu	-77(%rsi), %xmm1
+	movdqu	-77(%rdi), %xmm2
+	mov	$-77, %dl
+	pxor	%xmm1, %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(less16bytes)
+L(61bytes):
+	movdqu	-61(%rdi), %xmm2
+	movdqu	-61(%rsi), %xmm1
+	mov	$-61, %dl
+	pxor	%xmm1, %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(less16bytes)
+L(45bytes):
+	movdqu	-45(%rdi), %xmm2
+	movdqu	-45(%rsi), %xmm1
+	mov	$-45, %dl
+	pxor	%xmm1, %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(less16bytes)
+L(29bytes):
+	movdqu	-29(%rdi), %xmm2
+	movdqu	-29(%rsi), %xmm1
+	mov	$-29, %dl
+	pxor	%xmm1, %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(less16bytes)
+
+	mov	-13(%rdi), %rax
+	mov	-13(%rsi), %rcx
+	cmp	%rax, %rcx
+	jne	L(diffin8bytes)
+
+	mov	-8(%rdi), %rax
+	mov	-8(%rsi), %rcx
+	cmp	%rax, %rcx
+	jne	L(diffin8bytes)
+	xor	%eax, %eax
+	ret
+
+	ALIGN (4)
+L(78bytes):
+	movdqu	-78(%rsi), %xmm1
+	movdqu	-78(%rdi), %xmm2
+	mov	$-78, %dl
+	pxor	%xmm1, %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(less16bytes)
+L(62bytes):
+	movdqu	-62(%rdi), %xmm2
+	movdqu	-62(%rsi), %xmm1
+	mov	$-62, %dl
+	pxor	%xmm1, %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(less16bytes)
+L(46bytes):
+	movdqu	-46(%rdi), %xmm2
+	movdqu	-46(%rsi), %xmm1
+	mov	$-46, %dl
+	pxor	%xmm1, %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(less16bytes)
+L(30bytes):
+	movdqu	-30(%rdi), %xmm2
+	movdqu	-30(%rsi), %xmm1
+	mov	$-30, %dl
+	pxor	%xmm1, %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(less16bytes)
+	mov	-14(%rdi), %rax
+	mov	-14(%rsi), %rcx
+	cmp	%rax, %rcx
+	jne	L(diffin8bytes)
+	mov	-8(%rdi), %rax
+	mov	-8(%rsi), %rcx
+	cmp	%rax, %rcx
+	jne	L(diffin8bytes)
+	xor	%eax, %eax
+	ret
+
+	ALIGN (4)
+L(79bytes):
+	movdqu	-79(%rsi), %xmm1
+	movdqu	-79(%rdi), %xmm2
+	mov	$-79, %dl
+	pxor	%xmm1, %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(less16bytes)
+L(63bytes):
+	movdqu	-63(%rdi), %xmm2
+	movdqu	-63(%rsi), %xmm1
+	mov	$-63, %dl
+	pxor	%xmm1, %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(less16bytes)
+L(47bytes):
+	movdqu	-47(%rdi), %xmm2
+	movdqu	-47(%rsi), %xmm1
+	mov	$-47, %dl
+	pxor	%xmm1, %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(less16bytes)
+L(31bytes):
+	movdqu	-31(%rdi), %xmm2
+	movdqu	-31(%rsi), %xmm1
+	mov	$-31, %dl
+	pxor	%xmm1, %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(less16bytes)
+	mov	-15(%rdi), %rax
+	mov	-15(%rsi), %rcx
+	cmp	%rax, %rcx
+	jne	L(diffin8bytes)
+	mov	-8(%rdi), %rax
+	mov	-8(%rsi), %rcx
+	cmp	%rax, %rcx
+	jne	L(diffin8bytes)
+	xor	%eax, %eax
+	ret
+#endif
+	ALIGN (4)
+L(64bytes):
+	movdqu	-64(%rdi), %xmm2
+	movdqu	-64(%rsi), %xmm1
+	mov	$-64, %dl
+	pxor	%xmm1, %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(less16bytes)
+L(48bytes):
+	movdqu	-48(%rdi), %xmm2
+	movdqu	-48(%rsi), %xmm1
+	mov	$-48, %dl
+	pxor	%xmm1, %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(less16bytes)
+L(32bytes):
+	movdqu	-32(%rdi), %xmm2
+	movdqu	-32(%rsi), %xmm1
+	mov	$-32, %dl
+	pxor	%xmm1, %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(less16bytes)
+
+	mov	-16(%rdi), %rax
+	mov	-16(%rsi), %rcx
+	cmp	%rax, %rcx
+	jne	L(diffin8bytes)
+
+	mov	-8(%rdi), %rax
+	mov	-8(%rsi), %rcx
+	cmp	%rax, %rcx
+	jne	L(diffin8bytes)
+	xor	%eax, %eax
+	ret
+
+/*
+ * Aligned 8 bytes to avoid 2 branch "taken" in one 16 alinged code block.
+ */
+	ALIGN (3)
+L(less16bytes):
+	movsbq	%dl, %rdx
+	mov	(%rsi, %rdx), %rcx
+	mov	(%rdi, %rdx), %rax
+	cmp	%rax, %rcx
+	jne	L(diffin8bytes)
+	mov	8(%rsi, %rdx), %rcx
+	mov	8(%rdi, %rdx), %rax
+L(diffin8bytes):
+	cmp	%eax, %ecx
+	jne	L(diffin4bytes)
+	shr	$32, %rcx
+	shr	$32, %rax
+
+#ifdef USE_AS_WMEMCMP
+/* for wmemcmp */
+	cmp	%eax, %ecx
+	jne	L(diffin4bytes)
+	xor	%eax, %eax
+	ret
+#endif
+
+L(diffin4bytes):
+#ifndef USE_AS_WMEMCMP
+	cmp	%cx, %ax
+	jne	L(diffin2bytes)
+	shr	$16, %ecx
+	shr	$16, %eax
+L(diffin2bytes):
+	cmp	%cl, %al
+	jne	L(end)
+	and	$0xffff, %eax
+	and	$0xffff, %ecx
+	sub	%ecx, %eax
+	ret
+#else
+
+/* for wmemcmp */
+	mov	$1, %eax
+	jl	L(nequal_bigger)
+	neg	%eax
+	ret
+
+	ALIGN (4)
+L(nequal_bigger):
+	ret
+
+L(unreal_case):
+	xor	%eax, %eax
+	ret
+#endif
+
+	ALIGN (4)
+L(end):
+	and	$0xff, %eax
+	and	$0xff, %ecx
+	sub	%ecx, %eax
+	ret
+
+END (MEMCMP)
+
+	.section .rodata.sse4.1,"a",@progbits
+	ALIGN (3)
+#ifndef USE_AS_WMEMCMP
+L(table_64bytes):
+	.int	JMPTBL (L(0bytes), L(table_64bytes))
+	.int	JMPTBL (L(1bytes), L(table_64bytes))
+	.int	JMPTBL (L(2bytes), L(table_64bytes))
+	.int	JMPTBL (L(3bytes), L(table_64bytes))
+	.int	JMPTBL (L(4bytes), L(table_64bytes))
+	.int	JMPTBL (L(5bytes), L(table_64bytes))
+	.int	JMPTBL (L(6bytes), L(table_64bytes))
+	.int	JMPTBL (L(7bytes), L(table_64bytes))
+	.int	JMPTBL (L(8bytes), L(table_64bytes))
+	.int	JMPTBL (L(9bytes), L(table_64bytes))
+	.int	JMPTBL (L(10bytes), L(table_64bytes))
+	.int	JMPTBL (L(11bytes), L(table_64bytes))
+	.int	JMPTBL (L(12bytes), L(table_64bytes))
+	.int	JMPTBL (L(13bytes), L(table_64bytes))
+	.int	JMPTBL (L(14bytes), L(table_64bytes))
+	.int	JMPTBL (L(15bytes), L(table_64bytes))
+	.int	JMPTBL (L(16bytes), L(table_64bytes))
+	.int	JMPTBL (L(17bytes), L(table_64bytes))
+	.int	JMPTBL (L(18bytes), L(table_64bytes))
+	.int	JMPTBL (L(19bytes), L(table_64bytes))
+	.int	JMPTBL (L(20bytes), L(table_64bytes))
+	.int	JMPTBL (L(21bytes), L(table_64bytes))
+	.int	JMPTBL (L(22bytes), L(table_64bytes))
+	.int	JMPTBL (L(23bytes), L(table_64bytes))
+	.int	JMPTBL (L(24bytes), L(table_64bytes))
+	.int	JMPTBL (L(25bytes), L(table_64bytes))
+	.int	JMPTBL (L(26bytes), L(table_64bytes))
+	.int	JMPTBL (L(27bytes), L(table_64bytes))
+	.int	JMPTBL (L(28bytes), L(table_64bytes))
+	.int	JMPTBL (L(29bytes), L(table_64bytes))
+	.int	JMPTBL (L(30bytes), L(table_64bytes))
+	.int	JMPTBL (L(31bytes), L(table_64bytes))
+	.int	JMPTBL (L(32bytes), L(table_64bytes))
+	.int	JMPTBL (L(33bytes), L(table_64bytes))
+	.int	JMPTBL (L(34bytes), L(table_64bytes))
+	.int	JMPTBL (L(35bytes), L(table_64bytes))
+	.int	JMPTBL (L(36bytes), L(table_64bytes))
+	.int	JMPTBL (L(37bytes), L(table_64bytes))
+	.int	JMPTBL (L(38bytes), L(table_64bytes))
+	.int	JMPTBL (L(39bytes), L(table_64bytes))
+	.int	JMPTBL (L(40bytes), L(table_64bytes))
+	.int	JMPTBL (L(41bytes), L(table_64bytes))
+	.int	JMPTBL (L(42bytes), L(table_64bytes))
+	.int	JMPTBL (L(43bytes), L(table_64bytes))
+	.int	JMPTBL (L(44bytes), L(table_64bytes))
+	.int	JMPTBL (L(45bytes), L(table_64bytes))
+	.int	JMPTBL (L(46bytes), L(table_64bytes))
+	.int	JMPTBL (L(47bytes), L(table_64bytes))
+	.int	JMPTBL (L(48bytes), L(table_64bytes))
+	.int	JMPTBL (L(49bytes), L(table_64bytes))
+	.int	JMPTBL (L(50bytes), L(table_64bytes))
+	.int	JMPTBL (L(51bytes), L(table_64bytes))
+	.int	JMPTBL (L(52bytes), L(table_64bytes))
+	.int	JMPTBL (L(53bytes), L(table_64bytes))
+	.int	JMPTBL (L(54bytes), L(table_64bytes))
+	.int	JMPTBL (L(55bytes), L(table_64bytes))
+	.int	JMPTBL (L(56bytes), L(table_64bytes))
+	.int	JMPTBL (L(57bytes), L(table_64bytes))
+	.int	JMPTBL (L(58bytes), L(table_64bytes))
+	.int	JMPTBL (L(59bytes), L(table_64bytes))
+	.int	JMPTBL (L(60bytes), L(table_64bytes))
+	.int	JMPTBL (L(61bytes), L(table_64bytes))
+	.int	JMPTBL (L(62bytes), L(table_64bytes))
+	.int	JMPTBL (L(63bytes), L(table_64bytes))
+	.int	JMPTBL (L(64bytes), L(table_64bytes))
+	.int	JMPTBL (L(65bytes), L(table_64bytes))
+	.int	JMPTBL (L(66bytes), L(table_64bytes))
+	.int	JMPTBL (L(67bytes), L(table_64bytes))
+	.int	JMPTBL (L(68bytes), L(table_64bytes))
+	.int	JMPTBL (L(69bytes), L(table_64bytes))
+	.int	JMPTBL (L(70bytes), L(table_64bytes))
+	.int	JMPTBL (L(71bytes), L(table_64bytes))
+	.int	JMPTBL (L(72bytes), L(table_64bytes))
+	.int	JMPTBL (L(73bytes), L(table_64bytes))
+	.int	JMPTBL (L(74bytes), L(table_64bytes))
+	.int	JMPTBL (L(75bytes), L(table_64bytes))
+	.int	JMPTBL (L(76bytes), L(table_64bytes))
+	.int	JMPTBL (L(77bytes), L(table_64bytes))
+	.int	JMPTBL (L(78bytes), L(table_64bytes))
+	.int	JMPTBL (L(79bytes), L(table_64bytes))
+#else
+L(table_64bytes):
+	.int	JMPTBL (L(0bytes), L(table_64bytes))
+	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+	.int	JMPTBL (L(4bytes), L(table_64bytes))
+	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+	.int	JMPTBL (L(8bytes), L(table_64bytes))
+	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+	.int	JMPTBL (L(12bytes), L(table_64bytes))
+	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+	.int	JMPTBL (L(16bytes), L(table_64bytes))
+	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+	.int	JMPTBL (L(20bytes), L(table_64bytes))
+	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+	.int	JMPTBL (L(24bytes), L(table_64bytes))
+	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+	.int	JMPTBL (L(28bytes), L(table_64bytes))
+	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+	.int	JMPTBL (L(32bytes), L(table_64bytes))
+	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+	.int	JMPTBL (L(36bytes), L(table_64bytes))
+	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+	.int	JMPTBL (L(40bytes), L(table_64bytes))
+	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+	.int	JMPTBL (L(44bytes), L(table_64bytes))
+	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+	.int	JMPTBL (L(48bytes), L(table_64bytes))
+	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+	.int	JMPTBL (L(52bytes), L(table_64bytes))
+	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+	.int	JMPTBL (L(56bytes), L(table_64bytes))
+	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+	.int	JMPTBL (L(60bytes), L(table_64bytes))
+	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+	.int	JMPTBL (L(64bytes), L(table_64bytes))
+	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+	.int	JMPTBL (L(68bytes), L(table_64bytes))
+	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+	.int	JMPTBL (L(72bytes), L(table_64bytes))
+	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+	.int	JMPTBL (L(76bytes), L(table_64bytes))
+	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+#endif
diff --git a/libc/arch-x86_64/string/ssse3-strcmp-slm.S b/libc/arch-x86_64/string/ssse3-strcmp-slm.S
new file mode 100644
index 0000000..0dd8c27
--- /dev/null
+++ b/libc/arch-x86_64/string/ssse3-strcmp-slm.S
@@ -0,0 +1,1925 @@
+/*
+Copyright (c) 2014, Intel Corporation
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+    * Redistributions of source code must retain the above copyright notice,
+    * this list of conditions and the following disclaimer.
+
+    * Redistributions in binary form must reproduce the above copyright notice,
+    * this list of conditions and the following disclaimer in the documentation
+    * and/or other materials provided with the distribution.
+
+    * Neither the name of Intel Corporation nor the names of its contributors
+    * may be used to endorse or promote products derived from this software
+    * without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
+ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+#ifdef USE_AS_STRNCMP
+/* Since the counter, %r11, is unsigned, we branch to strcmp_exitz
+   if the new counter > the old one or is 0.  */
+#define UPDATE_STRNCMP_COUNTER				\
+	/* calculate left number to compare */		\
+	lea	-16(%rcx, %r11), %r9;			\
+	cmp	%r9, %r11;				\
+	jb	L(strcmp_exitz);			\
+	test	%r9, %r9;				\
+	je	L(strcmp_exitz);			\
+	mov	%r9, %r11
+
+#else
+#define UPDATE_STRNCMP_COUNTER
+#ifndef STRCMP
+#define STRCMP		strcmp
+#endif
+#endif
+
+#ifndef L
+# define L(label)	.L##label
+#endif
+
+#ifndef cfi_startproc
+# define cfi_startproc			.cfi_startproc
+#endif
+
+#ifndef cfi_endproc
+# define cfi_endproc			.cfi_endproc
+#endif
+
+#ifndef ENTRY
+# define ENTRY(name)			\
+	.type name,  @function; 	\
+	.globl name;			\
+	.p2align 4;			\
+name:					\
+	cfi_startproc
+#endif
+
+#ifndef END
+# define END(name)			\
+	cfi_endproc;			\
+	.size name, .-name
+#endif
+#define RETURN ret
+	.section .text.ssse3,"ax",@progbits
+ENTRY (STRCMP)
+/*
+ * This implementation uses SSE to compare up to 16 bytes at a time.
+ */
+#ifdef USE_AS_STRNCMP
+	test	%rdx, %rdx
+	je	L(strcmp_exitz)
+	cmp	$1, %rdx
+	je	L(Byte0)
+	mov	%rdx, %r11
+#endif
+	mov	%esi, %ecx
+	mov	%edi, %eax
+/* Use 64bit AND here to avoid long NOP padding.  */
+	and	$0x3f, %rcx		/* rsi alignment in cache line */
+	and	$0x3f, %rax		/* rdi alignment in cache line */
+	cmp	$0x30, %ecx
+	ja	L(crosscache)	/* rsi: 16-byte load will cross cache line */
+	cmp	$0x30, %eax
+	ja	L(crosscache)	/* rdi: 16-byte load will cross cache line */
+	movlpd	(%rdi), %xmm1
+	movlpd	(%rsi), %xmm2
+	movhpd	8(%rdi), %xmm1
+	movhpd	8(%rsi), %xmm2
+	pxor	%xmm0, %xmm0		/* clear %xmm0 for null char checks */
+	pcmpeqb	%xmm1, %xmm0		/* Any null chars? */
+	pcmpeqb	%xmm2, %xmm1		/* compare first 16 bytes for equality */
+	psubb	%xmm0, %xmm1		/* packed sub of comparison results*/
+	pmovmskb %xmm1, %edx
+	sub	$0xffff, %edx		/* if first 16 bytes are same, edx == 0xffff */
+	jnz	L(less16bytes)	/* If not, find different value or null char */
+#ifdef USE_AS_STRNCMP
+	sub	$16, %r11
+	jbe	L(strcmp_exitz)	/* finish comparision */
+#endif
+	add	$16, %rsi		/* prepare to search next 16 bytes */
+	add	$16, %rdi		/* prepare to search next 16 bytes */
+
+	/*
+	 * Determine source and destination string offsets from 16-byte alignment.
+	 * Use relative offset difference between the two to determine which case
+	 * below to use.
+	 */
+	.p2align 4
+L(crosscache):
+	and	$0xfffffffffffffff0, %rsi	/* force %rsi is 16 byte aligned */
+	and	$0xfffffffffffffff0, %rdi	/* force %rdi is 16 byte aligned */
+	mov	$0xffff, %edx			/* for equivalent offset */
+	xor	%r8d, %r8d
+	and	$0xf, %ecx			/* offset of rsi */
+	and	$0xf, %eax			/* offset of rdi */
+	cmp	%eax, %ecx
+	je	L(ashr_0)			/* rsi and rdi relative offset same */
+	ja	L(bigger)
+	mov	%edx, %r8d			/* r8d is offset flag for exit tail */
+	xchg	%ecx, %eax
+	xchg	%rsi, %rdi
+L(bigger):
+	lea	15(%rax), %r9
+	sub	%rcx, %r9
+	lea	L(unaligned_table)(%rip), %r10
+	movslq	(%r10, %r9,4), %r9
+	lea	(%r10, %r9), %r10
+	jmp	*%r10				/* jump to corresponding case */
+
+/*
+ * The following cases will be handled by ashr_0
+ *  rcx(offset of rsi)  rax(offset of rdi)  relative offset  corresponding case
+ *        n(0~15)            n(0~15)           15(15+ n-n)         ashr_0
+ */
+	.p2align 4
+L(ashr_0):
+
+	movdqa	(%rsi), %xmm1
+	pxor	%xmm0, %xmm0			/* clear %xmm0 for null char check */
+	pcmpeqb	%xmm1, %xmm0			/* Any null chars? */
+	pcmpeqb	(%rdi), %xmm1			/* compare 16 bytes for equality */
+	psubb	%xmm0, %xmm1			/* packed sub of comparison results*/
+	pmovmskb %xmm1, %r9d
+	shr	%cl, %edx			/* adjust 0xffff for offset */
+	shr	%cl, %r9d			/* adjust for 16-byte offset */
+	sub	%r9d, %edx
+	/*
+	 * edx must be the same with r9d if in left byte (16-rcx) is equal to
+	 * the start from (16-rax) and no null char was seen.
+	 */
+	jne	L(less32bytes)		/* mismatch or null char */
+	UPDATE_STRNCMP_COUNTER
+	mov	$16, %rcx
+	mov	$16, %r9
+	pxor	%xmm0, %xmm0			/* clear xmm0, may have changed above */
+
+	/*
+	 * Now both strings are aligned at 16-byte boundary. Loop over strings
+	 * checking 32-bytes per iteration.
+	 */
+	.p2align 4
+L(loop_ashr_0):
+	movdqa	(%rsi, %rcx), %xmm1
+	movdqa	(%rdi, %rcx), %xmm2
+
+	pcmpeqb	%xmm1, %xmm0
+	pcmpeqb	%xmm2, %xmm1
+	psubb	%xmm0, %xmm1
+	pmovmskb %xmm1, %edx
+	sub	$0xffff, %edx
+	jnz	L(exit)		/* mismatch or null char seen */
+
+#ifdef USE_AS_STRNCMP
+	sub	$16, %r11
+	jbe	L(strcmp_exitz)
+#endif
+	add	$16, %rcx
+	movdqa	(%rsi, %rcx), %xmm1
+	movdqa	(%rdi, %rcx), %xmm2
+
+	pcmpeqb	%xmm1, %xmm0
+	pcmpeqb	%xmm2, %xmm1
+	psubb	%xmm0, %xmm1
+	pmovmskb %xmm1, %edx
+	sub	$0xffff, %edx
+	jnz	L(exit)
+#ifdef USE_AS_STRNCMP
+	sub	$16, %r11
+	jbe	L(strcmp_exitz)
+#endif
+	add	$16, %rcx
+	jmp	L(loop_ashr_0)
+
+/*
+ * The following cases will be handled by ashr_1
+ * rcx(offset of rsi)  rax(offset of rdi)   relative offset   	corresponding case
+ *        n(15)            n -15            0(15 +(n-15) - n)         ashr_1
+ */
+	.p2align 4
+L(ashr_1):
+	pxor	%xmm0, %xmm0
+	movdqa	(%rdi), %xmm2
+	movdqa	(%rsi), %xmm1
+	pcmpeqb	%xmm1, %xmm0		/* Any null chars? */
+	pslldq	$15, %xmm2		/* shift first string to align with second */
+	pcmpeqb	%xmm1, %xmm2		/* compare 16 bytes for equality */
+	psubb	%xmm0, %xmm2		/* packed sub of comparison results*/
+	pmovmskb %xmm2, %r9d
+	shr	%cl, %edx		/* adjust 0xffff for offset */
+	shr	%cl, %r9d		/* adjust for 16-byte offset */
+	sub	%r9d, %edx
+	jnz	L(less32bytes)	/* mismatch or null char seen */
+	movdqa	(%rdi), %xmm3
+	UPDATE_STRNCMP_COUNTER
+
+	pxor	%xmm0, %xmm0
+	mov	$16, %rcx		/* index for loads*/
+	mov	$1, %r9d		/* byte position left over from less32bytes case */
+	/*
+	 * Setup %r10 value allows us to detect crossing a page boundary.
+	 * When %r10 goes positive we have crossed a page boundary and
+	 * need to do a nibble.
+	 */
+	lea	1(%rdi), %r10
+	and	$0xfff, %r10		/* offset into 4K page */
+	sub	$0x1000, %r10		/* subtract 4K pagesize */
+
+	.p2align 4
+L(loop_ashr_1):
+	add	$16, %r10
+	jg	L(nibble_ashr_1)	/* cross page boundary */
+
+L(gobble_ashr_1):
+	movdqa	(%rsi, %rcx), %xmm1
+	movdqa	(%rdi, %rcx), %xmm2
+	movdqa	%xmm2, %xmm4		 /* store for next cycle */
+
+	palignr $1, %xmm3, %xmm2        /* merge into one 16byte value */
+
+	pcmpeqb	%xmm1, %xmm0
+	pcmpeqb	%xmm2, %xmm1
+	psubb	%xmm0, %xmm1
+	pmovmskb %xmm1, %edx
+	sub	$0xffff, %edx
+	jnz	L(exit)
+
+#ifdef USE_AS_STRNCMP
+	sub	$16, %r11
+	jbe	L(strcmp_exitz)
+#endif
+	add	$16, %rcx
+	movdqa	%xmm4, %xmm3
+
+	add	$16, %r10
+	jg	L(nibble_ashr_1)	/* cross page boundary */
+
+	movdqa	(%rsi, %rcx), %xmm1
+	movdqa	(%rdi, %rcx), %xmm2
+	movdqa	%xmm2, %xmm4		/* store for next cycle */
+
+	palignr $1, %xmm3, %xmm2        /* merge into one 16byte value */
+
+	pcmpeqb	%xmm1, %xmm0
+	pcmpeqb	%xmm2, %xmm1
+	psubb	%xmm0, %xmm1
+	pmovmskb %xmm1, %edx
+	sub	$0xffff, %edx
+	jnz	L(exit)
+
+#ifdef USE_AS_STRNCMP
+	sub	$16, %r11
+	jbe	L(strcmp_exitz)
+#endif
+	add	$16, %rcx
+	movdqa	%xmm4, %xmm3
+	jmp	L(loop_ashr_1)
+
+	/*
+	 * Nibble avoids loads across page boundary. This is to avoid a potential
+	 * access into unmapped memory.
+	 */
+	.p2align 4
+L(nibble_ashr_1):
+	pcmpeqb	%xmm3, %xmm0		 /* check nibble for null char*/
+	pmovmskb %xmm0, %edx
+	test	$0xfffe, %edx
+	jnz	L(ashr_1_exittail)	/* find null char*/
+
+#ifdef USE_AS_STRNCMP
+	cmp	$14, %r11
+	jbe	L(ashr_1_exittail)
+#endif
+
+	pxor	%xmm0, %xmm0
+	sub	$0x1000, %r10		/* substract 4K from %r10 */
+	jmp	L(gobble_ashr_1)
+
+	/*
+	 * Once find null char, determine if there is a string mismatch
+	 * before the null char.
+	 */
+	.p2align 4
+L(ashr_1_exittail):
+	movdqa	(%rsi, %rcx), %xmm1
+	psrldq	$1, %xmm0
+	psrldq	$1, %xmm3
+	jmp	L(aftertail)
+
+/*
+ * The following cases will be handled by ashr_2
+ * rcx(offset of rsi)  rax(offset of rdi)   relative offset   	corresponding case
+ *        n(14~15)            n -14         1(15 +(n-14) - n)         ashr_2
+ */
+	.p2align 4
+L(ashr_2):
+	pxor	%xmm0, %xmm0
+	movdqa	(%rdi), %xmm2
+	movdqa	(%rsi), %xmm1
+	pcmpeqb	%xmm1, %xmm0
+	pslldq	$14, %xmm2
+	pcmpeqb	%xmm1, %xmm2
+	psubb	%xmm0, %xmm2
+	pmovmskb %xmm2, %r9d
+	shr	%cl, %edx
+	shr	%cl, %r9d
+	sub	%r9d, %edx
+	jnz	L(less32bytes)
+	movdqa	(%rdi), %xmm3
+	UPDATE_STRNCMP_COUNTER
+
+	pxor	%xmm0, %xmm0
+	mov	$16, %rcx	/* index for loads */
+	mov	$2, %r9d	/* byte position left over from less32bytes case */
+	/*
+	 * Setup %r10 value allows us to detect crossing a page boundary.
+	 * When %r10 goes positive we have crossed a page boundary and
+	 * need to do a nibble.
+	 */
+	lea	2(%rdi), %r10
+	and	$0xfff, %r10	/* offset into 4K page */
+	sub	$0x1000, %r10	/* subtract 4K pagesize */
+
+	.p2align 4
+L(loop_ashr_2):
+	add	$16, %r10
+	jg	L(nibble_ashr_2)
+
+L(gobble_ashr_2):
+	movdqa	(%rsi, %rcx), %xmm1
+	movdqa	(%rdi, %rcx), %xmm2
+	movdqa	%xmm2, %xmm4
+
+	palignr $2, %xmm3, %xmm2        /* merge into one 16byte value */
+
+	pcmpeqb	%xmm1, %xmm0
+	pcmpeqb	%xmm2, %xmm1
+	psubb	%xmm0, %xmm1
+	pmovmskb %xmm1, %edx
+	sub	$0xffff, %edx
+	jnz	L(exit)
+
+#ifdef USE_AS_STRNCMP
+	sub	$16, %r11
+	jbe	L(strcmp_exitz)
+#endif
+
+	add	$16, %rcx
+	movdqa	%xmm4, %xmm3
+
+	add	$16, %r10
+	jg	L(nibble_ashr_2)	/* cross page boundary */
+
+	movdqa	(%rsi, %rcx), %xmm1
+	movdqa	(%rdi, %rcx), %xmm2
+	movdqa	%xmm2, %xmm4
+
+	palignr $2, %xmm3, %xmm2        /* merge into one 16byte value */
+
+	pcmpeqb	%xmm1, %xmm0
+	pcmpeqb	%xmm2, %xmm1
+	psubb	%xmm0, %xmm1
+	pmovmskb %xmm1, %edx
+	sub	$0xffff, %edx
+	jnz	L(exit)
+
+#ifdef USE_AS_STRNCMP
+	sub	$16, %r11
+	jbe	L(strcmp_exitz)
+#endif
+
+	add	$16, %rcx
+	movdqa	%xmm4, %xmm3
+	jmp	L(loop_ashr_2)
+
+	.p2align 4
+L(nibble_ashr_2):
+	pcmpeqb	%xmm3, %xmm0		/* check nibble for null char */
+	pmovmskb %xmm0, %edx
+	test	$0xfffc, %edx
+	jnz	L(ashr_2_exittail)
+
+#ifdef USE_AS_STRNCMP
+	cmp	$13, %r11
+	jbe	L(ashr_2_exittail)
+#endif
+
+	pxor	%xmm0, %xmm0
+	sub	$0x1000, %r10
+	jmp	L(gobble_ashr_2)
+
+	.p2align 4
+L(ashr_2_exittail):
+	movdqa	(%rsi, %rcx), %xmm1
+	psrldq	$2, %xmm0
+	psrldq	$2, %xmm3
+	jmp	L(aftertail)
+
+/*
+ * The following cases will be handled by ashr_3
+ *  rcx(offset of rsi)  rax(offset of rdi)  relative offset	 corresponding case
+ *        n(13~15)            n -13         2(15 +(n-13) - n)         ashr_3
+ */
+	.p2align 4
+L(ashr_3):
+	pxor	%xmm0, %xmm0
+	movdqa	(%rdi), %xmm2
+	movdqa	(%rsi), %xmm1
+	pcmpeqb	%xmm1, %xmm0
+	pslldq	$13, %xmm2
+	pcmpeqb	%xmm1, %xmm2
+	psubb	%xmm0, %xmm2
+	pmovmskb %xmm2, %r9d
+	shr	%cl, %edx
+	shr	%cl, %r9d
+	sub	%r9d, %edx
+	jnz	L(less32bytes)
+	movdqa	(%rdi), %xmm3
+
+	UPDATE_STRNCMP_COUNTER
+
+	pxor	%xmm0, %xmm0
+	mov	$16, %rcx	/* index for loads */
+	mov	$3, %r9d	/* byte position left over from less32bytes case */
+	/*
+	 * Setup %r10 value allows us to detect crossing a page boundary.
+	 * When %r10 goes positive we have crossed a page boundary and
+	 * need to do a nibble.
+	 */
+	lea	3(%rdi), %r10
+	and	$0xfff, %r10	/* offset into 4K page */
+	sub	$0x1000, %r10	/* subtract 4K pagesize */
+
+	.p2align 4
+L(loop_ashr_3):
+	add	$16, %r10
+	jg	L(nibble_ashr_3)
+
+L(gobble_ashr_3):
+	movdqa	(%rsi, %rcx), %xmm1
+	movdqa	(%rdi, %rcx), %xmm2
+	movdqa	%xmm2, %xmm4
+
+	palignr $3, %xmm3, %xmm2        /* merge into one 16byte value */
+
+	pcmpeqb	%xmm1, %xmm0
+	pcmpeqb	%xmm2, %xmm1
+	psubb	%xmm0, %xmm1
+	pmovmskb %xmm1, %edx
+	sub	$0xffff, %edx
+	jnz	L(exit)
+
+#ifdef USE_AS_STRNCMP
+	sub	$16, %r11
+	jbe	L(strcmp_exitz)
+#endif
+
+	add	$16, %rcx
+	movdqa	%xmm4, %xmm3
+
+	add	$16, %r10
+	jg	L(nibble_ashr_3)	/* cross page boundary */
+
+	movdqa	(%rsi, %rcx), %xmm1
+	movdqa	(%rdi, %rcx), %xmm2
+	movdqa	%xmm2, %xmm4
+
+	palignr $3, %xmm3, %xmm2        /* merge into one 16byte value */
+
+	pcmpeqb	%xmm1, %xmm0
+	pcmpeqb	%xmm2, %xmm1
+	psubb	%xmm0, %xmm1
+	pmovmskb %xmm1, %edx
+	sub	$0xffff, %edx
+	jnz	L(exit)
+
+#ifdef USE_AS_STRNCMP
+	sub	$16, %r11
+	jbe	L(strcmp_exitz)
+#endif
+
+	add	$16, %rcx
+	movdqa	%xmm4, %xmm3
+	jmp	L(loop_ashr_3)
+
+	.p2align 4
+L(nibble_ashr_3):
+	pcmpeqb	%xmm3, %xmm0		/* check nibble for null char */
+	pmovmskb %xmm0, %edx
+	test	$0xfff8, %edx
+	jnz	L(ashr_3_exittail)
+
+#ifdef USE_AS_STRNCMP
+	cmp	$12, %r11
+	jbe	L(ashr_3_exittail)
+#endif
+
+	pxor	%xmm0, %xmm0
+	sub	$0x1000, %r10
+	jmp	L(gobble_ashr_3)
+
+	.p2align 4
+L(ashr_3_exittail):
+	movdqa	(%rsi, %rcx), %xmm1
+	psrldq	$3, %xmm0
+	psrldq	$3, %xmm3
+	jmp	L(aftertail)
+
+/*
+ * The following cases will be handled by ashr_4
+ *  rcx(offset of rsi)  rax(offset of rdi)  relative offset	 corresponding case
+ *        n(12~15)            n -12         3(15 +(n-12) - n)         ashr_4
+ */
+	.p2align 4
+L(ashr_4):
+	pxor	%xmm0, %xmm0
+	movdqa	(%rdi), %xmm2
+	movdqa	(%rsi), %xmm1
+	pcmpeqb	%xmm1, %xmm0
+	pslldq	$12, %xmm2
+	pcmpeqb	%xmm1, %xmm2
+	psubb	%xmm0, %xmm2
+	pmovmskb %xmm2, %r9d
+	shr	%cl, %edx
+	shr	%cl, %r9d
+	sub	%r9d, %edx
+	jnz	L(less32bytes)
+	movdqa	(%rdi), %xmm3
+
+	UPDATE_STRNCMP_COUNTER
+
+	pxor	%xmm0, %xmm0
+	mov	$16, %rcx	/* index for loads */
+	mov	$4, %r9d	/* byte position left over from less32bytes case */
+	/*
+	 * Setup %r10 value allows us to detect crossing a page boundary.
+	 * When %r10 goes positive we have crossed a page boundary and
+	 * need to do a nibble.
+	 */
+	lea	4(%rdi), %r10
+	and	$0xfff, %r10	/* offset into 4K page */
+	sub	$0x1000, %r10	/* subtract 4K pagesize */
+
+	.p2align 4
+L(loop_ashr_4):
+	add	$16, %r10
+	jg	L(nibble_ashr_4)
+
+L(gobble_ashr_4):
+	movdqa	(%rsi, %rcx), %xmm1
+	movdqa	(%rdi, %rcx), %xmm2
+	movdqa	%xmm2, %xmm4
+
+	palignr $4, %xmm3, %xmm2        /* merge into one 16byte value */
+
+	pcmpeqb	%xmm1, %xmm0
+	pcmpeqb	%xmm2, %xmm1
+	psubb	%xmm0, %xmm1
+	pmovmskb %xmm1, %edx
+	sub	$0xffff, %edx
+	jnz	L(exit)
+
+#ifdef USE_AS_STRNCMP
+	sub	$16, %r11
+	jbe	L(strcmp_exitz)
+#endif
+
+	add	$16, %rcx
+	movdqa	%xmm4, %xmm3
+
+	add	$16, %r10
+	jg	L(nibble_ashr_4)	/* cross page boundary */
+
+	movdqa	(%rsi, %rcx), %xmm1
+	movdqa	(%rdi, %rcx), %xmm2
+	movdqa	%xmm2, %xmm4
+
+	palignr $4, %xmm3, %xmm2        /* merge into one 16byte value */
+
+	pcmpeqb	%xmm1, %xmm0
+	pcmpeqb	%xmm2, %xmm1
+	psubb	%xmm0, %xmm1
+	pmovmskb %xmm1, %edx
+	sub	$0xffff, %edx
+	jnz	L(exit)
+
+#ifdef USE_AS_STRNCMP
+	sub	$16, %r11
+	jbe	L(strcmp_exitz)
+#endif
+
+	add	$16, %rcx
+	movdqa	%xmm4, %xmm3
+	jmp	L(loop_ashr_4)
+
+	.p2align 4
+L(nibble_ashr_4):
+	pcmpeqb	%xmm3, %xmm0		/* check nibble for null char */
+	pmovmskb %xmm0, %edx
+	test	$0xfff0, %edx
+	jnz	L(ashr_4_exittail)
+
+#ifdef USE_AS_STRNCMP
+	cmp	$11, %r11
+	jbe	L(ashr_4_exittail)
+#endif
+
+	pxor	%xmm0, %xmm0
+	sub	$0x1000, %r10
+	jmp	L(gobble_ashr_4)
+
+	.p2align 4
+L(ashr_4_exittail):
+	movdqa	(%rsi, %rcx), %xmm1
+	psrldq	$4, %xmm0
+	psrldq	$4, %xmm3
+	jmp	L(aftertail)
+
+/*
+ * The following cases will be handled by ashr_5
+ *  rcx(offset of rsi)  rax(offset of rdi)        relative offset      corresponding case
+ *        n(11~15)          n - 11      	  4(15 +(n-11) - n)         ashr_5
+ */
+	.p2align 4
+L(ashr_5):
+	pxor	%xmm0, %xmm0
+	movdqa	(%rdi), %xmm2
+	movdqa	(%rsi), %xmm1
+	pcmpeqb	%xmm1, %xmm0
+	pslldq	$11, %xmm2
+	pcmpeqb	%xmm1, %xmm2
+	psubb	%xmm0, %xmm2
+	pmovmskb %xmm2, %r9d
+	shr	%cl, %edx
+	shr	%cl, %r9d
+	sub	%r9d, %edx
+	jnz	L(less32bytes)
+	movdqa	(%rdi), %xmm3
+
+	UPDATE_STRNCMP_COUNTER
+
+	pxor	%xmm0, %xmm0
+	mov	$16, %rcx	/* index for loads */
+	mov	$5, %r9d	/* byte position left over from less32bytes case */
+	/*
+	 * Setup %r10 value allows us to detect crossing a page boundary.
+	 * When %r10 goes positive we have crossed a page boundary and
+	 * need to do a nibble.
+	 */
+	lea	5(%rdi), %r10
+	and	$0xfff, %r10	/* offset into 4K page */
+	sub	$0x1000, %r10	/* subtract 4K pagesize */
+
+	.p2align 4
+L(loop_ashr_5):
+	add	$16, %r10
+	jg	L(nibble_ashr_5)
+
+L(gobble_ashr_5):
+	movdqa	(%rsi, %rcx), %xmm1
+	movdqa	(%rdi, %rcx), %xmm2
+	movdqa	%xmm2, %xmm4
+
+	palignr $5, %xmm3, %xmm2        /* merge into one 16byte value */
+
+	pcmpeqb	%xmm1, %xmm0
+	pcmpeqb	%xmm2, %xmm1
+	psubb	%xmm0, %xmm1
+	pmovmskb %xmm1, %edx
+	sub	$0xffff, %edx
+	jnz	L(exit)
+
+#ifdef USE_AS_STRNCMP
+	sub	$16, %r11
+	jbe	L(strcmp_exitz)
+#endif
+
+	add	$16, %rcx
+	movdqa	%xmm4, %xmm3
+
+	add	$16, %r10
+	jg	L(nibble_ashr_5)	/* cross page boundary */
+
+	movdqa	(%rsi, %rcx), %xmm1
+	movdqa	(%rdi, %rcx), %xmm2
+	movdqa	%xmm2, %xmm4
+
+	palignr $5, %xmm3, %xmm2        /* merge into one 16byte value */
+
+	pcmpeqb	%xmm1, %xmm0
+	pcmpeqb	%xmm2, %xmm1
+	psubb	%xmm0, %xmm1
+	pmovmskb %xmm1, %edx
+	sub	$0xffff, %edx
+	jnz	L(exit)
+
+#ifdef USE_AS_STRNCMP
+	sub	$16, %r11
+	jbe	L(strcmp_exitz)
+#endif
+
+	add	$16, %rcx
+	movdqa	%xmm4, %xmm3
+	jmp	L(loop_ashr_5)
+
+	.p2align 4
+L(nibble_ashr_5):
+	pcmpeqb	%xmm3, %xmm0		/* check nibble for null char */
+	pmovmskb %xmm0, %edx
+	test	$0xffe0, %edx
+	jnz	L(ashr_5_exittail)
+
+#ifdef USE_AS_STRNCMP
+	cmp	$10, %r11
+	jbe	L(ashr_5_exittail)
+#endif
+
+	pxor	%xmm0, %xmm0
+	sub	$0x1000, %r10
+	jmp	L(gobble_ashr_5)
+
+	.p2align 4
+L(ashr_5_exittail):
+	movdqa	(%rsi, %rcx), %xmm1
+	psrldq	$5, %xmm0
+	psrldq	$5, %xmm3
+	jmp	L(aftertail)
+
+/*
+ * The following cases will be handled by ashr_6
+ *  rcx(offset of rsi)  rax(offset of rdi)        relative offset      corresponding case
+ *        n(10~15)          n - 10      	  5(15 +(n-10) - n)         ashr_6
+ */
+	.p2align 4
+L(ashr_6):
+	pxor	%xmm0, %xmm0
+	movdqa	(%rdi), %xmm2
+	movdqa	(%rsi), %xmm1
+	pcmpeqb	%xmm1, %xmm0
+	pslldq	$10, %xmm2
+	pcmpeqb	%xmm1, %xmm2
+	psubb	%xmm0, %xmm2
+	pmovmskb %xmm2, %r9d
+	shr	%cl, %edx
+	shr	%cl, %r9d
+	sub	%r9d, %edx
+	jnz	L(less32bytes)
+	movdqa	(%rdi), %xmm3
+
+	UPDATE_STRNCMP_COUNTER
+
+	pxor	%xmm0, %xmm0
+	mov	$16, %rcx	/* index for loads */
+	mov	$6, %r9d	/* byte position left over from less32bytes case */
+	/*
+	 * Setup %r10 value allows us to detect crossing a page boundary.
+	 * When %r10 goes positive we have crossed a page boundary and
+	 * need to do a nibble.
+	 */
+	lea	6(%rdi), %r10
+	and	$0xfff, %r10	/* offset into 4K page */
+	sub	$0x1000, %r10	/* subtract 4K pagesize */
+
+	.p2align 4
+L(loop_ashr_6):
+	add	$16, %r10
+	jg	L(nibble_ashr_6)
+
+L(gobble_ashr_6):
+	movdqa	(%rsi, %rcx), %xmm1
+	movdqa	(%rdi, %rcx), %xmm2
+	movdqa	%xmm2, %xmm4
+
+	palignr $6, %xmm3, %xmm2        /* merge into one 16byte value */
+
+	pcmpeqb	%xmm1, %xmm0
+	pcmpeqb	%xmm2, %xmm1
+	psubb	%xmm0, %xmm1
+	pmovmskb %xmm1, %edx
+	sub	$0xffff, %edx
+	jnz	L(exit)
+
+#ifdef USE_AS_STRNCMP
+	sub	$16, %r11
+	jbe	L(strcmp_exitz)
+#endif
+
+	add	$16, %rcx
+	movdqa	%xmm4, %xmm3
+
+	add	$16, %r10
+	jg	L(nibble_ashr_6)	/* cross page boundary */
+
+	movdqa	(%rsi, %rcx), %xmm1
+	movdqa	(%rdi, %rcx), %xmm2
+	movdqa	%xmm2, %xmm4
+
+	palignr $6, %xmm3, %xmm2        /* merge into one 16byte value */
+
+	pcmpeqb	%xmm1, %xmm0
+	pcmpeqb	%xmm2, %xmm1
+	psubb	%xmm0, %xmm1
+	pmovmskb %xmm1, %edx
+	sub	$0xffff, %edx
+	jnz	L(exit)
+
+#ifdef USE_AS_STRNCMP
+	sub	$16, %r11
+	jbe	L(strcmp_exitz)
+#endif
+
+	add	$16, %rcx
+	movdqa	%xmm4, %xmm3
+	jmp	L(loop_ashr_6)
+
+	.p2align 4
+L(nibble_ashr_6):
+	pcmpeqb	%xmm3, %xmm0		/* check nibble for null char */
+	pmovmskb %xmm0, %edx
+	test	$0xffc0, %edx
+	jnz	L(ashr_6_exittail)
+
+#ifdef USE_AS_STRNCMP
+	cmp	$9, %r11
+	jbe	L(ashr_6_exittail)
+#endif
+
+	pxor	%xmm0, %xmm0
+	sub	$0x1000, %r10
+	jmp	L(gobble_ashr_6)
+
+	.p2align 4
+L(ashr_6_exittail):
+	movdqa	(%rsi, %rcx), %xmm1
+	psrldq	$6, %xmm0
+	psrldq	$6, %xmm3
+	jmp	L(aftertail)
+
+/*
+ * The following cases will be handled by ashr_7
+ *  rcx(offset of rsi)  rax(offset of rdi)        relative offset      corresponding case
+ *        n(9~15)          n - 9      	        6(15 +(n - 9) - n)         ashr_7
+ */
+	.p2align 4
+L(ashr_7):
+	pxor	%xmm0, %xmm0
+	movdqa	(%rdi), %xmm2
+	movdqa	(%rsi), %xmm1
+	pcmpeqb	%xmm1, %xmm0
+	pslldq	$9, %xmm2
+	pcmpeqb	%xmm1, %xmm2
+	psubb	%xmm0, %xmm2
+	pmovmskb %xmm2, %r9d
+	shr	%cl, %edx
+	shr	%cl, %r9d
+	sub	%r9d, %edx
+	jnz	L(less32bytes)
+	movdqa	(%rdi), %xmm3
+
+	UPDATE_STRNCMP_COUNTER
+
+	pxor	%xmm0, %xmm0
+	mov	$16, %rcx	/* index for loads */
+	mov	$7, %r9d	/* byte position left over from less32bytes case */
+	/*
+	 * Setup %r10 value allows us to detect crossing a page boundary.
+	 * When %r10 goes positive we have crossed a page boundary and
+	 * need to do a nibble.
+	 */
+	lea	7(%rdi), %r10
+	and	$0xfff, %r10	/* offset into 4K page */
+	sub	$0x1000, %r10	/* subtract 4K pagesize */
+
+	.p2align 4
+L(loop_ashr_7):
+	add	$16, %r10
+	jg	L(nibble_ashr_7)
+
+L(gobble_ashr_7):
+	movdqa	(%rsi, %rcx), %xmm1
+	movdqa	(%rdi, %rcx), %xmm2
+	movdqa	%xmm2, %xmm4
+
+	palignr $7, %xmm3, %xmm2        /* merge into one 16byte value */
+
+	pcmpeqb	%xmm1, %xmm0
+	pcmpeqb	%xmm2, %xmm1
+	psubb	%xmm0, %xmm1
+	pmovmskb %xmm1, %edx
+	sub	$0xffff, %edx
+	jnz	L(exit)
+
+#ifdef USE_AS_STRNCMP
+	sub	$16, %r11
+	jbe	L(strcmp_exitz)
+#endif
+
+	add	$16, %rcx
+	movdqa	%xmm4, %xmm3
+
+	add	$16, %r10
+	jg	L(nibble_ashr_7)	/* cross page boundary */
+
+	movdqa	(%rsi, %rcx), %xmm1
+	movdqa	(%rdi, %rcx), %xmm2
+	movdqa	%xmm2, %xmm4
+
+	palignr $7, %xmm3, %xmm2        /* merge into one 16byte value */
+
+	pcmpeqb	%xmm1, %xmm0
+	pcmpeqb	%xmm2, %xmm1
+	psubb	%xmm0, %xmm1
+	pmovmskb %xmm1, %edx
+	sub	$0xffff, %edx
+	jnz	L(exit)
+
+#ifdef USE_AS_STRNCMP
+	sub	$16, %r11
+	jbe	L(strcmp_exitz)
+#endif
+
+	add	$16, %rcx
+	movdqa	%xmm4, %xmm3
+	jmp	L(loop_ashr_7)
+
+	.p2align 4
+L(nibble_ashr_7):
+	pcmpeqb	%xmm3, %xmm0		/* check nibble for null char */
+	pmovmskb %xmm0, %edx
+	test	$0xff80, %edx
+	jnz	L(ashr_7_exittail)
+
+#ifdef USE_AS_STRNCMP
+	cmp	$8, %r11
+	jbe	L(ashr_7_exittail)
+#endif
+
+	pxor	%xmm0, %xmm0
+	sub	$0x1000, %r10
+	jmp	L(gobble_ashr_7)
+
+	.p2align 4
+L(ashr_7_exittail):
+	movdqa	(%rsi, %rcx), %xmm1
+	psrldq	$7, %xmm0
+	psrldq	$7, %xmm3
+	jmp	L(aftertail)
+
+/*
+ *  The following cases will be handled by ashr_8
+ *  rcx(offset of rsi)  rax(offset of rdi)        relative offset	 corresponding case
+ *        n(8~15)          n - 8      	        7(15 +(n - 8) - n)         ashr_8
+ */
+	.p2align 4
+L(ashr_8):
+	pxor	%xmm0, %xmm0
+	movdqa	(%rdi), %xmm2
+	movdqa	(%rsi), %xmm1
+	pcmpeqb	%xmm1, %xmm0
+	pslldq	$8, %xmm2
+	pcmpeqb	%xmm1, %xmm2
+	psubb	%xmm0, %xmm2
+	pmovmskb %xmm2, %r9d
+	shr	%cl, %edx
+	shr	%cl, %r9d
+	sub	%r9d, %edx
+	jnz	L(less32bytes)
+	movdqa	(%rdi), %xmm3
+
+	UPDATE_STRNCMP_COUNTER
+
+	pxor	%xmm0, %xmm0
+	mov	$16, %rcx	/* index for loads */
+	mov	$8, %r9d	/* byte position left over from less32bytes case */
+	/*
+	 * Setup %r10 value allows us to detect crossing a page boundary.
+	 * When %r10 goes positive we have crossed a page boundary and
+	 * need to do a nibble.
+	 */
+	lea	8(%rdi), %r10
+	and	$0xfff, %r10	/* offset into 4K page */
+	sub	$0x1000, %r10	/* subtract 4K pagesize */
+
+	.p2align 4
+L(loop_ashr_8):
+	add	$16, %r10
+	jg	L(nibble_ashr_8)
+
+L(gobble_ashr_8):
+	movdqa	(%rsi, %rcx), %xmm1
+	movdqa	(%rdi, %rcx), %xmm2
+	movdqa	%xmm2, %xmm4
+
+	palignr $8, %xmm3, %xmm2        /* merge into one 16byte value */
+
+	pcmpeqb	%xmm1, %xmm0
+	pcmpeqb	%xmm2, %xmm1
+	psubb	%xmm0, %xmm1
+	pmovmskb %xmm1, %edx
+	sub	$0xffff, %edx
+	jnz	L(exit)
+
+#ifdef USE_AS_STRNCMP
+	sub	$16, %r11
+	jbe	L(strcmp_exitz)
+#endif
+
+	add	$16, %rcx
+	movdqa	%xmm4, %xmm3
+
+	add	$16, %r10
+	jg	L(nibble_ashr_8)	/* cross page boundary */
+
+	movdqa	(%rsi, %rcx), %xmm1
+	movdqa	(%rdi, %rcx), %xmm2
+	movdqa	%xmm2, %xmm4
+
+	palignr $8, %xmm3, %xmm2        /* merge into one 16byte value */
+
+	pcmpeqb	%xmm1, %xmm0
+	pcmpeqb	%xmm2, %xmm1
+	psubb	%xmm0, %xmm1
+	pmovmskb %xmm1, %edx
+	sub	$0xffff, %edx
+	jnz	L(exit)
+
+#ifdef USE_AS_STRNCMP
+	sub	$16, %r11
+	jbe	L(strcmp_exitz)
+#endif
+
+	add	$16, %rcx
+	movdqa	%xmm4, %xmm3
+	jmp	L(loop_ashr_8)
+
+	.p2align 4
+L(nibble_ashr_8):
+	pcmpeqb	%xmm3, %xmm0		/* check nibble for null char */
+	pmovmskb %xmm0, %edx
+	test	$0xff00, %edx
+	jnz	L(ashr_8_exittail)
+
+#ifdef USE_AS_STRNCMP
+	cmp	$7, %r11
+	jbe	L(ashr_8_exittail)
+#endif
+
+	pxor	%xmm0, %xmm0
+	sub	$0x1000, %r10
+	jmp	L(gobble_ashr_8)
+
+	.p2align 4
+L(ashr_8_exittail):
+	movdqa	(%rsi, %rcx), %xmm1
+	psrldq	$8, %xmm0
+	psrldq	$8, %xmm3
+	jmp	L(aftertail)
+
+/*
+ *  The following cases will be handled by ashr_9
+ *  rcx(offset of rsi)  rax(offset of rdi)        relative offset	 corresponding case
+ *        n(7~15)          n - 7      	        8(15 +(n - 7) - n)         ashr_9
+ */
+	.p2align 4
+L(ashr_9):
+	pxor	%xmm0, %xmm0
+	movdqa	(%rdi), %xmm2
+	movdqa	(%rsi), %xmm1
+	pcmpeqb	%xmm1, %xmm0
+	pslldq	$7, %xmm2
+	pcmpeqb	%xmm1, %xmm2
+	psubb	%xmm0, %xmm2
+	pmovmskb %xmm2, %r9d
+	shr	%cl, %edx
+	shr	%cl, %r9d
+	sub	%r9d, %edx
+	jnz	L(less32bytes)
+	movdqa	(%rdi), %xmm3
+
+	UPDATE_STRNCMP_COUNTER
+
+	pxor	%xmm0, %xmm0
+	mov	$16, %rcx	/* index for loads */
+	mov	$9, %r9d	/* byte position left over from less32bytes case */
+	/*
+	 * Setup %r10 value allows us to detect crossing a page boundary.
+	 * When %r10 goes positive we have crossed a page boundary and
+	 * need to do a nibble.
+	 */
+	lea	9(%rdi), %r10
+	and	$0xfff, %r10	/* offset into 4K page */
+	sub	$0x1000, %r10	/* subtract 4K pagesize */
+
+	.p2align 4
+L(loop_ashr_9):
+	add	$16, %r10
+	jg	L(nibble_ashr_9)
+
+L(gobble_ashr_9):
+	movdqa	(%rsi, %rcx), %xmm1
+	movdqa	(%rdi, %rcx), %xmm2
+	movdqa	%xmm2, %xmm4
+
+	palignr $9, %xmm3, %xmm2        /* merge into one 16byte value */
+
+	pcmpeqb	%xmm1, %xmm0
+	pcmpeqb	%xmm2, %xmm1
+	psubb	%xmm0, %xmm1
+	pmovmskb %xmm1, %edx
+	sub	$0xffff, %edx
+	jnz	L(exit)
+
+#ifdef USE_AS_STRNCMP
+	sub	$16, %r11
+	jbe	L(strcmp_exitz)
+#endif
+
+	add	$16, %rcx
+	movdqa	%xmm4, %xmm3
+
+	add	$16, %r10
+	jg	L(nibble_ashr_9)	/* cross page boundary */
+
+	movdqa	(%rsi, %rcx), %xmm1
+	movdqa	(%rdi, %rcx), %xmm2
+	movdqa	%xmm2, %xmm4
+
+	palignr $9, %xmm3, %xmm2        /* merge into one 16byte value */
+
+	pcmpeqb	%xmm1, %xmm0
+	pcmpeqb	%xmm2, %xmm1
+	psubb	%xmm0, %xmm1
+	pmovmskb %xmm1, %edx
+	sub	$0xffff, %edx
+	jnz	L(exit)
+
+#ifdef USE_AS_STRNCMP
+	sub	$16, %r11
+	jbe	L(strcmp_exitz)
+#endif
+
+	add	$16, %rcx
+	movdqa	%xmm4, %xmm3		/* store for next cycle */
+	jmp	L(loop_ashr_9)
+
+	.p2align 4
+L(nibble_ashr_9):
+	pcmpeqb	%xmm3, %xmm0		/* check nibble for null char */
+	pmovmskb %xmm0, %edx
+	test	$0xfe00, %edx
+	jnz	L(ashr_9_exittail)
+
+#ifdef USE_AS_STRNCMP
+	cmp	$6, %r11
+	jbe	L(ashr_9_exittail)
+#endif
+
+	pxor	%xmm0, %xmm0
+	sub	$0x1000, %r10
+	jmp	L(gobble_ashr_9)
+
+	.p2align 4
+L(ashr_9_exittail):
+	movdqa	(%rsi, %rcx), %xmm1
+	psrldq	$9, %xmm0
+	psrldq	$9, %xmm3
+	jmp	L(aftertail)
+
+/*
+ *  The following cases will be handled by ashr_10
+ *  rcx(offset of rsi)  rax(offset of rdi)        relative offset	 corresponding case
+ *        n(6~15)          n - 6      	        9(15 +(n - 6) - n)         ashr_10
+ */
+	.p2align 4
+L(ashr_10):
+	pxor	%xmm0, %xmm0
+	movdqa	(%rdi), %xmm2
+	movdqa	(%rsi), %xmm1
+	pcmpeqb	%xmm1, %xmm0
+	pslldq	$6, %xmm2
+	pcmpeqb	%xmm1, %xmm2
+	psubb	%xmm0, %xmm2
+	pmovmskb %xmm2, %r9d
+	shr	%cl, %edx
+	shr	%cl, %r9d
+	sub	%r9d, %edx
+	jnz	L(less32bytes)
+	movdqa	(%rdi), %xmm3
+
+	UPDATE_STRNCMP_COUNTER
+
+	pxor	%xmm0, %xmm0
+	mov	$16, %rcx	/* index for loads */
+	mov	$10, %r9d	/* byte position left over from less32bytes case */
+	/*
+	 * Setup %r10 value allows us to detect crossing a page boundary.
+	 * When %r10 goes positive we have crossed a page boundary and
+	 * need to do a nibble.
+	 */
+	lea	10(%rdi), %r10
+	and	$0xfff, %r10	/* offset into 4K page */
+	sub	$0x1000, %r10	/* subtract 4K pagesize */
+
+	.p2align 4
+L(loop_ashr_10):
+	add	$16, %r10
+	jg	L(nibble_ashr_10)
+
+L(gobble_ashr_10):
+	movdqa	(%rsi, %rcx), %xmm1
+	movdqa	(%rdi, %rcx), %xmm2
+	movdqa	%xmm2, %xmm4
+
+	palignr $10, %xmm3, %xmm2        /* merge into one 16byte value */
+
+	pcmpeqb	%xmm1, %xmm0
+	pcmpeqb	%xmm2, %xmm1
+	psubb	%xmm0, %xmm1
+	pmovmskb %xmm1, %edx
+	sub	$0xffff, %edx
+	jnz	L(exit)
+
+#ifdef USE_AS_STRNCMP
+	sub	$16, %r11
+	jbe	L(strcmp_exitz)
+#endif
+
+	add	$16, %rcx
+	movdqa	%xmm4, %xmm3
+
+	add	$16, %r10
+	jg	L(nibble_ashr_10)	/* cross page boundary */
+
+	movdqa	(%rsi, %rcx), %xmm1
+	movdqa	(%rdi, %rcx), %xmm2
+	movdqa	%xmm2, %xmm4
+
+	palignr $10, %xmm3, %xmm2        /* merge into one 16byte value */
+
+	pcmpeqb	%xmm1, %xmm0
+	pcmpeqb	%xmm2, %xmm1
+	psubb	%xmm0, %xmm1
+	pmovmskb %xmm1, %edx
+	sub	$0xffff, %edx
+	jnz	L(exit)
+
+#ifdef USE_AS_STRNCMP
+	sub	$16, %r11
+	jbe	L(strcmp_exitz)
+#endif
+
+	add	$16, %rcx
+	movdqa	%xmm4, %xmm3
+	jmp	L(loop_ashr_10)
+
+	.p2align 4
+L(nibble_ashr_10):
+	pcmpeqb	%xmm3, %xmm0		/* check nibble for null char */
+	pmovmskb %xmm0, %edx
+	test	$0xfc00, %edx
+	jnz	L(ashr_10_exittail)
+
+#ifdef USE_AS_STRNCMP
+	cmp	$5, %r11
+	jbe	L(ashr_10_exittail)
+#endif
+
+	pxor	%xmm0, %xmm0
+	sub	$0x1000, %r10
+	jmp	L(gobble_ashr_10)
+
+	.p2align 4
+L(ashr_10_exittail):
+	movdqa	(%rsi, %rcx), %xmm1
+	psrldq	$10, %xmm0
+	psrldq	$10, %xmm3
+	jmp	L(aftertail)
+
+/*
+ *  The following cases will be handled by ashr_11
+ *  rcx(offset of rsi)  rax(offset of rdi)        relative offset	 corresponding case
+ *        n(5~15)          n - 5      	        10(15 +(n - 5) - n)         ashr_11
+ */
+	.p2align 4
+L(ashr_11):
+	pxor	%xmm0, %xmm0
+	movdqa	(%rdi), %xmm2
+	movdqa	(%rsi), %xmm1
+	pcmpeqb	%xmm1, %xmm0
+	pslldq	$5, %xmm2
+	pcmpeqb	%xmm1, %xmm2
+	psubb	%xmm0, %xmm2
+	pmovmskb %xmm2, %r9d
+	shr	%cl, %edx
+	shr	%cl, %r9d
+	sub	%r9d, %edx
+	jnz	L(less32bytes)
+	movdqa	(%rdi), %xmm3
+
+	UPDATE_STRNCMP_COUNTER
+
+	pxor	%xmm0, %xmm0
+	mov	$16, %rcx	/* index for loads */
+	mov	$11, %r9d	/* byte position left over from less32bytes case */
+	/*
+	 * Setup %r10 value allows us to detect crossing a page boundary.
+	 * When %r10 goes positive we have crossed a page boundary and
+	 * need to do a nibble.
+	 */
+	lea	11(%rdi), %r10
+	and	$0xfff, %r10	/* offset into 4K page */
+	sub	$0x1000, %r10	/* subtract 4K pagesize */
+
+	.p2align 4
+L(loop_ashr_11):
+	add	$16, %r10
+	jg	L(nibble_ashr_11)
+
+L(gobble_ashr_11):
+	movdqa	(%rsi, %rcx), %xmm1
+	movdqa	(%rdi, %rcx), %xmm2
+	movdqa	%xmm2, %xmm4
+
+	palignr $11, %xmm3, %xmm2        /* merge into one 16byte value */
+
+	pcmpeqb	%xmm1, %xmm0
+	pcmpeqb	%xmm2, %xmm1
+	psubb	%xmm0, %xmm1
+	pmovmskb %xmm1, %edx
+	sub	$0xffff, %edx
+	jnz	L(exit)
+
+#ifdef USE_AS_STRNCMP
+	sub	$16, %r11
+	jbe	L(strcmp_exitz)
+#endif
+
+	add	$16, %rcx
+	movdqa	%xmm4, %xmm3
+
+	add	$16, %r10
+	jg	L(nibble_ashr_11)	/* cross page boundary */
+
+	movdqa	(%rsi, %rcx), %xmm1
+	movdqa	(%rdi, %rcx), %xmm2
+	movdqa	%xmm2, %xmm4
+
+	palignr $11, %xmm3, %xmm2        /* merge into one 16byte value */
+
+	pcmpeqb	%xmm1, %xmm0
+	pcmpeqb	%xmm2, %xmm1
+	psubb	%xmm0, %xmm1
+	pmovmskb %xmm1, %edx
+	sub	$0xffff, %edx
+	jnz	L(exit)
+
+#ifdef USE_AS_STRNCMP
+	sub	$16, %r11
+	jbe	L(strcmp_exitz)
+#endif
+
+	add	$16, %rcx
+	movdqa	%xmm4, %xmm3
+	jmp	L(loop_ashr_11)
+
+	.p2align 4
+L(nibble_ashr_11):
+	pcmpeqb	%xmm3, %xmm0		/* check nibble for null char */
+	pmovmskb %xmm0, %edx
+	test	$0xf800, %edx
+	jnz	L(ashr_11_exittail)
+
+#ifdef USE_AS_STRNCMP
+	cmp	$4, %r11
+	jbe	L(ashr_11_exittail)
+#endif
+
+	pxor	%xmm0, %xmm0
+	sub	$0x1000, %r10
+	jmp	L(gobble_ashr_11)
+
+	.p2align 4
+L(ashr_11_exittail):
+	movdqa	(%rsi, %rcx), %xmm1
+	psrldq	$11, %xmm0
+	psrldq	$11, %xmm3
+	jmp	L(aftertail)
+
+/*
+ *  The following cases will be handled by ashr_12
+ *  rcx(offset of rsi)  rax(offset of rdi)        relative offset	 corresponding case
+ *        n(4~15)          n - 4      	        11(15 +(n - 4) - n)         ashr_12
+ */
+	.p2align 4
+L(ashr_12):
+	pxor	%xmm0, %xmm0
+	movdqa	(%rdi), %xmm2
+	movdqa	(%rsi), %xmm1
+	pcmpeqb	%xmm1, %xmm0
+	pslldq	$4, %xmm2
+	pcmpeqb	%xmm1, %xmm2
+	psubb	%xmm0, %xmm2
+	pmovmskb %xmm2, %r9d
+	shr	%cl, %edx
+	shr	%cl, %r9d
+	sub	%r9d, %edx
+	jnz	L(less32bytes)
+	movdqa	(%rdi), %xmm3
+
+	UPDATE_STRNCMP_COUNTER
+
+	pxor	%xmm0, %xmm0
+	mov	$16, %rcx	/* index for loads */
+	mov	$12, %r9d	/* byte position left over from less32bytes case */
+	/*
+	 * Setup %r10 value allows us to detect crossing a page boundary.
+	 * When %r10 goes positive we have crossed a page boundary and
+	 * need to do a nibble.
+	 */
+	lea	12(%rdi), %r10
+	and	$0xfff, %r10	/* offset into 4K page */
+	sub	$0x1000, %r10	/* subtract 4K pagesize */
+
+	.p2align 4
+L(loop_ashr_12):
+	add	$16, %r10
+	jg	L(nibble_ashr_12)
+
+L(gobble_ashr_12):
+	movdqa	(%rsi, %rcx), %xmm1
+	movdqa	(%rdi, %rcx), %xmm2
+	movdqa	%xmm2, %xmm4
+
+	palignr $12, %xmm3, %xmm2        /* merge into one 16byte value */
+
+	pcmpeqb	%xmm1, %xmm0
+	pcmpeqb	%xmm2, %xmm1
+	psubb	%xmm0, %xmm1
+	pmovmskb %xmm1, %edx
+	sub	$0xffff, %edx
+	jnz	L(exit)
+
+#ifdef USE_AS_STRNCMP
+	sub	$16, %r11
+	jbe	L(strcmp_exitz)
+#endif
+
+	add	$16, %rcx
+	movdqa	%xmm4, %xmm3
+
+	add	$16, %r10
+	jg	L(nibble_ashr_12)	/* cross page boundary */
+
+	movdqa	(%rsi, %rcx), %xmm1
+	movdqa	(%rdi, %rcx), %xmm2
+	movdqa	%xmm2, %xmm4
+
+	palignr $12, %xmm3, %xmm2        /* merge into one 16byte value */
+
+	pcmpeqb	%xmm1, %xmm0
+	pcmpeqb	%xmm2, %xmm1
+	psubb	%xmm0, %xmm1
+	pmovmskb %xmm1, %edx
+	sub	$0xffff, %edx
+	jnz	L(exit)
+
+#ifdef USE_AS_STRNCMP
+	sub	$16, %r11
+	jbe	L(strcmp_exitz)
+#endif
+
+	add	$16, %rcx
+	movdqa	%xmm4, %xmm3
+	jmp	L(loop_ashr_12)
+
+	.p2align 4
+L(nibble_ashr_12):
+	pcmpeqb	%xmm3, %xmm0		/* check nibble for null char */
+	pmovmskb %xmm0, %edx
+	test	$0xf000, %edx
+	jnz	L(ashr_12_exittail)
+
+#ifdef USE_AS_STRNCMP
+	cmp	$3, %r11
+	jbe	L(ashr_12_exittail)
+#endif
+
+	pxor	%xmm0, %xmm0
+	sub	$0x1000, %r10
+	jmp	L(gobble_ashr_12)
+
+	.p2align 4
+L(ashr_12_exittail):
+	movdqa	(%rsi, %rcx), %xmm1
+	psrldq	$12, %xmm0
+	psrldq	$12, %xmm3
+	jmp	L(aftertail)
+
+/*
+ *  The following cases will be handled by ashr_13
+ *  rcx(offset of rsi)  rax(offset of rdi)        relative offset	 corresponding case
+ *        n(3~15)          n - 3      	        12(15 +(n - 3) - n)         ashr_13
+ */
+	.p2align 4
+L(ashr_13):
+	pxor	%xmm0, %xmm0
+	movdqa	(%rdi), %xmm2
+	movdqa	(%rsi), %xmm1
+	pcmpeqb	%xmm1, %xmm0
+	pslldq	$3, %xmm2
+	pcmpeqb	%xmm1, %xmm2
+	psubb	%xmm0, %xmm2
+	pmovmskb %xmm2, %r9d
+	shr	%cl, %edx
+	shr	%cl, %r9d
+	sub	%r9d, %edx
+	jnz	L(less32bytes)
+	movdqa	(%rdi), %xmm3
+
+	UPDATE_STRNCMP_COUNTER
+
+	pxor	%xmm0, %xmm0
+	mov	$16, %rcx	/* index for loads */
+	mov	$13, %r9d	/* byte position left over from less32bytes case */
+	/*
+	 * Setup %r10 value allows us to detect crossing a page boundary.
+	 * When %r10 goes positive we have crossed a page boundary and
+	 * need to do a nibble.
+	 */
+	lea	13(%rdi), %r10
+	and	$0xfff, %r10	/* offset into 4K page */
+	sub	$0x1000, %r10	/* subtract 4K pagesize */
+
+	.p2align 4
+L(loop_ashr_13):
+	add	$16, %r10
+	jg	L(nibble_ashr_13)
+
+L(gobble_ashr_13):
+	movdqa	(%rsi, %rcx), %xmm1
+	movdqa	(%rdi, %rcx), %xmm2
+	movdqa	%xmm2, %xmm4
+
+	palignr $13, %xmm3, %xmm2        /* merge into one 16byte value */
+
+	pcmpeqb	%xmm1, %xmm0
+	pcmpeqb	%xmm2, %xmm1
+	psubb	%xmm0, %xmm1
+	pmovmskb %xmm1, %edx
+	sub	$0xffff, %edx
+	jnz	L(exit)
+
+#ifdef USE_AS_STRNCMP
+	sub	$16, %r11
+	jbe	L(strcmp_exitz)
+#endif
+
+	add	$16, %rcx
+	movdqa	%xmm4, %xmm3
+
+	add	$16, %r10
+	jg	L(nibble_ashr_13)	/* cross page boundary */
+
+	movdqa	(%rsi, %rcx), %xmm1
+	movdqa	(%rdi, %rcx), %xmm2
+	movdqa	%xmm2, %xmm4
+
+	palignr $13, %xmm3, %xmm2        /* merge into one 16byte value */
+
+	pcmpeqb	%xmm1, %xmm0
+	pcmpeqb	%xmm2, %xmm1
+	psubb	%xmm0, %xmm1
+	pmovmskb %xmm1, %edx
+	sub	$0xffff, %edx
+	jnz	L(exit)
+
+#ifdef USE_AS_STRNCMP
+	sub	$16, %r11
+	jbe	L(strcmp_exitz)
+#endif
+
+	add	$16, %rcx
+	movdqa	%xmm4, %xmm3
+	jmp	L(loop_ashr_13)
+
+	.p2align 4
+L(nibble_ashr_13):
+	pcmpeqb	%xmm3, %xmm0		/* check nibble for null char */
+	pmovmskb %xmm0, %edx
+	test	$0xe000, %edx
+	jnz	L(ashr_13_exittail)
+
+#ifdef USE_AS_STRNCMP
+	cmp	$2, %r11
+	jbe	L(ashr_13_exittail)
+#endif
+
+	pxor	%xmm0, %xmm0
+	sub	$0x1000, %r10
+	jmp	L(gobble_ashr_13)
+
+	.p2align 4
+L(ashr_13_exittail):
+	movdqa	(%rsi, %rcx), %xmm1
+	psrldq  $13, %xmm0
+	psrldq  $13, %xmm3
+	jmp	L(aftertail)
+
+/*
+ *  The following cases will be handled by ashr_14
+ *  rcx(offset of rsi)  rax(offset of rdi)        relative offset	 corresponding case
+ *        n(2~15)          n - 2      	        13(15 +(n - 2) - n)         ashr_14
+ */
+	.p2align 4
+L(ashr_14):
+	pxor	%xmm0, %xmm0
+	movdqa	(%rdi), %xmm2
+	movdqa	(%rsi), %xmm1
+	pcmpeqb	%xmm1, %xmm0
+	pslldq  $2, %xmm2
+	pcmpeqb	%xmm1, %xmm2
+	psubb	%xmm0, %xmm2
+	pmovmskb %xmm2, %r9d
+	shr	%cl, %edx
+	shr	%cl, %r9d
+	sub	%r9d, %edx
+	jnz	L(less32bytes)
+	movdqa	(%rdi), %xmm3
+
+	UPDATE_STRNCMP_COUNTER
+
+	pxor	%xmm0, %xmm0
+	mov	$16, %rcx	/* index for loads */
+	mov	$14, %r9d	/* byte position left over from less32bytes case */
+	/*
+	 * Setup %r10 value allows us to detect crossing a page boundary.
+	 * When %r10 goes positive we have crossed a page boundary and
+	 * need to do a nibble.
+	 */
+	lea	14(%rdi), %r10
+	and	$0xfff, %r10	/* offset into 4K page */
+	sub	$0x1000, %r10	/* subtract 4K pagesize */
+
+	.p2align 4
+L(loop_ashr_14):
+	add	$16, %r10
+	jg	L(nibble_ashr_14)
+
+L(gobble_ashr_14):
+	movdqa	(%rsi, %rcx), %xmm1
+	movdqa	(%rdi, %rcx), %xmm2
+	movdqa	%xmm2, %xmm4
+
+	palignr $14, %xmm3, %xmm2        /* merge into one 16byte value */
+
+	pcmpeqb	%xmm1, %xmm0
+	pcmpeqb	%xmm2, %xmm1
+	psubb	%xmm0, %xmm1
+	pmovmskb %xmm1, %edx
+	sub	$0xffff, %edx
+	jnz	L(exit)
+
+#ifdef USE_AS_STRNCMP
+	sub	$16, %r11
+	jbe	L(strcmp_exitz)
+#endif
+
+	add	$16, %rcx
+	movdqa	%xmm4, %xmm3
+
+	add	$16, %r10
+	jg	L(nibble_ashr_14)	/* cross page boundary */
+
+	movdqa	(%rsi, %rcx), %xmm1
+	movdqa	(%rdi, %rcx), %xmm2
+	movdqa	%xmm2, %xmm4
+
+	palignr $14, %xmm3, %xmm2        /* merge into one 16byte value */
+
+	pcmpeqb	%xmm1, %xmm0
+	pcmpeqb	%xmm2, %xmm1
+	psubb	%xmm0, %xmm1
+	pmovmskb %xmm1, %edx
+	sub	$0xffff, %edx
+	jnz	L(exit)
+
+#ifdef USE_AS_STRNCMP
+	sub	$16, %r11
+	jbe	L(strcmp_exitz)
+#endif
+
+	add	$16, %rcx
+	movdqa	%xmm4, %xmm3
+	jmp	L(loop_ashr_14)
+
+	.p2align 4
+L(nibble_ashr_14):
+	pcmpeqb	%xmm3, %xmm0		/* check nibble for null char */
+	pmovmskb %xmm0, %edx
+	test	$0xc000, %edx
+	jnz	L(ashr_14_exittail)
+
+#ifdef USE_AS_STRNCMP
+	cmp	$1, %r11
+	jbe	L(ashr_14_exittail)
+#endif
+
+	pxor	%xmm0, %xmm0
+	sub	$0x1000, %r10
+	jmp	L(gobble_ashr_14)
+
+	.p2align 4
+L(ashr_14_exittail):
+	movdqa	(%rsi, %rcx), %xmm1
+	psrldq	$14, %xmm0
+	psrldq	$14, %xmm3
+	jmp	L(aftertail)
+
+/*
+ *  The following cases will be handled by ashr_15
+ *  rcx(offset of rsi)  rax(offset of rdi)        relative offset	 corresponding case
+ *        n(1~15)          n - 1      	        14(15 +(n - 1) - n)         ashr_15
+ */
+	.p2align 4
+L(ashr_15):
+	pxor	%xmm0, %xmm0
+	movdqa	(%rdi), %xmm2
+	movdqa	(%rsi), %xmm1
+	pcmpeqb	%xmm1, %xmm0
+	pslldq	$1, %xmm2
+	pcmpeqb	%xmm1, %xmm2
+	psubb	%xmm0, %xmm2
+	pmovmskb %xmm2, %r9d
+	shr	%cl, %edx
+	shr	%cl, %r9d
+	sub	%r9d, %edx
+	jnz	L(less32bytes)
+
+	movdqa	(%rdi), %xmm3
+
+	UPDATE_STRNCMP_COUNTER
+
+	pxor	%xmm0, %xmm0
+	mov	$16, %rcx	/* index for loads */
+	mov	$15, %r9d	/* byte position left over from less32bytes case */
+	/*
+	 * Setup %r10 value allows us to detect crossing a page boundary.
+	 * When %r10 goes positive we have crossed a page boundary and
+	 * need to do a nibble.
+	 */
+	lea	15(%rdi), %r10
+	and	$0xfff, %r10	/* offset into 4K page */
+
+	sub	$0x1000, %r10	/* subtract 4K pagesize */
+
+	.p2align 4
+L(loop_ashr_15):
+	add	$16, %r10
+	jg	L(nibble_ashr_15)
+
+L(gobble_ashr_15):
+	movdqa	(%rsi, %rcx), %xmm1
+	movdqa	(%rdi, %rcx), %xmm2
+	movdqa	%xmm2, %xmm4
+
+	palignr $15, %xmm3, %xmm2        /* merge into one 16byte value */
+
+	pcmpeqb	%xmm1, %xmm0
+	pcmpeqb	%xmm2, %xmm1
+	psubb	%xmm0, %xmm1
+	pmovmskb %xmm1, %edx
+	sub	$0xffff, %edx
+	jnz	L(exit)
+
+#ifdef USE_AS_STRNCMP
+	sub	$16, %r11
+	jbe	L(strcmp_exitz)
+#endif
+
+	add	$16, %rcx
+	movdqa	%xmm4, %xmm3
+
+	add	$16, %r10
+	jg	L(nibble_ashr_15)	/* cross page boundary */
+
+	movdqa	(%rsi, %rcx), %xmm1
+	movdqa	(%rdi, %rcx), %xmm2
+	movdqa	%xmm2, %xmm4
+
+	palignr $15, %xmm3, %xmm2        /* merge into one 16byte value */
+
+	pcmpeqb	%xmm1, %xmm0
+	pcmpeqb	%xmm2, %xmm1
+	psubb	%xmm0, %xmm1
+	pmovmskb %xmm1, %edx
+	sub	$0xffff, %edx
+	jnz	L(exit)
+
+#ifdef USE_AS_STRNCMP
+	sub	$16, %r11
+	jbe	L(strcmp_exitz)
+#endif
+
+	add	$16, %rcx
+	movdqa	%xmm4, %xmm3
+	jmp	L(loop_ashr_15)
+
+	.p2align 4
+L(nibble_ashr_15):
+	pcmpeqb	%xmm3, %xmm0		/* check nibble for null char */
+	pmovmskb %xmm0, %edx
+	test	$0x8000, %edx
+	jnz	L(ashr_15_exittail)
+
+#ifdef USE_AS_STRNCMP
+	test	%r11, %r11
+	je	L(ashr_15_exittail)
+#endif
+
+	pxor	%xmm0, %xmm0
+	sub	$0x1000, %r10
+	jmp	L(gobble_ashr_15)
+
+	.p2align 4
+L(ashr_15_exittail):
+	movdqa	(%rsi, %rcx), %xmm1
+	psrldq	$15, %xmm3
+	psrldq	$15, %xmm0
+
+	.p2align 4
+L(aftertail):
+	pcmpeqb	%xmm3, %xmm1
+	psubb	%xmm0, %xmm1
+	pmovmskb %xmm1, %edx
+	not	%edx
+
+	.p2align 4
+L(exit):
+	lea	-16(%r9, %rcx), %rax	/* locate the exact offset for rdi */
+L(less32bytes):
+	lea	(%rdi, %rax), %rdi	/* locate the exact address for first operand(rdi) */
+	lea	(%rsi, %rcx), %rsi	/* locate the exact address for second operand(rsi) */
+	test	%r8d, %r8d
+	jz	L(ret)
+	xchg	%rsi, %rdi		/* recover original order according to flag(%r8d) */
+
+	.p2align 4
+L(ret):
+L(less16bytes):
+	bsf	%rdx, %rdx		/* find and store bit index in %rdx */
+
+#ifdef USE_AS_STRNCMP
+	sub	%rdx, %r11
+	jbe	L(strcmp_exitz)
+#endif
+	movzbl	(%rsi, %rdx), %ecx
+	movzbl	(%rdi, %rdx), %eax
+
+	sub	%ecx, %eax
+	ret
+
+L(strcmp_exitz):
+	xor	%eax, %eax
+	ret
+
+	.p2align 4
+L(Byte0):
+	movzx	(%rsi), %ecx
+	movzx	(%rdi), %eax
+
+	sub	%ecx, %eax
+	ret
+END (STRCMP)
+
+	.section .rodata,"a",@progbits
+	.p2align 3
+L(unaligned_table):
+	.int	L(ashr_1) - L(unaligned_table)
+	.int	L(ashr_2) - L(unaligned_table)
+	.int	L(ashr_3) - L(unaligned_table)
+	.int	L(ashr_4) - L(unaligned_table)
+	.int	L(ashr_5) - L(unaligned_table)
+	.int	L(ashr_6) - L(unaligned_table)
+	.int	L(ashr_7) - L(unaligned_table)
+	.int	L(ashr_8) - L(unaligned_table)
+	.int	L(ashr_9) - L(unaligned_table)
+	.int	L(ashr_10) - L(unaligned_table)
+	.int	L(ashr_11) - L(unaligned_table)
+	.int	L(ashr_12) - L(unaligned_table)
+	.int	L(ashr_13) - L(unaligned_table)
+	.int	L(ashr_14) - L(unaligned_table)
+	.int	L(ashr_15) - L(unaligned_table)
+	.int	L(ashr_0) - L(unaligned_table)
diff --git a/libc/arch-x86/string/cache.h b/libc/arch-x86_64/string/ssse3-strncmp-slm.S
similarity index 73%
copy from libc/arch-x86/string/cache.h
copy to libc/arch-x86_64/string/ssse3-strncmp-slm.S
index 9d0a563..0e40775 100644
--- a/libc/arch-x86/string/cache.h
+++ b/libc/arch-x86_64/string/ssse3-strncmp-slm.S
@@ -1,5 +1,5 @@
 /*
-Copyright (c) 2010, Intel Corporation
+Copyright (c) 2014, Intel Corporation
 All rights reserved.
 
 Redistribution and use in source and binary forms, with or without
@@ -28,15 +28,6 @@
 SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 */
 
-#if defined(__slm__)
-/* Values are optimized for Silvermont */
-#define SHARED_CACHE_SIZE	(1024*1024)			/* Silvermont L2 Cache */
-#define DATA_CACHE_SIZE		(24*1024)			/* Silvermont L1 Data Cache */
-#else
-/* Values are optimized for Atom */
-#define SHARED_CACHE_SIZE	(512*1024)			/* Atom L2 Cache */
-#define DATA_CACHE_SIZE		(24*1024)			/* Atom L1 Data Cache */
-#endif
-
-#define SHARED_CACHE_SIZE_HALF	(SHARED_CACHE_SIZE / 2)
-#define DATA_CACHE_SIZE_HALF	(DATA_CACHE_SIZE / 2)
+#define USE_AS_STRNCMP
+#define STRCMP		strncmp
+#include "ssse3-strcmp-slm.S"
diff --git a/libc/arch-x86_64/syscalls/connect.S b/libc/arch-x86_64/syscalls/__connect.S
similarity index 84%
rename from libc/arch-x86_64/syscalls/connect.S
rename to libc/arch-x86_64/syscalls/__connect.S
index 23cdbae..288484e 100644
--- a/libc/arch-x86_64/syscalls/connect.S
+++ b/libc/arch-x86_64/syscalls/__connect.S
@@ -2,7 +2,7 @@
 
 #include <private/bionic_asm.h>
 
-ENTRY(connect)
+ENTRY(__connect)
     movl    $__NR_connect, %eax
     syscall
     cmpq    $-MAX_ERRNO, %rax
@@ -13,4 +13,5 @@
     orq     $-1, %rax
 1:
     ret
-END(connect)
+END(__connect)
+.hidden __connect
diff --git a/libc/arch-x86_64/x86_64.mk b/libc/arch-x86_64/x86_64.mk
index 9bce065..2bcf432 100644
--- a/libc/arch-x86_64/x86_64.mk
+++ b/libc/arch-x86_64/x86_64.mk
@@ -3,11 +3,7 @@
 libc_common_src_files_x86_64 := \
     bionic/index.cpp \
     bionic/memchr.c \
-    bionic/memcmp.c \
-    bionic/memcpy.cpp \
-    bionic/memmove.c \
     bionic/memrchr.c \
-    bionic/memset.c \
     bionic/strchr.cpp \
     bionic/strnlen.c \
     bionic/strrchr.cpp \
@@ -18,16 +14,8 @@
     upstream-freebsd/lib/libc/string/wcslen.c \
     upstream-freebsd/lib/libc/string/wcsrchr.c \
     upstream-freebsd/lib/libc/string/wmemcmp.c \
-    upstream-openbsd/lib/libc/string/bcopy.c \
-    upstream-openbsd/lib/libc/string/strcat.c \
-    upstream-openbsd/lib/libc/string/strcmp.c \
-    upstream-openbsd/lib/libc/string/strcpy.c \
     upstream-openbsd/lib/libc/string/strlcat.c \
     upstream-openbsd/lib/libc/string/strlcpy.c \
-    upstream-openbsd/lib/libc/string/strlen.c \
-    upstream-openbsd/lib/libc/string/strncat.c \
-    upstream-openbsd/lib/libc/string/strncmp.c \
-    upstream-openbsd/lib/libc/string/strncpy.c \
 
 # Fortify implementations of libc functions.
 libc_common_src_files_x86_64 += \
@@ -53,6 +41,23 @@
     arch-x86_64/bionic/vfork.S \
     bionic/__memcmp16.cpp \
 
+libc_bionic_src_files_x86_64 += \
+    arch-x86_64/string/sse2-bcopy-slm.S \
+    arch-x86_64/string/sse2-bzero-slm.S \
+    arch-x86_64/string/sse2-memcpy-slm.S \
+    arch-x86_64/string/sse2-memmove-slm.S \
+    arch-x86_64/string/sse2-memset-slm.S \
+    arch-x86_64/string/sse2-stpcpy-slm.S \
+    arch-x86_64/string/sse2-stpncpy-slm.S \
+    arch-x86_64/string/sse2-strcat-slm.S \
+    arch-x86_64/string/sse2-strcpy-slm.S \
+    arch-x86_64/string/sse2-strlen-slm.S \
+    arch-x86_64/string/sse2-strncat-slm.S \
+    arch-x86_64/string/sse2-strncpy-slm.S \
+    arch-x86_64/string/sse4-memcmp-slm.S \
+    arch-x86_64/string/ssse3-strcmp-slm.S \
+    arch-x86_64/string/ssse3-strncmp-slm.S \
+
 libc_crt_target_cflags_x86_64 += \
     -m64 \
     -I$(LOCAL_PATH)/arch-x86_64/include
diff --git a/libc/bionic/NetdClient.cpp b/libc/bionic/NetdClient.cpp
new file mode 100644
index 0000000..6826ee8
--- /dev/null
+++ b/libc/bionic/NetdClient.cpp
@@ -0,0 +1,54 @@
+/*
+ * Copyright (C) 2014 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifdef LIBC_STATIC
+#error NetdClient.cpp should NOT be included in static libc builds.
+#endif
+
+#include <private/NetdClient.h>
+#include <private/libc_logging.h>
+#include <pthread.h>
+
+#include <dlfcn.h>
+
+template <typename FunctionType>
+static void netdClientInitFunction(void* handle, const char* symbol, FunctionType* function) {
+    typedef void (*InitFunctionType)(FunctionType*);
+    InitFunctionType initFunction = reinterpret_cast<InitFunctionType>(dlsym(handle, symbol));
+    if (initFunction != NULL) {
+        initFunction(function);
+    }
+}
+
+static void netdClientInitImpl() {
+    void* netdClientHandle = dlopen("libnetd_client.so", RTLD_LAZY);
+    if (netdClientHandle == NULL) {
+        // If the library is not available, it's not an error. We'll just use
+        // default implementations of functions that it would've overridden.
+        return;
+    }
+    netdClientInitFunction(netdClientHandle, "netdClientInitConnect",
+                           &__netdClientDispatch.connect);
+}
+
+static pthread_once_t netdClientInitOnce = PTHREAD_ONCE_INIT;
+
+extern "C" __LIBC_HIDDEN__ void netdClientInit() {
+    if (pthread_once(&netdClientInitOnce, netdClientInitImpl)) {
+        __libc_format_log(ANDROID_LOG_ERROR, "netdClient",
+                          "Unable to initialize netd_client component.");
+    }
+}
diff --git a/libc/bionic/NetdClientDispatch.cpp b/libc/bionic/NetdClientDispatch.cpp
new file mode 100644
index 0000000..31728d2
--- /dev/null
+++ b/libc/bionic/NetdClientDispatch.cpp
@@ -0,0 +1,29 @@
+/*
+ * Copyright (C) 2014 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <private/NetdClient.h>
+
+#ifdef __i386__
+#define __socketcall __attribute__((__cdecl__))
+#else
+#define __socketcall
+#endif
+
+extern "C" __socketcall int __connect(int, const sockaddr*, socklen_t);
+
+NetdClientDispatch __netdClientDispatch __attribute__((aligned(32))) = {
+    __connect
+};
diff --git a/libc/bionic/connect.cpp b/libc/bionic/connect.cpp
new file mode 100644
index 0000000..c5db46b
--- /dev/null
+++ b/libc/bionic/connect.cpp
@@ -0,0 +1,22 @@
+/*
+ * Copyright (C) 2014 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <private/NetdClient.h>
+#include <sys/socket.h>
+
+int connect(int sockfd, const sockaddr* addr, socklen_t addrlen) {
+    return __netdClientDispatch.connect(sockfd, addr, addrlen);
+}
diff --git a/libc/bionic/libc_init_dynamic.cpp b/libc/bionic/libc_init_dynamic.cpp
index 61fb887..3d98861 100644
--- a/libc/bionic/libc_init_dynamic.cpp
+++ b/libc/bionic/libc_init_dynamic.cpp
@@ -58,6 +58,7 @@
   extern void pthread_debug_init(void);
   extern void malloc_debug_init(void);
   extern void malloc_debug_fini(void);
+  extern void netdClientInit(void);
 };
 
 // We flag the __libc_preinit function as a constructor to ensure
@@ -79,6 +80,9 @@
   // Hooks for the debug malloc and pthread libraries to let them know that we're starting up.
   pthread_debug_init();
   malloc_debug_init();
+
+  // Hook for the netd client library to let it know that we're starting up.
+  netdClientInit();
 }
 
 __LIBC_HIDDEN__ void __libc_postfini() {
diff --git a/libc/arch-mips/bionic/__set_tls.c b/libc/bionic/mntent.cpp
similarity index 81%
rename from libc/arch-mips/bionic/__set_tls.c
rename to libc/bionic/mntent.cpp
index 38e3a50..93b6915 100644
--- a/libc/arch-mips/bionic/__set_tls.c
+++ b/libc/bionic/mntent.cpp
@@ -25,11 +25,24 @@
  * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
-#include <pthread.h>
 
-extern int __set_thread_area(void *u_info);
+#include <mntent.h>
 
-int __set_tls(void *ptr)
-{
-    return __set_thread_area(ptr);
+mntent* getmntent(FILE*) {
+  return NULL;
+}
+
+mntent* getmntent_r(FILE*, struct mntent*, char*, int) {
+  return NULL;
+}
+
+FILE* setmntent(const char* path, const char* mode) {
+  return fopen(path, mode);
+}
+
+int endmntent(FILE* fp) {
+  if (fp != NULL) {
+    fclose(fp);
+  }
+  return 1;
 }
diff --git a/libc/bionic/stubs.cpp b/libc/bionic/stubs.cpp
index 2c2b74b..9b025df 100644
--- a/libc/bionic/stubs.cpp
+++ b/libc/bionic/stubs.cpp
@@ -89,9 +89,11 @@
   // Copy the strings.
   snprintf(buf, byte_count, "%s%c%s%c%s", src->pw_name, 0, src->pw_dir, 0, src->pw_shell);
 
-  // pw_passwd is non-POSIX and unused (always NULL) in bionic.
-  // pw_gecos is non-POSIX and missing in bionic.
+  // pw_passwd and pw_gecos are non-POSIX and unused (always NULL) in bionic.
   dst->pw_passwd = NULL;
+#ifdef __LP64__
+  dst->pw_gecos = NULL;
+#endif
 
   // Copy the integral fields.
   dst->pw_gid = src->pw_gid;
@@ -442,21 +444,6 @@
   UNIMPLEMENTED;
 }
 
-mntent* getmntent(FILE* /*f*/) {
-  UNIMPLEMENTED;
-  return NULL;
-}
-
-FILE* setmntent(const char*, const char*) {
-  UNIMPLEMENTED;
-  return NULL;
-}
-
-int endmntent(FILE*) {
-  UNIMPLEMENTED;
-  return 1; /* Allways returns 1 according to man */
-}
-
 char* ttyname(int /*fd*/) { // NOLINT: implementing bad function.
   UNIMPLEMENTED;
   return NULL;
diff --git a/libc/bionic/tmpfile.cpp b/libc/bionic/tmpfile.cpp
index 8419ff5..602d407 100644
--- a/libc/bionic/tmpfile.cpp
+++ b/libc/bionic/tmpfile.cpp
@@ -57,22 +57,23 @@
 };
 
 static FILE* __tmpfile_dir(const char* tmp_dir) {
-  char buf[PATH_MAX];
-  int path_length = snprintf(buf, sizeof(buf), "%s/tmp.XXXXXXXXXX", tmp_dir);
-  if (path_length >= static_cast<int>(sizeof(buf))) {
+  char* path = NULL;
+  if (asprintf(&path, "%s/tmp.XXXXXXXXXX", tmp_dir) == -1) {
     return NULL;
   }
 
   int fd;
   {
     ScopedSignalBlocker ssb;
-    fd = mkstemp(buf);
+    fd = mkstemp(path);
     if (fd == -1) {
+      free(path);
       return NULL;
     }
 
     // Unlink the file now so that it's removed when closed.
-    unlink(buf);
+    unlink(path);
+    free(path);
 
     // Can we still use the file now it's unlinked?
     // File systems without hard link support won't have the usual Unix semantics.
diff --git a/libc/bionic/wchar.cpp b/libc/bionic/wchar.cpp
index b46ad49..5da882f 100644
--- a/libc/bionic/wchar.cpp
+++ b/libc/bionic/wchar.cpp
@@ -32,23 +32,69 @@
 #include <wchar.h>
 
 //
-// This file is basically OpenBSD's citrus_utf8.c but rewritten to not require a 12-byte mbstate_t
-// so we're backwards-compatible with our LP32 ABI where mbstate_t was only 4 bytes. An additional
-// advantage of this is that callers who don't supply their own mbstate_t won't be accessing shared
-// state.
+// This file is basically OpenBSD's citrus_utf8.c but rewritten to not require a
+// 12-byte mbstate_t so we're backwards-compatible with our LP32 ABI where
+// mbstate_t was only 4 bytes.
 //
-// We also implement the POSIX interface directly rather than being accessed via function pointers.
+// The state is the UTF-8 sequence. We only support <= 4-bytes sequences so LP32
+// mbstate_t already has enough space (out of the 4 available bytes we only
+// need 3 since we should never need to store the entire sequence in the
+// intermediary state).
+//
+// The C standard leaves the conversion state undefined after a bad conversion.
+// To avoid unexpected failures due to the possible use of the internal private
+// state we always reset the conversion state when encountering illegal
+// sequences.
+//
+// We also implement the POSIX interface directly rather than being accessed via
+// function pointers.
 //
 
 #define ERR_ILLEGAL_SEQUENCE static_cast<size_t>(-1)
 #define ERR_INCOMPLETE_SEQUENCE static_cast<size_t>(-2)
 
-int mbsinit(const mbstate_t*) {
-  // We have no state, so we're always in the initial state.
-  return 1;
+static size_t mbstate_bytes_so_far(const mbstate_t* ps) {
+  return
+    (ps->__seq[2] != 0) ? 3 :
+    (ps->__seq[1] != 0) ? 2 :
+    (ps->__seq[0] != 0) ? 1 : 0;
 }
 
-size_t mbrtowc(wchar_t* pwc, const char* s, size_t n, mbstate_t*) {
+static void mbstate_set_byte(mbstate_t* ps, int i, char byte) {
+  ps->__seq[i] = static_cast<uint8_t>(byte);
+}
+
+static uint8_t mbstate_get_byte(const mbstate_t* ps, int n) {
+  return ps->__seq[n];
+}
+
+static size_t reset_and_return_illegal(int _errno, mbstate_t* ps) {
+  errno = _errno;
+  *(reinterpret_cast<uint32_t*>(ps->__seq)) = 0;
+  return ERR_ILLEGAL_SEQUENCE;
+}
+
+static size_t reset_and_return(int _return, mbstate_t* ps) {
+  *(reinterpret_cast<uint32_t*>(ps->__seq)) = 0;
+  return _return;
+}
+
+
+int mbsinit(const mbstate_t* ps) {
+  return (ps == NULL || (*(reinterpret_cast<const uint32_t*>(ps->__seq)) == 0));
+}
+
+size_t mbrtowc(wchar_t* pwc, const char* s, size_t n, mbstate_t* ps) {
+  static mbstate_t __private_state;
+  mbstate_t* state = (ps == NULL) ? &__private_state : ps;
+
+  // We should never get to a state which has all 4 bytes of the sequence set.
+  // Full state verification is done when decoding the sequence (after we have
+  // all the bytes).
+  if (mbstate_get_byte(state, 3) != 0) {
+    return reset_and_return_illegal(EINVAL, state);
+  }
+
   if (s == NULL) {
     s = "";
     n = 1;
@@ -59,8 +105,8 @@
     return 0;
   }
 
-  int ch;
-  if (((ch = static_cast<uint8_t>(*s)) & ~0x7f) == 0) {
+  uint8_t ch;
+  if (mbsinit(state) && (((ch = static_cast<uint8_t>(*s)) & ~0x7f) == 0)) {
     // Fast path for plain ASCII characters.
     if (pwc != NULL) {
       *pwc = ch;
@@ -82,7 +128,9 @@
   // between character codes and their multibyte representations.
   wchar_t lower_bound;
 
-  ch = static_cast<uint8_t>(*s);
+  // The first byte in the state (if any) tells the length.
+  size_t bytes_so_far = mbstate_bytes_so_far(state);
+  ch = bytes_so_far > 0 ? mbstate_get_byte(state, 0) : static_cast<uint8_t>(*s);
   if ((ch & 0x80) == 0) {
     mask = 0x7f;
     length = 1;
@@ -101,106 +149,144 @@
     lower_bound = 0x10000;
   } else {
     // Malformed input; input is not UTF-8. See RFC 3629.
-    errno = EILSEQ;
-    return ERR_ILLEGAL_SEQUENCE;
+    return reset_and_return_illegal(EILSEQ, state);
+  }
+
+  // Fill in the state.
+  size_t bytes_wanted = length - bytes_so_far;
+  size_t i;
+  for (i = 0; i < MIN(bytes_wanted, n); i++) {
+    if (!mbsinit(state) && ((*s & 0xc0) != 0x80)) {
+      // Malformed input; bad characters in the middle of a character.
+      return reset_and_return_illegal(EILSEQ, state);
+    }
+    mbstate_set_byte(state, bytes_so_far + i, *s++);
+  }
+  if (i < bytes_wanted) {
+    return ERR_INCOMPLETE_SEQUENCE;
   }
 
   // Decode the octet sequence representing the character in chunks
   // of 6 bits, most significant first.
-  wchar_t wch = static_cast<uint8_t>(*s++) & mask;
-  size_t i;
-  for (i = 1; i < MIN(length, n); i++) {
-    if ((*s & 0xc0) != 0x80) {
-      // Malformed input; bad characters in the middle of a character.
-      errno = EILSEQ;
-      return ERR_ILLEGAL_SEQUENCE;
-    }
+  wchar_t wch = mbstate_get_byte(state, 0) & mask;
+  for (i = 1; i < length; i++) {
     wch <<= 6;
-    wch |= *s++ & 0x3f;
+    wch |= mbstate_get_byte(state, i) & 0x3f;
   }
-  if (i < length) {
-    return ERR_INCOMPLETE_SEQUENCE;
-  }
+
   if (wch < lower_bound) {
     // Malformed input; redundant encoding.
-    errno = EILSEQ;
-    return ERR_ILLEGAL_SEQUENCE;
+    return reset_and_return_illegal(EILSEQ, state);
   }
   if ((wch >= 0xd800 && wch <= 0xdfff) || wch == 0xfffe || wch == 0xffff) {
     // Malformed input; invalid code points.
-    errno = EILSEQ;
-    return ERR_ILLEGAL_SEQUENCE;
+    return reset_and_return_illegal(EILSEQ, state);
   }
   if (pwc != NULL) {
     *pwc = wch;
   }
-  return (wch == L'\0' ? 0 : length);
+  return reset_and_return(wch == L'\0' ? 0 : bytes_wanted, state);
 }
 
 size_t mbsnrtowcs(wchar_t* dst, const char** src, size_t nmc, size_t len, mbstate_t* ps) {
+  static mbstate_t __private_state;
+  mbstate_t* state = (ps == NULL) ? &__private_state : ps;
   size_t i, o, r;
 
   if (dst == NULL) {
+    /*
+     * The fast path in the loop below is not safe if an ASCII
+     * character appears as anything but the first byte of a
+     * multibyte sequence. Check now to avoid doing it in the loop.
+     */
+    if ((nmc > 0) && (mbstate_bytes_so_far(state) > 0)
+        && (static_cast<uint8_t>((*src)[0]) < 0x80)) {
+      return reset_and_return_illegal(EILSEQ, state);
+    }
     for (i = o = 0; i < nmc; i += r, o++) {
       if (static_cast<uint8_t>((*src)[i]) < 0x80) {
         // Fast path for plain ASCII characters.
         if ((*src)[i] == '\0') {
-          return o;
+          return reset_and_return(o, state);
         }
         r = 1;
       } else {
-        r = mbrtowc(NULL, *src + i, nmc - i, ps);
+        r = mbrtowc(NULL, *src + i, nmc - i, state);
         if (r == ERR_ILLEGAL_SEQUENCE) {
-          return r;
+          return reset_and_return_illegal(EILSEQ, state);
         }
         if (r == ERR_INCOMPLETE_SEQUENCE) {
-          return o;
+          return reset_and_return_illegal(EILSEQ, state);
         }
         if (r == 0) {
-          return o;
+          return reset_and_return(o, state);
         }
       }
     }
-    return o;
+    return reset_and_return(o, state);
   }
 
+  /*
+   * The fast path in the loop below is not safe if an ASCII
+   * character appears as anything but the first byte of a
+   * multibyte sequence. Check now to avoid doing it in the loop.
+   */
+  if ((nmc > 0) && (mbstate_bytes_so_far(state) > 0)
+      && (static_cast<uint8_t>((*src)[0]) < 0x80)) {
+    return reset_and_return_illegal(EILSEQ, state);
+  }
   for (i = o = 0; i < nmc && o < len; i += r, o++) {
     if (static_cast<uint8_t>((*src)[i]) < 0x80) {
       // Fast path for plain ASCII characters.
       dst[o] = (*src)[i];
       if ((*src)[i] == '\0') {
         *src = NULL;
-        return o;
+        return reset_and_return_illegal(EILSEQ, state);
       }
       r = 1;
     } else {
-      r = mbrtowc(dst + o, *src + i, nmc - i, ps);
+      r = mbrtowc(dst + o, *src + i, nmc - i, state);
       if (r == ERR_ILLEGAL_SEQUENCE) {
         *src += i;
-        return r;
+        return reset_and_return_illegal(EILSEQ, state);
       }
       if (r == ERR_INCOMPLETE_SEQUENCE) {
         *src += nmc;
-        return o;
+        return reset_and_return(EILSEQ, state);
       }
       if (r == 0) {
         *src = NULL;
-        return o;
+        return reset_and_return(o, state);
       }
     }
   }
   *src += i;
-  return o;
+  return reset_and_return(o, state);
 }
 
 size_t mbsrtowcs(wchar_t* dst, const char** src, size_t len, mbstate_t* ps) {
   return mbsnrtowcs(dst, src, SIZE_MAX, len, ps);
 }
 
-size_t wcrtomb(char* s, wchar_t wc, mbstate_t*) {
+size_t wcrtomb(char* s, wchar_t wc, mbstate_t* ps) {
+  static mbstate_t __private_state;
+  mbstate_t* state = (ps == NULL) ? &__private_state : ps;
+
   if (s == NULL) {
-    // Reset to initial shift state (no-op).
-    return 1;
+    // Equivalent to wcrtomb(buf, L'\0', ps).
+    return reset_and_return(1, state);
+  }
+
+  // POSIX states that if wc is a null wide character, a null byte shall be
+  // stored, preceded by any shift sequence needed to restore the initial shift
+  // state. Since shift states are not supported, only the null byte is stored.
+  if (wc == L'\0') {
+    *s = '\0';
+    reset_and_return(1, state);
+  }
+
+  if (!mbsinit(state)) {
+    return reset_and_return_illegal(EILSEQ, state);
   }
 
   if ((wc & ~0x7f) == 0) {
@@ -246,6 +332,13 @@
 }
 
 size_t wcsnrtombs(char* dst, const wchar_t** src, size_t nwc, size_t len, mbstate_t* ps) {
+  static mbstate_t __private_state;
+  mbstate_t* state = (ps == NULL) ? &__private_state : ps;
+
+  if (!mbsinit(state)) {
+    return reset_and_return_illegal(EILSEQ, state);
+  }
+
   char buf[MB_LEN_MAX];
   size_t i, o, r;
   if (dst == NULL) {
@@ -258,7 +351,7 @@
         }
         r = 1;
       } else {
-        r = wcrtomb(buf, wc, ps);
+        r = wcrtomb(buf, wc, state);
         if (r == ERR_ILLEGAL_SEQUENCE) {
           return r;
         }
@@ -279,14 +372,14 @@
       r = 1;
     } else if (len - o >= sizeof(buf)) {
       // Enough space to translate in-place.
-      r = wcrtomb(dst + o, wc, ps);
+      r = wcrtomb(dst + o, wc, state);
       if (r == ERR_ILLEGAL_SEQUENCE) {
         *src += i;
         return r;
       }
     } else {
       // May not be enough space; use temp buffer.
-      r = wcrtomb(buf, wc, ps);
+      r = wcrtomb(buf, wc, state);
       if (r == ERR_ILLEGAL_SEQUENCE) {
         *src += i;
         return r;
diff --git a/libc/include/arpa/inet.h b/libc/include/arpa/inet.h
index b008812..067be1f 100644
--- a/libc/include/arpa/inet.h
+++ b/libc/include/arpa/inet.h
@@ -36,19 +36,18 @@
 
 typedef uint32_t in_addr_t;
 
-extern uint32_t      inet_addr(const char *);
-
-extern int           inet_aton(const char *, struct in_addr *);
-extern char*         inet_ntoa(struct in_addr);
-
-extern int           inet_pton(int, const char *, void *);
-extern const char*   inet_ntop(int, const void *, char *, socklen_t);
-
-extern unsigned int  inet_nsap_addr(const char *, unsigned char *, int);
-extern char*         inet_nsap_ntoa(int, const unsigned char *, char *);
+in_addr_t inet_addr(const char*);
+int inet_aton(const char*, struct in_addr*);
+in_addr_t inet_lnaof(struct in_addr);
+struct in_addr inet_makeaddr(in_addr_t, in_addr_t);
+in_addr_t inet_netof(struct in_addr);
+in_addr_t inet_network(const char*);
+char* inet_ntoa(struct in_addr);
+const char* inet_ntop(int, const void*, char*, socklen_t);
+unsigned int inet_nsap_addr(const char*, unsigned char*, int);
+char* inet_nsap_ntoa(int, const unsigned char*, char*);
+int inet_pton(int, const char*, void*);
 
 __END_DECLS
 
 #endif /* _ARPA_INET_H_ */
-
-
diff --git a/libc/include/dirent.h b/libc/include/dirent.h
index bfe4ea4..71eb2e7 100644
--- a/libc/include/dirent.h
+++ b/libc/include/dirent.h
@@ -56,6 +56,8 @@
 struct dirent { __DIRENT64_BODY };
 struct dirent64 { __DIRENT64_BODY };
 
+#undef __DIRENT64_BODY
+
 #define d_fileno d_ino
 
 typedef struct DIR DIR;
diff --git a/libc/include/mntent.h b/libc/include/mntent.h
index 6cc0b18..de285d0 100644
--- a/libc/include/mntent.h
+++ b/libc/include/mntent.h
@@ -35,23 +35,21 @@
 #define MOUNTED _PATH_MOUNTED
 #define MNTTYPE_IGNORE "ignore"
 
-struct mntent
-{
-    char* mnt_fsname;
-    char* mnt_dir;
-    char* mnt_type;
-    char* mnt_opts;
-    int mnt_freq;
-    int mnt_passno;
+struct mntent {
+  char* mnt_fsname;
+  char* mnt_dir;
+  char* mnt_type;
+  char* mnt_opts;
+  int mnt_freq;
+  int mnt_passno;
 };
 
-
 __BEGIN_DECLS
 
-
-struct mntent* getmntent(FILE*);
-FILE* setmntent(const char*, const char*);
 int endmntent(FILE*);
+struct mntent* getmntent(FILE*);
+struct mntent* getmntent_r(FILE*, struct mntent*, char*, int);
+FILE* setmntent(const char*, const char*);
 
 __END_DECLS
 
diff --git a/libc/include/nsswitch.h b/libc/include/nsswitch.h
deleted file mode 100644
index d19d055..0000000
--- a/libc/include/nsswitch.h
+++ /dev/null
@@ -1,236 +0,0 @@
-/*	$NetBSD: nsswitch.h,v 1.18 2005/11/29 03:12:58 christos Exp $	*/
-
-/*-
- * Copyright (c) 1997, 1998, 1999, 2004 The NetBSD Foundation, Inc.
- * All rights reserved.
- *
- * This code is derived from software contributed to The NetBSD Foundation
- * by Luke Mewburn.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- * 1. Redistributions of source code must retain the above copyright
- *    notice, this list of conditions and the following disclaimer.
- * 2. Redistributions in binary form must reproduce the above copyright
- *    notice, this list of conditions and the following disclaimer in the
- *    documentation and/or other materials provided with the distribution.
- * 3. All advertising materials mentioning features or use of this software
- *    must display the following acknowledgement:
- *        This product includes software developed by the NetBSD
- *        Foundation, Inc. and its contributors.
- * 4. Neither the name of The NetBSD Foundation nor the names of its
- *    contributors may be used to endorse or promote products derived
- *    from this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
- * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
- * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
- * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
- * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
- * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
- * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
- * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
- * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
- * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
- * POSSIBILITY OF SUCH DAMAGE.
- */
-
-#ifndef _NSSWITCH_H
-#define _NSSWITCH_H	1
-
-#include <sys/types.h>
-#include <stdarg.h>
-
-#define	NSS_MODULE_INTERFACE_VERSION	0
-
-#ifndef _PATH_NS_CONF
-#define _PATH_NS_CONF	"/etc/nsswitch.conf"
-#endif
-
-#define	NS_CONTINUE	0
-#define	NS_RETURN	1
-
-/*
- * Layout of:
- *	uint32_t ns_src.flags
- */ 
-	/* nsswitch.conf status codes and nsdispatch(3) return values */
-#define	NS_SUCCESS	(1<<0)		/* entry was found */
-#define	NS_UNAVAIL	(1<<1)		/* source not responding, or corrupt */
-#define	NS_NOTFOUND	(1<<2)		/* source responded 'no such entry' */
-#define	NS_TRYAGAIN	(1<<3)		/* source busy, may respond to retrys */
-#define	NS_STATUSMASK	0x000000ff	/* bitmask to get the status flags */
-
-	/* internal nsdispatch(3) flags; not settable in nsswitch.conf(5)  */
-#define	NS_FORCEALL	(1<<8)		/* force all methods to be invoked; */
-
-/*
- * Currently implemented sources.
- */
-#define NSSRC_FILES	"files"		/* local files */
-#define	NSSRC_DNS	"dns"		/* DNS; IN for hosts, HS for others */
-#define	NSSRC_NIS	"nis"		/* YP/NIS */
-#define	NSSRC_COMPAT	"compat"	/* passwd,group in YP compat mode */
-
-/*
- * Currently implemented databases.
- */
-#define NSDB_HOSTS		"hosts"
-#define NSDB_GROUP		"group"
-#define NSDB_GROUP_COMPAT	"group_compat"
-#define NSDB_NETGROUP		"netgroup"
-#define NSDB_NETWORKS		"networks"
-#define NSDB_PASSWD		"passwd"
-#define NSDB_PASSWD_COMPAT	"passwd_compat"
-#define NSDB_SHELLS		"shells"
-
-/*
- * Suggested databases to implement.
- */
-#define NSDB_ALIASES		"aliases"
-#define NSDB_AUTH		"auth"
-#define NSDB_AUTOMOUNT		"automount"
-#define NSDB_BOOTPARAMS		"bootparams"
-#define NSDB_ETHERS		"ethers"
-#define NSDB_EXPORTS		"exports"
-#define NSDB_NETMASKS		"netmasks"
-#define NSDB_PHONES		"phones"
-#define NSDB_PRINTCAP		"printcap"
-#define NSDB_PROTOCOLS		"protocols"
-#define NSDB_REMOTE		"remote"
-#define NSDB_RPC		"rpc"
-#define NSDB_SENDMAILVARS	"sendmailvars"
-#define NSDB_SERVICES		"services"
-#define NSDB_TERMCAP		"termcap"
-#define NSDB_TTYS		"ttys"
-
-/*
- * ns_dtab `callback' function signature.
- */
-typedef	int (*nss_method)(void *, void *, va_list);
-
-/*
- * ns_dtab - `nsswitch dispatch table'
- * Contains an entry for each source and the appropriate function to call.
- */
-typedef struct {
-	const char	 *src;
-	nss_method	 callback;
-	void		 *cb_data;
-} ns_dtab;
-
-/*
- * Macros to help build an ns_dtab[]
- */
-#define NS_FILES_CB(F,C)	{ NSSRC_FILES,	F,	__UNCONST(C) },
-#define NS_COMPAT_CB(F,C)	{ NSSRC_COMPAT,	F,	__UNCONST(C) },
- 
-#ifdef HESIOD
-#   define NS_DNS_CB(F,C)	{ NSSRC_DNS,	F,	__UNCONST(C) },
-#else
-#   define NS_DNS_CB(F,C)
-#endif
-
-#ifdef YP
-#   define NS_NIS_CB(F,C)	{ NSSRC_NIS,	F,	__UNCONST(C) },
-#else
-#   define NS_NIS_CB(F,C)
-#endif
-
-/*
- * ns_src - `nsswitch source'
- * Used by the nsparser routines to store a mapping between a source
- * and its dispatch control flags for a given database.
- */
-typedef struct {
-	const char	*name;
-	uint32_t	 flags;
-} ns_src;
-
-
-/*
- * Default sourcelists (if nsswitch.conf is missing, corrupt,
- * or the requested database doesn't have an entry)
- */
-extern const ns_src __nsdefaultsrc[];
-extern const ns_src __nsdefaultcompat[];
-extern const ns_src __nsdefaultcompat_forceall[];
-extern const ns_src __nsdefaultfiles[];
-extern const ns_src __nsdefaultfiles_forceall[];
-extern const ns_src __nsdefaultnis[];
-extern const ns_src __nsdefaultnis_forceall[];
-
-
-/*
- * ns_mtab - `nsswitch method table'
- * An nsswitch module provides a mapping from (database name, method name)
- * tuples to the nss_method and associated callback data.  Effectively,
- * ns_dtab, but used for dynamically loaded modules.
- */
-typedef struct {
-	const char	*database;
-	const char	*name;
-	nss_method	 method;
-	void		*mdata;
-} ns_mtab;
-
-/*
- * nss_module_register_fn - module registration function
- *	called at module load
- * nss_module_unregister_fn - module un-registration function
- *	called at module unload
- */
-typedef	void (*nss_module_unregister_fn)(ns_mtab *, u_int);
-typedef	ns_mtab *(*nss_module_register_fn)(const char *, u_int *,
-					   nss_module_unregister_fn *);
-
-#ifdef _NS_PRIVATE
-
-/*
- * Private data structures for back-end nsswitch implementation.
- */
-
-/*
- * ns_dbt - `nsswitch database thang'
- * For each database in /etc/nsswitch.conf there is a ns_dbt, with its
- * name and a list of ns_src's containing the source information.
- */
-typedef struct {
-	const char	*name;		/* name of database */
-	ns_src		*srclist;	/* list of sources */
-	u_int		 srclistsize;	/* size of srclist */
-} ns_dbt;
-
-/*
- * ns_mod - `nsswitch module'
- */
-typedef struct {
-	const char	*name;		/* module name */
-	void		*handle;	/* handle from dlopen() */
-	ns_mtab		*mtab;		/* method table */
-	u_int		 mtabsize;	/* size of mtab */
-					/* called to unload module */
-	nss_module_unregister_fn unregister;
-} ns_mod;
-
-#endif /* _NS_PRIVATE */
-
-
-#include <sys/cdefs.h>
-
-__BEGIN_DECLS
-int	nsdispatch(void *, const ns_dtab [], const char *,
-			const char *, const ns_src [], ...);
-
-#ifdef _NS_PRIVATE
-int		 _nsdbtaddsrc(ns_dbt *, const ns_src *);
-void		 _nsdbtdump(const ns_dbt *);
-int		 _nsdbtput(const ns_dbt *);
-void		 _nsyyerror(const char *);
-int		 _nsyylex(void);
-#endif /* _NS_PRIVATE */
-
-__END_DECLS
-
-#endif /* !_NSSWITCH_H */
diff --git a/libc/include/pwd.h b/libc/include/pwd.h
index 6f3fad5..6d483c0 100644
--- a/libc/include/pwd.h
+++ b/libc/include/pwd.h
@@ -100,12 +100,15 @@
 
 struct passwd
 {
-    char* pw_name;
-    char* pw_passwd;
-    uid_t pw_uid;
-    gid_t pw_gid;
-    char* pw_dir;
-    char* pw_shell;
+  char* pw_name;
+  char* pw_passwd;
+  uid_t pw_uid;
+  gid_t pw_gid;
+#ifdef __LP64__
+  char* pw_gecos;
+#endif
+  char* pw_dir;
+  char* pw_shell;
 };
 
 __BEGIN_DECLS
diff --git a/libc/include/sched.h b/libc/include/sched.h
index 68115bb..e43b6cc 100644
--- a/libc/include/sched.h
+++ b/libc/include/sched.h
@@ -59,10 +59,10 @@
 extern int sched_getcpu(void);
 extern int setns(int, int);
 
-#ifdef __LP32__
-#define CPU_SETSIZE 32
-#else
+#ifdef __LP64__
 #define CPU_SETSIZE 1024
+#else
+#define CPU_SETSIZE 32
 #endif
 
 #define __CPU_BITTYPE  unsigned long int  /* mandated by the kernel  */
diff --git a/libc/include/sys/endian.h b/libc/include/sys/endian.h
index cbde121..be4c905 100644
--- a/libc/include/sys/endian.h
+++ b/libc/include/sys/endian.h
@@ -39,6 +39,8 @@
 #include <sys/cdefs.h>
 #include <machine/endian.h>
 
+#include <stdint.h>
+
 #define _LITTLE_ENDIAN	1234
 #define _BIG_ENDIAN	4321
 #define _PDP_ENDIAN	3412
@@ -186,14 +188,22 @@
 #define letoh64(x) (x)
 #endif /* __BSD_VISIBLE */
 
-#define htons(x) __swap16(x)
+/* glibc compatibility. */
+__BEGIN_DECLS
+uint32_t htonl(uint32_t) __pure2;
+uint16_t htons(uint16_t) __pure2;
+uint32_t ntohl(uint32_t) __pure2;
+uint16_t ntohs(uint16_t) __pure2;
+__END_DECLS
+
 #define htonl(x) __swap32(x)
-#define ntohs(x) __swap16(x)
+#define htons(x) __swap16(x)
 #define ntohl(x) __swap32(x)
+#define ntohs(x) __swap16(x)
 
 /* Bionic additions */
-#define ntohq(x) __swap64(x)
 #define htonq(x) __swap64(x)
+#define ntohq(x) __swap64(x)
 
 #define __LITTLE_ENDIAN_BITFIELD
 
diff --git a/libc/include/sys/stat.h b/libc/include/sys/stat.h
index e62e76d..c0c168b 100644
--- a/libc/include/sys/stat.h
+++ b/libc/include/sys/stat.h
@@ -130,6 +130,8 @@
 struct stat { __STAT64_BODY };
 struct stat64 { __STAT64_BODY };
 
+#undef __STAT64_BODY
+
 #define st_atimensec st_atime_nsec
 #define st_mtimensec st_mtime_nsec
 #define st_ctimensec st_ctime_nsec
diff --git a/libc/include/sys/statvfs.h b/libc/include/sys/statvfs.h
index 3d8179e..3495546 100644
--- a/libc/include/sys/statvfs.h
+++ b/libc/include/sys/statvfs.h
@@ -23,6 +23,12 @@
 
 __BEGIN_DECLS
 
+#ifdef __LP64__
+#define __STATVFS64_RESERVED uint32_t __f_reserved[6];
+#else
+#define __STATVFS64_RESERVED
+#endif
+
 #define __STATVFS64_BODY \
   unsigned long f_bsize; \
   unsigned long f_frsize; \
@@ -35,10 +41,14 @@
   unsigned long f_fsid; \
   unsigned long f_flag; \
   unsigned long f_namemax; \
+  __STATVFS64_RESERVED
 
 struct statvfs { __STATVFS64_BODY };
 struct statvfs64 { __STATVFS64_BODY };
 
+#undef __STATVFS64_BODY
+#undef __STATVFS64_RESERVED
+
 #define ST_RDONLY      0x0001
 #define ST_NOSUID      0x0002
 #define ST_NODEV       0x0004
diff --git a/libc/include/sys/vfs.h b/libc/include/sys/vfs.h
index cd6044d..5358ffb 100644
--- a/libc/include/sys/vfs.h
+++ b/libc/include/sys/vfs.h
@@ -107,6 +107,8 @@
 struct statfs { __STATFS64_BODY };
 struct statfs64 { __STATFS64_BODY };
 
+#undef __STATFS64_BODY
+
 /* Declare that we have the f_namelen, f_frsize, and f_flags fields. */
 #define _STATFS_F_NAMELEN
 #define _STATFS_F_FRSIZE
diff --git a/libc/include/wchar.h b/libc/include/wchar.h
index fe2fe07..4ac468d 100644
--- a/libc/include/wchar.h
+++ b/libc/include/wchar.h
@@ -41,11 +41,9 @@
 
 typedef __WINT_TYPE__  wint_t;
 typedef struct {
-#ifdef __LP32__
-  int dummy;
-#else
-  // 8 bytes should be enough to support at least UTF-8
-  char __reserved[8];
+  uint8_t __seq[4];
+#ifdef __LP64__
+  char __reserved[4];
 #endif
 } mbstate_t;
 
diff --git a/libc/private/NetdClient.h b/libc/private/NetdClient.h
new file mode 100644
index 0000000..48c05cb
--- /dev/null
+++ b/libc/private/NetdClient.h
@@ -0,0 +1,28 @@
+/*
+ * Copyright (C) 2014 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef PRIVATE_NETD_CLIENT_H
+#define PRIVATE_NETD_CLIENT_H
+
+#include <sys/socket.h>
+
+struct NetdClientDispatch {
+    int (*connect)(int, const sockaddr*, socklen_t);
+};
+
+extern NetdClientDispatch __netdClientDispatch;
+
+#endif  // PRIVATE_NETD_CLIENT_H
diff --git a/libc/upstream-netbsd/lib/libc/inet/inet_ntop.c b/libc/upstream-netbsd/lib/libc/inet/inet_ntop.c
deleted file mode 100644
index 00c55a8..0000000
--- a/libc/upstream-netbsd/lib/libc/inet/inet_ntop.c
+++ /dev/null
@@ -1,232 +0,0 @@
-/*	$NetBSD: inet_ntop.c,v 1.9 2012/03/20 17:08:13 matt Exp $	*/
-
-/*
- * Copyright (c) 2004 by Internet Systems Consortium, Inc. ("ISC")
- * Copyright (c) 1996-1999 by Internet Software Consortium.
- *
- * Permission to use, copy, modify, and distribute this software for any
- * purpose with or without fee is hereby granted, provided that the above
- * copyright notice and this permission notice appear in all copies.
- *
- * THE SOFTWARE IS PROVIDED "AS IS" AND ISC DISCLAIMS ALL WARRANTIES
- * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
- * MERCHANTABILITY AND FITNESS.  IN NO EVENT SHALL ISC BE LIABLE FOR
- * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
- * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
- * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
- * OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
- */
-
-#include <sys/cdefs.h>
-#if defined(LIBC_SCCS) && !defined(lint)
-#if 0
-static const char rcsid[] = "Id: inet_ntop.c,v 1.5 2005/11/03 22:59:52 marka Exp";
-#else
-__RCSID("$NetBSD: inet_ntop.c,v 1.9 2012/03/20 17:08:13 matt Exp $");
-#endif
-#endif /* LIBC_SCCS and not lint */
-
-#include "port_before.h"
-
-#include "namespace.h"
-#include <sys/param.h>
-#include <sys/types.h>
-#include <sys/socket.h>
-
-#include <netinet/in.h>
-#include <arpa/inet.h>
-#include <arpa/nameser.h>
-
-#include <assert.h>
-#include <errno.h>
-#include <stdio.h>
-#include <stdlib.h>
-#include <string.h>
-
-#include "port_after.h"
-
-#ifdef __weak_alias
-__weak_alias(inet_ntop,_inet_ntop)
-#endif
-
-/*%
- * WARNING: Don't even consider trying to compile this on a system where
- * sizeof(int) < 4.  sizeof(int) > 4 is fine; all the world's not a VAX.
- */
-
-static const char *inet_ntop4(const u_char *src, char *dst, socklen_t size);
-static const char *inet_ntop6(const u_char *src, char *dst, socklen_t size);
-
-/* char *
- * inet_ntop(af, src, dst, size)
- *	convert a network format address to presentation format.
- * return:
- *	pointer to presentation format address (`dst'), or NULL (see errno).
- * author:
- *	Paul Vixie, 1996.
- */
-const char *
-inet_ntop(int af, const void *src, char *dst, socklen_t size)
-{
-
-	_DIAGASSERT(src != NULL);
-	_DIAGASSERT(dst != NULL);
-
-	switch (af) {
-	case AF_INET:
-		return (inet_ntop4(src, dst, size));
-	case AF_INET6:
-		return (inet_ntop6(src, dst, size));
-	default:
-		errno = EAFNOSUPPORT;
-		return (NULL);
-	}
-	/* NOTREACHED */
-}
-
-/* const char *
- * inet_ntop4(src, dst, size)
- *	format an IPv4 address, more or less like inet_ntoa()
- * return:
- *	`dst' (as a const)
- * notes:
- *	(1) uses no statics
- *	(2) takes a u_char* not an in_addr as input
- * author:
- *	Paul Vixie, 1996.
- */
-static const char *
-inet_ntop4(const u_char *src, char *dst, socklen_t size)
-{
-	char tmp[sizeof "255.255.255.255"];
-	int l;
-
-	_DIAGASSERT(src != NULL);
-	_DIAGASSERT(dst != NULL);
-
-	l = snprintf(tmp, sizeof(tmp), "%u.%u.%u.%u",
-	    src[0], src[1], src[2], src[3]);
-	if (l <= 0 || (socklen_t) l >= size) {
-		errno = ENOSPC;
-		return (NULL);
-	}
-	strlcpy(dst, tmp, size);
-	return (dst);
-}
-
-/* const char *
- * inet_ntop6(src, dst, size)
- *	convert IPv6 binary address into presentation (printable) format
- * author:
- *	Paul Vixie, 1996.
- */
-static const char *
-inet_ntop6(const u_char *src, char *dst, socklen_t size)
-{
-	/*
-	 * Note that int32_t and int16_t need only be "at least" large enough
-	 * to contain a value of the specified size.  On some systems, like
-	 * Crays, there is no such thing as an integer variable with 16 bits.
-	 * Keep this in mind if you think this function should have been coded
-	 * to use pointer overlays.  All the world's not a VAX.
-	 */
-	char tmp[sizeof "ffff:ffff:ffff:ffff:ffff:ffff:255.255.255.255"];
-	char *tp, *ep;
-	struct { int base, len; } best, cur;
-	u_int words[NS_IN6ADDRSZ / NS_INT16SZ];
-	int i;
-	int advance;
-
-	_DIAGASSERT(src != NULL);
-	_DIAGASSERT(dst != NULL);
-
-	/*
-	 * Preprocess:
-	 *	Copy the input (bytewise) array into a wordwise array.
-	 *	Find the longest run of 0x00's in src[] for :: shorthanding.
-	 */
-	memset(words, '\0', sizeof words);
-	for (i = 0; i < NS_IN6ADDRSZ; i++)
-		words[i / 2] |= (src[i] << ((1 - (i % 2)) << 3));
-	best.base = -1;
-	best.len = 0;
-	cur.base = -1;
-	cur.len = 0;
-	for (i = 0; i < (NS_IN6ADDRSZ / NS_INT16SZ); i++) {
-		if (words[i] == 0) {
-			if (cur.base == -1)
-				cur.base = i, cur.len = 1;
-			else
-				cur.len++;
-		} else {
-			if (cur.base != -1) {
-				if (best.base == -1 || cur.len > best.len)
-					best = cur;
-				cur.base = -1;
-			}
-		}
-	}
-	if (cur.base != -1) {
-		if (best.base == -1 || cur.len > best.len)
-			best = cur;
-	}
-	if (best.base != -1 && best.len < 2)
-		best.base = -1;
-
-	/*
-	 * Format the result.
-	 */
-	tp = tmp;
-	ep = tmp + sizeof(tmp);
-	for (i = 0; i < (NS_IN6ADDRSZ / NS_INT16SZ); i++) {
-		/* Are we inside the best run of 0x00's? */
-		if (best.base != -1 && i >= best.base &&
-		    i < (best.base + best.len)) {
-			if (i == best.base)
-				*tp++ = ':';
-			continue;
-		}
-		/* Are we following an initial run of 0x00s or any real hex? */
-		if (i != 0) {
-			if (tp + 1 >= ep)
-				return (NULL);
-			*tp++ = ':';
-		}
-		/* Is this address an encapsulated IPv4? */
-		if (i == 6 && best.base == 0 &&
-		    (best.len == 6 ||
-		    (best.len == 7 && words[7] != 0x0001) ||
-		    (best.len == 5 && words[5] == 0xffff))) {
-			if (!inet_ntop4(src+12, tp, (socklen_t)(ep - tp)))
-				return (NULL);
-			tp += strlen(tp);
-			break;
-		}
-		advance = snprintf(tp, (size_t)(ep - tp), "%x", words[i]);
-		if (advance <= 0 || advance >= ep - tp)
-			return (NULL);
-		tp += advance;
-	}
-	/* Was it a trailing run of 0x00's? */
-	if (best.base != -1 && (best.base + best.len) == 
-	    (NS_IN6ADDRSZ / NS_INT16SZ)) {
-		if (tp + 1 >= ep)
-			return (NULL);
-		*tp++ = ':';
-	}
-	if (tp + 1 >= ep)
-		return (NULL);
-	*tp++ = '\0';
-
-	/*
-	 * Check for overflow, copy, and we're done.
-	 */
-	if ((size_t)(tp - tmp) > size) {
-		errno = ENOSPC;
-		return (NULL);
-	}
-	strlcpy(dst, tmp, size);
-	return (dst);
-}
-
-/*! \file */
diff --git a/libc/upstream-netbsd/lib/libc/inet/inet_pton.c b/libc/upstream-netbsd/lib/libc/inet/inet_pton.c
deleted file mode 100644
index c1e9a69..0000000
--- a/libc/upstream-netbsd/lib/libc/inet/inet_pton.c
+++ /dev/null
@@ -1,310 +0,0 @@
-/*	$NetBSD: inet_pton.c,v 1.8 2012/03/13 21:13:38 christos Exp $	*/
-
-/*
- * Copyright (c) 2004 by Internet Systems Consortium, Inc. ("ISC")
- * Copyright (c) 1996,1999 by Internet Software Consortium.
- *
- * Permission to use, copy, modify, and distribute this software for any
- * purpose with or without fee is hereby granted, provided that the above
- * copyright notice and this permission notice appear in all copies.
- *
- * THE SOFTWARE IS PROVIDED "AS IS" AND ISC DISCLAIMS ALL WARRANTIES
- * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
- * MERCHANTABILITY AND FITNESS.  IN NO EVENT SHALL ISC BE LIABLE FOR
- * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
- * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
- * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
- * OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
- */
-
-#include <sys/cdefs.h>
-#if defined(LIBC_SCCS) && !defined(lint)
-#if 0
-static const char rcsid[] = "Id: inet_pton.c,v 1.5 2005/07/28 06:51:47 marka Exp";
-#else
-__RCSID("$NetBSD: inet_pton.c,v 1.8 2012/03/13 21:13:38 christos Exp $");
-#endif
-#endif /* LIBC_SCCS and not lint */
-
-#include "port_before.h"
-
-#include "namespace.h"
-#include <sys/param.h>
-#include <sys/types.h>
-#include <sys/socket.h>
-#include <netinet/in.h>
-#include <arpa/inet.h>
-#include <arpa/nameser.h>
-#include <stddef.h>
-#include <string.h>
-#include <assert.h>
-#include <ctype.h>
-#include <errno.h>
-
-#include "port_after.h"
-
-#ifdef __weak_alias
-__weak_alias(inet_pton,_inet_pton)
-#endif
-
-/*%
- * WARNING: Don't even consider trying to compile this on a system where
- * sizeof(int) < 4.  sizeof(int) > 4 is fine; all the world's not a VAX.
- */
-
-static int	inet_pton4(const char *src, u_char *dst, int pton);
-static int	inet_pton6(const char *src, u_char *dst);
-
-/* int
- * inet_pton(af, src, dst)
- *	convert from presentation format (which usually means ASCII printable)
- *	to network format (which is usually some kind of binary format).
- * return:
- *	1 if the address was valid for the specified address family
- *	0 if the address wasn't valid (`dst' is untouched in this case)
- *	-1 if some other error occurred (`dst' is untouched in this case, too)
- * author:
- *	Paul Vixie, 1996.
- */
-int
-inet_pton(int af, const char *src, void *dst)
-{
-
-	_DIAGASSERT(src != NULL);
-	_DIAGASSERT(dst != NULL);
-
-	switch (af) {
-	case AF_INET:
-		return (inet_pton4(src, dst, 1));
-	case AF_INET6:
-		return (inet_pton6(src, dst));
-	default:
-		errno = EAFNOSUPPORT;
-		return (-1);
-	}
-	/* NOTREACHED */
-}
-
-/* int
- * inet_pton4(src, dst, pton)
- *	when last arg is 0: inet_aton(). with hexadecimal, octal and shorthand.
- *	when last arg is 1: inet_pton(). decimal dotted-quad only.
- * return:
- *	1 if `src' is a valid input, else 0.
- * notice:
- *	does not touch `dst' unless it's returning 1.
- * author:
- *	Paul Vixie, 1996.
- */
-static int
-inet_pton4(const char *src, u_char *dst, int pton)
-{
-	u_int32_t val;
-	u_int digit, base;
-	ptrdiff_t n;
-	unsigned char c;
-	u_int parts[4];
-	u_int *pp = parts;
-
-	_DIAGASSERT(src != NULL);
-	_DIAGASSERT(dst != NULL);
-
-	c = *src;
-	for (;;) {
-		/*
-		 * Collect number up to ``.''.
-		 * Values are specified as for C:
-		 * 0x=hex, 0=octal, isdigit=decimal.
-		 */
-		if (!isdigit(c))
-			return (0);
-		val = 0; base = 10;
-		if (c == '0') {
-			c = *++src;
-			if (c == 'x' || c == 'X')
-				base = 16, c = *++src;
-			else if (isdigit(c) && c != '9')
-				base = 8;
-		}
-		/* inet_pton() takes decimal only */
-		if (pton && base != 10)
-			return (0);
-		for (;;) {
-			if (isdigit(c)) {
-				digit = c - '0';
-				if (digit >= base)
-					break;
-				val = (val * base) + digit;
-				c = *++src;
-			} else if (base == 16 && isxdigit(c)) {
-				digit = c + 10 - (islower(c) ? 'a' : 'A');
-				if (digit >= 16)
-					break;
-				val = (val << 4) | digit;
-				c = *++src;
-			} else
-				break;
-		}
-		if (c == '.') {
-			/*
-			 * Internet format:
-			 *	a.b.c.d
-			 *	a.b.c	(with c treated as 16 bits)
-			 *	a.b	(with b treated as 24 bits)
-			 *	a	(with a treated as 32 bits)
-			 */
-			if (pp >= parts + 3)
-				return (0);
-			*pp++ = val;
-			c = *++src;
-		} else
-			break;
-	}
-	/*
-	 * Check for trailing characters.
-	 */
-	if (c != '\0' && !isspace(c))
-		return (0);
-	/*
-	 * Concoct the address according to
-	 * the number of parts specified.
-	 */
-	n = pp - parts + 1;
-	/* inet_pton() takes dotted-quad only.  it does not take shorthand. */
-	if (pton && n != 4)
-		return (0);
-	switch (n) {
-
-	case 0:
-		return (0);		/* initial nondigit */
-
-	case 1:				/* a -- 32 bits */
-		break;
-
-	case 2:				/* a.b -- 8.24 bits */
-		if (parts[0] > 0xff || val > 0xffffff)
-			return (0);
-		val |= parts[0] << 24;
-		break;
-
-	case 3:				/* a.b.c -- 8.8.16 bits */
-		if ((parts[0] | parts[1]) > 0xff || val > 0xffff)
-			return (0);
-		val |= (parts[0] << 24) | (parts[1] << 16);
-		break;
-
-	case 4:				/* a.b.c.d -- 8.8.8.8 bits */
-		if ((parts[0] | parts[1] | parts[2] | val) > 0xff)
-			return (0);
-		val |= (parts[0] << 24) | (parts[1] << 16) | (parts[2] << 8);
-		break;
-	}
-	if (dst) {
-		val = htonl(val);
-		memcpy(dst, &val, NS_INADDRSZ);
-	}
-	return (1);
-}
-
-/* int
- * inet_pton6(src, dst)
- *	convert presentation level address to network order binary form.
- * return:
- *	1 if `src' is a valid [RFC1884 2.2] address, else 0.
- * notice:
- *	(1) does not touch `dst' unless it's returning 1.
- *	(2) :: in a full address is silently ignored.
- * credit:
- *	inspired by Mark Andrews.
- * author:
- *	Paul Vixie, 1996.
- */
-static int
-inet_pton6(const char *src, u_char *dst)
-{
-	static const char xdigits_l[] = "0123456789abcdef",
-			  xdigits_u[] = "0123456789ABCDEF";
-	u_char tmp[NS_IN6ADDRSZ], *tp, *endp, *colonp;
-	const char *xdigits, *curtok;
-	int ch, seen_xdigits;
-	u_int val;
-
-	_DIAGASSERT(src != NULL);
-	_DIAGASSERT(dst != NULL);
-
-	memset((tp = tmp), '\0', NS_IN6ADDRSZ);
-	endp = tp + NS_IN6ADDRSZ;
-	colonp = NULL;
-	/* Leading :: requires some special handling. */
-	if (*src == ':')
-		if (*++src != ':')
-			return (0);
-	curtok = src;
-	seen_xdigits = 0;
-	val = 0;
-	while ((ch = *src++) != '\0') {
-		const char *pch;
-
-		if ((pch = strchr((xdigits = xdigits_l), ch)) == NULL)
-			pch = strchr((xdigits = xdigits_u), ch);
-		if (pch != NULL) {
-			val <<= 4;
-			val |= (int)(pch - xdigits);
-			if (++seen_xdigits > 4)
-				return (0);
-			continue;
-		}
-		if (ch == ':') {
-			curtok = src;
-			if (!seen_xdigits) {
-				if (colonp)
-					return (0);
-				colonp = tp;
-				continue;
-			} else if (*src == '\0')
-				return (0);
-			if (tp + NS_INT16SZ > endp)
-				return (0);
-			*tp++ = (u_char) (val >> 8) & 0xff;
-			*tp++ = (u_char) val & 0xff;
-			seen_xdigits = 0;
-			val = 0;
-			continue;
-		}
-		if (ch == '.' && ((tp + NS_INADDRSZ) <= endp) &&
-		    inet_pton4(curtok, tp, 1) > 0) {
-			tp += NS_INADDRSZ;
-			seen_xdigits = 0;
-			break;	/*%< '\\0' was seen by inet_pton4(). */
-		}
-		return (0);
-	}
-	if (seen_xdigits) {
-		if (tp + NS_INT16SZ > endp)
-			return (0);
-		*tp++ = (u_char) (val >> 8) & 0xff;
-		*tp++ = (u_char) val & 0xff;
-	}
-	if (colonp != NULL) {
-		/*
-		 * Since some memmove()'s erroneously fail to handle
-		 * overlapping regions, we'll do the shift by hand.
-		 */
-		const ptrdiff_t n = tp - colonp;
-		int i;
-
-		if (tp == endp)
-			return (0);
-		for (i = 1; i <= n; i++) {
-			endp[- i] = colonp[n - i];
-			colonp[n - i] = 0;
-		}
-		tp = endp;
-	}
-	if (tp != endp)
-		return (0);
-	memcpy(dst, tmp, NS_IN6ADDRSZ);
-	return (1);
-}
-
-/*! \file */
diff --git a/libc/upstream-openbsd/lib/libc/net/htonl.c b/libc/upstream-openbsd/lib/libc/net/htonl.c
new file mode 100644
index 0000000..5ab4189
--- /dev/null
+++ b/libc/upstream-openbsd/lib/libc/net/htonl.c
@@ -0,0 +1,21 @@
+/*	$OpenBSD: htonl.c,v 1.6 2005/08/06 20:30:03 espie Exp $ */
+/*
+ * Written by J.T. Conklin <jtc@netbsd.org>.
+ * Public domain.
+ */
+
+#include <sys/types.h>
+#include <machine/endian.h>
+
+#undef htonl
+
+u_int32_t
+htonl(u_int32_t x)
+{
+#if BYTE_ORDER == LITTLE_ENDIAN
+	u_char *s = (u_char *)&x;
+	return (u_int32_t)(s[0] << 24 | s[1] << 16 | s[2] << 8 | s[3]);
+#else
+	return x;
+#endif
+}
diff --git a/libc/upstream-openbsd/lib/libc/net/htons.c b/libc/upstream-openbsd/lib/libc/net/htons.c
new file mode 100644
index 0000000..c8b73fd
--- /dev/null
+++ b/libc/upstream-openbsd/lib/libc/net/htons.c
@@ -0,0 +1,21 @@
+/*	$OpenBSD: htons.c,v 1.8 2005/08/06 20:30:03 espie Exp $ */
+/*
+ * Written by J.T. Conklin <jtc@netbsd.org>.
+ * Public domain.
+ */
+
+#include <sys/types.h>
+#include <machine/endian.h>
+
+#undef htons
+
+u_int16_t
+htons(u_int16_t x)
+{
+#if BYTE_ORDER == LITTLE_ENDIAN
+	u_char *s = (u_char *) &x;
+	return (u_int16_t)(s[0] << 8 | s[1]);
+#else
+	return x;
+#endif
+}
diff --git a/libc/upstream-netbsd/common/lib/libc/inet/inet_addr.c b/libc/upstream-openbsd/lib/libc/net/inet_addr.c
similarity index 62%
rename from libc/upstream-netbsd/common/lib/libc/inet/inet_addr.c
rename to libc/upstream-openbsd/lib/libc/net/inet_addr.c
index 1360ce9..18762ab 100644
--- a/libc/upstream-netbsd/common/lib/libc/inet/inet_addr.c
+++ b/libc/upstream-openbsd/lib/libc/net/inet_addr.c
@@ -1,6 +1,8 @@
-/*	$NetBSD: inet_addr.c,v 1.3 2012/03/09 15:41:16 christos Exp $	*/
+/*	$OpenBSD: inet_addr.c,v 1.10 2013/11/24 23:51:28 deraadt Exp $	*/
 
 /*
+ * ++Copyright++ 1983, 1990, 1993
+ * -
  * Copyright (c) 1983, 1990, 1993
  *    The Regents of the University of California.  All rights reserved.
  * 
@@ -12,11 +14,7 @@
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
- * 3. All advertising materials mentioning features or use of this software
- *    must display the following acknowledgement:
- * 	This product includes software developed by the University of
- * 	California, Berkeley and its contributors.
- * 4. Neither the name of the University nor the names of its contributors
+ * 3. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  * 
@@ -31,9 +29,7 @@
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
- */
-
-/*
+ * -
  * Portions Copyright (c) 1993 by Digital Equipment Corporation.
  * 
  * Permission to use, copy, modify, and distribute this software for any
@@ -51,63 +47,23 @@
  * PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
  * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS
  * SOFTWARE.
+ * -
+ * --Copyright--
  */
 
-/*
- * Copyright (c) 2004 by Internet Systems Consortium, Inc. ("ISC")
- * Portions Copyright (c) 1996-1999 by Internet Software Consortium.
- *
- * Permission to use, copy, modify, and distribute this software for any
- * purpose with or without fee is hereby granted, provided that the above
- * copyright notice and this permission notice appear in all copies.
- *
- * THE SOFTWARE IS PROVIDED "AS IS" AND ISC DISCLAIMS ALL WARRANTIES
- * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
- * MERCHANTABILITY AND FITNESS.  IN NO EVENT SHALL ISC BE LIABLE FOR
- * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
- * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
- * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
- * OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
- */
-
-#if !defined(_KERNEL) && !defined(_STANDALONE)
-#include <sys/cdefs.h>
-#if defined(LIBC_SCCS) && !defined(lint)
-#if 0
-static const char sccsid[] = "@(#)inet_addr.c	8.1 (Berkeley) 6/17/93";
-static const char rcsid[] = "Id: inet_addr.c,v 1.2.206.2 2004/03/17 00:29:45 marka Exp";
-#else
-__RCSID("$NetBSD: inet_addr.c,v 1.3 2012/03/09 15:41:16 christos Exp $");
-#endif
-#endif /* LIBC_SCCS and not lint */
-
-#include "port_before.h"
-
-#include "namespace.h"
 #include <sys/types.h>
 #include <sys/param.h>
-
 #include <netinet/in.h>
 #include <arpa/inet.h>
-
 #include <ctype.h>
 
-#include "port_after.h"
-
-#ifdef __weak_alias
-__weak_alias(inet_aton,_inet_aton)
-#endif
-#else
-#include <lib/libkern/libkern.h>
-#include <netinet/in.h>
-#endif
-
 /*
  * Ascii internet address interpretation routine.
  * The value returned is in network order.
  */
-uint32_t
-inet_addr(const char *cp) {
+in_addr_t
+inet_addr(const char *cp)
+{
 	struct in_addr val;
 
 	if (inet_aton(cp, &val))
@@ -123,14 +79,13 @@
  * cannot distinguish between failure and a local broadcast address.
  */
 int
-inet_aton(const char *cp, struct in_addr *addr) {
-	uint32_t val;
-	int base;
-	size_t n;
+inet_aton(const char *cp, struct in_addr *addr)
+{
+	in_addr_t val;
+	int base, n;
 	char c;
-	uint8_t parts[4];
-	uint8_t *pp = parts;
-	int digit;
+	u_int parts[4];
+	u_int *pp = parts;
 
 	c = *cp;
 	for (;;) {
@@ -141,29 +96,25 @@
 		 */
 		if (!isdigit((unsigned char)c))
 			return (0);
-		val = 0; base = 10; digit = 0;
+		val = 0; base = 10;
 		if (c == '0') {
 			c = *++cp;
 			if (c == 'x' || c == 'X')
 				base = 16, c = *++cp;
-			else {
+			else
 				base = 8;
-				digit = 1 ;
-			}
 		}
 		for (;;) {
-			if (isascii(c) && isdigit((unsigned char)c)) {
-				if (base == 8 && (c == '8' || c == '9'))
-					return (0);
+			if (isascii((unsigned char)c) &&
+			    isdigit((unsigned char)c)) {
 				val = (val * base) + (c - '0');
 				c = *++cp;
-				digit = 1;
-			} else if (base == 16 && isascii(c) && 
-				   isxdigit((unsigned char)c)) {
+			} else if (base == 16 &&
+			    isascii((unsigned char)c) &&
+			    isxdigit((unsigned char)c)) {
 				val = (val << 4) |
-					(c + 10 - (islower((unsigned char)c) ? 'a' : 'A'));
+				    (c + 10 - (islower((unsigned char)c) ? 'a' : 'A'));
 				c = *++cp;
-				digit = 1;
 			} else
 				break;
 		}
@@ -174,7 +125,7 @@
 			 *	a.b.c	(with c treated as 16 bits)
 			 *	a.b	(with b treated as 24 bits)
 			 */
-			if (pp >= parts + 3 || val > 0xffU)
+			if (pp >= parts + 3)
 				return (0);
 			*pp++ = val;
 			c = *++cp;
@@ -184,12 +135,8 @@
 	/*
 	 * Check for trailing characters.
 	 */
-	if (c != '\0' && (!isascii(c) || !isspace((unsigned char)c)))
-		return (0);
-	/*
-	 * Did we get a valid digit?
-	 */
-	if (!digit)
+	if (c != '\0' &&
+	    (!isascii((unsigned char)c) || !isspace((unsigned char)c)))
 		return (0);
 	/*
 	 * Concoct the address according to
@@ -197,28 +144,32 @@
 	 */
 	n = pp - parts + 1;
 	switch (n) {
+
+	case 0:
+		return (0);		/* initial nondigit */
+
 	case 1:				/* a -- 32 bits */
 		break;
 
 	case 2:				/* a.b -- 8.24 bits */
-		if (val > 0xffffffU)
+		if ((val > 0xffffff) || (parts[0] > 0xff))
 			return (0);
 		val |= parts[0] << 24;
 		break;
 
 	case 3:				/* a.b.c -- 8.8.16 bits */
-		if (val > 0xffffU)
+		if ((val > 0xffff) || (parts[0] > 0xff) || (parts[1] > 0xff))
 			return (0);
 		val |= (parts[0] << 24) | (parts[1] << 16);
 		break;
 
 	case 4:				/* a.b.c.d -- 8.8.8.8 bits */
-		if (val > 0xffU)
+		if ((val > 0xff) || (parts[0] > 0xff) || (parts[1] > 0xff) || (parts[2] > 0xff))
 			return (0);
 		val |= (parts[0] << 24) | (parts[1] << 16) | (parts[2] << 8);
 		break;
 	}
-	if (addr != NULL)
+	if (addr)
 		addr->s_addr = htonl(val);
 	return (1);
 }
diff --git a/libc/upstream-netbsd/lib/libc/inet/inet_ntoa.c b/libc/upstream-openbsd/lib/libc/net/inet_lnaof.c
similarity index 67%
copy from libc/upstream-netbsd/lib/libc/inet/inet_ntoa.c
copy to libc/upstream-openbsd/lib/libc/net/inet_lnaof.c
index 6ed023b..b1a58cd 100644
--- a/libc/upstream-netbsd/lib/libc/inet/inet_ntoa.c
+++ b/libc/upstream-openbsd/lib/libc/net/inet_lnaof.c
@@ -1,5 +1,4 @@
-/*	$NetBSD: inet_ntoa.c,v 1.2 2012/03/13 21:13:38 christos Exp $	*/
-
+/*	$OpenBSD: inet_lnaof.c,v 1.6 2005/08/06 20:30:03 espie Exp $ */
 /*
  * Copyright (c) 1983, 1993
  *	The Regents of the University of California.  All rights reserved.
@@ -29,36 +28,24 @@
  * SUCH DAMAGE.
  */
 
-#include <sys/cdefs.h>
-#if defined(LIBC_SCCS) && !defined(lint)
-#if 0
-static char sccsid[] = "@(#)inet_ntoa.c	8.1 (Berkeley) 6/4/93";
-#else
-__RCSID("$NetBSD: inet_ntoa.c,v 1.2 2012/03/13 21:13:38 christos Exp $");
-#endif
-#endif /* LIBC_SCCS and not lint */
-
-#include "namespace.h"
-#include <sys/types.h>
-#include <sys/socket.h>
+#include <sys/param.h>
 #include <netinet/in.h>
 #include <arpa/inet.h>
-#include <stdio.h>
-#include <string.h>
-
-#ifdef __weak_alias
-__weak_alias(inet_ntoa,_inet_ntoa)
-#endif
 
 /*
- * Convert network-format internet address
- * to base 256 d.d.d.d representation.
+ * Return the local network address portion of an
+ * internet address; handles class a/b/c network
+ * number formats.
  */
-/*const*/ char *
-inet_ntoa(struct in_addr in) {
-	static char ret[18];
+in_addr_t
+inet_lnaof(struct in_addr in)
+{
+	in_addr_t i = ntohl(in.s_addr);
 
-	strlcpy(ret, "[inet_ntoa error]", sizeof(ret));
-	(void) inet_ntop(AF_INET, &in, ret, (socklen_t)sizeof ret);
-	return ret;
+	if (IN_CLASSA(i))
+		return ((i)&IN_CLASSA_HOST);
+	else if (IN_CLASSB(i))
+		return ((i)&IN_CLASSB_HOST);
+	else
+		return ((i)&IN_CLASSC_HOST);
 }
diff --git a/libc/upstream-netbsd/lib/libc/inet/inet_ntoa.c b/libc/upstream-openbsd/lib/libc/net/inet_makeaddr.c
similarity index 67%
copy from libc/upstream-netbsd/lib/libc/inet/inet_ntoa.c
copy to libc/upstream-openbsd/lib/libc/net/inet_makeaddr.c
index 6ed023b..87d9325 100644
--- a/libc/upstream-netbsd/lib/libc/inet/inet_ntoa.c
+++ b/libc/upstream-openbsd/lib/libc/net/inet_makeaddr.c
@@ -1,5 +1,4 @@
-/*	$NetBSD: inet_ntoa.c,v 1.2 2012/03/13 21:13:38 christos Exp $	*/
-
+/*	$OpenBSD: inet_makeaddr.c,v 1.6 2005/08/06 20:30:03 espie Exp $ */
 /*
  * Copyright (c) 1983, 1993
  *	The Regents of the University of California.  All rights reserved.
@@ -29,36 +28,27 @@
  * SUCH DAMAGE.
  */
 
-#include <sys/cdefs.h>
-#if defined(LIBC_SCCS) && !defined(lint)
-#if 0
-static char sccsid[] = "@(#)inet_ntoa.c	8.1 (Berkeley) 6/4/93";
-#else
-__RCSID("$NetBSD: inet_ntoa.c,v 1.2 2012/03/13 21:13:38 christos Exp $");
-#endif
-#endif /* LIBC_SCCS and not lint */
-
-#include "namespace.h"
-#include <sys/types.h>
-#include <sys/socket.h>
+#include <sys/param.h>
 #include <netinet/in.h>
 #include <arpa/inet.h>
-#include <stdio.h>
-#include <string.h>
-
-#ifdef __weak_alias
-__weak_alias(inet_ntoa,_inet_ntoa)
-#endif
 
 /*
- * Convert network-format internet address
- * to base 256 d.d.d.d representation.
+ * Formulate an Internet address from network + host.  Used in
+ * building addresses stored in the ifnet structure.
  */
-/*const*/ char *
-inet_ntoa(struct in_addr in) {
-	static char ret[18];
+struct in_addr
+inet_makeaddr(in_addr_t net, in_addr_t host)
+{
+	in_addr_t addr;
 
-	strlcpy(ret, "[inet_ntoa error]", sizeof(ret));
-	(void) inet_ntop(AF_INET, &in, ret, (socklen_t)sizeof ret);
-	return ret;
+	if (net < 128)
+		addr = (net << IN_CLASSA_NSHIFT) | (host & IN_CLASSA_HOST);
+	else if (net < 65536)
+		addr = (net << IN_CLASSB_NSHIFT) | (host & IN_CLASSB_HOST);
+	else if (net < 16777216L)
+		addr = (net << IN_CLASSC_NSHIFT) | (host & IN_CLASSC_HOST);
+	else
+		addr = net | host;
+	addr = htonl(addr);
+	return (*(struct in_addr *)&addr);
 }
diff --git a/libc/upstream-netbsd/lib/libc/inet/inet_ntoa.c b/libc/upstream-openbsd/lib/libc/net/inet_netof.c
similarity index 67%
copy from libc/upstream-netbsd/lib/libc/inet/inet_ntoa.c
copy to libc/upstream-openbsd/lib/libc/net/inet_netof.c
index 6ed023b..2f468c3 100644
--- a/libc/upstream-netbsd/lib/libc/inet/inet_ntoa.c
+++ b/libc/upstream-openbsd/lib/libc/net/inet_netof.c
@@ -1,5 +1,4 @@
-/*	$NetBSD: inet_ntoa.c,v 1.2 2012/03/13 21:13:38 christos Exp $	*/
-
+/*	$OpenBSD: inet_netof.c,v 1.6 2005/08/06 20:30:03 espie Exp $ */
 /*
  * Copyright (c) 1983, 1993
  *	The Regents of the University of California.  All rights reserved.
@@ -29,36 +28,23 @@
  * SUCH DAMAGE.
  */
 
-#include <sys/cdefs.h>
-#if defined(LIBC_SCCS) && !defined(lint)
-#if 0
-static char sccsid[] = "@(#)inet_ntoa.c	8.1 (Berkeley) 6/4/93";
-#else
-__RCSID("$NetBSD: inet_ntoa.c,v 1.2 2012/03/13 21:13:38 christos Exp $");
-#endif
-#endif /* LIBC_SCCS and not lint */
-
-#include "namespace.h"
-#include <sys/types.h>
-#include <sys/socket.h>
+#include <sys/param.h>
 #include <netinet/in.h>
 #include <arpa/inet.h>
-#include <stdio.h>
-#include <string.h>
-
-#ifdef __weak_alias
-__weak_alias(inet_ntoa,_inet_ntoa)
-#endif
 
 /*
- * Convert network-format internet address
- * to base 256 d.d.d.d representation.
+ * Return the network number from an internet
+ * address; handles class a/b/c network #'s.
  */
-/*const*/ char *
-inet_ntoa(struct in_addr in) {
-	static char ret[18];
+in_addr_t
+inet_netof(struct in_addr in)
+{
+	in_addr_t i = ntohl(in.s_addr);
 
-	strlcpy(ret, "[inet_ntoa error]", sizeof(ret));
-	(void) inet_ntop(AF_INET, &in, ret, (socklen_t)sizeof ret);
-	return ret;
+	if (IN_CLASSA(i))
+		return (((i)&IN_CLASSA_NET) >> IN_CLASSA_NSHIFT);
+	else if (IN_CLASSB(i))
+		return (((i)&IN_CLASSB_NET) >> IN_CLASSB_NSHIFT);
+	else
+		return (((i)&IN_CLASSC_NET) >> IN_CLASSC_NSHIFT);
 }
diff --git a/libc/upstream-netbsd/lib/libc/inet/inet_ntoa.c b/libc/upstream-openbsd/lib/libc/net/inet_network.c
similarity index 62%
copy from libc/upstream-netbsd/lib/libc/inet/inet_ntoa.c
copy to libc/upstream-openbsd/lib/libc/net/inet_network.c
index 6ed023b..ecf554e 100644
--- a/libc/upstream-netbsd/lib/libc/inet/inet_ntoa.c
+++ b/libc/upstream-openbsd/lib/libc/net/inet_network.c
@@ -1,5 +1,4 @@
-/*	$NetBSD: inet_ntoa.c,v 1.2 2012/03/13 21:13:38 christos Exp $	*/
-
+/*	$OpenBSD: inet_network.c,v 1.11 2013/11/25 17:29:19 deraadt Exp $ */
 /*
  * Copyright (c) 1983, 1993
  *	The Regents of the University of California.  All rights reserved.
@@ -29,36 +28,57 @@
  * SUCH DAMAGE.
  */
 
-#include <sys/cdefs.h>
-#if defined(LIBC_SCCS) && !defined(lint)
-#if 0
-static char sccsid[] = "@(#)inet_ntoa.c	8.1 (Berkeley) 6/4/93";
-#else
-__RCSID("$NetBSD: inet_ntoa.c,v 1.2 2012/03/13 21:13:38 christos Exp $");
-#endif
-#endif /* LIBC_SCCS and not lint */
-
-#include "namespace.h"
 #include <sys/types.h>
-#include <sys/socket.h>
 #include <netinet/in.h>
 #include <arpa/inet.h>
-#include <stdio.h>
-#include <string.h>
-
-#ifdef __weak_alias
-__weak_alias(inet_ntoa,_inet_ntoa)
-#endif
+#include <ctype.h>
 
 /*
- * Convert network-format internet address
- * to base 256 d.d.d.d representation.
+ * Internet network address interpretation routine.
+ * The library routines call this routine to interpret
+ * network numbers.
  */
-/*const*/ char *
-inet_ntoa(struct in_addr in) {
-	static char ret[18];
+in_addr_t
+inet_network(const char *cp)
+{
+	in_addr_t val, base, n;
+	u_char c;
+	in_addr_t parts[4], *pp = parts;
+	int i;
 
-	strlcpy(ret, "[inet_ntoa error]", sizeof(ret));
-	(void) inet_ntop(AF_INET, &in, ret, (socklen_t)sizeof ret);
-	return ret;
+again:
+	val = 0; base = 10;
+	if (*cp == '0')
+		base = 8, cp++;
+	if (*cp == 'x' || *cp == 'X')
+		base = 16, cp++;
+	while ((c = *cp)) {
+		if (isdigit(c)) {
+			val = (val * base) + (c - '0');
+			cp++;
+			continue;
+		}
+		if (base == 16 && isxdigit(c)) {
+			val = (val << 4) + (c + 10 - (islower(c) ? 'a' : 'A'));
+			cp++;
+			continue;
+		}
+		break;
+	}
+	if (*cp == '.') {
+		if (pp >= parts + 3)
+			return (INADDR_NONE);
+		*pp++ = val, cp++;
+		goto again;
+	}
+	if (*cp && !isspace(*cp))
+		return (INADDR_NONE);
+	*pp++ = val;
+	n = pp - parts;
+	for (val = 0, i = 0; i < 4; i++) {
+		val <<= 8;
+		if (i < n)
+			val |= parts[i] & 0xff;
+	}
+	return (val);
 }
diff --git a/libc/upstream-netbsd/lib/libc/inet/inet_ntoa.c b/libc/upstream-openbsd/lib/libc/net/inet_ntoa.c
similarity index 73%
rename from libc/upstream-netbsd/lib/libc/inet/inet_ntoa.c
rename to libc/upstream-openbsd/lib/libc/net/inet_ntoa.c
index 6ed023b..ff5d93d 100644
--- a/libc/upstream-netbsd/lib/libc/inet/inet_ntoa.c
+++ b/libc/upstream-openbsd/lib/libc/net/inet_ntoa.c
@@ -1,5 +1,4 @@
-/*	$NetBSD: inet_ntoa.c,v 1.2 2012/03/13 21:13:38 christos Exp $	*/
-
+/*	$OpenBSD: inet_ntoa.c,v 1.6 2005/08/06 20:30:03 espie Exp $ */
 /*
  * Copyright (c) 1983, 1993
  *	The Regents of the University of California.  All rights reserved.
@@ -29,36 +28,24 @@
  * SUCH DAMAGE.
  */
 
-#include <sys/cdefs.h>
-#if defined(LIBC_SCCS) && !defined(lint)
-#if 0
-static char sccsid[] = "@(#)inet_ntoa.c	8.1 (Berkeley) 6/4/93";
-#else
-__RCSID("$NetBSD: inet_ntoa.c,v 1.2 2012/03/13 21:13:38 christos Exp $");
-#endif
-#endif /* LIBC_SCCS and not lint */
-
-#include "namespace.h"
-#include <sys/types.h>
-#include <sys/socket.h>
-#include <netinet/in.h>
-#include <arpa/inet.h>
-#include <stdio.h>
-#include <string.h>
-
-#ifdef __weak_alias
-__weak_alias(inet_ntoa,_inet_ntoa)
-#endif
-
 /*
  * Convert network-format internet address
  * to base 256 d.d.d.d representation.
  */
-/*const*/ char *
-inet_ntoa(struct in_addr in) {
-	static char ret[18];
+#include <sys/types.h>
+#include <netinet/in.h>
+#include <arpa/inet.h>
+#include <stdio.h>
 
-	strlcpy(ret, "[inet_ntoa error]", sizeof(ret));
-	(void) inet_ntop(AF_INET, &in, ret, (socklen_t)sizeof ret);
-	return ret;
+char *
+inet_ntoa(struct in_addr in)
+{
+	static char b[18];
+	char *p;
+
+	p = (char *)&in;
+#define	UC(b)	(((int)b)&0xff)
+	(void)snprintf(b, sizeof(b),
+	    "%u.%u.%u.%u", UC(p[0]), UC(p[1]), UC(p[2]), UC(p[3]));
+	return (b);
 }
diff --git a/libc/upstream-openbsd/lib/libc/net/inet_ntop.c b/libc/upstream-openbsd/lib/libc/net/inet_ntop.c
new file mode 100644
index 0000000..359acd8
--- /dev/null
+++ b/libc/upstream-openbsd/lib/libc/net/inet_ntop.c
@@ -0,0 +1,205 @@
+/*	$OpenBSD: inet_ntop.c,v 1.9 2014/02/05 14:20:43 millert Exp $	*/
+
+/* Copyright (c) 1996 by Internet Software Consortium.
+ *
+ * Permission to use, copy, modify, and distribute this software for any
+ * purpose with or without fee is hereby granted, provided that the above
+ * copyright notice and this permission notice appear in all copies.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS" AND INTERNET SOFTWARE CONSORTIUM DISCLAIMS
+ * ALL WARRANTIES WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL INTERNET SOFTWARE
+ * CONSORTIUM BE LIABLE FOR ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL
+ * DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR
+ * PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
+ * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS
+ * SOFTWARE.
+ */
+
+#include <sys/param.h>
+#include <sys/types.h>
+#include <sys/socket.h>
+#include <netinet/in.h>
+#include <arpa/inet.h>
+#include <arpa/nameser.h>
+#include <string.h>
+#include <errno.h>
+#include <stdio.h>
+
+/*
+ * WARNING: Don't even consider trying to compile this on a system where
+ * sizeof(int) < 4.  sizeof(int) > 4 is fine; all the world's not a VAX.
+ */
+
+static const char *inet_ntop4(const u_char *src, char *dst, size_t size);
+static const char *inet_ntop6(const u_char *src, char *dst, size_t size);
+
+/* const char *
+ * inet_ntop(af, src, dst, size)
+ *	convert a network format address to presentation format.
+ * return:
+ *	pointer to presentation format address (`dst'), or NULL (see errno).
+ * author:
+ *	Paul Vixie, 1996.
+ */
+const char *
+inet_ntop(int af, const void *src, char *dst, socklen_t size)
+{
+	switch (af) {
+	case AF_INET:
+		return (inet_ntop4(src, dst, (size_t)size));
+	case AF_INET6:
+		return (inet_ntop6(src, dst, (size_t)size));
+	default:
+		errno = EAFNOSUPPORT;
+		return (NULL);
+	}
+	/* NOTREACHED */
+}
+
+/* const char *
+ * inet_ntop4(src, dst, size)
+ *	format an IPv4 address, more or less like inet_ntoa()
+ * return:
+ *	`dst' (as a const)
+ * notes:
+ *	(1) uses no statics
+ *	(2) takes a u_char* not an in_addr as input
+ * author:
+ *	Paul Vixie, 1996.
+ */
+static const char *
+inet_ntop4(const u_char *src, char *dst, size_t size)
+{
+	static const char fmt[] = "%u.%u.%u.%u";
+	char tmp[sizeof "255.255.255.255"];
+	int l;
+
+	l = snprintf(tmp, size, fmt, src[0], src[1], src[2], src[3]);
+	if (l <= 0 || l >= size) {
+		errno = ENOSPC;
+		return (NULL);
+	}
+	strlcpy(dst, tmp, size);
+	return (dst);
+}
+
+/* const char *
+ * inet_ntop6(src, dst, size)
+ *	convert IPv6 binary address into presentation (printable) format
+ * author:
+ *	Paul Vixie, 1996.
+ */
+static const char *
+inet_ntop6(const u_char *src, char *dst, size_t size)
+{
+	/*
+	 * Note that int32_t and int16_t need only be "at least" large enough
+	 * to contain a value of the specified size.  On some systems, like
+	 * Crays, there is no such thing as an integer variable with 16 bits.
+	 * Keep this in mind if you think this function should have been coded
+	 * to use pointer overlays.  All the world's not a VAX.
+	 */
+	char tmp[sizeof "ffff:ffff:ffff:ffff:ffff:ffff:255.255.255.255"];
+	char *tp, *ep;
+	struct { int base, len; } best, cur;
+	u_int words[IN6ADDRSZ / INT16SZ];
+	int i;
+	int advance;
+
+	/*
+	 * Preprocess:
+	 *	Copy the input (bytewise) array into a wordwise array.
+	 *	Find the longest run of 0x00's in src[] for :: shorthanding.
+	 */
+	memset(words, '\0', sizeof words);
+	for (i = 0; i < IN6ADDRSZ; i++)
+		words[i / 2] |= (src[i] << ((1 - (i % 2)) << 3));
+	best.base = -1;
+	cur.base = -1;
+	for (i = 0; i < (IN6ADDRSZ / INT16SZ); i++) {
+		if (words[i] == 0) {
+			if (cur.base == -1)
+				cur.base = i, cur.len = 1;
+			else
+				cur.len++;
+		} else {
+			if (cur.base != -1) {
+				if (best.base == -1 || cur.len > best.len)
+					best = cur;
+				cur.base = -1;
+			}
+		}
+	}
+	if (cur.base != -1) {
+		if (best.base == -1 || cur.len > best.len)
+			best = cur;
+	}
+	if (best.base != -1 && best.len < 2)
+		best.base = -1;
+
+	/*
+	 * Format the result.
+	 */
+	tp = tmp;
+	ep = tmp + sizeof(tmp);
+	for (i = 0; i < (IN6ADDRSZ / INT16SZ) && tp < ep; i++) {
+		/* Are we inside the best run of 0x00's? */
+		if (best.base != -1 && i >= best.base &&
+		    i < (best.base + best.len)) {
+			if (i == best.base) {
+				if (tp + 1 >= ep) {
+					errno = ENOSPC;
+					return (NULL);
+				}
+				*tp++ = ':';
+			}
+			continue;
+		}
+		/* Are we following an initial run of 0x00s or any real hex? */
+		if (i != 0) {
+			if (tp + 1 >= ep) {
+				errno = ENOSPC;
+				return (NULL);
+			}
+			*tp++ = ':';
+		}
+		/* Is this address an encapsulated IPv4? */
+		if (i == 6 && best.base == 0 &&
+		    (best.len == 6 || (best.len == 5 && words[5] == 0xffff))) {
+			if (!inet_ntop4(src+12, tp, (size_t)(ep - tp)))
+				return (NULL);
+			tp += strlen(tp);
+			break;
+		}
+		advance = snprintf(tp, ep - tp, "%x", words[i]);
+		if (advance <= 0 || advance >= ep - tp) {
+			errno = ENOSPC;
+			return (NULL);
+		}
+		tp += advance;
+	}
+	/* Was it a trailing run of 0x00's? */
+	if (best.base != -1 && (best.base + best.len) == (IN6ADDRSZ / INT16SZ)) {
+		if (tp + 1 >= ep) {
+			errno = ENOSPC;
+			return (NULL);
+		}
+		*tp++ = ':';
+	}
+	if (tp + 1 >= ep) {
+		errno = ENOSPC;
+		return (NULL);
+	}
+	*tp++ = '\0';
+
+	/*
+	 * Check for overflow, copy, and we're done.
+	 */
+	if ((size_t)(tp - tmp) > size) {
+		errno = ENOSPC;
+		return (NULL);
+	}
+	strlcpy(dst, tmp, size);
+	return (dst);
+}
diff --git a/libc/upstream-openbsd/lib/libc/net/inet_pton.c b/libc/upstream-openbsd/lib/libc/net/inet_pton.c
new file mode 100644
index 0000000..7e521c3
--- /dev/null
+++ b/libc/upstream-openbsd/lib/libc/net/inet_pton.c
@@ -0,0 +1,213 @@
+/*	$OpenBSD: inet_pton.c,v 1.8 2010/05/06 15:47:14 claudio Exp $	*/
+
+/* Copyright (c) 1996 by Internet Software Consortium.
+ *
+ * Permission to use, copy, modify, and distribute this software for any
+ * purpose with or without fee is hereby granted, provided that the above
+ * copyright notice and this permission notice appear in all copies.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS" AND INTERNET SOFTWARE CONSORTIUM DISCLAIMS
+ * ALL WARRANTIES WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL INTERNET SOFTWARE
+ * CONSORTIUM BE LIABLE FOR ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL
+ * DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR
+ * PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
+ * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS
+ * SOFTWARE.
+ */
+
+#include <sys/param.h>
+#include <sys/types.h>
+#include <sys/socket.h>
+#include <netinet/in.h>
+#include <arpa/inet.h>
+#include <arpa/nameser.h>
+#include <string.h>
+#include <errno.h>
+
+/*
+ * WARNING: Don't even consider trying to compile this on a system where
+ * sizeof(int) < 4.  sizeof(int) > 4 is fine; all the world's not a VAX.
+ */
+
+static int	inet_pton4(const char *src, u_char *dst);
+static int	inet_pton6(const char *src, u_char *dst);
+
+/* int
+ * inet_pton(af, src, dst)
+ *	convert from presentation format (which usually means ASCII printable)
+ *	to network format (which is usually some kind of binary format).
+ * return:
+ *	1 if the address was valid for the specified address family
+ *	0 if the address wasn't valid (`dst' is untouched in this case)
+ *	-1 if some other error occurred (`dst' is untouched in this case, too)
+ * author:
+ *	Paul Vixie, 1996.
+ */
+int
+inet_pton(int af, const char *src, void *dst)
+{
+	switch (af) {
+	case AF_INET:
+		return (inet_pton4(src, dst));
+	case AF_INET6:
+		return (inet_pton6(src, dst));
+	default:
+		errno = EAFNOSUPPORT;
+		return (-1);
+	}
+	/* NOTREACHED */
+}
+
+/* int
+ * inet_pton4(src, dst)
+ *	like inet_aton() but without all the hexadecimal and shorthand.
+ * return:
+ *	1 if `src' is a valid dotted quad, else 0.
+ * notice:
+ *	does not touch `dst' unless it's returning 1.
+ * author:
+ *	Paul Vixie, 1996.
+ */
+static int
+inet_pton4(const char *src, u_char *dst)
+{
+	static const char digits[] = "0123456789";
+	int saw_digit, octets, ch;
+	u_char tmp[INADDRSZ], *tp;
+
+	saw_digit = 0;
+	octets = 0;
+	*(tp = tmp) = 0;
+	while ((ch = *src++) != '\0') {
+		const char *pch;
+
+		if ((pch = strchr(digits, ch)) != NULL) {
+			u_int new = *tp * 10 + (pch - digits);
+
+			if (new > 255)
+				return (0);
+			if (! saw_digit) {
+				if (++octets > 4)
+					return (0);
+				saw_digit = 1;
+			}
+			*tp = new;
+		} else if (ch == '.' && saw_digit) {
+			if (octets == 4)
+				return (0);
+			*++tp = 0;
+			saw_digit = 0;
+		} else
+			return (0);
+	}
+	if (octets < 4)
+		return (0);
+
+	memcpy(dst, tmp, INADDRSZ);
+	return (1);
+}
+
+/* int
+ * inet_pton6(src, dst)
+ *	convert presentation level address to network order binary form.
+ * return:
+ *	1 if `src' is a valid [RFC1884 2.2] address, else 0.
+ * notice:
+ *	does not touch `dst' unless it's returning 1.
+ * credit:
+ *	inspired by Mark Andrews.
+ * author:
+ *	Paul Vixie, 1996.
+ */
+static int
+inet_pton6(const char *src, u_char *dst)
+{
+	static const char xdigits_l[] = "0123456789abcdef",
+			  xdigits_u[] = "0123456789ABCDEF";
+	u_char tmp[IN6ADDRSZ], *tp, *endp, *colonp;
+	const char *xdigits, *curtok;
+	int ch, saw_xdigit, count_xdigit;
+	u_int val;
+
+	memset((tp = tmp), '\0', IN6ADDRSZ);
+	endp = tp + IN6ADDRSZ;
+	colonp = NULL;
+	/* Leading :: requires some special handling. */
+	if (*src == ':')
+		if (*++src != ':')
+			return (0);
+	curtok = src;
+	saw_xdigit = count_xdigit = 0;
+	val = 0;
+	while ((ch = *src++) != '\0') {
+		const char *pch;
+
+		if ((pch = strchr((xdigits = xdigits_l), ch)) == NULL)
+			pch = strchr((xdigits = xdigits_u), ch);
+		if (pch != NULL) {
+			if (count_xdigit >= 4)
+				return (0);
+			val <<= 4;
+			val |= (pch - xdigits);
+			if (val > 0xffff)
+				return (0);
+			saw_xdigit = 1;
+			count_xdigit++;
+			continue;
+		}
+		if (ch == ':') {
+			curtok = src;
+			if (!saw_xdigit) {
+				if (colonp)
+					return (0);
+				colonp = tp;
+				continue;
+			} else if (*src == '\0') {
+				return (0);
+			}
+			if (tp + INT16SZ > endp)
+				return (0);
+			*tp++ = (u_char) (val >> 8) & 0xff;
+			*tp++ = (u_char) val & 0xff;
+			saw_xdigit = 0;
+			count_xdigit = 0;
+			val = 0;
+			continue;
+		}
+		if (ch == '.' && ((tp + INADDRSZ) <= endp) &&
+		    inet_pton4(curtok, tp) > 0) {
+			tp += INADDRSZ;
+			saw_xdigit = 0;
+			count_xdigit = 0;
+			break;	/* '\0' was seen by inet_pton4(). */
+		}
+		return (0);
+	}
+	if (saw_xdigit) {
+		if (tp + INT16SZ > endp)
+			return (0);
+		*tp++ = (u_char) (val >> 8) & 0xff;
+		*tp++ = (u_char) val & 0xff;
+	}
+	if (colonp != NULL) {
+		/*
+		 * Since some memmove()'s erroneously fail to handle
+		 * overlapping regions, we'll do the shift by hand.
+		 */
+		const int n = tp - colonp;
+		int i;
+
+		if (tp == endp)
+			return (0);
+		for (i = 1; i <= n; i++) {
+			endp[- i] = colonp[n - i];
+			colonp[n - i] = 0;
+		}
+		tp = endp;
+	}
+	if (tp != endp)
+		return (0);
+	memcpy(dst, tmp, IN6ADDRSZ);
+	return (1);
+}
diff --git a/libc/upstream-openbsd/lib/libc/net/ntohl.c b/libc/upstream-openbsd/lib/libc/net/ntohl.c
new file mode 100644
index 0000000..36414b7
--- /dev/null
+++ b/libc/upstream-openbsd/lib/libc/net/ntohl.c
@@ -0,0 +1,21 @@
+/*	$OpenBSD: ntohl.c,v 1.6 2005/08/06 20:30:03 espie Exp $ */
+/*
+ * Written by J.T. Conklin <jtc@netbsd.org>.
+ * Public domain.
+ */
+
+#include <sys/types.h>
+#include <machine/endian.h>
+
+#undef ntohl
+
+u_int32_t
+ntohl(u_int32_t x)
+{
+#if BYTE_ORDER == LITTLE_ENDIAN
+	u_char *s = (u_char *)&x;
+	return (u_int32_t)(s[0] << 24 | s[1] << 16 | s[2] << 8 | s[3]);
+#else
+	return x;
+#endif
+}
diff --git a/libc/upstream-openbsd/lib/libc/net/ntohs.c b/libc/upstream-openbsd/lib/libc/net/ntohs.c
new file mode 100644
index 0000000..8f345e8
--- /dev/null
+++ b/libc/upstream-openbsd/lib/libc/net/ntohs.c
@@ -0,0 +1,21 @@
+/*	$OpenBSD: ntohs.c,v 1.8 2005/08/06 20:30:03 espie Exp $ */
+/*
+ * Written by J.T. Conklin <jtc@netbsd.org>.
+ * Public domain.
+ */
+
+#include <sys/types.h>
+#include <machine/endian.h>
+
+#undef ntohs
+
+u_int16_t
+ntohs(u_int16_t x)
+{
+#if BYTE_ORDER == LITTLE_ENDIAN
+	u_char *s = (u_char *) &x;
+	return (u_int16_t)(s[0] << 8 | s[1]);
+#else
+	return x;
+#endif
+}
diff --git a/libc/zoneinfo/tzdata b/libc/zoneinfo/tzdata
index 2178298..d78c819 100644
--- a/libc/zoneinfo/tzdata
+++ b/libc/zoneinfo/tzdata
Binary files differ
diff --git a/linker/linker.cpp b/linker/linker.cpp
index 4be68d1..df53a84 100644
--- a/linker/linker.cpp
+++ b/linker/linker.cpp
@@ -60,7 +60,6 @@
  *
  * open issues / todo:
  *
- * - are we doing everything we should for ARM_COPY relocations?
  * - cleaner error reporting
  * - after linking, set as much stuff as possible to READONLY
  *   and NOEXEC
@@ -976,52 +975,17 @@
         break;
 
     case R_AARCH64_COPY:
-        if ((si->flags & FLAG_EXE) == 0) {
-            /*
-              * http://infocenter.arm.com/help/topic/com.arm.doc.ihi0044d/IHI0044D_aaelf.pdf
-              *
-              * Section 4.7.1.10 "Dynamic relocations"
-              * R_AARCH64_COPY may only appear in executable objects where e_type is
-              * set to ET_EXEC.
-              *
-              * FLAG_EXE is set for both ET_DYN and ET_EXEC executables.
-              * We should explicitly disallow ET_DYN executables from having
-              * R_AARCH64_COPY relocations.
-              */
-            DL_ERR("%s R_AARCH64_COPY relocations only supported for ET_EXEC", si->name);
-            return -1;
-        }
-        count_relocation(kRelocCopy);
-        MARK(rela->r_offset);
-        TRACE_TYPE(RELO, "RELO COPY %16llx <- %lld @ %16llx %s\n",
-                   reloc,
-                   s->st_size,
-                   (sym_addr + rela->r_addend),
-                   sym_name);
-        if (reloc == (sym_addr + rela->r_addend)) {
-            ElfW(Sym)* src = soinfo_do_lookup(NULL, sym_name, &lsi, needed);
-
-            if (src == NULL) {
-                DL_ERR("%s R_AARCH64_COPY relocation source cannot be resolved", si->name);
-                return -1;
-            }
-            if (lsi->has_DT_SYMBOLIC) {
-                DL_ERR("%s invalid R_AARCH64_COPY relocation against DT_SYMBOLIC shared "
-                       "library %s (built with -Bsymbolic?)", si->name, lsi->name);
-                return -1;
-            }
-            if (s->st_size < src->st_size) {
-                DL_ERR("%s R_AARCH64_COPY relocation size mismatch (%lld < %lld)",
-                       si->name, s->st_size, src->st_size);
-                return -1;
-            }
-            memcpy(reinterpret_cast<void*>(reloc),
-                   reinterpret_cast<void*>(src->st_value + lsi->load_bias), src->st_size);
-        } else {
-            DL_ERR("%s R_AARCH64_COPY relocation target cannot be resolved", si->name);
-            return -1;
-        }
-        break;
+        /*
+         * ET_EXEC is not supported so this should not happen.
+         *
+         * http://infocenter.arm.com/help/topic/com.arm.doc.ihi0044d/IHI0044D_aaelf.pdf
+         *
+         * Section 4.7.1.10 "Dynamic relocations"
+         * R_AARCH64_COPY may only appear in executable objects where e_type is
+         * set to ET_EXEC.
+         */
+        DL_ERR("%s R_AARCH64_COPY relocations are not supported", si->name);
+        return -1;
     case R_AARCH64_TLS_TPREL64:
         TRACE_TYPE(RELO, "RELO TLS_TPREL64 *** %16llx <- %16llx - %16llx\n",
                    reloc, (sym_addr + rela->r_addend), rela->r_offset);
@@ -1199,48 +1163,17 @@
             *reinterpret_cast<ElfW(Addr)*>(reloc) += sym_addr - rel->r_offset;
             break;
         case R_ARM_COPY:
-            if ((si->flags & FLAG_EXE) == 0) {
-                /*
-                 * http://infocenter.arm.com/help/topic/com.arm.doc.ihi0044d/IHI0044D_aaelf.pdf
-                 *
-                 * Section 4.7.1.10 "Dynamic relocations"
-                 * R_ARM_COPY may only appear in executable objects where e_type is
-                 * set to ET_EXEC.
-                 *
-                 * TODO: FLAG_EXE is set for both ET_DYN and ET_EXEC executables.
-                 * We should explicitly disallow ET_DYN executables from having
-                 * R_ARM_COPY relocations.
-                 */
-                DL_ERR("%s R_ARM_COPY relocations only supported for ET_EXEC", si->name);
-                return -1;
-            }
-            count_relocation(kRelocCopy);
-            MARK(rel->r_offset);
-            TRACE_TYPE(RELO, "RELO %08x <- %d @ %08x %s", reloc, s->st_size, sym_addr, sym_name);
-            if (reloc == sym_addr) {
-                ElfW(Sym)* src = soinfo_do_lookup(NULL, sym_name, &lsi, needed);
-
-                if (src == NULL) {
-                    DL_ERR("%s R_ARM_COPY relocation source cannot be resolved", si->name);
-                    return -1;
-                }
-                if (lsi->has_DT_SYMBOLIC) {
-                    DL_ERR("%s invalid R_ARM_COPY relocation against DT_SYMBOLIC shared "
-                           "library %s (built with -Bsymbolic?)", si->name, lsi->name);
-                    return -1;
-                }
-                if (s->st_size < src->st_size) {
-                    DL_ERR("%s R_ARM_COPY relocation size mismatch (%d < %d)",
-                           si->name, s->st_size, src->st_size);
-                    return -1;
-                }
-                memcpy(reinterpret_cast<void*>(reloc),
-                       reinterpret_cast<void*>(src->st_value + lsi->load_bias), src->st_size);
-            } else {
-                DL_ERR("%s R_ARM_COPY relocation target cannot be resolved", si->name);
-                return -1;
-            }
-            break;
+            /*
+             * ET_EXEC is not supported so this should not happen.
+             *
+             * http://infocenter.arm.com/help/topic/com.arm.doc.ihi0044d/IHI0044D_aaelf.pdf
+             *
+             * Section 4.7.1.10 "Dynamic relocations"
+             * R_ARM_COPY may only appear in executable objects where e_type is
+             * set to ET_EXEC.
+             */
+            DL_ERR("%s R_ARM_COPY relocations are not supported", si->name);
+            return -1;
 #elif defined(__i386__)
         case R_386_JMP_SLOT:
             count_relocation(kRelocAbsolute);
diff --git a/tests/Android.mk b/tests/Android.mk
index 10e288c..4ad07ba 100644
--- a/tests/Android.mk
+++ b/tests/Android.mk
@@ -44,6 +44,7 @@
     -std=gnu++11 \
 
 libBionicStandardTests_src_files := \
+    arpa_inet_test.cpp \
     buffer_tests.cpp \
     ctype_test.cpp \
     dirent_test.cpp \
@@ -59,6 +60,7 @@
     locale_test.cpp \
     malloc_test.cpp \
     math_test.cpp \
+    mntent_test.cpp \
     netdb_test.cpp \
     pthread_test.cpp \
     regex_test.cpp \
diff --git a/tests/arpa_inet_test.cpp b/tests/arpa_inet_test.cpp
new file mode 100644
index 0000000..cee9f36
--- /dev/null
+++ b/tests/arpa_inet_test.cpp
@@ -0,0 +1,61 @@
+/*
+ * Copyright (C) 2014 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <gtest/gtest.h>
+
+#include <arpa/inet.h>
+
+TEST(arpa_inet, inet_addr) {
+  ASSERT_EQ((htonl)(0x7f000001), inet_addr("127.0.0.1"));
+}
+
+TEST(arpa_inet, inet_aton) {
+  in_addr a;
+  ASSERT_EQ(1, inet_aton("127.0.0.1", &a));
+  ASSERT_EQ((htonl)(0x7f000001), a.s_addr);
+}
+
+TEST(arpa_inet, inet_lnaof) {
+  in_addr a = { htonl(0x12345678) };
+  ASSERT_EQ(0x00345678U, inet_lnaof(a));
+}
+
+TEST(arpa_inet, inet_makeaddr) {
+  in_addr a = inet_makeaddr(0x12U, 0x345678);
+  ASSERT_EQ((htonl)(0x12345678), a.s_addr);
+}
+
+TEST(arpa_inet, inet_netof) {
+  in_addr a = { htonl(0x12345678) };
+  ASSERT_EQ(0x12U, inet_netof(a));
+}
+
+TEST(arpa_inet, inet_network) {
+  ASSERT_EQ(0x7f000001U, inet_network("127.0.0.1"));
+}
+
+TEST(arpa_inet, inet_ntoa) {
+  in_addr a = { (htonl)(0x7f000001) };
+  ASSERT_STREQ("127.0.0.1", inet_ntoa(a));
+}
+
+TEST(arpa_inet, inet_pton__inet_ntop) {
+  sockaddr_storage ss;
+  ASSERT_EQ(1, inet_pton(AF_INET, "127.0.0.1", &ss));
+
+  char s[INET_ADDRSTRLEN];
+  ASSERT_STREQ("127.0.0.1", inet_ntop(AF_INET, &ss, s, INET_ADDRSTRLEN));
+}
diff --git a/tests/dlfcn_test.cpp b/tests/dlfcn_test.cpp
index 1fdecdb..b31d7e4 100644
--- a/tests/dlfcn_test.cpp
+++ b/tests/dlfcn_test.cpp
@@ -53,7 +53,7 @@
 TEST(dlfcn, dlopen_failure) {
   void* self = dlopen("/does/not/exist", RTLD_NOW);
   ASSERT_TRUE(self == NULL);
-#if __BIONIC__
+#if defined(__BIONIC__)
   ASSERT_STREQ("dlopen failed: library \"/does/not/exist\" not found", dlerror());
 #else
   ASSERT_STREQ("/does/not/exist: cannot open shared object file: No such file or directory", dlerror());
@@ -92,14 +92,14 @@
   // NULL handle.
   sym = dlsym(NULL, "test");
   ASSERT_TRUE(sym == NULL);
-#if __BIONIC__
+#if defined(__BIONIC__)
   ASSERT_SUBSTR("dlsym library handle is null", dlerror());
 #else
   ASSERT_SUBSTR("undefined symbol: test", dlerror()); // glibc isn't specific about the failure.
 #endif
 
   // NULL symbol name.
-#if __BIONIC__
+#if defined(__BIONIC__)
   // glibc marks this parameter non-null and SEGVs if you cheat.
   sym = dlsym(self, NULL);
   ASSERT_TRUE(sym == NULL);
@@ -206,7 +206,7 @@
   dlerror(); // Clear any pending errors.
   void* handle;
 
-#ifdef __GLIBC__
+#if defined(__GLIBC__)
   // glibc was smart enough not to define RTLD_NOW as 0, so it can detect missing flags.
   handle = dlopen(NULL, 0);
   ASSERT_TRUE(handle == NULL);
diff --git a/tests/fcntl_test.cpp b/tests/fcntl_test.cpp
index 725ac4a..fb6b07e 100644
--- a/tests/fcntl_test.cpp
+++ b/tests/fcntl_test.cpp
@@ -73,7 +73,7 @@
 TEST(fcntl, fallocate_EINVAL) {
   TemporaryFile tf;
 
-#if !defined(__GLIBC__)
+#if defined(__BIONIC__)
   errno = 0;
   ASSERT_EQ(-1, fallocate(tf.fd, 0, 0, -1));
   ASSERT_EQ(EINVAL, errno);
@@ -98,7 +98,7 @@
   ASSERT_EQ(0, fstat(tf.fd, &sb));
   ASSERT_EQ(0, sb.st_size);
 
-#if !defined(__GLIBC__)
+#if defined(__BIONIC__)
   ASSERT_EQ(0, fallocate(tf.fd, 0, 0, 1));
   ASSERT_EQ(0, fstat(tf.fd, &sb));
   ASSERT_EQ(1, sb.st_size);
diff --git a/tests/fortify_test.cpp b/tests/fortify_test.cpp
index e0d055e..873c71e 100644
--- a/tests/fortify_test.cpp
+++ b/tests/fortify_test.cpp
@@ -668,7 +668,7 @@
 }
 
 TEST(DEATHTEST, FD_ISSET_fortified) {
-#ifdef __BIONIC__ // glibc catches this at compile-time.
+#if defined(__BIONIC__) // glibc catches this at compile-time.
   ::testing::FLAGS_gtest_death_test_style = "threadsafe";
   fd_set set;
   memset(&set, 0, sizeof(set));
diff --git a/tests/locale_test.cpp b/tests/locale_test.cpp
index 347b5b1..7d063f9 100644
--- a/tests/locale_test.cpp
+++ b/tests/locale_test.cpp
@@ -58,7 +58,7 @@
   EXPECT_EQ(NULL, setlocale(13, NULL));
   EXPECT_EQ(EINVAL, errno);
 
-#if __BIONIC__
+#if defined(__BIONIC__)
   // The "" locale is implementation-defined. For bionic, it's the C locale.
   // glibc will give us something like "en_US.UTF-8", depending on the user's configuration.
   EXPECT_STREQ("C", setlocale(LC_ALL, ""));
diff --git a/tests/mntent_test.cpp b/tests/mntent_test.cpp
new file mode 100644
index 0000000..637cb52
--- /dev/null
+++ b/tests/mntent_test.cpp
@@ -0,0 +1,34 @@
+/*
+ * Copyright (C) 2013 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <gtest/gtest.h>
+
+#include <mntent.h>
+
+TEST(mntent, mntent_smoke) {
+  FILE* fp = setmntent("/no/mnt/tab/on/android", "r");
+  ASSERT_TRUE(fp == NULL);
+
+#if __BIONIC__ // glibc doesn't let you call getmntent/getmntent_r with a NULL FILE*.
+  ASSERT_TRUE(getmntent(fp) == NULL);
+
+  struct mntent mbuf;
+  char cbuf[32];
+  ASSERT_TRUE(getmntent_r(fp, &mbuf, cbuf, sizeof(cbuf)) == NULL);
+#endif
+
+  ASSERT_EQ(1, endmntent(fp));
+}
diff --git a/tests/regex_test.cpp b/tests/regex_test.cpp
index 659d1db..d026221 100644
--- a/tests/regex_test.cpp
+++ b/tests/regex_test.cpp
@@ -28,7 +28,7 @@
 
   char buf[80];
   regerror(REG_NOMATCH, &re, buf, sizeof(buf));
-#if __BIONIC__
+#if defined(__BIONIC__)
   ASSERT_STREQ("regexec() failed to match", buf);
 #else
   ASSERT_STREQ("No match", buf);
diff --git a/tests/sched_test.cpp b/tests/sched_test.cpp
index 7c19962..caf4c65 100644
--- a/tests/sched_test.cpp
+++ b/tests/sched_test.cpp
@@ -21,7 +21,7 @@
 #include <sys/types.h>
 #include <sys/wait.h>
 
-#ifdef __BIONIC__
+#if defined(__BIONIC__)
 static int child_fn(void* i_ptr) {
   *reinterpret_cast<int*>(i_ptr) = 42;
   return 123;
diff --git a/tests/stdio_test.cpp b/tests/stdio_test.cpp
index 73e9dcd..655ad3f 100644
--- a/tests/stdio_test.cpp
+++ b/tests/stdio_test.cpp
@@ -24,6 +24,9 @@
 #include <sys/stat.h>
 #include <unistd.h>
 #include <wchar.h>
+#include <locale.h>
+
+#include "TemporaryFile.h"
 
 TEST(stdio, tmpfile_fileno_fprintf_rewind_fgets) {
   FILE* fp = tmpfile();
@@ -102,14 +105,12 @@
   ASSERT_EQ(getdelim(&buffer, NULL, ' ', fp), -1);
   ASSERT_EQ(EINVAL, errno);
 
-  // The stream can't be closed.
-  fclose(fp);
+  // The underlying fd can't be closed.
+  ASSERT_EQ(0, close(fileno(fp)));
   errno = 0;
   ASSERT_EQ(getdelim(&buffer, &buffer_length, ' ', fp), -1);
-  // glibc sometimes doesn't set errno in this particular case.
-#if defined(__BIONIC__)
   ASSERT_EQ(EBADF, errno);
-#endif // __BIONIC__
+  fclose(fp);
 }
 
 TEST(stdio, getline) {
@@ -171,14 +172,12 @@
   ASSERT_EQ(getline(&buffer, NULL, fp), -1);
   ASSERT_EQ(EINVAL, errno);
 
-  // The stream can't be closed.
-  fclose(fp);
+  // The underlying fd can't be closed.
+  ASSERT_EQ(0, close(fileno(fp)));
   errno = 0;
   ASSERT_EQ(getline(&buffer, &buffer_length, fp), -1);
-  // glibc sometimes doesn't set errno in this particular case.
-#if defined(__BIONIC__)
   ASSERT_EQ(EBADF, errno);
-#endif // __BIONIC__
+  fclose(fp);
 }
 
 TEST(stdio, printf_ssize_t) {
@@ -220,7 +219,7 @@
 }
 
 TEST(stdio, snprintf_n) {
-#if !defined(__GLIBC__)
+#if defined(__BIONIC__)
   // http://b/14492135
   char buf[32];
   int i = 1234;
@@ -461,7 +460,7 @@
 
   errno = 0;
   EXPECT_EQ(EOF, fwprintf(fp, L"hello"));
-#if !defined(__GLIBC__)
+#if defined(__BIONIC__)
   EXPECT_EQ(EBADF, errno);
 #endif
 
@@ -479,7 +478,131 @@
 
   errno = 0;
   EXPECT_EQ(WEOF, fputwc(L'x', fp));
-#if !defined(__GLIBC__)
+#if defined(__BIONIC__)
   EXPECT_EQ(EBADF, errno);
 #endif
 }
+
+// Tests that we can only have a consistent and correct fpos_t when using
+// f*pos functions (i.e. fpos doesn't get inside a multi byte character).
+TEST(stdio, consistent_fpos_t) {
+  ASSERT_STREQ("C.UTF-8", setlocale(LC_CTYPE, "C.UTF-8"));
+  uselocale(LC_GLOBAL_LOCALE);
+
+  FILE* fp = tmpfile();
+  ASSERT_TRUE(fp != NULL);
+
+  wchar_t mb_one_bytes = L'h';
+  wchar_t mb_two_bytes = 0x00a2;
+  wchar_t mb_three_bytes = 0x20ac;
+  wchar_t mb_four_bytes = 0x24b62;
+
+  // Write to file.
+  ASSERT_EQ(mb_one_bytes, static_cast<wchar_t>(fputwc(mb_one_bytes, fp)));
+  ASSERT_EQ(mb_two_bytes, static_cast<wchar_t>(fputwc(mb_two_bytes, fp)));
+  ASSERT_EQ(mb_three_bytes, static_cast<wchar_t>(fputwc(mb_three_bytes, fp)));
+  ASSERT_EQ(mb_four_bytes, static_cast<wchar_t>(fputwc(mb_four_bytes, fp)));
+
+  rewind(fp);
+
+  // Record each character position.
+  fpos_t pos1;
+  fpos_t pos2;
+  fpos_t pos3;
+  fpos_t pos4;
+  fpos_t pos5;
+  EXPECT_EQ(0, fgetpos(fp, &pos1));
+  ASSERT_EQ(mb_one_bytes, static_cast<wchar_t>(fgetwc(fp)));
+  EXPECT_EQ(0, fgetpos(fp, &pos2));
+  ASSERT_EQ(mb_two_bytes, static_cast<wchar_t>(fgetwc(fp)));
+  EXPECT_EQ(0, fgetpos(fp, &pos3));
+  ASSERT_EQ(mb_three_bytes, static_cast<wchar_t>(fgetwc(fp)));
+  EXPECT_EQ(0, fgetpos(fp, &pos4));
+  ASSERT_EQ(mb_four_bytes, static_cast<wchar_t>(fgetwc(fp)));
+  EXPECT_EQ(0, fgetpos(fp, &pos5));
+
+#if defined(__BIONIC__)
+  // Bionic's fpos_t is just an alias for off_t. This is inherited from OpenBSD
+  // upstream. Glibc differs by storing the mbstate_t inside its fpos_t. In
+  // Bionic (and upstream OpenBSD) the mbstate_t is stored inside the FILE
+  // structure.
+  ASSERT_EQ(0, static_cast<off_t>(pos1));
+  ASSERT_EQ(1, static_cast<off_t>(pos2));
+  ASSERT_EQ(3, static_cast<off_t>(pos3));
+  ASSERT_EQ(6, static_cast<off_t>(pos4));
+  ASSERT_EQ(10, static_cast<off_t>(pos5));
+#endif
+
+  // Exercise back and forth movements of the position.
+  ASSERT_EQ(0, fsetpos(fp, &pos2));
+  ASSERT_EQ(mb_two_bytes, static_cast<wchar_t>(fgetwc(fp)));
+  ASSERT_EQ(0, fsetpos(fp, &pos1));
+  ASSERT_EQ(mb_one_bytes, static_cast<wchar_t>(fgetwc(fp)));
+  ASSERT_EQ(0, fsetpos(fp, &pos4));
+  ASSERT_EQ(mb_four_bytes, static_cast<wchar_t>(fgetwc(fp)));
+  ASSERT_EQ(0, fsetpos(fp, &pos3));
+  ASSERT_EQ(mb_three_bytes, static_cast<wchar_t>(fgetwc(fp)));
+  ASSERT_EQ(0, fsetpos(fp, &pos5));
+  ASSERT_EQ(WEOF, fgetwc(fp));
+
+  fclose(fp);
+}
+
+// Exercise the interaction between fpos and seek.
+TEST(stdio, fpos_t_and_seek) {
+  ASSERT_STREQ("C.UTF-8", setlocale(LC_CTYPE, "C.UTF-8"));
+  uselocale(LC_GLOBAL_LOCALE);
+
+  // For glibc we need to close and re-open the file in order for fseek to work
+  // after using setlocale(LC_CTYPE, "C.UTF-8") and fputwc.
+  // TODO: find out if this is expected or a bug in glibc.
+  TemporaryFile tf;
+  FILE* fp = fdopen(tf.fd, "w+");
+  ASSERT_TRUE(fp != NULL);
+
+  wchar_t mb_two_bytes = 0x00a2;
+  wchar_t mb_three_bytes = 0x20ac;
+  wchar_t mb_four_bytes = 0x24b62;
+
+  // Write to file.
+  ASSERT_EQ(mb_two_bytes, static_cast<wchar_t>(fputwc(mb_two_bytes, fp)));
+  ASSERT_EQ(mb_three_bytes, static_cast<wchar_t>(fputwc(mb_three_bytes, fp)));
+  ASSERT_EQ(mb_four_bytes, static_cast<wchar_t>(fputwc(mb_four_bytes, fp)));
+
+  fflush(fp);
+  fclose(fp);
+
+  fp = fopen(tf.filename, "r");
+  ASSERT_TRUE(fp != NULL);
+
+  // Store a valid position.
+  fpos_t mb_two_bytes_pos;
+  ASSERT_EQ(0, fgetpos(fp, &mb_two_bytes_pos));
+
+  // Move inside mb_four_bytes with fseek.
+  long offset_inside_mb = 6;
+  ASSERT_EQ(0, fseek(fp, offset_inside_mb, SEEK_SET));
+
+  // Store the "inside multi byte" position.
+  fpos_t pos_inside_mb;
+  ASSERT_EQ(0, fgetpos(fp, &pos_inside_mb));
+#if defined(__BIONIC__)
+  ASSERT_EQ(offset_inside_mb, static_cast<off_t>(pos_inside_mb));
+#endif
+
+  // Reading from within a byte should produce an error.
+  ASSERT_EQ(WEOF, fgetwc(fp));
+  ASSERT_EQ(EILSEQ, errno);
+
+  // Reverting to a valid position should work.
+  ASSERT_EQ(0, fsetpos(fp, &mb_two_bytes_pos));
+  ASSERT_EQ(mb_two_bytes, static_cast<wchar_t>(fgetwc(fp)));
+
+  // Moving withing a multi byte with fsetpos should work but reading should
+  // produce an error.
+  ASSERT_EQ(0, fsetpos(fp, &pos_inside_mb));
+  ASSERT_EQ(WEOF, fgetwc(fp));
+  ASSERT_EQ(EILSEQ, errno);
+
+  fclose(fp);
+}
diff --git a/tests/stdlib_test.cpp b/tests/stdlib_test.cpp
index fc0f0e1..978a60f 100644
--- a/tests/stdlib_test.cpp
+++ b/tests/stdlib_test.cpp
@@ -42,7 +42,7 @@
   EXPECT_EQ(902267124, lrand48());
   EXPECT_EQ(132366131, lrand48());
 
-#if __BIONIC__
+#if defined(__BIONIC__)
   // On bionic, random(3) is equivalent to lrand48...
   srandom(0x01020304);
   EXPECT_EQ(1409163720, random());
diff --git a/tests/stubs_test.cpp b/tests/stubs_test.cpp
index 7d70aa6..9b0c231 100644
--- a/tests/stubs_test.cpp
+++ b/tests/stubs_test.cpp
@@ -41,6 +41,10 @@
   EXPECT_STREQ(username, pwd->pw_name);
   EXPECT_EQ(uid, pwd->pw_uid);
   EXPECT_EQ(uid, pwd->pw_gid);
+  ASSERT_EQ(NULL, pwd->pw_passwd);
+#ifdef __LP64__
+  ASSERT_EQ(NULL, pwd->pw_gecos);
+#endif
 
   if (uid_type == TYPE_SYSTEM) {
     EXPECT_STREQ("/", pwd->pw_dir);
diff --git a/tests/sys_resource_test.cpp b/tests/sys_resource_test.cpp
index bd974cb..d6d99a0 100644
--- a/tests/sys_resource_test.cpp
+++ b/tests/sys_resource_test.cpp
@@ -18,7 +18,7 @@
 
 #include <sys/resource.h>
 
-#if __GLIBC__
+#if defined(__GLIBC__)
 /* The host glibc we're currently building with doesn't have prlimit64 yet. */
 static int prlimit64(pid_t, int resource, const struct rlimit64* new_limit, struct rlimit64* old_limit) {
   if (new_limit != NULL) {
@@ -30,7 +30,7 @@
 #endif
 
 TEST(sys_resource, smoke) {
-#if __LP64__ || __GLIBC__
+#if defined(__LP64__) || defined(__GLIBC__)
   ASSERT_EQ(sizeof(rlimit), sizeof(rlimit64));
   ASSERT_EQ(8U, sizeof(rlim_t));
 #else
diff --git a/tests/unistd_test.cpp b/tests/unistd_test.cpp
index 2b51aad..ff05039 100644
--- a/tests/unistd_test.cpp
+++ b/tests/unistd_test.cpp
@@ -67,7 +67,7 @@
   ASSERT_EQ(reinterpret_cast<void*>(-1), sbrk(-(current_brk + 1)));
   ASSERT_EQ(ENOMEM, errno);
 
-#if !defined(__GLIBC__)
+#if defined(__BIONIC__)
   // The maximum negative value is an interesting special case that glibc gets wrong.
   ASSERT_EQ(reinterpret_cast<void*>(-1), sbrk(PTRDIFF_MIN));
   ASSERT_EQ(ENOMEM, errno);
diff --git a/tests/wchar_test.cpp b/tests/wchar_test.cpp
index 0d15f21..c8aec2d 100644
--- a/tests/wchar_test.cpp
+++ b/tests/wchar_test.cpp
@@ -87,6 +87,29 @@
   EXPECT_EQ(EILSEQ, errno);
 }
 
+TEST(wchar, wcrtomb_start_state) {
+  char out[MB_LEN_MAX];
+  mbstate_t ps;
+
+  // Any non-initial state is invalid when calling wcrtomb.
+  memset(&ps, 0, sizeof(ps));
+  EXPECT_EQ(static_cast<size_t>(-2), mbrtowc(NULL, "\xc2", 1, &ps));
+  EXPECT_EQ(static_cast<size_t>(-1), wcrtomb(out, 0x00a2, &ps));
+  EXPECT_EQ(EILSEQ, errno);
+
+  // If the first argument to wcrtomb is NULL or the second is L'\0' the shift
+  // state should be reset.
+  memset(&ps, 0, sizeof(ps));
+  EXPECT_EQ(static_cast<size_t>(-2), mbrtowc(NULL, "\xc2", 1, &ps));
+  EXPECT_EQ(1U, wcrtomb(NULL, 0x00a2, &ps));
+  EXPECT_TRUE(mbsinit(&ps));
+
+  memset(&ps, 0, sizeof(ps));
+  EXPECT_EQ(static_cast<size_t>(-2), mbrtowc(NULL, "\xf0\xa4", 1, &ps));
+  EXPECT_EQ(1U, wcrtomb(out, L'\0', &ps));
+  EXPECT_TRUE(mbsinit(&ps));
+}
+
 TEST(wchar, wcstombs_wcrtombs) {
   const wchar_t chars[] = { L'h', L'e', L'l', L'l', L'o', 0 };
   const wchar_t bad_chars[] = { L'h', L'i', static_cast<wchar_t>(0xffffffff), 0 };
@@ -184,6 +207,14 @@
   EXPECT_EQ(EILSEQ, errno);
   bytes[3] = 0;
   EXPECT_STREQ("hix", bytes);
+
+  // Any non-initial state is invalid when calling wcsrtombs.
+  mbstate_t ps;
+  src = chars;
+  memset(&ps, 0, sizeof(ps));
+  ASSERT_EQ(static_cast<size_t>(-2), mbrtowc(NULL, "\xc2", 1, &ps));
+  EXPECT_EQ(static_cast<size_t>(-1), wcsrtombs(NULL, &src, 0, &ps));
+  EXPECT_EQ(EILSEQ, errno);
 }
 
 TEST(wchar, limits) {
@@ -257,7 +288,7 @@
   // 4-byte UTF-8.
   ASSERT_EQ(4U, mbrtowc(out, "\xf0\xa4\xad\xa2" "ef", 6, NULL));
   ASSERT_EQ(static_cast<wchar_t>(0x24b62), out[0]);
-#if __BIONIC__ // glibc allows this.
+#if defined(__BIONIC__) // glibc allows this.
   // Illegal 5-byte UTF-8.
   ASSERT_EQ(static_cast<size_t>(-1), mbrtowc(out, "\xf8\xa1\xa2\xa3\xa4" "f", 6, NULL));
   ASSERT_EQ(EILSEQ, errno);
@@ -267,6 +298,83 @@
   ASSERT_EQ(EILSEQ, errno);
 }
 
+void test_mbrtowc_incomplete(mbstate_t* ps) {
+  ASSERT_STREQ("C.UTF-8", setlocale(LC_CTYPE, "C.UTF-8"));
+  uselocale(LC_GLOBAL_LOCALE);
+
+  wchar_t out;
+  // 2-byte UTF-8.
+  ASSERT_EQ(static_cast<size_t>(-2), mbrtowc(&out, "\xc2", 1, ps));
+  ASSERT_EQ(1U, mbrtowc(&out, "\xa2" "cdef", 5, ps));
+  ASSERT_EQ(0x00a2, out);
+  ASSERT_TRUE(mbsinit(ps));
+  // 3-byte UTF-8.
+  ASSERT_EQ(static_cast<size_t>(-2), mbrtowc(&out, "\xe2", 1, ps));
+  ASSERT_EQ(static_cast<size_t>(-2), mbrtowc(&out, "\x82", 1, ps));
+  ASSERT_EQ(1U, mbrtowc(&out, "\xac" "def", 4, ps));
+  ASSERT_EQ(0x20ac, out);
+  ASSERT_TRUE(mbsinit(ps));
+  // 4-byte UTF-8.
+  ASSERT_EQ(static_cast<size_t>(-2), mbrtowc(&out, "\xf0", 1, ps));
+  ASSERT_EQ(static_cast<size_t>(-2), mbrtowc(&out, "\xa4\xad", 2, ps));
+  ASSERT_EQ(1U, mbrtowc(&out, "\xa2" "ef", 3, ps));
+  ASSERT_EQ(0x24b62, out);
+  ASSERT_TRUE(mbsinit(ps));
+
+  // Invalid 2-byte
+  ASSERT_EQ(static_cast<size_t>(-2), mbrtowc(&out, "\xc2", 1, ps));
+  ASSERT_EQ(static_cast<size_t>(-1), mbrtowc(&out, "\x20" "cdef", 5, ps));
+  ASSERT_EQ(EILSEQ, errno);
+}
+
+TEST(wchar, mbrtowc_incomplete) {
+  mbstate_t ps;
+  memset(&ps, 0, sizeof(ps));
+
+  test_mbrtowc_incomplete(&ps);
+  test_mbrtowc_incomplete(NULL);
+}
+
+void test_mbsrtowcs(mbstate_t* ps) {
+  wchar_t out[4];
+
+  const char* valid = "A" "\xc2\xa2" "\xe2\x82\xac" "\xf0\xa4\xad\xa2" "ef";
+  ASSERT_EQ(4U, mbsrtowcs(out, &valid, 4, ps));
+  ASSERT_EQ(L'A', out[0]);
+  ASSERT_EQ(0x00a2, out[1]);
+  ASSERT_EQ(0x20ac, out[2]);
+  ASSERT_EQ(0x24b62, out[3]);
+  ASSERT_EQ('e', *valid);
+
+  const char* invalid = "A" "\xc2\x20" "ef";
+  ASSERT_EQ(static_cast<size_t>(-1), mbsrtowcs(out, &invalid, 4, ps));
+  EXPECT_EQ(EILSEQ, errno);
+  ASSERT_EQ('\xc2', *invalid);
+
+  const char* incomplete = "A" "\xc2";
+  ASSERT_EQ(static_cast<size_t>(-1), mbsrtowcs(out, &incomplete, 2, ps));
+  EXPECT_EQ(EILSEQ, errno);
+  ASSERT_EQ('\xc2', *incomplete);
+}
+
+TEST(wchar, mbsrtowcs) {
+  ASSERT_STREQ("C.UTF-8", setlocale(LC_CTYPE, "C.UTF-8"));
+  uselocale(LC_GLOBAL_LOCALE);
+
+  mbstate_t ps;
+  memset(&ps, 0, sizeof(ps));
+  test_mbsrtowcs(&ps);
+  test_mbsrtowcs(NULL);
+
+  // Invalid multi byte continuation.
+  const char* invalid = "\x20";
+  wchar_t out;
+  ASSERT_EQ(static_cast<size_t>(-2), mbrtowc(&out, "\xc2", 1, &ps));
+  ASSERT_EQ(static_cast<size_t>(-1), mbsrtowcs(&out, &invalid, 1, &ps));
+  EXPECT_EQ(EILSEQ, errno);
+  ASSERT_EQ('\x20', *invalid);
+}
+
 TEST(wchar, wcstod) {
   ASSERT_DOUBLE_EQ(1.23, wcstod(L"1.23", NULL));
 }