Merge "added headers for SuperH which automatically generate by update_all.py"
diff --git a/libc/Android.mk b/libc/Android.mk
index bafc118..f627640 100644
--- a/libc/Android.mk
+++ b/libc/Android.mk
@@ -171,7 +171,6 @@
 	stdlib/tolower_.c \
 	stdlib/toupper_.c \
 	stdlib/wchar.c \
-	string/bcopy.c \
 	string/index.c \
 	string/memccpy.c \
 	string/memchr.c \
@@ -182,7 +181,6 @@
 	string/strcasestr.c \
 	string/strcat.c \
 	string/strchr.c \
-	string/strcmp.c \
 	string/strcoll.c \
 	string/strcpy.c \
 	string/strcspn.c \
@@ -192,7 +190,6 @@
 	string/strlcat.c \
 	string/strlcpy.c \
 	string/strncat.c \
-	string/strncmp.c \
 	string/strncpy.c \
 	string/strndup.c \
 	string/strnlen.c \
@@ -299,6 +296,9 @@
 	arch-arm/bionic/strlen.c.arm \
 	arch-arm/bionic/syscall.S \
 	string/memmove.c.arm \
+	string/bcopy.c \
+	string/strcmp.c \
+	string/strncmp.c \
 	unistd/socketcalls.c
 
 # These files need to be arm so that gdbserver
@@ -329,12 +329,16 @@
 	arch-x86/bionic/setjmp.S \
 	arch-x86/bionic/_setjmp.S \
 	arch-x86/bionic/vfork.S \
-	arch-x86/string/bzero.S \
-	arch-x86/string/memset.S \
-	arch-x86/string/memcmp.S \
-	arch-x86/string/memcpy.S \
+	arch-x86/bionic/syscall.S \
+	arch-x86/string/bcopy_wrapper.S \
+	arch-x86/string/memcpy_wrapper.S \
+	arch-x86/string/memmove_wrapper.S \
+	arch-x86/string/bzero_wrapper.S \
+	arch-x86/string/memcmp_wrapper.S \
+	arch-x86/string/memset_wrapper.S \
+	arch-x86/string/strcmp_wrapper.S \
+	arch-x86/string/strncmp_wrapper.S \
 	arch-x86/string/strlen.S \
-	string/memmove.c \
 	bionic/pthread.c \
 	bionic/pthread-timers.c \
 	bionic/ptrace.c
@@ -366,6 +370,9 @@
 	arch-sh/bionic/__set_tls.c \
 	arch-sh/bionic/__get_tls.c \
 	arch-sh/bionic/ffs.S \
+	string/bcopy.c \
+	string/strcmp.c \
+	string/strncmp.c \
 	string/memcmp.c \
 	string/strlen.c \
 	bionic/eabi.c \
@@ -413,6 +420,10 @@
 else # !arm
   ifeq ($(TARGET_ARCH),x86)
     libc_crt_target_cflags := -m32
+
+    # Enable recent IA friendly memory routines (such as for Atom)
+    # These will not work on the earlier x86 machines
+    libc_common_cflags += -mtune=i686 -DUSE_SSSE3 -DUSE_SSE2
   endif # x86
 endif # !arm
 
diff --git a/libc/arch-x86/bionic/syscall.S b/libc/arch-x86/bionic/syscall.S
new file mode 100644
index 0000000..71abe6b
--- /dev/null
+++ b/libc/arch-x86/bionic/syscall.S
@@ -0,0 +1,52 @@
+/*
+ * Generic syscall call.
+ * Upon entry
+ *	%eax: system call number
+ *	%ebx: arg0 to system call
+ *	%ecx: arg..
+ *	%edx: arg..
+ *	%esi: arg..
+ *	%edi: arg..
+ * We push these (to save them) load them up with the
+ * values from the calling frame (not all will actually be valid)
+ * and make the syscall.
+ */
+
+#include <sys/linux-syscalls.h>
+
+    .text
+    .type syscall, @function
+    .globl syscall
+    .align 4
+
+syscall:
+    push    %eax
+    push    %ebx
+    push    %ecx
+    push    %edx
+    push    %esi
+    push    %edi
+    mov     28(%esp),%eax
+    mov     32(%esp),%ebx
+    mov     36(%esp),%ecx
+    mov     40(%esp),%edx
+    mov     44(%esp),%esi
+    mov     48(%esp),%edi
+
+    int     $0x80
+
+    cmpl    $-129, %eax
+    jb      1f
+    negl    %eax
+    pushl   %eax
+    call    __set_errno
+    addl    $4, %esp
+    orl     $-1, %eax
+1:
+    pop    %edi
+    pop    %esi
+    pop    %edx
+    pop    %ecx
+    pop    %ebx
+    pop    %eax
+    ret
diff --git a/libc/arch-x86/include/machine/_types.h b/libc/arch-x86/include/machine/_types.h
index 3a31e22..be4f6e4 100644
--- a/libc/arch-x86/include/machine/_types.h
+++ b/libc/arch-x86/include/machine/_types.h
@@ -36,10 +36,23 @@
 #define _I386__TYPES_H_
 
 /* the kernel defines size_t as unsigned int, but g++ wants it to be unsigned long */
-#define _SIZE_T
+#ifndef _SIZE_T
+#  define _SIZE_T
+#  ifdef ANDROID
+     typedef unsigned int  size_t;
+#  else
+     typedef unsigned long  size_t;
+#  endif
+#endif
+#if !defined(_SSIZE_T) && !defined(_SSIZE_T_DEFINED_)
+#define _SSIZE_T
+#define _SSIZE_T_DEFINED_
+typedef long int       ssize_t;
+#endif
+#ifndef _PTRDIFF_T
 #define _PTRDIFF_T
-typedef unsigned int   size_t;
-typedef int            ptrdiff_t;
+typedef long           ptrdiff_t;
+#endif
 
 #define _OFF_T_DEFINED_
 #define _SIZE_T_DEFINED_
diff --git a/libc/arch-x86/string/bcopy_wrapper.S b/libc/arch-x86/string/bcopy_wrapper.S
new file mode 100644
index 0000000..fa8774c
--- /dev/null
+++ b/libc/arch-x86/string/bcopy_wrapper.S
@@ -0,0 +1,45 @@
+/*
+Copyright (c) 2010, Intel Corporation
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+    * Redistributions of source code must retain the above copyright notice,
+    * this list of conditions and the following disclaimer.
+
+    * Redistributions in binary form must reproduce the above copyright notice,
+    * this list of conditions and the following disclaimer in the documentation
+    * and/or other materials provided with the distribution.
+
+    * Neither the name of Intel Corporation nor the names of its contributors
+    * may be used to endorse or promote products derived from this software
+    * without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
+ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+
+#if defined(USE_SSSE3)
+
+# include "cache_wrapper.S"
+# undef __i686
+# define MEMCPY	bcopy
+# define USE_AS_MEMMOVE
+# define USE_AS_BCOPY
+# include "ssse3-memcpy5.S"
+
+#else
+
+# include "bcopy.S"
+
+#endif
diff --git a/libc/arch-x86/string/bzero_wrapper.S b/libc/arch-x86/string/bzero_wrapper.S
new file mode 100644
index 0000000..aa1bb9c
--- /dev/null
+++ b/libc/arch-x86/string/bzero_wrapper.S
@@ -0,0 +1,43 @@
+/*
+Copyright (c) 2010, Intel Corporation
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+    * Redistributions of source code must retain the above copyright notice,
+    * this list of conditions and the following disclaimer.
+
+    * Redistributions in binary form must reproduce the above copyright notice,
+    * this list of conditions and the following disclaimer in the documentation
+    * and/or other materials provided with the distribution.
+
+    * Neither the name of Intel Corporation nor the names of its contributors
+    * may be used to endorse or promote products derived from this software
+    * without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
+ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+#if defined(USE_SSE2)
+
+# include "cache_wrapper.S"
+# undef __i686
+# define USE_AS_BZERO
+# define sse2_memset5_atom bzero
+# include "sse2-memset5-atom.S"
+
+#else
+
+# include "bzero.S"
+
+#endif
diff --git a/libc/arch-x86/string/cache_wrapper.S b/libc/arch-x86/string/cache_wrapper.S
new file mode 100644
index 0000000..d9aff5c
--- /dev/null
+++ b/libc/arch-x86/string/cache_wrapper.S
@@ -0,0 +1,35 @@
+/*
+Copyright (c) 2010, Intel Corporation
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+    * Redistributions of source code must retain the above copyright notice,
+    * this list of conditions and the following disclaimer.
+
+    * Redistributions in binary form must reproduce the above copyright notice,
+    * this list of conditions and the following disclaimer in the documentation
+    * and/or other materials provided with the distribution.
+
+    * Neither the name of Intel Corporation nor the names of its contributors
+    * may be used to endorse or promote products derived from this software
+    * without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
+ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+/* Values are optimized for Atom */
+#define SHARED_CACHE_SIZE	(512*1024)			/* Atom L2 Cache */
+#define DATA_CACHE_SIZE		(24*1024)			/* Atom L1 Data Cache */
+#define SHARED_CACHE_SIZE_HALF	(SHARED_CACHE_SIZE / 2)
+#define DATA_CACHE_SIZE_HALF	(DATA_CACHE_SIZE / 2)
diff --git a/libc/arch-x86/string/memcmp_wrapper.S b/libc/arch-x86/string/memcmp_wrapper.S
new file mode 100644
index 0000000..7e28c1e
--- /dev/null
+++ b/libc/arch-x86/string/memcmp_wrapper.S
@@ -0,0 +1,40 @@
+/*
+Copyright (c) 2010, Intel Corporation
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+    * Redistributions of source code must retain the above copyright notice,
+    * this list of conditions and the following disclaimer.
+
+    * Redistributions in binary form must reproduce the above copyright notice,
+    * this list of conditions and the following disclaimer in the documentation
+    * and/or other materials provided with the distribution.
+
+    * Neither the name of Intel Corporation nor the names of its contributors
+    * may be used to endorse or promote products derived from this software
+    * without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
+ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+#if defined(USE_SSSE3)
+
+# define MEMCMP memcmp
+# include "ssse3-memcmp3.S"
+
+#else
+
+# include "memcmp.S"
+
+#endif
diff --git a/libc/arch-x86/string/memcpy_wrapper.S b/libc/arch-x86/string/memcpy_wrapper.S
new file mode 100644
index 0000000..7e765ea
--- /dev/null
+++ b/libc/arch-x86/string/memcpy_wrapper.S
@@ -0,0 +1,43 @@
+/*
+Copyright (c) 2010, Intel Corporation
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+    * Redistributions of source code must retain the above copyright notice,
+    * this list of conditions and the following disclaimer.
+
+    * Redistributions in binary form must reproduce the above copyright notice,
+    * this list of conditions and the following disclaimer in the documentation
+    * and/or other materials provided with the distribution.
+
+    * Neither the name of Intel Corporation nor the names of its contributors
+    * may be used to endorse or promote products derived from this software
+    * without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
+ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+#if defined(USE_SSSE3)
+
+# include "cache_wrapper.S"
+# undef __i686
+# define MEMCPY	memcpy
+# define USE_AS_MEMMOVE
+# include "ssse3-memcpy5.S"
+
+#else
+
+# include "memcpy.S"
+
+#endif
diff --git a/libc/arch-x86/string/memmove_wrapper.S b/libc/arch-x86/string/memmove_wrapper.S
new file mode 100644
index 0000000..7e83e27
--- /dev/null
+++ b/libc/arch-x86/string/memmove_wrapper.S
@@ -0,0 +1,43 @@
+/*
+Copyright (c) 2010, Intel Corporation
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+    * Redistributions of source code must retain the above copyright notice,
+    * this list of conditions and the following disclaimer.
+
+    * Redistributions in binary form must reproduce the above copyright notice,
+    * this list of conditions and the following disclaimer in the documentation
+    * and/or other materials provided with the distribution.
+
+    * Neither the name of Intel Corporation nor the names of its contributors
+    * may be used to endorse or promote products derived from this software
+    * without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
+ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+#if defined(USE_SSSE3)
+
+# include "cache_wrapper.S"
+# undef __i686
+# define MEMCPY memmove
+# define USE_AS_MEMMOVE
+# include "ssse3-memcpy5.S"
+
+#else
+
+# include "memmove.S"
+
+#endif
diff --git a/libc/arch-x86/string/memset_wrapper.S b/libc/arch-x86/string/memset_wrapper.S
new file mode 100644
index 0000000..d037a50
--- /dev/null
+++ b/libc/arch-x86/string/memset_wrapper.S
@@ -0,0 +1,42 @@
+/*
+Copyright (c) 2010, Intel Corporation
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+    * Redistributions of source code must retain the above copyright notice,
+    * this list of conditions and the following disclaimer.
+
+    * Redistributions in binary form must reproduce the above copyright notice,
+    * this list of conditions and the following disclaimer in the documentation
+    * and/or other materials provided with the distribution.
+
+    * Neither the name of Intel Corporation nor the names of its contributors
+    * may be used to endorse or promote products derived from this software
+    * without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
+ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+#if defined(USE_SSE2)
+
+# include "cache_wrapper.S"
+# undef __i686
+# define sse2_memset5_atom memset
+# include "sse2-memset5-atom.S"
+
+#else
+
+# include "memset.S"
+
+#endif
diff --git a/libc/arch-x86/string/sse2-memset5-atom.S b/libc/arch-x86/string/sse2-memset5-atom.S
new file mode 100644
index 0000000..59a598c
--- /dev/null
+++ b/libc/arch-x86/string/sse2-memset5-atom.S
@@ -0,0 +1,907 @@
+/*
+Copyright (c) 2010, Intel Corporation
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+    * Redistributions of source code must retain the above copyright notice,
+    * this list of conditions and the following disclaimer.
+
+    * Redistributions in binary form must reproduce the above copyright notice,
+    * this list of conditions and the following disclaimer in the documentation
+    * and/or other materials provided with the distribution.
+
+    * Neither the name of Intel Corporation nor the names of its contributors
+    * may be used to endorse or promote products derived from this software
+    * without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
+ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+#ifndef L
+# define L(label)	.L##label
+#endif
+
+#ifndef ALIGN
+# define ALIGN(n)	.p2align n
+#endif
+
+#ifndef cfi_startproc
+# define cfi_startproc			.cfi_startproc
+#endif
+
+#ifndef cfi_endproc
+# define cfi_endproc			.cfi_endproc
+#endif
+
+#ifndef cfi_rel_offset
+# define cfi_rel_offset(reg, off)	.cfi_rel_offset reg, off
+#endif
+
+#ifndef cfi_restore
+# define cfi_restore(reg)		.cfi_restore (reg)
+#endif
+
+#ifndef cfi_adjust_cfa_offset
+# define cfi_adjust_cfa_offset(off)	.cfi_adjust_cfa_offset off
+#endif
+
+#ifndef ENTRY
+# define ENTRY(name)			\
+	.type name,  @function; 	\
+	.globl name;			\
+	.p2align 4;			\
+name:					\
+	cfi_startproc
+#endif
+
+#ifndef END
+# define END(name)			\
+	cfi_endproc;			\
+	.size name, .-name
+#endif
+
+#define CFI_PUSH(REG)						\
+  cfi_adjust_cfa_offset (4);					\
+  cfi_rel_offset (REG, 0)
+
+#define CFI_POP(REG)						\
+  cfi_adjust_cfa_offset (-4);					\
+  cfi_restore (REG)
+
+#define PUSH(REG)	pushl REG; CFI_PUSH (REG)
+#define POP(REG)	popl REG; CFI_POP (REG)
+
+#ifdef USE_AS_BZERO
+# define DEST		PARMS
+# define LEN		DEST+4
+# define SETRTNVAL
+#else
+# define DEST		PARMS
+# define CHR		DEST+4
+# define LEN		CHR+4
+# define SETRTNVAL	movl DEST(%esp), %eax
+#endif
+
+#ifdef SHARED
+# define ENTRANCE	PUSH (%ebx);
+# define RETURN_END	POP (%ebx); ret
+# define RETURN		RETURN_END; CFI_PUSH (%ebx)
+# define PARMS		8		/* Preserve EBX.  */
+# define JMPTBL(I, B)	I - B
+
+/* Load an entry in a jump table into EBX and branch to it.  TABLE is a
+   jump table with relative offsets.   */
+# define BRANCH_TO_JMPTBL_ENTRY(TABLE)				\
+    /* We first load PC into EBX.  */				\
+    call	__i686.get_pc_thunk.bx;				\
+    /* Get the address of the jump table.  */			\
+    add		$(TABLE - .), %ebx;				\
+    /* Get the entry and convert the relative offset to the	\
+       absolute address.  */					\
+    add		(%ebx,%ecx,4), %ebx;				\
+    add		%ecx, %edx;					\
+    /* We loaded the jump table and adjuested EDX. Go.  */	\
+    jmp		*%ebx
+
+	.section	.gnu.linkonce.t.__i686.get_pc_thunk.bx,"ax",@progbits
+	.globl	__i686.get_pc_thunk.bx
+	.hidden	__i686.get_pc_thunk.bx
+	ALIGN (4)
+	.type	__i686.get_pc_thunk.bx,@function
+__i686.get_pc_thunk.bx:
+	movl	(%esp), %ebx
+	ret
+#else
+# define ENTRANCE
+# define RETURN_END	ret
+# define RETURN		RETURN_END
+# define PARMS		4
+# define JMPTBL(I, B)	I
+
+/* Branch to an entry in a jump table.  TABLE is a jump table with
+   absolute offsets.  */
+# define BRANCH_TO_JMPTBL_ENTRY(TABLE)				\
+    add		%ecx, %edx;					\
+    jmp		*TABLE(,%ecx,4)
+#endif
+
+	.section .text.sse2,"ax",@progbits
+	ALIGN (4)
+ENTRY (sse2_memset5_atom)
+	ENTRANCE
+
+	movl	LEN(%esp), %ecx
+#ifdef USE_AS_BZERO
+	xor	%eax, %eax
+#else
+	movzbl	CHR(%esp), %eax
+	movb	%al, %ah
+	/* Fill the whole EAX with pattern.  */
+	movl	%eax, %edx
+	shl	$16, %eax
+	or	%edx, %eax
+#endif
+	movl	DEST(%esp), %edx
+	cmp	$32, %ecx
+	jae	L(32bytesormore)
+
+L(write_less32bytes):
+	BRANCH_TO_JMPTBL_ENTRY (L(table_less_32bytes))
+
+
+	.pushsection .rodata.sse2,"a",@progbits
+	ALIGN (2)
+L(table_less_32bytes):
+	.int	JMPTBL (L(write_0bytes), L(table_less_32bytes))
+	.int	JMPTBL (L(write_1bytes), L(table_less_32bytes))
+	.int	JMPTBL (L(write_2bytes), L(table_less_32bytes))
+	.int	JMPTBL (L(write_3bytes), L(table_less_32bytes))
+	.int	JMPTBL (L(write_4bytes), L(table_less_32bytes))
+	.int	JMPTBL (L(write_5bytes), L(table_less_32bytes))
+	.int	JMPTBL (L(write_6bytes), L(table_less_32bytes))
+	.int	JMPTBL (L(write_7bytes), L(table_less_32bytes))
+	.int	JMPTBL (L(write_8bytes), L(table_less_32bytes))
+	.int	JMPTBL (L(write_9bytes), L(table_less_32bytes))
+	.int	JMPTBL (L(write_10bytes), L(table_less_32bytes))
+	.int	JMPTBL (L(write_11bytes), L(table_less_32bytes))
+	.int	JMPTBL (L(write_12bytes), L(table_less_32bytes))
+	.int	JMPTBL (L(write_13bytes), L(table_less_32bytes))
+	.int	JMPTBL (L(write_14bytes), L(table_less_32bytes))
+	.int	JMPTBL (L(write_15bytes), L(table_less_32bytes))
+	.int	JMPTBL (L(write_16bytes), L(table_less_32bytes))
+	.int	JMPTBL (L(write_17bytes), L(table_less_32bytes))
+	.int	JMPTBL (L(write_18bytes), L(table_less_32bytes))
+	.int	JMPTBL (L(write_19bytes), L(table_less_32bytes))
+	.int	JMPTBL (L(write_20bytes), L(table_less_32bytes))
+	.int	JMPTBL (L(write_21bytes), L(table_less_32bytes))
+	.int	JMPTBL (L(write_22bytes), L(table_less_32bytes))
+	.int	JMPTBL (L(write_23bytes), L(table_less_32bytes))
+	.int	JMPTBL (L(write_24bytes), L(table_less_32bytes))
+	.int	JMPTBL (L(write_25bytes), L(table_less_32bytes))
+	.int	JMPTBL (L(write_26bytes), L(table_less_32bytes))
+	.int	JMPTBL (L(write_27bytes), L(table_less_32bytes))
+	.int	JMPTBL (L(write_28bytes), L(table_less_32bytes))
+	.int	JMPTBL (L(write_29bytes), L(table_less_32bytes))
+	.int	JMPTBL (L(write_30bytes), L(table_less_32bytes))
+	.int	JMPTBL (L(write_31bytes), L(table_less_32bytes))
+	.popsection
+
+	ALIGN (4)
+L(write_28bytes):
+	movl	%eax, -28(%edx)
+L(write_24bytes):
+	movl	%eax, -24(%edx)
+L(write_20bytes):
+	movl	%eax, -20(%edx)
+L(write_16bytes):
+	movl	%eax, -16(%edx)
+L(write_12bytes):
+	movl	%eax, -12(%edx)
+L(write_8bytes):
+	movl	%eax, -8(%edx)
+L(write_4bytes):
+	movl	%eax, -4(%edx)
+L(write_0bytes):
+	SETRTNVAL
+	RETURN
+
+	ALIGN (4)
+L(write_29bytes):
+	movl	%eax, -29(%edx)
+L(write_25bytes):
+	movl	%eax, -25(%edx)
+L(write_21bytes):
+	movl	%eax, -21(%edx)
+L(write_17bytes):
+	movl	%eax, -17(%edx)
+L(write_13bytes):
+	movl	%eax, -13(%edx)
+L(write_9bytes):
+	movl	%eax, -9(%edx)
+L(write_5bytes):
+	movl	%eax, -5(%edx)
+L(write_1bytes):
+	movb	%al, -1(%edx)
+	SETRTNVAL
+	RETURN
+
+	ALIGN (4)
+L(write_30bytes):
+	movl	%eax, -30(%edx)
+L(write_26bytes):
+	movl	%eax, -26(%edx)
+L(write_22bytes):
+	movl	%eax, -22(%edx)
+L(write_18bytes):
+	movl	%eax, -18(%edx)
+L(write_14bytes):
+	movl	%eax, -14(%edx)
+L(write_10bytes):
+	movl	%eax, -10(%edx)
+L(write_6bytes):
+	movl	%eax, -6(%edx)
+L(write_2bytes):
+	movw	%ax, -2(%edx)
+	SETRTNVAL
+	RETURN
+
+	ALIGN (4)
+L(write_31bytes):
+	movl	%eax, -31(%edx)
+L(write_27bytes):
+	movl	%eax, -27(%edx)
+L(write_23bytes):
+	movl	%eax, -23(%edx)
+L(write_19bytes):
+	movl	%eax, -19(%edx)
+L(write_15bytes):
+	movl	%eax, -15(%edx)
+L(write_11bytes):
+	movl	%eax, -11(%edx)
+L(write_7bytes):
+	movl	%eax, -7(%edx)
+L(write_3bytes):
+	movw	%ax, -3(%edx)
+	movb	%al, -1(%edx)
+	SETRTNVAL
+	RETURN
+
+	ALIGN (4)
+/* ECX > 32 and EDX is 4 byte aligned.  */
+L(32bytesormore):
+	/* Fill xmm0 with the pattern.  */
+#ifdef USE_AS_BZERO
+	pxor	%xmm0, %xmm0
+#else
+	movd	%eax, %xmm0
+	punpcklbw %xmm0, %xmm0
+	pshufd	$0, %xmm0, %xmm0
+#endif
+	testl	$0xf, %edx
+	jz	L(aligned_16)
+/* ECX > 32 and EDX is not 16 byte aligned.  */
+L(not_aligned_16):
+	movdqu	%xmm0, (%edx)
+	movl	%edx, %eax
+	and	$-16, %edx
+	add	$16, %edx
+	sub	%edx, %eax
+	add	%eax, %ecx
+	movd	%xmm0, %eax
+
+	ALIGN (4)
+L(aligned_16):
+	cmp	$128, %ecx
+	jae	L(128bytesormore)
+
+L(aligned_16_less128bytes):
+	BRANCH_TO_JMPTBL_ENTRY (L(table_16_128bytes))
+
+	ALIGN (4)
+L(128bytesormore):
+#ifdef SHARED_CACHE_SIZE
+	PUSH (%ebx)
+	mov	$SHARED_CACHE_SIZE, %ebx
+#else
+# ifdef SHARED
+	call	__i686.get_pc_thunk.bx
+	add	$_GLOBAL_OFFSET_TABLE_, %ebx
+	mov	__x86_shared_cache_size@GOTOFF(%ebx), %ebx
+# else
+	PUSH (%ebx)
+	mov	__x86_shared_cache_size, %ebx
+# endif
+#endif
+	cmp	%ebx, %ecx
+	jae	L(128bytesormore_nt_start)
+
+
+#ifdef DATA_CACHE_SIZE
+	POP (%ebx)
+	cmp	$DATA_CACHE_SIZE, %ecx
+#else
+# ifdef SHARED
+	call	__i686.get_pc_thunk.bx
+	add	$_GLOBAL_OFFSET_TABLE_, %ebx
+	cmp	__x86_data_cache_size@GOTOFF(%ebx), %ecx
+# else
+	POP (%ebx)
+	cmp	__x86_data_cache_size, %ecx
+# endif
+#endif
+
+	jae	L(128bytes_L2_normal)
+	subl	$128, %ecx
+L(128bytesormore_normal):
+	sub	$128, %ecx
+	movdqa	%xmm0, (%edx)
+	movdqa	%xmm0, 0x10(%edx)
+	movdqa	%xmm0, 0x20(%edx)
+	movdqa	%xmm0, 0x30(%edx)
+	movdqa	%xmm0, 0x40(%edx)
+	movdqa	%xmm0, 0x50(%edx)
+	movdqa	%xmm0, 0x60(%edx)
+	movdqa	%xmm0, 0x70(%edx)
+	lea	128(%edx), %edx
+	jb	L(128bytesless_normal)
+
+
+	sub	$128, %ecx
+	movdqa	%xmm0, (%edx)
+	movdqa	%xmm0, 0x10(%edx)
+	movdqa	%xmm0, 0x20(%edx)
+	movdqa	%xmm0, 0x30(%edx)
+	movdqa	%xmm0, 0x40(%edx)
+	movdqa	%xmm0, 0x50(%edx)
+	movdqa	%xmm0, 0x60(%edx)
+	movdqa	%xmm0, 0x70(%edx)
+	lea	128(%edx), %edx
+	jae	L(128bytesormore_normal)
+
+L(128bytesless_normal):
+	lea	128(%ecx), %ecx
+	BRANCH_TO_JMPTBL_ENTRY (L(table_16_128bytes))
+
+	ALIGN (4)
+L(128bytes_L2_normal):
+	prefetcht0	0x380(%edx)
+	prefetcht0	0x3c0(%edx)
+	sub	$128, %ecx
+	movdqa	%xmm0, (%edx)
+	movaps	%xmm0, 0x10(%edx)
+	movaps	%xmm0, 0x20(%edx)
+	movaps	%xmm0, 0x30(%edx)
+	movaps	%xmm0, 0x40(%edx)
+	movaps	%xmm0, 0x50(%edx)
+	movaps	%xmm0, 0x60(%edx)
+	movaps	%xmm0, 0x70(%edx)
+	add	$128, %edx
+	cmp	$128, %ecx
+	jae	L(128bytes_L2_normal)
+
+L(128bytesless_L2_normal):
+	BRANCH_TO_JMPTBL_ENTRY (L(table_16_128bytes))
+
+L(128bytesormore_nt_start):
+	sub	%ebx, %ecx
+	ALIGN (4)
+L(128bytesormore_shared_cache_loop):
+	prefetcht0	0x3c0(%edx)
+	prefetcht0	0x380(%edx)
+	sub	$0x80, %ebx
+	movdqa	%xmm0, (%edx)
+	movdqa	%xmm0, 0x10(%edx)
+	movdqa	%xmm0, 0x20(%edx)
+	movdqa	%xmm0, 0x30(%edx)
+	movdqa	%xmm0, 0x40(%edx)
+	movdqa	%xmm0, 0x50(%edx)
+	movdqa	%xmm0, 0x60(%edx)
+	movdqa	%xmm0, 0x70(%edx)
+	add	$0x80, %edx
+	cmp	$0x80, %ebx
+	jae	L(128bytesormore_shared_cache_loop)
+	cmp	$0x80, %ecx
+	jb	L(shared_cache_loop_end)
+	ALIGN (4)
+L(128bytesormore_nt):
+	sub	$0x80, %ecx
+	movntdq	%xmm0, (%edx)
+	movntdq	%xmm0, 0x10(%edx)
+	movntdq	%xmm0, 0x20(%edx)
+	movntdq	%xmm0, 0x30(%edx)
+	movntdq	%xmm0, 0x40(%edx)
+	movntdq	%xmm0, 0x50(%edx)
+	movntdq	%xmm0, 0x60(%edx)
+	movntdq	%xmm0, 0x70(%edx)
+	add	$0x80, %edx
+	cmp	$0x80, %ecx
+	jae	L(128bytesormore_nt)
+	sfence
+L(shared_cache_loop_end):
+#if defined DATA_CACHE_SIZE || !defined SHARED
+	POP (%ebx)
+#endif
+	BRANCH_TO_JMPTBL_ENTRY (L(table_16_128bytes))
+
+
+	.pushsection .rodata.sse2,"a",@progbits
+	ALIGN (2)
+L(table_16_128bytes):
+	.int	JMPTBL (L(aligned_16_0bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_1bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_2bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_3bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_4bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_5bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_6bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_7bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_8bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_9bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_10bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_11bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_12bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_13bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_14bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_15bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_16bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_17bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_18bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_19bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_20bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_21bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_22bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_23bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_24bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_25bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_26bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_27bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_28bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_29bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_30bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_31bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_32bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_33bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_34bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_35bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_36bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_37bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_38bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_39bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_40bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_41bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_42bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_43bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_44bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_45bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_46bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_47bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_48bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_49bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_50bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_51bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_52bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_53bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_54bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_55bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_56bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_57bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_58bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_59bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_60bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_61bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_62bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_63bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_64bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_65bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_66bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_67bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_68bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_69bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_70bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_71bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_72bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_73bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_74bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_75bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_76bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_77bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_78bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_79bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_80bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_81bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_82bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_83bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_84bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_85bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_86bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_87bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_88bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_89bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_90bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_91bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_92bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_93bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_94bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_95bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_96bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_97bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_98bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_99bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_100bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_101bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_102bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_103bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_104bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_105bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_106bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_107bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_108bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_109bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_110bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_111bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_112bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_113bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_114bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_115bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_116bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_117bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_118bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_119bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_120bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_121bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_122bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_123bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_124bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_125bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_126bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_127bytes), L(table_16_128bytes))
+	.popsection
+
+	ALIGN (4)
+L(aligned_16_112bytes):
+	movdqa	%xmm0, -112(%edx)
+L(aligned_16_96bytes):
+	movdqa	%xmm0, -96(%edx)
+L(aligned_16_80bytes):
+	movdqa	%xmm0, -80(%edx)
+L(aligned_16_64bytes):
+	movdqa	%xmm0, -64(%edx)
+L(aligned_16_48bytes):
+	movdqa	%xmm0, -48(%edx)
+L(aligned_16_32bytes):
+	movdqa	%xmm0, -32(%edx)
+L(aligned_16_16bytes):
+	movdqa	%xmm0, -16(%edx)
+L(aligned_16_0bytes):
+	SETRTNVAL
+	RETURN
+
+	ALIGN (4)
+L(aligned_16_113bytes):
+	movdqa	%xmm0, -113(%edx)
+L(aligned_16_97bytes):
+	movdqa	%xmm0, -97(%edx)
+L(aligned_16_81bytes):
+	movdqa	%xmm0, -81(%edx)
+L(aligned_16_65bytes):
+	movdqa	%xmm0, -65(%edx)
+L(aligned_16_49bytes):
+	movdqa	%xmm0, -49(%edx)
+L(aligned_16_33bytes):
+	movdqa	%xmm0, -33(%edx)
+L(aligned_16_17bytes):
+	movdqa	%xmm0, -17(%edx)
+L(aligned_16_1bytes):
+	movb	%al, -1(%edx)
+	SETRTNVAL
+	RETURN
+
+	ALIGN (4)
+L(aligned_16_114bytes):
+	movdqa	%xmm0, -114(%edx)
+L(aligned_16_98bytes):
+	movdqa	%xmm0, -98(%edx)
+L(aligned_16_82bytes):
+	movdqa	%xmm0, -82(%edx)
+L(aligned_16_66bytes):
+	movdqa	%xmm0, -66(%edx)
+L(aligned_16_50bytes):
+	movdqa	%xmm0, -50(%edx)
+L(aligned_16_34bytes):
+	movdqa	%xmm0, -34(%edx)
+L(aligned_16_18bytes):
+	movdqa	%xmm0, -18(%edx)
+L(aligned_16_2bytes):
+	movw	%ax, -2(%edx)
+	SETRTNVAL
+	RETURN
+
+	ALIGN (4)
+L(aligned_16_115bytes):
+	movdqa	%xmm0, -115(%edx)
+L(aligned_16_99bytes):
+	movdqa	%xmm0, -99(%edx)
+L(aligned_16_83bytes):
+	movdqa	%xmm0, -83(%edx)
+L(aligned_16_67bytes):
+	movdqa	%xmm0, -67(%edx)
+L(aligned_16_51bytes):
+	movdqa	%xmm0, -51(%edx)
+L(aligned_16_35bytes):
+	movdqa	%xmm0, -35(%edx)
+L(aligned_16_19bytes):
+	movdqa	%xmm0, -19(%edx)
+L(aligned_16_3bytes):
+	movw	%ax, -3(%edx)
+	movb	%al, -1(%edx)
+	SETRTNVAL
+	RETURN
+
+	ALIGN (4)
+L(aligned_16_116bytes):
+	movdqa	%xmm0, -116(%edx)
+L(aligned_16_100bytes):
+	movdqa	%xmm0, -100(%edx)
+L(aligned_16_84bytes):
+	movdqa	%xmm0, -84(%edx)
+L(aligned_16_68bytes):
+	movdqa	%xmm0, -68(%edx)
+L(aligned_16_52bytes):
+	movdqa	%xmm0, -52(%edx)
+L(aligned_16_36bytes):
+	movdqa	%xmm0, -36(%edx)
+L(aligned_16_20bytes):
+	movdqa	%xmm0, -20(%edx)
+L(aligned_16_4bytes):
+	movl	%eax, -4(%edx)
+	SETRTNVAL
+	RETURN
+
+	ALIGN (4)
+L(aligned_16_117bytes):
+	movdqa	%xmm0, -117(%edx)
+L(aligned_16_101bytes):
+	movdqa	%xmm0, -101(%edx)
+L(aligned_16_85bytes):
+	movdqa	%xmm0, -85(%edx)
+L(aligned_16_69bytes):
+	movdqa	%xmm0, -69(%edx)
+L(aligned_16_53bytes):
+	movdqa	%xmm0, -53(%edx)
+L(aligned_16_37bytes):
+	movdqa	%xmm0, -37(%edx)
+L(aligned_16_21bytes):
+	movdqa	%xmm0, -21(%edx)
+L(aligned_16_5bytes):
+	movl	%eax, -5(%edx)
+	movb	%al, -1(%edx)
+	SETRTNVAL
+	RETURN
+
+	ALIGN (4)
+L(aligned_16_118bytes):
+	movdqa	%xmm0, -118(%edx)
+L(aligned_16_102bytes):
+	movdqa	%xmm0, -102(%edx)
+L(aligned_16_86bytes):
+	movdqa	%xmm0, -86(%edx)
+L(aligned_16_70bytes):
+	movdqa	%xmm0, -70(%edx)
+L(aligned_16_54bytes):
+	movdqa	%xmm0, -54(%edx)
+L(aligned_16_38bytes):
+	movdqa	%xmm0, -38(%edx)
+L(aligned_16_22bytes):
+	movdqa	%xmm0, -22(%edx)
+L(aligned_16_6bytes):
+	movl	%eax, -6(%edx)
+	movw	%ax, -2(%edx)
+	SETRTNVAL
+	RETURN
+
+	ALIGN (4)
+L(aligned_16_119bytes):
+	movdqa	%xmm0, -119(%edx)
+L(aligned_16_103bytes):
+	movdqa	%xmm0, -103(%edx)
+L(aligned_16_87bytes):
+	movdqa	%xmm0, -87(%edx)
+L(aligned_16_71bytes):
+	movdqa	%xmm0, -71(%edx)
+L(aligned_16_55bytes):
+	movdqa	%xmm0, -55(%edx)
+L(aligned_16_39bytes):
+	movdqa	%xmm0, -39(%edx)
+L(aligned_16_23bytes):
+	movdqa	%xmm0, -23(%edx)
+L(aligned_16_7bytes):
+	movl	%eax, -7(%edx)
+	movw	%ax, -3(%edx)
+	movb	%al, -1(%edx)
+	SETRTNVAL
+	RETURN
+
+	ALIGN (4)
+L(aligned_16_120bytes):
+	movdqa	%xmm0, -120(%edx)
+L(aligned_16_104bytes):
+	movdqa	%xmm0, -104(%edx)
+L(aligned_16_88bytes):
+	movdqa	%xmm0, -88(%edx)
+L(aligned_16_72bytes):
+	movdqa	%xmm0, -72(%edx)
+L(aligned_16_56bytes):
+	movdqa	%xmm0, -56(%edx)
+L(aligned_16_40bytes):
+	movdqa	%xmm0, -40(%edx)
+L(aligned_16_24bytes):
+	movdqa	%xmm0, -24(%edx)
+L(aligned_16_8bytes):
+	movq	%xmm0, -8(%edx)
+	SETRTNVAL
+	RETURN
+
+	ALIGN (4)
+L(aligned_16_121bytes):
+	movdqa	%xmm0, -121(%edx)
+L(aligned_16_105bytes):
+	movdqa	%xmm0, -105(%edx)
+L(aligned_16_89bytes):
+	movdqa	%xmm0, -89(%edx)
+L(aligned_16_73bytes):
+	movdqa	%xmm0, -73(%edx)
+L(aligned_16_57bytes):
+	movdqa	%xmm0, -57(%edx)
+L(aligned_16_41bytes):
+	movdqa	%xmm0, -41(%edx)
+L(aligned_16_25bytes):
+	movdqa	%xmm0, -25(%edx)
+L(aligned_16_9bytes):
+	movq	%xmm0, -9(%edx)
+	movb	%al, -1(%edx)
+	SETRTNVAL
+	RETURN
+
+	ALIGN (4)
+L(aligned_16_122bytes):
+	movdqa	%xmm0, -122(%edx)
+L(aligned_16_106bytes):
+	movdqa	%xmm0, -106(%edx)
+L(aligned_16_90bytes):
+	movdqa	%xmm0, -90(%edx)
+L(aligned_16_74bytes):
+	movdqa	%xmm0, -74(%edx)
+L(aligned_16_58bytes):
+	movdqa	%xmm0, -58(%edx)
+L(aligned_16_42bytes):
+	movdqa	%xmm0, -42(%edx)
+L(aligned_16_26bytes):
+	movdqa	%xmm0, -26(%edx)
+L(aligned_16_10bytes):
+	movq	%xmm0, -10(%edx)
+	movw	%ax, -2(%edx)
+	SETRTNVAL
+	RETURN
+
+	ALIGN (4)
+L(aligned_16_123bytes):
+	movdqa	%xmm0, -123(%edx)
+L(aligned_16_107bytes):
+	movdqa	%xmm0, -107(%edx)
+L(aligned_16_91bytes):
+	movdqa	%xmm0, -91(%edx)
+L(aligned_16_75bytes):
+	movdqa	%xmm0, -75(%edx)
+L(aligned_16_59bytes):
+	movdqa	%xmm0, -59(%edx)
+L(aligned_16_43bytes):
+	movdqa	%xmm0, -43(%edx)
+L(aligned_16_27bytes):
+	movdqa	%xmm0, -27(%edx)
+L(aligned_16_11bytes):
+	movq	%xmm0, -11(%edx)
+	movw	%ax, -3(%edx)
+	movb	%al, -1(%edx)
+	SETRTNVAL
+	RETURN
+
+	ALIGN (4)
+L(aligned_16_124bytes):
+	movdqa	%xmm0, -124(%edx)
+L(aligned_16_108bytes):
+	movdqa	%xmm0, -108(%edx)
+L(aligned_16_92bytes):
+	movdqa	%xmm0, -92(%edx)
+L(aligned_16_76bytes):
+	movdqa	%xmm0, -76(%edx)
+L(aligned_16_60bytes):
+	movdqa	%xmm0, -60(%edx)
+L(aligned_16_44bytes):
+	movdqa	%xmm0, -44(%edx)
+L(aligned_16_28bytes):
+	movdqa	%xmm0, -28(%edx)
+L(aligned_16_12bytes):
+	movq	%xmm0, -12(%edx)
+	movl	%eax, -4(%edx)
+	SETRTNVAL
+	RETURN
+
+	ALIGN (4)
+L(aligned_16_125bytes):
+	movdqa	%xmm0, -125(%edx)
+L(aligned_16_109bytes):
+	movdqa	%xmm0, -109(%edx)
+L(aligned_16_93bytes):
+	movdqa	%xmm0, -93(%edx)
+L(aligned_16_77bytes):
+	movdqa	%xmm0, -77(%edx)
+L(aligned_16_61bytes):
+	movdqa	%xmm0, -61(%edx)
+L(aligned_16_45bytes):
+	movdqa	%xmm0, -45(%edx)
+L(aligned_16_29bytes):
+	movdqa	%xmm0, -29(%edx)
+L(aligned_16_13bytes):
+	movq	%xmm0, -13(%edx)
+	movl	%eax, -5(%edx)
+	movb	%al, -1(%edx)
+	SETRTNVAL
+	RETURN
+
+	ALIGN (4)
+L(aligned_16_126bytes):
+	movdqa	%xmm0, -126(%edx)
+L(aligned_16_110bytes):
+	movdqa	%xmm0, -110(%edx)
+L(aligned_16_94bytes):
+	movdqa	%xmm0, -94(%edx)
+L(aligned_16_78bytes):
+	movdqa	%xmm0, -78(%edx)
+L(aligned_16_62bytes):
+	movdqa	%xmm0, -62(%edx)
+L(aligned_16_46bytes):
+	movdqa	%xmm0, -46(%edx)
+L(aligned_16_30bytes):
+	movdqa	%xmm0, -30(%edx)
+L(aligned_16_14bytes):
+	movq	%xmm0, -14(%edx)
+	movl	%eax, -6(%edx)
+	movw	%ax, -2(%edx)
+	SETRTNVAL
+	RETURN
+
+	ALIGN (4)
+L(aligned_16_127bytes):
+	movdqa	%xmm0, -127(%edx)
+L(aligned_16_111bytes):
+	movdqa	%xmm0, -111(%edx)
+L(aligned_16_95bytes):
+	movdqa	%xmm0, -95(%edx)
+L(aligned_16_79bytes):
+	movdqa	%xmm0, -79(%edx)
+L(aligned_16_63bytes):
+	movdqa	%xmm0, -63(%edx)
+L(aligned_16_47bytes):
+	movdqa	%xmm0, -47(%edx)
+L(aligned_16_31bytes):
+	movdqa	%xmm0, -31(%edx)
+L(aligned_16_15bytes):
+	movq	%xmm0, -15(%edx)
+	movl	%eax, -7(%edx)
+	movw	%ax, -3(%edx)
+	movb	%al, -1(%edx)
+	SETRTNVAL
+	RETURN_END
+
+END (sse2_memset5_atom)
diff --git a/libc/arch-x86/string/ssse3-memcmp3.S b/libc/arch-x86/string/ssse3-memcmp3.S
new file mode 100644
index 0000000..a7ce819
--- /dev/null
+++ b/libc/arch-x86/string/ssse3-memcmp3.S
@@ -0,0 +1,2027 @@
+/*
+Copyright (c) 2010, Intel Corporation
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+    * Redistributions of source code must retain the above copyright notice,
+    * this list of conditions and the following disclaimer.
+
+    * Redistributions in binary form must reproduce the above copyright notice,
+    * this list of conditions and the following disclaimer in the documentation
+    * and/or other materials provided with the distribution.
+
+    * Neither the name of Intel Corporation nor the names of its contributors
+    * may be used to endorse or promote products derived from this software
+    * without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
+ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+#ifndef MEMCMP
+# define MEMCMP		ssse3_memcmp3_new
+#endif
+
+#ifndef L
+# define L(label)	.L##label
+#endif
+
+#ifndef ALIGN
+# define ALIGN(n)	.p2align n
+#endif
+
+#ifndef cfi_startproc
+# define cfi_startproc			.cfi_startproc
+#endif
+
+#ifndef cfi_endproc
+# define cfi_endproc			.cfi_endproc
+#endif
+
+#ifndef cfi_rel_offset
+# define cfi_rel_offset(reg, off)	.cfi_rel_offset reg, off
+#endif
+
+#ifndef cfi_restore
+# define cfi_restore(reg)		.cfi_restore (reg)
+#endif
+
+#ifndef cfi_adjust_cfa_offset
+# define cfi_adjust_cfa_offset(off)	.cfi_adjust_cfa_offset off
+#endif
+
+#ifndef ENTRY
+# define ENTRY(name)			\
+	.type name,  @function; 	\
+	.globl name;			\
+	.p2align 4;			\
+name:					\
+	cfi_startproc
+#endif
+
+#ifndef END
+# define END(name)			\
+	cfi_endproc;			\
+	.size name, .-name
+#endif
+
+#define CFI_PUSH(REG)						\
+  cfi_adjust_cfa_offset (4);					\
+  cfi_rel_offset (REG, 0)
+
+#define CFI_POP(REG)						\
+  cfi_adjust_cfa_offset (-4);					\
+  cfi_restore (REG)
+
+#define PUSH(REG)	pushl REG; CFI_PUSH (REG)
+#define POP(REG)	popl REG; CFI_POP (REG)
+
+#define PARMS		4
+#define BLK1		PARMS
+#define BLK2		BLK1+4
+#define LEN		BLK2+4
+#define RETURN_END	POP (%edi); POP (%esi); POP (%ebx); ret
+#define RETURN		RETURN_END; CFI_PUSH (%ebx); CFI_PUSH (%edi); \
+			CFI_PUSH (%esi)
+
+	.section .text.ssse3,"ax",@progbits
+ENTRY (MEMCMP)
+	movl	LEN(%esp), %ecx
+	movl	BLK1(%esp), %eax
+	cmp	$48, %ecx
+	movl	BLK2(%esp), %edx
+	jae	L(48bytesormore)
+	cmp	$1, %ecx
+	jbe	L(less1bytes)
+	PUSH (%ebx)
+	add	%ecx, %edx
+	add	%ecx, %eax
+	jmp	L(less48bytes)
+
+	CFI_POP (%ebx)
+	ALIGN (4)
+L(less1bytes):
+	jb	L(zero)
+	movb	(%eax), %cl
+	cmp	(%edx), %cl
+	je	L(zero)
+	mov	$1, %eax
+	ja	L(1bytesend)
+	neg	%eax
+L(1bytesend):
+	ret
+
+	ALIGN (4)
+L(zero):
+	mov	$0, %eax
+	ret
+
+	ALIGN (4)
+L(48bytesormore):
+	PUSH (%ebx)
+	PUSH (%esi)
+	PUSH (%edi)
+	movdqu    (%eax), %xmm3
+	movdqu    (%edx), %xmm0
+	movl	%eax, %edi
+	movl	%edx, %esi
+	pcmpeqb   %xmm0, %xmm3
+	pmovmskb  %xmm3, %edx
+	lea	16(%edi), %edi
+
+	sub      $0xffff, %edx
+	lea	16(%esi), %esi
+	jnz	  L(less16bytes)
+	mov	%edi, %edx
+	and	$0xf, %edx
+	xor	%edx, %edi
+	sub	%edx, %esi
+	add	%edx, %ecx
+	mov	%esi, %edx
+	and	$0xf, %edx
+	jz	L(shr_0)
+	xor	%edx, %esi
+
+	cmp	$8, %edx
+	jae	L(next_unaligned_table)
+	cmp	$0, %edx
+	je	L(shr_0)
+	cmp	$1, %edx
+	je	L(shr_1)
+	cmp	$2, %edx
+	je	L(shr_2)
+	cmp	$3, %edx
+	je	L(shr_3)
+	cmp	$4, %edx
+	je	L(shr_4)
+	cmp	$5, %edx
+	je	L(shr_5)
+	cmp	$6, %edx
+	je	L(shr_6)
+	jmp	L(shr_7)
+
+	ALIGN (4)
+L(next_unaligned_table):
+	cmp	$8, %edx
+	je	L(shr_8)
+	cmp	$9, %edx
+	je	L(shr_9)
+	cmp	$10, %edx
+	je	L(shr_10)
+	cmp	$11, %edx
+	je	L(shr_11)
+	cmp	$12, %edx
+	je	L(shr_12)
+	cmp	$13, %edx
+	je	L(shr_13)
+	cmp	$14, %edx
+	je	L(shr_14)
+	jmp	L(shr_15)
+
+	ALIGN (4)
+L(shr_0):
+	cmp	$80, %ecx
+	jae	L(shr_0_gobble)
+	lea	-48(%ecx), %ecx
+	xor	%eax, %eax
+	movaps	(%esi), %xmm1
+	pcmpeqb	(%edi), %xmm1
+	movaps	16(%esi), %xmm2
+	pcmpeqb	16(%edi), %xmm2
+	pand	%xmm1, %xmm2
+	pmovmskb %xmm2, %edx
+	add	$32, %edi
+	add	$32, %esi
+	sub	$0xffff, %edx
+	jnz	L(exit)
+
+	lea	(%ecx, %edi,1), %eax
+	lea	(%ecx, %esi,1), %edx
+	POP (%edi)
+	POP (%esi)
+	jmp	L(less48bytes)
+
+	CFI_PUSH (%esi)
+	CFI_PUSH (%edi)
+	ALIGN (4)
+L(shr_0_gobble):
+	lea	-48(%ecx), %ecx
+	movdqa	(%esi), %xmm0
+	xor	%eax, %eax
+	pcmpeqb	(%edi), %xmm0
+	sub	$32, %ecx
+	movdqa	16(%esi), %xmm2
+	pcmpeqb	16(%edi), %xmm2
+L(shr_0_gobble_loop):
+	pand	%xmm0, %xmm2
+	sub	$32, %ecx
+	pmovmskb %xmm2, %edx
+	movdqa	%xmm0, %xmm1
+	movdqa	32(%esi), %xmm0
+	movdqa	48(%esi), %xmm2
+	sbb	$0xffff, %edx
+	pcmpeqb	32(%edi), %xmm0
+	pcmpeqb	48(%edi), %xmm2
+	lea	32(%edi), %edi
+	lea	32(%esi), %esi
+	jz	L(shr_0_gobble_loop)
+
+	pand	%xmm0, %xmm2
+	cmp	$0, %ecx
+	jge	L(shr_0_gobble_loop_next)
+	inc	%edx
+	add	$32, %ecx
+L(shr_0_gobble_loop_next):
+	test	%edx, %edx
+	jnz	L(exit)
+
+	pmovmskb %xmm2, %edx
+	movdqa	%xmm0, %xmm1
+	lea	32(%edi), %edi
+	lea	32(%esi), %esi
+	sub	$0xffff, %edx
+	jnz	L(exit)
+	lea	(%ecx, %edi,1), %eax
+	lea	(%ecx, %esi,1), %edx
+	POP (%edi)
+	POP (%esi)
+	jmp	L(less48bytes)
+
+	CFI_PUSH (%esi)
+	CFI_PUSH (%edi)
+	ALIGN (4)
+L(shr_1):
+	cmp	$80, %ecx
+	lea	-48(%ecx), %ecx
+	mov	%edx, %eax
+	jae	L(shr_1_gobble)
+
+	movdqa	16(%esi), %xmm1
+	movdqa	%xmm1, %xmm2
+	palignr	$1,(%esi), %xmm1
+	pcmpeqb	(%edi), %xmm1
+
+	movdqa	32(%esi), %xmm3
+	palignr	$1,%xmm2, %xmm3
+	pcmpeqb	16(%edi), %xmm3
+
+	pand	%xmm1, %xmm3
+	pmovmskb %xmm3, %edx
+	lea	32(%edi), %edi
+	lea	32(%esi), %esi
+	sub	$0xffff, %edx
+	jnz	L(exit)
+	lea	(%ecx, %edi,1), %eax
+	lea	1(%ecx, %esi,1), %edx
+	POP (%edi)
+	POP (%esi)
+	jmp	L(less48bytes)
+
+	CFI_PUSH (%esi)
+	CFI_PUSH (%edi)
+	ALIGN (4)
+L(shr_1_gobble):
+	sub	$32, %ecx
+	movdqa	16(%esi), %xmm0
+	palignr	$1,(%esi), %xmm0
+	pcmpeqb	(%edi), %xmm0
+
+	movdqa	32(%esi), %xmm3
+	palignr	$1,16(%esi), %xmm3
+	pcmpeqb	16(%edi), %xmm3
+
+L(shr_1_gobble_loop):
+	pand	%xmm0, %xmm3
+	sub	$32, %ecx
+	pmovmskb %xmm3, %edx
+	movdqa	%xmm0, %xmm1
+
+	movdqa	64(%esi), %xmm3
+	palignr	$1,48(%esi), %xmm3
+	sbb	$0xffff, %edx
+	movdqa	48(%esi), %xmm0
+	palignr	$1,32(%esi), %xmm0
+	pcmpeqb	32(%edi), %xmm0
+	lea	32(%esi), %esi
+	pcmpeqb	48(%edi), %xmm3
+
+	lea	32(%edi), %edi
+	jz	L(shr_1_gobble_loop)
+	pand	%xmm0, %xmm3
+
+	cmp	$0, %ecx
+	jge	L(shr_1_gobble_next)
+	inc	%edx
+	add	$32, %ecx
+L(shr_1_gobble_next):
+	test	%edx, %edx
+	jnz	L(exit)
+
+	pmovmskb %xmm3, %edx
+	movdqa	%xmm0, %xmm1
+	lea	32(%edi), %edi
+	lea	32(%esi), %esi
+	sub	$0xffff, %edx
+	jnz	L(exit)
+
+	lea	(%ecx, %edi,1), %eax
+	lea	1(%ecx, %esi,1), %edx
+	POP (%edi)
+	POP (%esi)
+	jmp	L(less48bytes)
+
+	CFI_PUSH (%esi)
+	CFI_PUSH (%edi)
+	ALIGN (4)
+L(shr_2):
+	cmp	$80, %ecx
+	lea	-48(%ecx), %ecx
+	mov	%edx, %eax
+	jae	L(shr_2_gobble)
+
+	movdqa	16(%esi), %xmm1
+	movdqa	%xmm1, %xmm2
+	palignr	$2,(%esi), %xmm1
+	pcmpeqb	(%edi), %xmm1
+
+	movdqa	32(%esi), %xmm3
+	palignr	$2,%xmm2, %xmm3
+	pcmpeqb	16(%edi), %xmm3
+
+	pand	%xmm1, %xmm3
+	pmovmskb %xmm3, %edx
+	lea	32(%edi), %edi
+	lea	32(%esi), %esi
+	sub	$0xffff, %edx
+	jnz	L(exit)
+	lea	(%ecx, %edi,1), %eax
+	lea	2(%ecx, %esi,1), %edx
+	POP (%edi)
+	POP (%esi)
+	jmp	L(less48bytes)
+
+	CFI_PUSH (%esi)
+	CFI_PUSH (%edi)
+	ALIGN (4)
+L(shr_2_gobble):
+	sub	$32, %ecx
+	movdqa	16(%esi), %xmm0
+	palignr	$2,(%esi), %xmm0
+	pcmpeqb	(%edi), %xmm0
+
+	movdqa	32(%esi), %xmm3
+	palignr	$2,16(%esi), %xmm3
+	pcmpeqb	16(%edi), %xmm3
+
+L(shr_2_gobble_loop):
+	pand	%xmm0, %xmm3
+	sub	$32, %ecx
+	pmovmskb %xmm3, %edx
+	movdqa	%xmm0, %xmm1
+
+	movdqa	64(%esi), %xmm3
+	palignr	$2,48(%esi), %xmm3
+	sbb	$0xffff, %edx
+	movdqa	48(%esi), %xmm0
+	palignr	$2,32(%esi), %xmm0
+	pcmpeqb	32(%edi), %xmm0
+	lea	32(%esi), %esi
+	pcmpeqb	48(%edi), %xmm3
+
+	lea	32(%edi), %edi
+	jz	L(shr_2_gobble_loop)
+	pand	%xmm0, %xmm3
+
+	cmp	$0, %ecx
+	jge	L(shr_2_gobble_next)
+	inc	%edx
+	add	$32, %ecx
+L(shr_2_gobble_next):
+	test	%edx, %edx
+	jnz	L(exit)
+
+	pmovmskb %xmm3, %edx
+	movdqa	%xmm0, %xmm1
+	lea	32(%edi), %edi
+	lea	32(%esi), %esi
+	sub	$0xffff, %edx
+	jnz	L(exit)
+
+	lea	(%ecx, %edi,1), %eax
+	lea	2(%ecx, %esi,1), %edx
+	POP (%edi)
+	POP (%esi)
+	jmp	L(less48bytes)
+
+	CFI_PUSH (%esi)
+	CFI_PUSH (%edi)
+	ALIGN (4)
+L(shr_3):
+	cmp	$80, %ecx
+	lea	-48(%ecx), %ecx
+	mov	%edx, %eax
+	jae	L(shr_3_gobble)
+
+	movdqa	16(%esi), %xmm1
+	movdqa	%xmm1, %xmm2
+	palignr	$3,(%esi), %xmm1
+	pcmpeqb	(%edi), %xmm1
+
+	movdqa	32(%esi), %xmm3
+	palignr	$3,%xmm2, %xmm3
+	pcmpeqb	16(%edi), %xmm3
+
+	pand	%xmm1, %xmm3
+	pmovmskb %xmm3, %edx
+	lea	32(%edi), %edi
+	lea	32(%esi), %esi
+	sub	$0xffff, %edx
+	jnz	L(exit)
+	lea	(%ecx, %edi,1), %eax
+	lea	3(%ecx, %esi,1), %edx
+	POP (%edi)
+	POP (%esi)
+	jmp	L(less48bytes)
+
+	CFI_PUSH (%esi)
+	CFI_PUSH (%edi)
+	ALIGN (4)
+L(shr_3_gobble):
+	sub	$32, %ecx
+	movdqa	16(%esi), %xmm0
+	palignr	$3,(%esi), %xmm0
+	pcmpeqb	(%edi), %xmm0
+
+	movdqa	32(%esi), %xmm3
+	palignr	$3,16(%esi), %xmm3
+	pcmpeqb	16(%edi), %xmm3
+
+L(shr_3_gobble_loop):
+	pand	%xmm0, %xmm3
+	sub	$32, %ecx
+	pmovmskb %xmm3, %edx
+	movdqa	%xmm0, %xmm1
+
+	movdqa	64(%esi), %xmm3
+	palignr	$3,48(%esi), %xmm3
+	sbb	$0xffff, %edx
+	movdqa	48(%esi), %xmm0
+	palignr	$3,32(%esi), %xmm0
+	pcmpeqb	32(%edi), %xmm0
+	lea	32(%esi), %esi
+	pcmpeqb	48(%edi), %xmm3
+
+	lea	32(%edi), %edi
+	jz	L(shr_3_gobble_loop)
+	pand	%xmm0, %xmm3
+
+	cmp	$0, %ecx
+	jge	L(shr_3_gobble_next)
+	inc	%edx
+	add	$32, %ecx
+L(shr_3_gobble_next):
+	test	%edx, %edx
+	jnz	L(exit)
+
+	pmovmskb %xmm3, %edx
+	movdqa	%xmm0, %xmm1
+	lea	32(%edi), %edi
+	lea	32(%esi), %esi
+	sub	$0xffff, %edx
+	jnz	L(exit)
+
+	lea	(%ecx, %edi,1), %eax
+	lea	3(%ecx, %esi,1), %edx
+	POP (%edi)
+	POP (%esi)
+	jmp	L(less48bytes)
+
+	CFI_PUSH (%esi)
+	CFI_PUSH (%edi)
+	ALIGN (4)
+L(shr_4):
+	cmp	$80, %ecx
+	lea	-48(%ecx), %ecx
+	mov	%edx, %eax
+	jae	L(shr_4_gobble)
+
+	movdqa	16(%esi), %xmm1
+	movdqa	%xmm1, %xmm2
+	palignr	$4,(%esi), %xmm1
+	pcmpeqb	(%edi), %xmm1
+
+	movdqa	32(%esi), %xmm3
+	palignr	$4,%xmm2, %xmm3
+	pcmpeqb	16(%edi), %xmm3
+
+	pand	%xmm1, %xmm3
+	pmovmskb %xmm3, %edx
+	lea	32(%edi), %edi
+	lea	32(%esi), %esi
+	sub	$0xffff, %edx
+	jnz	L(exit)
+	lea	(%ecx, %edi,1), %eax
+	lea	4(%ecx, %esi,1), %edx
+	POP (%edi)
+	POP (%esi)
+	jmp	L(less48bytes)
+
+	CFI_PUSH (%esi)
+	CFI_PUSH (%edi)
+	ALIGN (4)
+L(shr_4_gobble):
+	sub	$32, %ecx
+	movdqa	16(%esi), %xmm0
+	palignr	$4,(%esi), %xmm0
+	pcmpeqb	(%edi), %xmm0
+
+	movdqa	32(%esi), %xmm3
+	palignr	$4,16(%esi), %xmm3
+	pcmpeqb	16(%edi), %xmm3
+
+L(shr_4_gobble_loop):
+	pand	%xmm0, %xmm3
+	sub	$32, %ecx
+	pmovmskb %xmm3, %edx
+	movdqa	%xmm0, %xmm1
+
+	movdqa	64(%esi), %xmm3
+	palignr	$4,48(%esi), %xmm3
+	sbb	$0xffff, %edx
+	movdqa	48(%esi), %xmm0
+	palignr	$4,32(%esi), %xmm0
+	pcmpeqb	32(%edi), %xmm0
+	lea	32(%esi), %esi
+	pcmpeqb	48(%edi), %xmm3
+
+	lea	32(%edi), %edi
+	jz	L(shr_4_gobble_loop)
+	pand	%xmm0, %xmm3
+
+	cmp	$0, %ecx
+	jge	L(shr_4_gobble_next)
+	inc	%edx
+	add	$32, %ecx
+L(shr_4_gobble_next):
+	test	%edx, %edx
+	jnz	L(exit)
+
+	pmovmskb %xmm3, %edx
+	movdqa	%xmm0, %xmm1
+	lea	32(%edi), %edi
+	lea	32(%esi), %esi
+	sub	$0xffff, %edx
+	jnz	L(exit)
+
+	lea	(%ecx, %edi,1), %eax
+	lea	4(%ecx, %esi,1), %edx
+	POP (%edi)
+	POP (%esi)
+	jmp	L(less48bytes)
+
+	CFI_PUSH (%esi)
+	CFI_PUSH (%edi)
+	ALIGN (4)
+L(shr_5):
+	cmp	$80, %ecx
+	lea	-48(%ecx), %ecx
+	mov	%edx, %eax
+	jae	L(shr_5_gobble)
+
+	movdqa	16(%esi), %xmm1
+	movdqa	%xmm1, %xmm2
+	palignr	$5,(%esi), %xmm1
+	pcmpeqb	(%edi), %xmm1
+
+	movdqa	32(%esi), %xmm3
+	palignr	$5,%xmm2, %xmm3
+	pcmpeqb	16(%edi), %xmm3
+
+	pand	%xmm1, %xmm3
+	pmovmskb %xmm3, %edx
+	lea	32(%edi), %edi
+	lea	32(%esi), %esi
+	sub	$0xffff, %edx
+	jnz	L(exit)
+	lea	(%ecx, %edi,1), %eax
+	lea	5(%ecx, %esi,1), %edx
+	POP (%edi)
+	POP (%esi)
+	jmp	L(less48bytes)
+
+	CFI_PUSH (%esi)
+	CFI_PUSH (%edi)
+	ALIGN (4)
+L(shr_5_gobble):
+	sub	$32, %ecx
+	movdqa	16(%esi), %xmm0
+	palignr	$5,(%esi), %xmm0
+	pcmpeqb	(%edi), %xmm0
+
+	movdqa	32(%esi), %xmm3
+	palignr	$5,16(%esi), %xmm3
+	pcmpeqb	16(%edi), %xmm3
+
+L(shr_5_gobble_loop):
+	pand	%xmm0, %xmm3
+	sub	$32, %ecx
+	pmovmskb %xmm3, %edx
+	movdqa	%xmm0, %xmm1
+
+	movdqa	64(%esi), %xmm3
+	palignr	$5,48(%esi), %xmm3
+	sbb	$0xffff, %edx
+	movdqa	48(%esi), %xmm0
+	palignr	$5,32(%esi), %xmm0
+	pcmpeqb	32(%edi), %xmm0
+	lea	32(%esi), %esi
+	pcmpeqb	48(%edi), %xmm3
+
+	lea	32(%edi), %edi
+	jz	L(shr_5_gobble_loop)
+	pand	%xmm0, %xmm3
+
+	cmp	$0, %ecx
+	jge	L(shr_5_gobble_next)
+	inc	%edx
+	add	$32, %ecx
+L(shr_5_gobble_next):
+	test	%edx, %edx
+	jnz	L(exit)
+
+	pmovmskb %xmm3, %edx
+	movdqa	%xmm0, %xmm1
+	lea	32(%edi), %edi
+	lea	32(%esi), %esi
+	sub	$0xffff, %edx
+	jnz	L(exit)
+
+	lea	(%ecx, %edi,1), %eax
+	lea	5(%ecx, %esi,1), %edx
+	POP (%edi)
+	POP (%esi)
+	jmp	L(less48bytes)
+
+	CFI_PUSH (%esi)
+	CFI_PUSH (%edi)
+	ALIGN (4)
+L(shr_6):
+	cmp	$80, %ecx
+	lea	-48(%ecx), %ecx
+	mov	%edx, %eax
+	jae	L(shr_6_gobble)
+
+	movdqa	16(%esi), %xmm1
+	movdqa	%xmm1, %xmm2
+	palignr	$6,(%esi), %xmm1
+	pcmpeqb	(%edi), %xmm1
+
+	movdqa	32(%esi), %xmm3
+	palignr	$6,%xmm2, %xmm3
+	pcmpeqb	16(%edi), %xmm3
+
+	pand	%xmm1, %xmm3
+	pmovmskb %xmm3, %edx
+	lea	32(%edi), %edi
+	lea	32(%esi), %esi
+	sub	$0xffff, %edx
+	jnz	L(exit)
+	lea	(%ecx, %edi,1), %eax
+	lea	6(%ecx, %esi,1), %edx
+	POP (%edi)
+	POP (%esi)
+	jmp	L(less48bytes)
+
+	CFI_PUSH (%esi)
+	CFI_PUSH (%edi)
+	ALIGN (4)
+L(shr_6_gobble):
+	sub	$32, %ecx
+	movdqa	16(%esi), %xmm0
+	palignr	$6,(%esi), %xmm0
+	pcmpeqb	(%edi), %xmm0
+
+	movdqa	32(%esi), %xmm3
+	palignr	$6,16(%esi), %xmm3
+	pcmpeqb	16(%edi), %xmm3
+
+L(shr_6_gobble_loop):
+	pand	%xmm0, %xmm3
+	sub	$32, %ecx
+	pmovmskb %xmm3, %edx
+	movdqa	%xmm0, %xmm1
+
+	movdqa	64(%esi), %xmm3
+	palignr	$6,48(%esi), %xmm3
+	sbb	$0xffff, %edx
+	movdqa	48(%esi), %xmm0
+	palignr	$6,32(%esi), %xmm0
+	pcmpeqb	32(%edi), %xmm0
+	lea	32(%esi), %esi
+	pcmpeqb	48(%edi), %xmm3
+
+	lea	32(%edi), %edi
+	jz	L(shr_6_gobble_loop)
+	pand	%xmm0, %xmm3
+
+	cmp	$0, %ecx
+	jge	L(shr_6_gobble_next)
+	inc	%edx
+	add	$32, %ecx
+L(shr_6_gobble_next):
+	test	%edx, %edx
+	jnz	L(exit)
+
+	pmovmskb %xmm3, %edx
+	movdqa	%xmm0, %xmm1
+	lea	32(%edi), %edi
+	lea	32(%esi), %esi
+	sub	$0xffff, %edx
+	jnz	L(exit)
+
+	lea	(%ecx, %edi,1), %eax
+	lea	6(%ecx, %esi,1), %edx
+	POP (%edi)
+	POP (%esi)
+	jmp	L(less48bytes)
+
+	CFI_PUSH (%esi)
+	CFI_PUSH (%edi)
+	ALIGN (4)
+L(shr_7):
+	cmp	$80, %ecx
+	lea	-48(%ecx), %ecx
+	mov	%edx, %eax
+	jae	L(shr_7_gobble)
+
+	movdqa	16(%esi), %xmm1
+	movdqa	%xmm1, %xmm2
+	palignr	$7,(%esi), %xmm1
+	pcmpeqb	(%edi), %xmm1
+
+	movdqa	32(%esi), %xmm3
+	palignr	$7,%xmm2, %xmm3
+	pcmpeqb	16(%edi), %xmm3
+
+	pand	%xmm1, %xmm3
+	pmovmskb %xmm3, %edx
+	lea	32(%edi), %edi
+	lea	32(%esi), %esi
+	sub	$0xffff, %edx
+	jnz	L(exit)
+	lea	(%ecx, %edi,1), %eax
+	lea	7(%ecx, %esi,1), %edx
+	POP (%edi)
+	POP (%esi)
+	jmp	L(less48bytes)
+
+	CFI_PUSH (%esi)
+	CFI_PUSH (%edi)
+	ALIGN (4)
+L(shr_7_gobble):
+	sub	$32, %ecx
+	movdqa	16(%esi), %xmm0
+	palignr	$7,(%esi), %xmm0
+	pcmpeqb	(%edi), %xmm0
+
+	movdqa	32(%esi), %xmm3
+	palignr	$7,16(%esi), %xmm3
+	pcmpeqb	16(%edi), %xmm3
+
+L(shr_7_gobble_loop):
+	pand	%xmm0, %xmm3
+	sub	$32, %ecx
+	pmovmskb %xmm3, %edx
+	movdqa	%xmm0, %xmm1
+
+	movdqa	64(%esi), %xmm3
+	palignr	$7,48(%esi), %xmm3
+	sbb	$0xffff, %edx
+	movdqa	48(%esi), %xmm0
+	palignr	$7,32(%esi), %xmm0
+	pcmpeqb	32(%edi), %xmm0
+	lea	32(%esi), %esi
+	pcmpeqb	48(%edi), %xmm3
+
+	lea	32(%edi), %edi
+	jz	L(shr_7_gobble_loop)
+	pand	%xmm0, %xmm3
+
+	cmp	$0, %ecx
+	jge	L(shr_7_gobble_next)
+	inc	%edx
+	add	$32, %ecx
+L(shr_7_gobble_next):
+	test	%edx, %edx
+	jnz	L(exit)
+
+	pmovmskb %xmm3, %edx
+	movdqa	%xmm0, %xmm1
+	lea	32(%edi), %edi
+	lea	32(%esi), %esi
+	sub	$0xffff, %edx
+	jnz	L(exit)
+
+	lea	(%ecx, %edi,1), %eax
+	lea	7(%ecx, %esi,1), %edx
+	POP (%edi)
+	POP (%esi)
+	jmp	L(less48bytes)
+
+	CFI_PUSH (%esi)
+	CFI_PUSH (%edi)
+	ALIGN (4)
+L(shr_8):
+	cmp	$80, %ecx
+	lea	-48(%ecx), %ecx
+	mov	%edx, %eax
+	jae	L(shr_8_gobble)
+
+	movdqa	16(%esi), %xmm1
+	movdqa	%xmm1, %xmm2
+	palignr	$8,(%esi), %xmm1
+	pcmpeqb	(%edi), %xmm1
+
+	movdqa	32(%esi), %xmm3
+	palignr	$8,%xmm2, %xmm3
+	pcmpeqb	16(%edi), %xmm3
+
+	pand	%xmm1, %xmm3
+	pmovmskb %xmm3, %edx
+	lea	32(%edi), %edi
+	lea	32(%esi), %esi
+	sub	$0xffff, %edx
+	jnz	L(exit)
+	lea	(%ecx, %edi,1), %eax
+	lea	8(%ecx, %esi,1), %edx
+	POP (%edi)
+	POP (%esi)
+	jmp	L(less48bytes)
+
+	CFI_PUSH (%esi)
+	CFI_PUSH (%edi)
+	ALIGN (4)
+L(shr_8_gobble):
+	sub	$32, %ecx
+	movdqa	16(%esi), %xmm0
+	palignr	$8,(%esi), %xmm0
+	pcmpeqb	(%edi), %xmm0
+
+	movdqa	32(%esi), %xmm3
+	palignr	$8,16(%esi), %xmm3
+	pcmpeqb	16(%edi), %xmm3
+
+L(shr_8_gobble_loop):
+	pand	%xmm0, %xmm3
+	sub	$32, %ecx
+	pmovmskb %xmm3, %edx
+	movdqa	%xmm0, %xmm1
+
+	movdqa	64(%esi), %xmm3
+	palignr	$8,48(%esi), %xmm3
+	sbb	$0xffff, %edx
+	movdqa	48(%esi), %xmm0
+	palignr	$8,32(%esi), %xmm0
+	pcmpeqb	32(%edi), %xmm0
+	lea	32(%esi), %esi
+	pcmpeqb	48(%edi), %xmm3
+
+	lea	32(%edi), %edi
+	jz	L(shr_8_gobble_loop)
+	pand	%xmm0, %xmm3
+
+	cmp	$0, %ecx
+	jge	L(shr_8_gobble_next)
+	inc	%edx
+	add	$32, %ecx
+L(shr_8_gobble_next):
+	test	%edx, %edx
+	jnz	L(exit)
+
+	pmovmskb %xmm3, %edx
+	movdqa	%xmm0, %xmm1
+	lea	32(%edi), %edi
+	lea	32(%esi), %esi
+	sub	$0xffff, %edx
+	jnz	L(exit)
+
+	lea	(%ecx, %edi,1), %eax
+	lea	8(%ecx, %esi,1), %edx
+	POP (%edi)
+	POP (%esi)
+	jmp	L(less48bytes)
+
+	CFI_PUSH (%esi)
+	CFI_PUSH (%edi)
+	ALIGN (4)
+L(shr_9):
+	cmp	$80, %ecx
+	lea	-48(%ecx), %ecx
+	mov	%edx, %eax
+	jae	L(shr_9_gobble)
+
+	movdqa	16(%esi), %xmm1
+	movdqa	%xmm1, %xmm2
+	palignr	$9,(%esi), %xmm1
+	pcmpeqb	(%edi), %xmm1
+
+	movdqa	32(%esi), %xmm3
+	palignr	$9,%xmm2, %xmm3
+	pcmpeqb	16(%edi), %xmm3
+
+	pand	%xmm1, %xmm3
+	pmovmskb %xmm3, %edx
+	lea	32(%edi), %edi
+	lea	32(%esi), %esi
+	sub	$0xffff, %edx
+	jnz	L(exit)
+	lea	(%ecx, %edi,1), %eax
+	lea	9(%ecx, %esi,1), %edx
+	POP (%edi)
+	POP (%esi)
+	jmp	L(less48bytes)
+
+	CFI_PUSH (%esi)
+	CFI_PUSH (%edi)
+	ALIGN (4)
+L(shr_9_gobble):
+	sub	$32, %ecx
+	movdqa	16(%esi), %xmm0
+	palignr	$9,(%esi), %xmm0
+	pcmpeqb	(%edi), %xmm0
+
+	movdqa	32(%esi), %xmm3
+	palignr	$9,16(%esi), %xmm3
+	pcmpeqb	16(%edi), %xmm3
+
+L(shr_9_gobble_loop):
+	pand	%xmm0, %xmm3
+	sub	$32, %ecx
+	pmovmskb %xmm3, %edx
+	movdqa	%xmm0, %xmm1
+
+	movdqa	64(%esi), %xmm3
+	palignr	$9,48(%esi), %xmm3
+	sbb	$0xffff, %edx
+	movdqa	48(%esi), %xmm0
+	palignr	$9,32(%esi), %xmm0
+	pcmpeqb	32(%edi), %xmm0
+	lea	32(%esi), %esi
+	pcmpeqb	48(%edi), %xmm3
+
+	lea	32(%edi), %edi
+	jz	L(shr_9_gobble_loop)
+	pand	%xmm0, %xmm3
+
+	cmp	$0, %ecx
+	jge	L(shr_9_gobble_next)
+	inc	%edx
+	add	$32, %ecx
+L(shr_9_gobble_next):
+	test	%edx, %edx
+	jnz	L(exit)
+
+	pmovmskb %xmm3, %edx
+	movdqa	%xmm0, %xmm1
+	lea	32(%edi), %edi
+	lea	32(%esi), %esi
+	sub	$0xffff, %edx
+	jnz	L(exit)
+
+	lea	(%ecx, %edi,1), %eax
+	lea	9(%ecx, %esi,1), %edx
+	POP (%edi)
+	POP (%esi)
+	jmp	L(less48bytes)
+
+	CFI_PUSH (%esi)
+	CFI_PUSH (%edi)
+	ALIGN (4)
+L(shr_10):
+	cmp	$80, %ecx
+	lea	-48(%ecx), %ecx
+	mov	%edx, %eax
+	jae	L(shr_10_gobble)
+
+	movdqa	16(%esi), %xmm1
+	movdqa	%xmm1, %xmm2
+	palignr	$10, (%esi), %xmm1
+	pcmpeqb	(%edi), %xmm1
+
+	movdqa	32(%esi), %xmm3
+	palignr	$10,%xmm2, %xmm3
+	pcmpeqb	16(%edi), %xmm3
+
+	pand	%xmm1, %xmm3
+	pmovmskb %xmm3, %edx
+	lea	32(%edi), %edi
+	lea	32(%esi), %esi
+	sub	$0xffff, %edx
+	jnz	L(exit)
+	lea	(%ecx, %edi,1), %eax
+	lea	10(%ecx, %esi,1), %edx
+	POP (%edi)
+	POP (%esi)
+	jmp	L(less48bytes)
+
+	CFI_PUSH (%esi)
+	CFI_PUSH (%edi)
+	ALIGN (4)
+L(shr_10_gobble):
+	sub	$32, %ecx
+	movdqa	16(%esi), %xmm0
+	palignr	$10, (%esi), %xmm0
+	pcmpeqb	(%edi), %xmm0
+
+	movdqa	32(%esi), %xmm3
+	palignr	$10, 16(%esi), %xmm3
+	pcmpeqb	16(%edi), %xmm3
+
+L(shr_10_gobble_loop):
+	pand	%xmm0, %xmm3
+	sub	$32, %ecx
+	pmovmskb %xmm3, %edx
+	movdqa	%xmm0, %xmm1
+
+	movdqa	64(%esi), %xmm3
+	palignr	$10,48(%esi), %xmm3
+	sbb	$0xffff, %edx
+	movdqa	48(%esi), %xmm0
+	palignr	$10,32(%esi), %xmm0
+	pcmpeqb	32(%edi), %xmm0
+	lea	32(%esi), %esi
+	pcmpeqb	48(%edi), %xmm3
+
+	lea	32(%edi), %edi
+	jz	L(shr_10_gobble_loop)
+	pand	%xmm0, %xmm3
+
+	cmp	$0, %ecx
+	jge	L(shr_10_gobble_next)
+	inc	%edx
+	add	$32, %ecx
+L(shr_10_gobble_next):
+	test	%edx, %edx
+	jnz	L(exit)
+
+	pmovmskb %xmm3, %edx
+	movdqa	%xmm0, %xmm1
+	lea	32(%edi), %edi
+	lea	32(%esi), %esi
+	sub	$0xffff, %edx
+	jnz	L(exit)
+
+	lea	(%ecx, %edi,1), %eax
+	lea	10(%ecx, %esi,1), %edx
+	POP (%edi)
+	POP (%esi)
+	jmp	L(less48bytes)
+
+	CFI_PUSH (%esi)
+	CFI_PUSH (%edi)
+	ALIGN (4)
+L(shr_11):
+	cmp	$80, %ecx
+	lea	-48(%ecx), %ecx
+	mov	%edx, %eax
+	jae	L(shr_11_gobble)
+
+	movdqa	16(%esi), %xmm1
+	movdqa	%xmm1, %xmm2
+	palignr	$11, (%esi), %xmm1
+	pcmpeqb	(%edi), %xmm1
+
+	movdqa	32(%esi), %xmm3
+	palignr	$11, %xmm2, %xmm3
+	pcmpeqb	16(%edi), %xmm3
+
+	pand	%xmm1, %xmm3
+	pmovmskb %xmm3, %edx
+	lea	32(%edi), %edi
+	lea	32(%esi), %esi
+	sub	$0xffff, %edx
+	jnz	L(exit)
+	lea	(%ecx, %edi,1), %eax
+	lea	11(%ecx, %esi,1), %edx
+	POP (%edi)
+	POP (%esi)
+	jmp	L(less48bytes)
+
+	CFI_PUSH (%esi)
+	CFI_PUSH (%edi)
+	ALIGN (4)
+L(shr_11_gobble):
+	sub	$32, %ecx
+	movdqa	16(%esi), %xmm0
+	palignr	$11, (%esi), %xmm0
+	pcmpeqb	(%edi), %xmm0
+
+	movdqa	32(%esi), %xmm3
+	palignr	$11, 16(%esi), %xmm3
+	pcmpeqb	16(%edi), %xmm3
+
+L(shr_11_gobble_loop):
+	pand	%xmm0, %xmm3
+	sub	$32, %ecx
+	pmovmskb %xmm3, %edx
+	movdqa	%xmm0, %xmm1
+
+	movdqa	64(%esi), %xmm3
+	palignr	$11,48(%esi), %xmm3
+	sbb	$0xffff, %edx
+	movdqa	48(%esi), %xmm0
+	palignr	$11,32(%esi), %xmm0
+	pcmpeqb	32(%edi), %xmm0
+	lea	32(%esi), %esi
+	pcmpeqb	48(%edi), %xmm3
+
+	lea	32(%edi), %edi
+	jz	L(shr_11_gobble_loop)
+	pand	%xmm0, %xmm3
+
+	cmp	$0, %ecx
+	jge	L(shr_11_gobble_next)
+	inc	%edx
+	add	$32, %ecx
+L(shr_11_gobble_next):
+	test	%edx, %edx
+	jnz	L(exit)
+
+	pmovmskb %xmm3, %edx
+	movdqa	%xmm0, %xmm1
+	lea	32(%edi), %edi
+	lea	32(%esi), %esi
+	sub	$0xffff, %edx
+	jnz	L(exit)
+
+	lea	(%ecx, %edi,1), %eax
+	lea	11(%ecx, %esi,1), %edx
+	POP (%edi)
+	POP (%esi)
+	jmp	L(less48bytes)
+
+	CFI_PUSH (%esi)
+	CFI_PUSH (%edi)
+	ALIGN (4)
+L(shr_12):
+	cmp	$80, %ecx
+	lea	-48(%ecx), %ecx
+	mov	%edx, %eax
+	jae	L(shr_12_gobble)
+
+	movdqa	16(%esi), %xmm1
+	movdqa	%xmm1, %xmm2
+	palignr	$12, (%esi), %xmm1
+	pcmpeqb	(%edi), %xmm1
+
+	movdqa	32(%esi), %xmm3
+	palignr	$12, %xmm2, %xmm3
+	pcmpeqb	16(%edi), %xmm3
+
+	pand	%xmm1, %xmm3
+	pmovmskb %xmm3, %edx
+	lea	32(%edi), %edi
+	lea	32(%esi), %esi
+	sub	$0xffff, %edx
+	jnz	L(exit)
+	lea	(%ecx, %edi,1), %eax
+	lea	12(%ecx, %esi,1), %edx
+	POP (%edi)
+	POP (%esi)
+	jmp	L(less48bytes)
+
+	CFI_PUSH (%esi)
+	CFI_PUSH (%edi)
+	ALIGN (4)
+L(shr_12_gobble):
+	sub	$32, %ecx
+	movdqa	16(%esi), %xmm0
+	palignr	$12, (%esi), %xmm0
+	pcmpeqb	(%edi), %xmm0
+
+	movdqa	32(%esi), %xmm3
+	palignr	$12, 16(%esi), %xmm3
+	pcmpeqb	16(%edi), %xmm3
+
+L(shr_12_gobble_loop):
+	pand	%xmm0, %xmm3
+	sub	$32, %ecx
+	pmovmskb %xmm3, %edx
+	movdqa	%xmm0, %xmm1
+
+	movdqa	64(%esi), %xmm3
+	palignr	$12,48(%esi), %xmm3
+	sbb	$0xffff, %edx
+	movdqa	48(%esi), %xmm0
+	palignr	$12,32(%esi), %xmm0
+	pcmpeqb	32(%edi), %xmm0
+	lea	32(%esi), %esi
+	pcmpeqb	48(%edi), %xmm3
+
+	lea	32(%edi), %edi
+	jz	L(shr_12_gobble_loop)
+	pand	%xmm0, %xmm3
+
+	cmp	$0, %ecx
+	jge	L(shr_12_gobble_next)
+	inc	%edx
+	add	$32, %ecx
+L(shr_12_gobble_next):
+	test	%edx, %edx
+	jnz	L(exit)
+
+	pmovmskb %xmm3, %edx
+	movdqa	%xmm0, %xmm1
+	lea	32(%edi), %edi
+	lea	32(%esi), %esi
+	sub	$0xffff, %edx
+	jnz	L(exit)
+
+	lea	(%ecx, %edi,1), %eax
+	lea	12(%ecx, %esi,1), %edx
+	POP (%edi)
+	POP (%esi)
+	jmp	L(less48bytes)
+
+	CFI_PUSH (%esi)
+	CFI_PUSH (%edi)
+	ALIGN (4)
+L(shr_13):
+	cmp	$80, %ecx
+	lea	-48(%ecx), %ecx
+	mov	%edx, %eax
+	jae	L(shr_13_gobble)
+
+	movdqa	16(%esi), %xmm1
+	movdqa	%xmm1, %xmm2
+	palignr	$13, (%esi), %xmm1
+	pcmpeqb	(%edi), %xmm1
+
+	movdqa	32(%esi), %xmm3
+	palignr	$13, %xmm2, %xmm3
+	pcmpeqb	16(%edi), %xmm3
+
+	pand	%xmm1, %xmm3
+	pmovmskb %xmm3, %edx
+	lea	32(%edi), %edi
+	lea	32(%esi), %esi
+	sub	$0xffff, %edx
+	jnz	L(exit)
+	lea	(%ecx, %edi,1), %eax
+	lea	13(%ecx, %esi,1), %edx
+	POP (%edi)
+	POP (%esi)
+	jmp	L(less48bytes)
+
+	CFI_PUSH (%esi)
+	CFI_PUSH (%edi)
+	ALIGN (4)
+L(shr_13_gobble):
+	sub	$32, %ecx
+	movdqa	16(%esi), %xmm0
+	palignr	$13, (%esi), %xmm0
+	pcmpeqb	(%edi), %xmm0
+
+	movdqa	32(%esi), %xmm3
+	palignr	$13, 16(%esi), %xmm3
+	pcmpeqb	16(%edi), %xmm3
+
+L(shr_13_gobble_loop):
+	pand	%xmm0, %xmm3
+	sub	$32, %ecx
+	pmovmskb %xmm3, %edx
+	movdqa	%xmm0, %xmm1
+
+	movdqa	64(%esi), %xmm3
+	palignr	$13,48(%esi), %xmm3
+	sbb	$0xffff, %edx
+	movdqa	48(%esi), %xmm0
+	palignr	$13,32(%esi), %xmm0
+	pcmpeqb	32(%edi), %xmm0
+	lea	32(%esi), %esi
+	pcmpeqb	48(%edi), %xmm3
+
+	lea	32(%edi), %edi
+	jz	L(shr_13_gobble_loop)
+	pand	%xmm0, %xmm3
+
+	cmp	$0, %ecx
+	jge	L(shr_13_gobble_next)
+	inc	%edx
+	add	$32, %ecx
+L(shr_13_gobble_next):
+	test	%edx, %edx
+	jnz	L(exit)
+
+	pmovmskb %xmm3, %edx
+	movdqa	%xmm0, %xmm1
+	lea	32(%edi), %edi
+	lea	32(%esi), %esi
+	sub	$0xffff, %edx
+	jnz	L(exit)
+
+	lea	(%ecx, %edi,1), %eax
+	lea	13(%ecx, %esi,1), %edx
+	POP (%edi)
+	POP (%esi)
+	jmp	L(less48bytes)
+
+	CFI_PUSH (%esi)
+	CFI_PUSH (%edi)
+	ALIGN (4)
+L(shr_14):
+	cmp	$80, %ecx
+	lea	-48(%ecx), %ecx
+	mov	%edx, %eax
+	jae	L(shr_14_gobble)
+
+	movdqa	16(%esi), %xmm1
+	movdqa	%xmm1, %xmm2
+	palignr	$14, (%esi), %xmm1
+	pcmpeqb	(%edi), %xmm1
+
+	movdqa	32(%esi), %xmm3
+	palignr	$14, %xmm2, %xmm3
+	pcmpeqb	16(%edi), %xmm3
+
+	pand	%xmm1, %xmm3
+	pmovmskb %xmm3, %edx
+	lea	32(%edi), %edi
+	lea	32(%esi), %esi
+	sub	$0xffff, %edx
+	jnz	L(exit)
+	lea	(%ecx, %edi,1), %eax
+	lea	14(%ecx, %esi,1), %edx
+	POP (%edi)
+	POP (%esi)
+	jmp	L(less48bytes)
+
+	CFI_PUSH (%esi)
+	CFI_PUSH (%edi)
+	ALIGN (4)
+L(shr_14_gobble):
+	sub	$32, %ecx
+	movdqa	16(%esi), %xmm0
+	palignr	$14, (%esi), %xmm0
+	pcmpeqb	(%edi), %xmm0
+
+	movdqa	32(%esi), %xmm3
+	palignr	$14, 16(%esi), %xmm3
+	pcmpeqb	16(%edi), %xmm3
+
+L(shr_14_gobble_loop):
+	pand	%xmm0, %xmm3
+	sub	$32, %ecx
+	pmovmskb %xmm3, %edx
+	movdqa	%xmm0, %xmm1
+
+	movdqa	64(%esi), %xmm3
+	palignr	$14,48(%esi), %xmm3
+	sbb	$0xffff, %edx
+	movdqa	48(%esi), %xmm0
+	palignr	$14,32(%esi), %xmm0
+	pcmpeqb	32(%edi), %xmm0
+	lea	32(%esi), %esi
+	pcmpeqb	48(%edi), %xmm3
+
+	lea	32(%edi), %edi
+	jz	L(shr_14_gobble_loop)
+	pand	%xmm0, %xmm3
+
+	cmp	$0, %ecx
+	jge	L(shr_14_gobble_next)
+	inc	%edx
+	add	$32, %ecx
+L(shr_14_gobble_next):
+	test	%edx, %edx
+	jnz	L(exit)
+
+	pmovmskb %xmm3, %edx
+	movdqa	%xmm0, %xmm1
+	lea	32(%edi), %edi
+	lea	32(%esi), %esi
+	sub	$0xffff, %edx
+	jnz	L(exit)
+
+	lea	(%ecx, %edi,1), %eax
+	lea	14(%ecx, %esi,1), %edx
+	POP (%edi)
+	POP (%esi)
+	jmp	L(less48bytes)
+
+	CFI_PUSH (%esi)
+	CFI_PUSH (%edi)
+	ALIGN (4)
+L(shr_15):
+	cmp	$80, %ecx
+	lea	-48(%ecx), %ecx
+	mov	%edx, %eax
+	jae	L(shr_15_gobble)
+
+	movdqa	16(%esi), %xmm1
+	movdqa	%xmm1, %xmm2
+	palignr	$15, (%esi), %xmm1
+	pcmpeqb	(%edi), %xmm1
+
+	movdqa	32(%esi), %xmm3
+	palignr	$15, %xmm2, %xmm3
+	pcmpeqb	16(%edi), %xmm3
+
+	pand	%xmm1, %xmm3
+	pmovmskb %xmm3, %edx
+	lea	32(%edi), %edi
+	lea	32(%esi), %esi
+	sub	$0xffff, %edx
+	jnz	L(exit)
+	lea	(%ecx, %edi,1), %eax
+	lea	15(%ecx, %esi,1), %edx
+	POP (%edi)
+	POP (%esi)
+	jmp	L(less48bytes)
+
+	CFI_PUSH (%esi)
+	CFI_PUSH (%edi)
+	ALIGN (4)
+L(shr_15_gobble):
+	sub	$32, %ecx
+	movdqa	16(%esi), %xmm0
+	palignr	$15, (%esi), %xmm0
+	pcmpeqb	(%edi), %xmm0
+
+	movdqa	32(%esi), %xmm3
+	palignr	$15, 16(%esi), %xmm3
+	pcmpeqb	16(%edi), %xmm3
+
+L(shr_15_gobble_loop):
+	pand	%xmm0, %xmm3
+	sub	$32, %ecx
+	pmovmskb %xmm3, %edx
+	movdqa	%xmm0, %xmm1
+
+	movdqa	64(%esi), %xmm3
+	palignr	$15,48(%esi), %xmm3
+	sbb	$0xffff, %edx
+	movdqa	48(%esi), %xmm0
+	palignr	$15,32(%esi), %xmm0
+	pcmpeqb	32(%edi), %xmm0
+	lea	32(%esi), %esi
+	pcmpeqb	48(%edi), %xmm3
+
+	lea	32(%edi), %edi
+	jz	L(shr_15_gobble_loop)
+	pand	%xmm0, %xmm3
+
+	cmp	$0, %ecx
+	jge	L(shr_15_gobble_next)
+	inc	%edx
+	add	$32, %ecx
+L(shr_15_gobble_next):
+	test	%edx, %edx
+	jnz	L(exit)
+
+	pmovmskb %xmm3, %edx
+	movdqa	%xmm0, %xmm1
+	lea	32(%edi), %edi
+	lea	32(%esi), %esi
+	sub	$0xffff, %edx
+	jnz	L(exit)
+
+	lea	(%ecx, %edi,1), %eax
+	lea	15(%ecx, %esi,1), %edx
+	POP (%edi)
+	POP (%esi)
+	jmp	L(less48bytes)
+
+	CFI_PUSH (%esi)
+	CFI_PUSH (%edi)
+	ALIGN (4)
+L(exit):
+	pmovmskb %xmm1, %ebx
+	sub	$0xffff, %ebx
+	jz	L(first16bytes)
+	lea	-16(%esi), %esi
+	lea	-16(%edi), %edi
+	mov	%ebx, %edx
+L(first16bytes):
+	add	%eax, %esi
+L(less16bytes):
+	test	%dl, %dl
+	jz	L(next_24_bytes)
+
+	test	$0x01, %dl
+	jnz	L(Byte16)
+
+	test	$0x02, %dl
+	jnz	L(Byte17)
+
+	test	$0x04, %dl
+	jnz	L(Byte18)
+
+	test	$0x08, %dl
+	jnz	L(Byte19)
+
+	test	$0x10, %dl
+	jnz	L(Byte20)
+
+	test	$0x20, %dl
+	jnz	L(Byte21)
+
+	test	$0x40, %dl
+	jnz	L(Byte22)
+L(Byte23):
+	movzbl	 -9(%edi), %eax
+	movzbl	 -9(%esi), %edx
+	sub	%edx, %eax
+	RETURN
+
+	ALIGN (4)
+L(Byte16):
+	movzbl	 -16(%edi), %eax
+	movzbl	 -16(%esi), %edx
+	sub	%edx, %eax
+	RETURN
+
+	ALIGN (4)
+L(Byte17):
+	movzbl	 -15(%edi), %eax
+	movzbl	 -15(%esi), %edx
+	sub	%edx, %eax
+	RETURN
+
+	ALIGN (4)
+L(Byte18):
+	movzbl	 -14(%edi), %eax
+	movzbl	 -14(%esi), %edx
+	sub	%edx, %eax
+	RETURN
+
+	ALIGN (4)
+L(Byte19):
+	movzbl	 -13(%edi), %eax
+	movzbl	 -13(%esi), %edx
+	sub	%edx, %eax
+	RETURN
+
+	ALIGN (4)
+L(Byte20):
+	movzbl	 -12(%edi), %eax
+	movzbl	 -12(%esi), %edx
+	sub	%edx, %eax
+	RETURN
+
+	ALIGN (4)
+L(Byte21):
+	movzbl	 -11(%edi), %eax
+	movzbl	 -11(%esi), %edx
+	sub	%edx, %eax
+	RETURN
+
+	ALIGN (4)
+L(Byte22):
+	movzbl	 -10(%edi), %eax
+	movzbl	 -10(%esi), %edx
+	sub	%edx, %eax
+	RETURN
+
+	ALIGN (4)
+L(next_24_bytes):
+	lea	8(%edi), %edi
+	lea	8(%esi), %esi
+	test	$0x01, %dh
+	jnz	L(Byte16)
+
+	test	$0x02, %dh
+	jnz	L(Byte17)
+
+	test	$0x04, %dh
+	jnz	L(Byte18)
+
+	test	$0x08, %dh
+	jnz	L(Byte19)
+
+	test	$0x10, %dh
+	jnz	L(Byte20)
+
+	test	$0x20, %dh
+	jnz	L(Byte21)
+
+	test	$0x40, %dh
+	jnz	L(Byte22)
+
+	ALIGN (4)
+L(Byte31):
+	movzbl	 -9(%edi), %eax
+	movzbl	 -9(%esi), %edx
+	sub	%edx, %eax
+	RETURN_END
+	CFI_PUSH (%ebx)
+
+	ALIGN (4)
+L(more8bytes):
+	cmp	$16, %ecx
+	jae	L(more16bytes)
+	cmp	$8, %ecx
+	je	L(8bytes)
+	cmp	$9, %ecx
+	je	L(9bytes)
+	cmp	$10, %ecx
+	je	L(10bytes)
+	cmp	$11, %ecx
+	je	L(11bytes)
+	cmp	$12, %ecx
+	je	L(12bytes)
+	cmp	$13, %ecx
+	je	L(13bytes)
+	cmp	$14, %ecx
+	je	L(14bytes)
+	jmp	L(15bytes)
+
+	ALIGN (4)
+L(more16bytes):
+	cmp	$24, %ecx
+	jae	L(more24bytes)
+	cmp	$16, %ecx
+	je	L(16bytes)
+	cmp	$17, %ecx
+	je	L(17bytes)
+	cmp	$18, %ecx
+	je	L(18bytes)
+	cmp	$19, %ecx
+	je	L(19bytes)
+	cmp	$20, %ecx
+	je	L(20bytes)
+	cmp	$21, %ecx
+	je	L(21bytes)
+	cmp	$22, %ecx
+	je	L(22bytes)
+	jmp	L(23bytes)
+
+	ALIGN (4)
+L(more24bytes):
+	cmp	$32, %ecx
+	jae	L(more32bytes)
+	cmp	$24, %ecx
+	je	L(24bytes)
+	cmp	$25, %ecx
+	je	L(25bytes)
+	cmp	$26, %ecx
+	je	L(26bytes)
+	cmp	$27, %ecx
+	je	L(27bytes)
+	cmp	$28, %ecx
+	je	L(28bytes)
+	cmp	$29, %ecx
+	je	L(29bytes)
+	cmp	$30, %ecx
+	je	L(30bytes)
+	jmp	L(31bytes)
+
+	ALIGN (4)
+L(more32bytes):
+	cmp	$40, %ecx
+	jae	L(more40bytes)
+	cmp	$32, %ecx
+	je	L(32bytes)
+	cmp	$33, %ecx
+	je	L(33bytes)
+	cmp	$34, %ecx
+	je	L(34bytes)
+	cmp	$35, %ecx
+	je	L(35bytes)
+	cmp	$36, %ecx
+	je	L(36bytes)
+	cmp	$37, %ecx
+	je	L(37bytes)
+	cmp	$38, %ecx
+	je	L(38bytes)
+	jmp	L(39bytes)
+
+	ALIGN (4)
+L(more40bytes):
+	cmp	$40, %ecx
+	je	L(40bytes)
+	cmp	$41, %ecx
+	je	L(41bytes)
+	cmp	$42, %ecx
+	je	L(42bytes)
+	cmp	$43, %ecx
+	je	L(43bytes)
+	cmp	$44, %ecx
+	je	L(44bytes)
+	cmp	$45, %ecx
+	je	L(45bytes)
+	cmp	$46, %ecx
+	je	L(46bytes)
+	jmp	L(47bytes)
+
+	ALIGN (4)
+L(less48bytes):
+	cmp	$8, %ecx
+	jae	L(more8bytes)
+	cmp	$2, %ecx
+	je	L(2bytes)
+	cmp	$3, %ecx
+	je	L(3bytes)
+	cmp	$4, %ecx
+	je	L(4bytes)
+	cmp	$5, %ecx
+	je	L(5bytes)
+	cmp	$6, %ecx
+	je	L(6bytes)
+	jmp	L(7bytes)
+
+
+	ALIGN (4)
+L(44bytes):
+	mov	-44(%eax), %ecx
+	mov	-44(%edx), %ebx
+	cmp	%ebx, %ecx
+	jne	L(find_diff)
+L(40bytes):
+	mov	-40(%eax), %ecx
+	mov	-40(%edx), %ebx
+	cmp	%ebx, %ecx
+	jne	L(find_diff)
+L(36bytes):
+	mov	-36(%eax), %ecx
+	mov	-36(%edx), %ebx
+	cmp	%ebx, %ecx
+	jne	L(find_diff)
+L(32bytes):
+	mov	-32(%eax), %ecx
+	mov	-32(%edx), %ebx
+	cmp	%ebx, %ecx
+	jne	L(find_diff)
+L(28bytes):
+	mov	-28(%eax), %ecx
+	mov	-28(%edx), %ebx
+	cmp	%ebx, %ecx
+	jne	L(find_diff)
+L(24bytes):
+	mov	-24(%eax), %ecx
+	mov	-24(%edx), %ebx
+	cmp	%ebx, %ecx
+	jne	L(find_diff)
+L(20bytes):
+	mov	-20(%eax), %ecx
+	mov	-20(%edx), %ebx
+	cmp	%ebx, %ecx
+	jne	L(find_diff)
+L(16bytes):
+	mov	-16(%eax), %ecx
+	mov	-16(%edx), %ebx
+	cmp	%ebx, %ecx
+	jne	L(find_diff)
+L(12bytes):
+	mov	-12(%eax), %ecx
+	mov	-12(%edx), %ebx
+	cmp	%ebx, %ecx
+	jne	L(find_diff)
+L(8bytes):
+	mov	-8(%eax), %ecx
+	mov	-8(%edx), %ebx
+	cmp	%ebx, %ecx
+	jne	L(find_diff)
+L(4bytes):
+	mov	-4(%eax), %ecx
+	mov	-4(%edx), %ebx
+	cmp	%ebx, %ecx
+	mov	$0, %eax
+	jne	L(find_diff)
+	POP (%ebx)
+	ret
+	CFI_PUSH (%ebx)
+
+	ALIGN (4)
+L(45bytes):
+	mov	-45(%eax), %ecx
+	mov	-45(%edx), %ebx
+	cmp	%ebx, %ecx
+	jne	L(find_diff)
+L(41bytes):
+	mov	-41(%eax), %ecx
+	mov	-41(%edx), %ebx
+	cmp	%ebx, %ecx
+	jne	L(find_diff)
+L(37bytes):
+	mov	-37(%eax), %ecx
+	mov	-37(%edx), %ebx
+	cmp	%ebx, %ecx
+	jne	L(find_diff)
+L(33bytes):
+	mov	-33(%eax), %ecx
+	mov	-33(%edx), %ebx
+	cmp	%ebx, %ecx
+	jne	L(find_diff)
+L(29bytes):
+	mov	-29(%eax), %ecx
+	mov	-29(%edx), %ebx
+	cmp	%ebx, %ecx
+	jne	L(find_diff)
+L(25bytes):
+	mov	-25(%eax), %ecx
+	mov	-25(%edx), %ebx
+	cmp	%ebx, %ecx
+	jne	L(find_diff)
+L(21bytes):
+	mov	-21(%eax), %ecx
+	mov	-21(%edx), %ebx
+	cmp	%ebx, %ecx
+	jne	L(find_diff)
+L(17bytes):
+	mov	-17(%eax), %ecx
+	mov	-17(%edx), %ebx
+	cmp	%ebx, %ecx
+	jne	L(find_diff)
+L(13bytes):
+	mov	-13(%eax), %ecx
+	mov	-13(%edx), %ebx
+	cmp	%ebx, %ecx
+	jne	L(find_diff)
+L(9bytes):
+	mov	-9(%eax), %ecx
+	mov	-9(%edx), %ebx
+	cmp	%ebx, %ecx
+	jne	L(find_diff)
+L(5bytes):
+	mov	-5(%eax), %ecx
+	mov	-5(%edx), %ebx
+	cmp	%ebx, %ecx
+	jne	L(find_diff)
+	movzbl	-1(%eax), %ecx
+	cmp	-1(%edx), %cl
+	mov	$0, %eax
+	jne	L(end)
+	POP (%ebx)
+	ret
+	CFI_PUSH (%ebx)
+
+	ALIGN (4)
+L(46bytes):
+	mov	-46(%eax), %ecx
+	mov	-46(%edx), %ebx
+	cmp	%ebx, %ecx
+	jne	L(find_diff)
+L(42bytes):
+	mov	-42(%eax), %ecx
+	mov	-42(%edx), %ebx
+	cmp	%ebx, %ecx
+	jne	L(find_diff)
+L(38bytes):
+	mov	-38(%eax), %ecx
+	mov	-38(%edx), %ebx
+	cmp	%ebx, %ecx
+	jne	L(find_diff)
+L(34bytes):
+	mov	-34(%eax), %ecx
+	mov	-34(%edx), %ebx
+	cmp	%ebx, %ecx
+	jne	L(find_diff)
+L(30bytes):
+	mov	-30(%eax), %ecx
+	mov	-30(%edx), %ebx
+	cmp	%ebx, %ecx
+	jne	L(find_diff)
+L(26bytes):
+	mov	-26(%eax), %ecx
+	mov	-26(%edx), %ebx
+	cmp	%ebx, %ecx
+	jne	L(find_diff)
+L(22bytes):
+	mov	-22(%eax), %ecx
+	mov	-22(%edx), %ebx
+	cmp	%ebx, %ecx
+	jne	L(find_diff)
+L(18bytes):
+	mov	-18(%eax), %ecx
+	mov	-18(%edx), %ebx
+	cmp	%ebx, %ecx
+	jne	L(find_diff)
+L(14bytes):
+	mov	-14(%eax), %ecx
+	mov	-14(%edx), %ebx
+	cmp	%ebx, %ecx
+	jne	L(find_diff)
+L(10bytes):
+	mov	-10(%eax), %ecx
+	mov	-10(%edx), %ebx
+	cmp	%ebx, %ecx
+	jne	L(find_diff)
+L(6bytes):
+	mov	-6(%eax), %ecx
+	mov	-6(%edx), %ebx
+	cmp	%ebx, %ecx
+	jne	L(find_diff)
+L(2bytes):
+	movzwl	-2(%eax), %ecx
+	movzwl	-2(%edx), %ebx
+	cmp	%bl, %cl
+	jne	L(end)
+	cmp	%bh, %ch
+	mov	$0, %eax
+	jne	L(end)
+	POP (%ebx)
+	ret
+	CFI_PUSH (%ebx)
+
+	ALIGN (4)
+L(47bytes):
+	movl	-47(%eax), %ecx
+	movl	-47(%edx), %ebx
+	cmp	%ebx, %ecx
+	jne	L(find_diff)
+L(43bytes):
+	movl	-43(%eax), %ecx
+	movl	-43(%edx), %ebx
+	cmp	%ebx, %ecx
+	jne	L(find_diff)
+L(39bytes):
+	movl	-39(%eax), %ecx
+	movl	-39(%edx), %ebx
+	cmp	%ebx, %ecx
+	jne	L(find_diff)
+L(35bytes):
+	movl	-35(%eax), %ecx
+	movl	-35(%edx), %ebx
+	cmp	%ebx, %ecx
+	jne	L(find_diff)
+L(31bytes):
+	movl	-31(%eax), %ecx
+	movl	-31(%edx), %ebx
+	cmp	%ebx, %ecx
+	jne	L(find_diff)
+L(27bytes):
+	movl	-27(%eax), %ecx
+	movl	-27(%edx), %ebx
+	cmp	%ebx, %ecx
+	jne	L(find_diff)
+L(23bytes):
+	movl	-23(%eax), %ecx
+	movl	-23(%edx), %ebx
+	cmp	%ebx, %ecx
+	jne	L(find_diff)
+L(19bytes):
+	movl	-19(%eax), %ecx
+	movl	-19(%edx), %ebx
+	cmp	%ebx, %ecx
+	jne	L(find_diff)
+L(15bytes):
+	movl	-15(%eax), %ecx
+	movl	-15(%edx), %ebx
+	cmp	%ebx, %ecx
+	jne	L(find_diff)
+L(11bytes):
+	movl	-11(%eax), %ecx
+	movl	-11(%edx), %ebx
+	cmp	%ebx, %ecx
+	jne	L(find_diff)
+L(7bytes):
+	movl	-7(%eax), %ecx
+	movl	-7(%edx), %ebx
+	cmp	%ebx, %ecx
+	jne	L(find_diff)
+L(3bytes):
+	movzwl	-3(%eax), %ecx
+	movzwl	-3(%edx), %ebx
+	cmpb	%bl, %cl
+	jne	L(end)
+	cmp	%bx, %cx
+	jne	L(end)
+	movzbl	-1(%eax), %eax
+	cmpb	-1(%edx), %al
+	mov	$0, %eax
+	jne	L(end)
+	POP (%ebx)
+	ret
+	CFI_PUSH (%ebx)
+
+	ALIGN (4)
+L(find_diff):
+	cmpb	%bl, %cl
+	jne	L(end)
+	cmp	%bx, %cx
+	jne	L(end)
+	shr	$16,%ecx
+	shr	$16,%ebx
+	cmp	%bl, %cl
+	jne	L(end)
+	cmp	%bx, %cx
+L(end):
+	POP (%ebx)
+	mov	$1, %eax
+	ja	L(bigger)
+	neg	%eax
+L(bigger):
+	ret
+
+END (MEMCMP)
diff --git a/libc/arch-x86/string/ssse3-memcpy5.S b/libc/arch-x86/string/ssse3-memcpy5.S
new file mode 100644
index 0000000..6b90402
--- /dev/null
+++ b/libc/arch-x86/string/ssse3-memcpy5.S
@@ -0,0 +1,1770 @@
+/*
+Copyright (c) 2010, Intel Corporation
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+    * Redistributions of source code must retain the above copyright notice,
+    * this list of conditions and the following disclaimer.
+
+    * Redistributions in binary form must reproduce the above copyright notice,
+    * this list of conditions and the following disclaimer in the documentation
+    * and/or other materials provided with the distribution.
+
+    * Neither the name of Intel Corporation nor the names of its contributors
+    * may be used to endorse or promote products derived from this software
+    * without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
+ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+#ifndef MEMCPY
+# define MEMCPY         ssse3_memcpy5
+#endif
+
+#ifndef L
+# define L(label)	.L##label
+#endif
+
+#ifndef ALIGN
+# define ALIGN(n)	.p2align n
+#endif
+
+#ifndef cfi_startproc
+# define cfi_startproc			.cfi_startproc
+#endif
+
+#ifndef cfi_endproc
+# define cfi_endproc			.cfi_endproc
+#endif
+
+#ifndef cfi_rel_offset
+# define cfi_rel_offset(reg, off)	.cfi_rel_offset reg, off
+#endif
+
+#ifndef cfi_restore
+# define cfi_restore(reg)		.cfi_restore (reg)
+#endif
+
+#ifndef cfi_adjust_cfa_offset
+# define cfi_adjust_cfa_offset(off)	.cfi_adjust_cfa_offset off
+#endif
+
+#ifndef ENTRY
+# define ENTRY(name)			\
+	.type name,  @function; 	\
+	.globl name;			\
+	.p2align 4;			\
+name:					\
+	cfi_startproc
+#endif
+
+#ifndef END
+# define END(name)			\
+	cfi_endproc;			\
+	.size name, .-name
+#endif
+
+#ifdef USE_AS_BCOPY
+# define SRC		PARMS
+# define DEST		SRC+4
+# define LEN		DEST+4
+#else
+# define DEST		PARMS
+# define SRC		DEST+4
+# define LEN		SRC+4
+#endif
+
+#define CFI_PUSH(REG)						\
+  cfi_adjust_cfa_offset (4);					\
+  cfi_rel_offset (REG, 0)
+
+#define CFI_POP(REG)						\
+  cfi_adjust_cfa_offset (-4);					\
+  cfi_restore (REG)
+
+#define PUSH(REG)	pushl REG; CFI_PUSH (REG)
+#define POP(REG)	popl REG; CFI_POP (REG)
+
+#ifdef SHARED
+# define PARMS		8		/* Preserve EBX.  */
+# define ENTRANCE	PUSH (%ebx);
+# define RETURN_END	POP (%ebx); ret
+# define RETURN		RETURN_END; CFI_PUSH (%ebx)
+# define JMPTBL(I, B)	I - B
+
+/* Load an entry in a jump table into EBX and branch to it.  TABLE is a
+   jump table with relative offsets.  INDEX is a register contains the
+   index into the jump table.   SCALE is the scale of INDEX. */
+# define BRANCH_TO_JMPTBL_ENTRY(TABLE, INDEX, SCALE)		\
+    /* We first load PC into EBX.  */				\
+    call	__i686.get_pc_thunk.bx;				\
+    /* Get the address of the jump table.  */			\
+    addl	$(TABLE - .), %ebx;				\
+    /* Get the entry and convert the relative offset to the	\
+       absolute address.  */					\
+    addl	(%ebx,INDEX,SCALE), %ebx;			\
+    /* We loaded the jump table.  Go.  */			\
+    jmp		*%ebx
+
+# define BRANCH_TO_JMPTBL_ENTRY_VALUE(TABLE)			\
+    addl	$(TABLE - .), %ebx	
+			
+# define BRANCH_TO_JMPTBL_ENTRY_TAIL(TABLE, INDEX, SCALE)	\
+    addl	(%ebx,INDEX,SCALE), %ebx;			\
+    /* We loaded the jump table.  Go.  */			\
+    jmp		*%ebx
+
+	.section	.gnu.linkonce.t.__i686.get_pc_thunk.bx,"ax",@progbits
+	.globl	__i686.get_pc_thunk.bx
+	.hidden	__i686.get_pc_thunk.bx
+	ALIGN (4)
+	.type	__i686.get_pc_thunk.bx,@function
+__i686.get_pc_thunk.bx:
+	movl	(%esp), %ebx
+	ret
+#else
+# define PARMS		4
+# define ENTRANCE
+# define RETURN_END	ret
+# define RETURN		RETURN_END
+# define JMPTBL(I, B)	I
+
+/* Branch to an entry in a jump table.  TABLE is a jump table with
+   absolute offsets.  INDEX is a register contains the index into the
+   jump table.  SCALE is the scale of INDEX. */
+# define BRANCH_TO_JMPTBL_ENTRY(TABLE, INDEX, SCALE)		\
+    jmp		*TABLE(,INDEX,SCALE)
+
+# define BRANCH_TO_JMPTBL_ENTRY_VALUE(TABLE)			
+
+# define BRANCH_TO_JMPTBL_ENTRY_TAIL(TABLE, INDEX, SCALE)		\
+    jmp		*TABLE(,INDEX,SCALE)
+#endif
+
+	.section .text.ssse3,"ax",@progbits
+ENTRY (MEMCPY)
+	ENTRANCE
+	movl	LEN(%esp), %ecx
+	movl	SRC(%esp), %eax
+	movl	DEST(%esp), %edx
+
+#ifdef USE_AS_MEMMOVE
+	cmp	%eax, %edx
+	jb	L(copy_forward)
+	je	L(fwd_write_0bytes)
+	cmp	$32, %ecx
+	jae	L(memmove_bwd)
+	jmp	L(bk_write_less32bytes_2)
+L(memmove_bwd):
+	add	%ecx, %eax
+	cmp	%eax, %edx
+	movl	SRC(%esp), %eax
+	jb	L(copy_backward)
+
+L(copy_forward):
+#endif
+	cmp	$48, %ecx
+	jae	L(48bytesormore)
+
+L(fwd_write_less32bytes):
+#ifndef USE_AS_MEMMOVE
+	cmp	%dl, %al
+	jb	L(bk_write)
+#endif
+	add	%ecx, %edx
+	add	%ecx, %eax
+	BRANCH_TO_JMPTBL_ENTRY (L(table_48bytes_fwd), %ecx, 4)
+#ifndef USE_AS_MEMMOVE
+L(bk_write):
+	BRANCH_TO_JMPTBL_ENTRY (L(table_48_bytes_bwd), %ecx, 4)
+#endif
+
+	ALIGN (4)
+/* ECX > 32 and EDX is 4 byte aligned.  */
+L(48bytesormore):
+	movdqu	(%eax), %xmm0
+	PUSH (%edi)
+	movl	%edx, %edi
+	and	$-16, %edx
+	PUSH (%esi)
+	add	$16, %edx
+	movl	%edi, %esi
+	sub	%edx, %edi
+	add	%edi, %ecx
+	sub	%edi, %eax
+
+#ifdef SHARED_CACHE_SIZE_HALF
+	cmp	$SHARED_CACHE_SIZE_HALF, %ecx
+#else
+# ifdef SHARED
+	call	__i686.get_pc_thunk.bx
+	add	$_GLOBAL_OFFSET_TABLE_, %ebx
+	cmp	__x86_shared_cache_size_half@GOTOFF(%ebx), %ecx
+# else
+	cmp	__x86_shared_cache_size_half, %ecx
+# endif
+#endif
+
+	mov	%eax, %edi
+	jae	L(large_page)
+	and	$0xf, %edi
+	jz	L(shl_0)
+
+	BRANCH_TO_JMPTBL_ENTRY (L(shl_table), %edi, 4)
+
+	ALIGN (4)
+L(shl_0):
+	movdqu	%xmm0, (%esi)
+	xor	%edi, %edi
+	POP (%esi)
+	cmp	$127, %ecx
+	ja	L(shl_0_gobble)
+	lea	-32(%ecx), %ecx
+L(shl_0_loop):
+	movdqa	(%eax, %edi), %xmm0
+	movdqa	16(%eax, %edi), %xmm1
+	sub	$32, %ecx
+	movdqa	%xmm0, (%edx, %edi)
+	movdqa	%xmm1, 16(%edx, %edi)
+	lea	32(%edi), %edi
+	jb	L(shl_0_end)
+
+	movdqa	(%eax, %edi), %xmm0
+	movdqa	16(%eax, %edi), %xmm1
+	sub	$32, %ecx
+	movdqa	%xmm0, (%edx, %edi)
+	movdqa	%xmm1, 16(%edx, %edi)
+	lea	32(%edi), %edi
+	jb	L(shl_0_end)
+
+	movdqa	(%eax, %edi), %xmm0
+	movdqa	16(%eax, %edi), %xmm1
+	sub	$32, %ecx
+	movdqa	%xmm0, (%edx, %edi)
+	movdqa	%xmm1, 16(%edx, %edi)
+	lea	32(%edi), %edi
+	jb	L(shl_0_end)
+
+	movdqa	(%eax, %edi), %xmm0
+	movdqa	16(%eax, %edi), %xmm1
+	sub	$32, %ecx
+	movdqa	%xmm0, (%edx, %edi)
+	movdqa	%xmm1, 16(%edx, %edi)
+	lea	32(%edi), %edi
+L(shl_0_end):
+	lea	32(%ecx), %ecx
+	add	%ecx, %edi
+	add	%edi, %edx
+	add	%edi, %eax
+	POP (%edi)
+	BRANCH_TO_JMPTBL_ENTRY (L(table_48bytes_fwd), %ecx, 4)
+
+L(shl_0_gobble):
+
+#ifdef DATA_CACHE_SIZE_HALF
+	cmp	$DATA_CACHE_SIZE_HALF, %ecx
+#else
+# ifdef SHARED
+	call	__i686.get_pc_thunk.bx
+	add	$_GLOBAL_OFFSET_TABLE_, %ebx
+	cmp	__x86_data_cache_size_half@GOTOFF(%ebx), %ecx
+# else
+	cmp	__x86_data_cache_size_half, %ecx
+# endif
+#endif
+
+	POP (%edi)
+	lea	-128(%ecx), %ecx
+	jae	L(shl_0_gobble_mem_loop)
+L(shl_0_gobble_cache_loop):
+	movdqa	(%eax), %xmm0
+	movdqa	0x10(%eax), %xmm1
+	movdqa	0x20(%eax), %xmm2
+	movdqa	0x30(%eax), %xmm3
+	movdqa	0x40(%eax), %xmm4
+	movdqa	0x50(%eax), %xmm5
+	movdqa	0x60(%eax), %xmm6
+	movdqa	0x70(%eax), %xmm7
+	lea	0x80(%eax), %eax
+	sub	$128, %ecx
+	movdqa	%xmm0, (%edx)
+	movdqa	%xmm1, 0x10(%edx)
+	movdqa	%xmm2, 0x20(%edx)
+	movdqa	%xmm3, 0x30(%edx)
+	movdqa	%xmm4, 0x40(%edx)
+	movdqa	%xmm5, 0x50(%edx)
+	movdqa	%xmm6, 0x60(%edx)
+	movdqa	%xmm7, 0x70(%edx)
+	lea	0x80(%edx), %edx
+
+	jae	L(shl_0_gobble_cache_loop)
+	cmp	$-0x40, %ecx
+	lea	0x80(%ecx), %ecx
+	jl	L(shl_0_cache_less_64bytes)
+
+	movdqa	(%eax), %xmm0
+	sub	$0x40, %ecx
+	movdqa	0x10(%eax), %xmm1
+
+	movdqa	%xmm0, (%edx)
+	movdqa	%xmm1, 0x10(%edx)
+
+	movdqa	0x20(%eax), %xmm0
+	movdqa	0x30(%eax), %xmm1
+	add	$0x40, %eax
+
+	movdqa	%xmm0, 0x20(%edx)
+	movdqa	%xmm1, 0x30(%edx)
+	add	$0x40, %edx
+L(shl_0_cache_less_64bytes):
+	cmp	$0x20, %ecx
+	jb	L(shl_0_cache_less_32bytes)
+	movdqa	(%eax), %xmm0
+	sub	$0x20, %ecx
+	movdqa	0x10(%eax), %xmm1
+	add	$0x20, %eax
+	movdqa	%xmm0, (%edx)
+	movdqa	%xmm1, 0x10(%edx)
+	add	$0x20, %edx
+L(shl_0_cache_less_32bytes):
+	cmp	$0x10, %ecx
+	jb	L(shl_0_cache_less_16bytes)
+	sub	$0x10, %ecx
+	movdqa	(%eax), %xmm0
+	add	$0x10, %eax
+	movdqa	%xmm0, (%edx)
+	add	$0x10, %edx
+L(shl_0_cache_less_16bytes):
+	add	%ecx, %edx
+	add	%ecx, %eax
+	BRANCH_TO_JMPTBL_ENTRY (L(table_48bytes_fwd), %ecx, 4)
+
+
+	ALIGN (4)
+L(shl_0_gobble_mem_loop):
+	prefetcht0 0x1c0(%eax)
+	prefetcht0 0x280(%eax)
+	prefetcht0 0x1c0(%edx)
+
+	movdqa	(%eax), %xmm0
+	movdqa	0x10(%eax), %xmm1
+	movdqa	0x20(%eax), %xmm2
+	movdqa	0x30(%eax), %xmm3
+	movdqa	0x40(%eax), %xmm4
+	movdqa	0x50(%eax), %xmm5
+	movdqa	0x60(%eax), %xmm6
+	movdqa	0x70(%eax), %xmm7
+	lea	0x80(%eax), %eax
+	sub	$0x80, %ecx
+	movdqa	%xmm0, (%edx)
+	movdqa	%xmm1, 0x10(%edx)
+	movdqa	%xmm2, 0x20(%edx)
+	movdqa	%xmm3, 0x30(%edx)
+	movdqa	%xmm4, 0x40(%edx)
+	movdqa	%xmm5, 0x50(%edx)
+	movdqa	%xmm6, 0x60(%edx)
+	movdqa	%xmm7, 0x70(%edx)
+	lea	0x80(%edx), %edx
+
+	jae	L(shl_0_gobble_mem_loop)
+	cmp	$-0x40, %ecx
+	lea	0x80(%ecx), %ecx
+	jl	L(shl_0_mem_less_64bytes)
+
+	movdqa	(%eax), %xmm0
+	sub	$0x40, %ecx
+	movdqa	0x10(%eax), %xmm1
+
+	movdqa	%xmm0, (%edx)
+	movdqa	%xmm1, 0x10(%edx)
+
+	movdqa	0x20(%eax), %xmm0
+	movdqa	0x30(%eax), %xmm1
+	add	$0x40, %eax
+
+	movdqa	%xmm0, 0x20(%edx)
+	movdqa	%xmm1, 0x30(%edx)
+	add	$0x40, %edx
+L(shl_0_mem_less_64bytes):
+	cmp	$0x20, %ecx
+	jb	L(shl_0_mem_less_32bytes)
+	movdqa	(%eax), %xmm0
+	sub	$0x20, %ecx
+	movdqa	0x10(%eax), %xmm1
+	add	$0x20, %eax
+	movdqa	%xmm0, (%edx)
+	movdqa	%xmm1, 0x10(%edx)
+	add	$0x20, %edx
+L(shl_0_mem_less_32bytes):
+	cmp	$0x10, %ecx
+	jb	L(shl_0_mem_less_16bytes)
+	sub	$0x10, %ecx
+	movdqa	(%eax), %xmm0
+	add	$0x10, %eax
+	movdqa	%xmm0, (%edx)
+	add	$0x10, %edx
+L(shl_0_mem_less_16bytes):
+	add	%ecx, %edx
+	add	%ecx, %eax
+	BRANCH_TO_JMPTBL_ENTRY (L(table_48bytes_fwd), %ecx, 4)
+
+
+	ALIGN (4)
+L(shl_1):
+	BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd))
+	lea	-1(%eax), %eax
+	movaps	(%eax), %xmm1
+	xor	%edi, %edi
+	lea	-32(%ecx), %ecx
+	movdqu	%xmm0, (%esi)
+	POP (%esi)
+L(shl_1_loop):
+
+	movdqa	16(%eax, %edi), %xmm2
+	sub	$32, %ecx
+	movdqa	32(%eax, %edi), %xmm3
+	movdqa	%xmm3, %xmm4
+	palignr	$1, %xmm2, %xmm3
+	palignr	$1, %xmm1, %xmm2
+	lea	32(%edi), %edi
+	movdqa	%xmm2, -32(%edx, %edi)
+	movdqa	%xmm3, -16(%edx, %edi)
+
+	jb	L(shl_1_end)
+
+	movdqa	16(%eax, %edi), %xmm2
+	sub	$32, %ecx
+	movdqa	32(%eax, %edi), %xmm3
+	movdqa	%xmm3, %xmm1
+	palignr	$1, %xmm2, %xmm3
+	palignr	$1, %xmm4, %xmm2
+	lea	32(%edi), %edi
+	movdqa	%xmm2, -32(%edx, %edi)
+	movdqa	%xmm3, -16(%edx, %edi)
+
+	jae	L(shl_1_loop)
+
+L(shl_1_end):
+	lea	32(%ecx), %ecx
+	add	%ecx, %edi
+	add	%edi, %edx
+	lea	1(%edi, %eax), %eax
+	POP (%edi)
+	BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4)
+
+	ALIGN (4)
+L(shl_2):
+	BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd))
+	lea	-2(%eax), %eax
+	movaps	(%eax), %xmm1
+	xor	%edi, %edi
+	lea	-32(%ecx), %ecx
+	movdqu	%xmm0, (%esi)
+	POP (%esi)
+L(shl_2_loop):
+
+	movdqa	16(%eax, %edi), %xmm2
+	sub	$32, %ecx
+	movdqa	32(%eax, %edi), %xmm3
+	movdqa	%xmm3, %xmm4
+	palignr	$2, %xmm2, %xmm3
+	palignr	$2, %xmm1, %xmm2
+	lea	32(%edi), %edi
+	movdqa	%xmm2, -32(%edx, %edi)
+	movdqa	%xmm3, -16(%edx, %edi)
+
+	jb	L(shl_2_end)
+
+	movdqa	16(%eax, %edi), %xmm2
+	sub	$32, %ecx
+	movdqa	32(%eax, %edi), %xmm3
+	movdqa	%xmm3, %xmm1
+	palignr	$2, %xmm2, %xmm3
+	palignr	$2, %xmm4, %xmm2
+	lea	32(%edi), %edi
+	movdqa	%xmm2, -32(%edx, %edi)
+	movdqa	%xmm3, -16(%edx, %edi)
+
+	jae	L(shl_2_loop)
+
+L(shl_2_end):
+	lea	32(%ecx), %ecx
+	add	%ecx, %edi
+	add	%edi, %edx
+	lea	2(%edi, %eax), %eax
+	POP (%edi)
+	BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4)
+
+	ALIGN (4)
+L(shl_3):
+	BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd))
+	lea	-3(%eax), %eax
+	movaps	(%eax), %xmm1
+	xor	%edi, %edi
+	lea	-32(%ecx), %ecx
+	movdqu	%xmm0, (%esi)
+	POP (%esi)
+L(shl_3_loop):
+
+	movdqa	16(%eax, %edi), %xmm2
+	sub	$32, %ecx
+	movdqa	32(%eax, %edi), %xmm3
+	movdqa	%xmm3, %xmm4
+	palignr	$3, %xmm2, %xmm3
+	palignr	$3, %xmm1, %xmm2
+	lea	32(%edi), %edi
+	movdqa	%xmm2, -32(%edx, %edi)
+	movdqa	%xmm3, -16(%edx, %edi)
+
+	jb	L(shl_3_end)
+
+	movdqa	16(%eax, %edi), %xmm2
+	sub	$32, %ecx
+	movdqa	32(%eax, %edi), %xmm3
+	movdqa	%xmm3, %xmm1
+	palignr	$3, %xmm2, %xmm3
+	palignr	$3, %xmm4, %xmm2
+	lea	32(%edi), %edi
+	movdqa	%xmm2, -32(%edx, %edi)
+	movdqa	%xmm3, -16(%edx, %edi)
+
+	jae	L(shl_3_loop)
+
+L(shl_3_end):
+	lea	32(%ecx), %ecx
+	add	%ecx, %edi
+	add	%edi, %edx
+	lea	3(%edi, %eax), %eax
+	POP (%edi)
+	BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4)
+
+	ALIGN (4)
+L(shl_4):
+	BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd))
+	lea	-4(%eax), %eax
+	movaps	(%eax), %xmm1
+	xor	%edi, %edi
+	lea	-32(%ecx), %ecx
+	movdqu	%xmm0, (%esi)
+	POP (%esi)
+L(shl_4_loop):
+
+	movdqa	16(%eax, %edi), %xmm2
+	sub	$32, %ecx
+	movdqa	32(%eax, %edi), %xmm3
+	movdqa	%xmm3, %xmm4
+	palignr	$4, %xmm2, %xmm3
+	palignr	$4, %xmm1, %xmm2
+	lea	32(%edi), %edi
+	movdqa	%xmm2, -32(%edx, %edi)
+	movdqa	%xmm3, -16(%edx, %edi)
+
+	jb	L(shl_4_end)
+
+	movdqa	16(%eax, %edi), %xmm2
+	sub	$32, %ecx
+	movdqa	32(%eax, %edi), %xmm3
+	movdqa	%xmm3, %xmm1
+	palignr	$4, %xmm2, %xmm3
+	palignr	$4, %xmm4, %xmm2
+	lea	32(%edi), %edi
+	movdqa	%xmm2, -32(%edx, %edi)
+	movdqa	%xmm3, -16(%edx, %edi)
+
+	jae	L(shl_4_loop)
+
+L(shl_4_end):
+	lea	32(%ecx), %ecx
+	add	%ecx, %edi
+	add	%edi, %edx
+	lea	4(%edi, %eax), %eax
+	POP (%edi)
+	BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4)
+
+	ALIGN (4)
+L(shl_5):
+	BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd))
+	lea	-5(%eax), %eax
+	movaps	(%eax), %xmm1
+	xor	%edi, %edi
+	lea	-32(%ecx), %ecx
+	movdqu	%xmm0, (%esi)
+	POP (%esi)
+L(shl_5_loop):
+
+	movdqa	16(%eax, %edi), %xmm2
+	sub	$32, %ecx
+	movdqa	32(%eax, %edi), %xmm3
+	movdqa	%xmm3, %xmm4
+	palignr	$5, %xmm2, %xmm3
+	palignr	$5, %xmm1, %xmm2
+	lea	32(%edi), %edi
+	movdqa	%xmm2, -32(%edx, %edi)
+	movdqa	%xmm3, -16(%edx, %edi)
+
+	jb	L(shl_5_end)
+
+	movdqa	16(%eax, %edi), %xmm2
+	sub	$32, %ecx
+	movdqa	32(%eax, %edi), %xmm3
+	movdqa	%xmm3, %xmm1
+	palignr	$5, %xmm2, %xmm3
+	palignr	$5, %xmm4, %xmm2
+	lea	32(%edi), %edi
+	movdqa	%xmm2, -32(%edx, %edi)
+	movdqa	%xmm3, -16(%edx, %edi)
+
+	jae	L(shl_5_loop)
+
+L(shl_5_end):
+	lea	32(%ecx), %ecx
+	add	%ecx, %edi
+	add	%edi, %edx
+	lea	5(%edi, %eax), %eax
+	POP (%edi)
+	BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4)
+
+
+	ALIGN (4)
+L(shl_6):
+	BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd))
+	lea	-6(%eax), %eax
+	movaps	(%eax), %xmm1
+	xor	%edi, %edi
+	lea	-32(%ecx), %ecx
+	movdqu	%xmm0, (%esi)
+	POP (%esi)
+L(shl_6_loop):
+
+	movdqa	16(%eax, %edi), %xmm2
+	sub	$32, %ecx
+	movdqa	32(%eax, %edi), %xmm3
+	movdqa	%xmm3, %xmm4
+	palignr	$6, %xmm2, %xmm3
+	palignr	$6, %xmm1, %xmm2
+	lea	32(%edi), %edi
+	movdqa	%xmm2, -32(%edx, %edi)
+	movdqa	%xmm3, -16(%edx, %edi)
+
+	jb	L(shl_6_end)
+
+	movdqa	16(%eax, %edi), %xmm2
+	sub	$32, %ecx
+	movdqa	32(%eax, %edi), %xmm3
+	movdqa	%xmm3, %xmm1
+	palignr	$6, %xmm2, %xmm3
+	palignr	$6, %xmm4, %xmm2
+	lea	32(%edi), %edi
+	movdqa	%xmm2, -32(%edx, %edi)
+	movdqa	%xmm3, -16(%edx, %edi)
+
+	jae	L(shl_6_loop)
+
+L(shl_6_end):
+	lea	32(%ecx), %ecx
+	add	%ecx, %edi
+	add	%edi, %edx
+	lea	6(%edi, %eax), %eax
+	POP (%edi)
+	BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4)
+
+	ALIGN (4)
+L(shl_7):
+	BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd))
+	lea	-7(%eax), %eax
+	movaps	(%eax), %xmm1
+	xor	%edi, %edi
+	lea	-32(%ecx), %ecx
+	movdqu	%xmm0, (%esi)
+	POP (%esi)
+L(shl_7_loop):
+
+	movdqa	16(%eax, %edi), %xmm2
+	sub	$32, %ecx
+	movdqa	32(%eax, %edi), %xmm3
+	movdqa	%xmm3, %xmm4
+	palignr	$7, %xmm2, %xmm3
+	palignr	$7, %xmm1, %xmm2
+	lea	32(%edi), %edi
+	movdqa	%xmm2, -32(%edx, %edi)
+	movdqa	%xmm3, -16(%edx, %edi)
+
+	jb	L(shl_7_end)
+
+	movdqa	16(%eax, %edi), %xmm2
+	sub	$32, %ecx
+	movdqa	32(%eax, %edi), %xmm3
+	movdqa	%xmm3, %xmm1
+	palignr	$7, %xmm2, %xmm3
+	palignr	$7, %xmm4, %xmm2
+	lea	32(%edi), %edi
+	movdqa	%xmm2, -32(%edx, %edi)
+	movdqa	%xmm3, -16(%edx, %edi)
+
+	jae	L(shl_7_loop)
+
+L(shl_7_end):
+	lea	32(%ecx), %ecx
+	add	%ecx, %edi
+	add	%edi, %edx
+	lea	7(%edi, %eax), %eax
+	POP (%edi)
+	BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4)
+
+	ALIGN (4)
+L(shl_8):
+	BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd))
+	lea	-8(%eax), %eax
+	movaps	(%eax), %xmm1
+	xor	%edi, %edi
+	lea	-32(%ecx), %ecx
+	movdqu	%xmm0, (%esi)
+	POP (%esi)
+L(shl_8_loop):
+
+	movdqa	16(%eax, %edi), %xmm2
+	sub	$32, %ecx
+	movdqa	32(%eax, %edi), %xmm3
+	movdqa	%xmm3, %xmm4
+	palignr	$8, %xmm2, %xmm3
+	palignr	$8, %xmm1, %xmm2
+	lea	32(%edi), %edi
+	movdqa	%xmm2, -32(%edx, %edi)
+	movdqa	%xmm3, -16(%edx, %edi)
+
+	jb	L(shl_8_end)
+
+	movdqa	16(%eax, %edi), %xmm2
+	sub	$32, %ecx
+	movdqa	32(%eax, %edi), %xmm3
+	movdqa	%xmm3, %xmm1
+	palignr	$8, %xmm2, %xmm3
+	palignr	$8, %xmm4, %xmm2
+	lea	32(%edi), %edi
+	movdqa	%xmm2, -32(%edx, %edi)
+	movdqa	%xmm3, -16(%edx, %edi)
+
+	jae	L(shl_8_loop)
+
+L(shl_8_end):
+	lea	32(%ecx), %ecx
+	add	%ecx, %edi
+	add	%edi, %edx
+	lea	8(%edi, %eax), %eax
+	POP (%edi)
+	BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4)
+
+	ALIGN (4)
+L(shl_9):
+	BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd))
+	lea	-9(%eax), %eax
+	movaps	(%eax), %xmm1
+	xor	%edi, %edi
+	lea	-32(%ecx), %ecx
+	movdqu	%xmm0, (%esi)
+	POP (%esi)
+L(shl_9_loop):
+
+	movdqa	16(%eax, %edi), %xmm2
+	sub	$32, %ecx
+	movdqa	32(%eax, %edi), %xmm3
+	movdqa	%xmm3, %xmm4
+	palignr	$9, %xmm2, %xmm3
+	palignr	$9, %xmm1, %xmm2
+	lea	32(%edi), %edi
+	movdqa	%xmm2, -32(%edx, %edi)
+	movdqa	%xmm3, -16(%edx, %edi)
+
+	jb	L(shl_9_end)
+
+	movdqa	16(%eax, %edi), %xmm2
+	sub	$32, %ecx
+	movdqa	32(%eax, %edi), %xmm3
+	movdqa	%xmm3, %xmm1
+	palignr	$9, %xmm2, %xmm3
+	palignr	$9, %xmm4, %xmm2
+	lea	32(%edi), %edi
+	movdqa	%xmm2, -32(%edx, %edi)
+	movdqa	%xmm3, -16(%edx, %edi)
+
+	jae	L(shl_9_loop)
+
+L(shl_9_end):
+	lea	32(%ecx), %ecx
+	add	%ecx, %edi
+	add	%edi, %edx
+	lea	9(%edi, %eax), %eax
+	POP (%edi)
+	BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4)
+
+	ALIGN (4)
+L(shl_10):
+	BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd))
+	lea	-10(%eax), %eax
+	movaps	(%eax), %xmm1
+	xor	%edi, %edi
+	lea	-32(%ecx), %ecx
+	movdqu	%xmm0, (%esi)
+	POP (%esi)
+L(shl_10_loop):
+
+	movdqa	16(%eax, %edi), %xmm2
+	sub	$32, %ecx
+	movdqa	32(%eax, %edi), %xmm3
+	movdqa	%xmm3, %xmm4
+	palignr	$10, %xmm2, %xmm3
+	palignr	$10, %xmm1, %xmm2
+	lea	32(%edi), %edi
+	movdqa	%xmm2, -32(%edx, %edi)
+	movdqa	%xmm3, -16(%edx, %edi)
+
+	jb	L(shl_10_end)
+
+	movdqa	16(%eax, %edi), %xmm2
+	sub	$32, %ecx
+	movdqa	32(%eax, %edi), %xmm3
+	movdqa	%xmm3, %xmm1
+	palignr	$10, %xmm2, %xmm3
+	palignr	$10, %xmm4, %xmm2
+	lea	32(%edi), %edi
+	movdqa	%xmm2, -32(%edx, %edi)
+	movdqa	%xmm3, -16(%edx, %edi)
+
+	jae	L(shl_10_loop)
+
+L(shl_10_end):
+	lea	32(%ecx), %ecx
+	add	%ecx, %edi
+	add	%edi, %edx
+	lea	10(%edi, %eax), %eax
+	POP (%edi)
+	BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4)
+
+	ALIGN (4)
+L(shl_11):
+	BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd))
+	lea	-11(%eax), %eax
+	movaps	(%eax), %xmm1
+	xor	%edi, %edi
+	lea	-32(%ecx), %ecx
+	movdqu	%xmm0, (%esi)
+	POP (%esi)
+L(shl_11_loop):
+
+	movdqa	16(%eax, %edi), %xmm2
+	sub	$32, %ecx
+	movdqa	32(%eax, %edi), %xmm3
+	movdqa	%xmm3, %xmm4
+	palignr	$11, %xmm2, %xmm3
+	palignr	$11, %xmm1, %xmm2
+	lea	32(%edi), %edi
+	movdqa	%xmm2, -32(%edx, %edi)
+	movdqa	%xmm3, -16(%edx, %edi)
+
+	jb	L(shl_11_end)
+
+	movdqa	16(%eax, %edi), %xmm2
+	sub	$32, %ecx
+	movdqa	32(%eax, %edi), %xmm3
+	movdqa	%xmm3, %xmm1
+	palignr	$11, %xmm2, %xmm3
+	palignr	$11, %xmm4, %xmm2
+	lea	32(%edi), %edi
+	movdqa	%xmm2, -32(%edx, %edi)
+	movdqa	%xmm3, -16(%edx, %edi)
+
+	jae	L(shl_11_loop)
+
+L(shl_11_end):
+	lea	32(%ecx), %ecx
+	add	%ecx, %edi
+	add	%edi, %edx
+	lea	11(%edi, %eax), %eax
+	POP (%edi)
+	BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4)
+
+	ALIGN (4)
+L(shl_12):
+	BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd))
+	lea	-12(%eax), %eax
+	movaps	(%eax), %xmm1
+	xor	%edi, %edi
+	lea	-32(%ecx), %ecx
+	movdqu	%xmm0, (%esi)
+	POP (%esi)
+L(shl_12_loop):
+
+	movdqa	16(%eax, %edi), %xmm2
+	sub	$32, %ecx
+	movdqa	32(%eax, %edi), %xmm3
+	movdqa	%xmm3, %xmm4
+	palignr	$12, %xmm2, %xmm3
+	palignr	$12, %xmm1, %xmm2
+	lea	32(%edi), %edi
+	movdqa	%xmm2, -32(%edx, %edi)
+	movdqa	%xmm3, -16(%edx, %edi)
+
+	jb	L(shl_12_end)
+
+	movdqa	16(%eax, %edi), %xmm2
+	sub	$32, %ecx
+	movdqa	32(%eax, %edi), %xmm3
+	movdqa	%xmm3, %xmm1
+	palignr	$12, %xmm2, %xmm3
+	palignr	$12, %xmm4, %xmm2
+	lea	32(%edi), %edi
+	movdqa	%xmm2, -32(%edx, %edi)
+	movdqa	%xmm3, -16(%edx, %edi)
+
+	jae	L(shl_12_loop)
+
+L(shl_12_end):
+	lea	32(%ecx), %ecx
+	add	%ecx, %edi
+	add	%edi, %edx
+	lea	12(%edi, %eax), %eax
+	POP (%edi)
+	BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4)
+
+	ALIGN (4)
+L(shl_13):
+	BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd))
+	lea	-13(%eax), %eax
+	movaps	(%eax), %xmm1
+	xor	%edi, %edi
+	lea	-32(%ecx), %ecx
+	movdqu	%xmm0, (%esi)
+	POP (%esi)
+L(shl_13_loop):
+
+	movdqa	16(%eax, %edi), %xmm2
+	sub	$32, %ecx
+	movdqa	32(%eax, %edi), %xmm3
+	movdqa	%xmm3, %xmm4
+	palignr	$13, %xmm2, %xmm3
+	palignr	$13, %xmm1, %xmm2
+	lea	32(%edi), %edi
+	movdqa	%xmm2, -32(%edx, %edi)
+	movdqa	%xmm3, -16(%edx, %edi)
+
+	jb	L(shl_13_end)
+
+	movdqa	16(%eax, %edi), %xmm2
+	sub	$32, %ecx
+	movdqa	32(%eax, %edi), %xmm3
+	movdqa	%xmm3, %xmm1
+	palignr	$13, %xmm2, %xmm3
+	palignr	$13, %xmm4, %xmm2
+	lea	32(%edi), %edi
+	movdqa	%xmm2, -32(%edx, %edi)
+	movdqa	%xmm3, -16(%edx, %edi)
+
+	jae	L(shl_13_loop)
+
+L(shl_13_end):
+	lea	32(%ecx), %ecx
+	add	%ecx, %edi
+	add	%edi, %edx
+	lea	13(%edi, %eax), %eax
+	POP (%edi)
+	BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4)
+
+	ALIGN (4)
+L(shl_14):
+	BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd))
+	lea	-14(%eax), %eax
+	movaps	(%eax), %xmm1
+	xor	%edi, %edi
+	lea	-32(%ecx), %ecx
+	movdqu	%xmm0, (%esi)
+	POP (%esi)
+L(shl_14_loop):
+
+	movdqa	16(%eax, %edi), %xmm2
+	sub	$32, %ecx
+	movdqa	32(%eax, %edi), %xmm3
+	movdqa	%xmm3, %xmm4
+	palignr	$14, %xmm2, %xmm3
+	palignr	$14, %xmm1, %xmm2
+	lea	32(%edi), %edi
+	movdqa	%xmm2, -32(%edx, %edi)
+	movdqa	%xmm3, -16(%edx, %edi)
+
+	jb	L(shl_14_end)
+
+	movdqa	16(%eax, %edi), %xmm2
+	sub	$32, %ecx
+	movdqa	32(%eax, %edi), %xmm3
+	movdqa	%xmm3, %xmm1
+	palignr	$14, %xmm2, %xmm3
+	palignr	$14, %xmm4, %xmm2
+	lea	32(%edi), %edi
+	movdqa	%xmm2, -32(%edx, %edi)
+	movdqa	%xmm3, -16(%edx, %edi)
+
+	jae	L(shl_14_loop)
+
+L(shl_14_end):
+	lea	32(%ecx), %ecx
+	add	%ecx, %edi
+	add	%edi, %edx
+	lea	14(%edi, %eax), %eax
+	POP (%edi)
+	BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4)
+
+
+	ALIGN (4)
+L(shl_15):
+	BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd))
+	lea	-15(%eax), %eax
+	movaps	(%eax), %xmm1
+	xor	%edi, %edi
+	lea	-32(%ecx), %ecx
+	movdqu	%xmm0, (%esi)
+	POP (%esi)
+L(shl_15_loop):
+
+	movdqa	16(%eax, %edi), %xmm2
+	sub	$32, %ecx
+	movdqa	32(%eax, %edi), %xmm3
+	movdqa	%xmm3, %xmm4
+	palignr	$15, %xmm2, %xmm3
+	palignr	$15, %xmm1, %xmm2
+	lea	32(%edi), %edi
+	movdqa	%xmm2, -32(%edx, %edi)
+	movdqa	%xmm3, -16(%edx, %edi)
+
+	jb	L(shl_15_end)
+
+	movdqa	16(%eax, %edi), %xmm2
+	sub	$32, %ecx
+	movdqa	32(%eax, %edi), %xmm3
+	movdqa	%xmm3, %xmm1
+	palignr	$15, %xmm2, %xmm3
+	palignr	$15, %xmm4, %xmm2
+	lea	32(%edi), %edi
+	movdqa	%xmm2, -32(%edx, %edi)
+	movdqa	%xmm3, -16(%edx, %edi)
+
+	jae	L(shl_15_loop)
+
+L(shl_15_end):
+	lea	32(%ecx), %ecx
+	add	%ecx, %edi
+	add	%edi, %edx
+	lea	15(%edi, %eax), %eax
+	POP (%edi)
+	BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4)
+
+
+	ALIGN (4)
+L(fwd_write_44bytes):
+	movl	-44(%eax), %ecx
+	movl	%ecx, -44(%edx)
+L(fwd_write_40bytes):
+	movl	-40(%eax), %ecx
+	movl	%ecx, -40(%edx)
+L(fwd_write_36bytes):
+	movl	-36(%eax), %ecx
+	movl	%ecx, -36(%edx)
+L(fwd_write_32bytes):
+	movl	-32(%eax), %ecx
+	movl	%ecx, -32(%edx)
+L(fwd_write_28bytes):
+	movl	-28(%eax), %ecx
+	movl	%ecx, -28(%edx)
+L(fwd_write_24bytes):
+	movl	-24(%eax), %ecx
+	movl	%ecx, -24(%edx)
+L(fwd_write_20bytes):
+	movl	-20(%eax), %ecx
+	movl	%ecx, -20(%edx)
+L(fwd_write_16bytes):
+	movl	-16(%eax), %ecx
+	movl	%ecx, -16(%edx)
+L(fwd_write_12bytes):
+	movl	-12(%eax), %ecx
+	movl	%ecx, -12(%edx)
+L(fwd_write_8bytes):
+	movl	-8(%eax), %ecx
+	movl	%ecx, -8(%edx)
+L(fwd_write_4bytes):
+	movl	-4(%eax), %ecx
+	movl	%ecx, -4(%edx)
+L(fwd_write_0bytes):
+#ifndef USE_AS_BCOPY
+# ifdef USE_AS_MEMPCPY
+	movl	%edx, %eax
+# else
+	movl	DEST(%esp), %eax
+# endif
+#endif
+	RETURN
+
+	ALIGN (4)
+L(fwd_write_5bytes):
+	movl	-5(%eax), %ecx
+	movl	-4(%eax), %eax
+	movl	%ecx, -5(%edx)
+	movl	%eax, -4(%edx)
+#ifndef USE_AS_BCOPY
+# ifdef USE_AS_MEMPCPY
+	movl	%edx, %eax
+# else
+	movl	DEST(%esp), %eax
+# endif
+#endif
+	RETURN
+
+	ALIGN (4)
+L(fwd_write_45bytes):
+	movl	-45(%eax), %ecx
+	movl	%ecx, -45(%edx)
+L(fwd_write_41bytes):
+	movl	-41(%eax), %ecx
+	movl	%ecx, -41(%edx)
+L(fwd_write_37bytes):
+	movl	-37(%eax), %ecx
+	movl	%ecx, -37(%edx)
+L(fwd_write_33bytes):
+	movl	-33(%eax), %ecx
+	movl	%ecx, -33(%edx)
+L(fwd_write_29bytes):
+	movl	-29(%eax), %ecx
+	movl	%ecx, -29(%edx)
+L(fwd_write_25bytes):
+	movl	-25(%eax), %ecx
+	movl	%ecx, -25(%edx)
+L(fwd_write_21bytes):
+	movl	-21(%eax), %ecx
+	movl	%ecx, -21(%edx)
+L(fwd_write_17bytes):
+	movl	-17(%eax), %ecx
+	movl	%ecx, -17(%edx)
+L(fwd_write_13bytes):
+	movl	-13(%eax), %ecx
+	movl	%ecx, -13(%edx)
+L(fwd_write_9bytes):
+	movl	-9(%eax), %ecx
+	movl	%ecx, -9(%edx)
+	movl	-5(%eax), %ecx
+	movl	%ecx, -5(%edx)
+L(fwd_write_1bytes):
+	movzbl	-1(%eax), %ecx
+	movb	%cl, -1(%edx)
+#ifndef USE_AS_BCOPY
+# ifdef USE_AS_MEMPCPY
+	movl	%edx, %eax
+# else
+	movl	DEST(%esp), %eax
+# endif
+#endif
+	RETURN
+
+	ALIGN (4)
+L(fwd_write_46bytes):
+	movl	-46(%eax), %ecx
+	movl	%ecx, -46(%edx)
+L(fwd_write_42bytes):
+	movl	-42(%eax), %ecx
+	movl	%ecx, -42(%edx)
+L(fwd_write_38bytes):
+	movl	-38(%eax), %ecx
+	movl	%ecx, -38(%edx)
+L(fwd_write_34bytes):
+	movl	-34(%eax), %ecx
+	movl	%ecx, -34(%edx)
+L(fwd_write_30bytes):
+	movl	-30(%eax), %ecx
+	movl	%ecx, -30(%edx)
+L(fwd_write_26bytes):
+	movl	-26(%eax), %ecx
+	movl	%ecx, -26(%edx)
+L(fwd_write_22bytes):
+	movl	-22(%eax), %ecx
+	movl	%ecx, -22(%edx)
+L(fwd_write_18bytes):
+	movl	-18(%eax), %ecx
+	movl	%ecx, -18(%edx)
+L(fwd_write_14bytes):
+	movl	-14(%eax), %ecx
+	movl	%ecx, -14(%edx)
+L(fwd_write_10bytes):
+	movl	-10(%eax), %ecx
+	movl	%ecx, -10(%edx)
+L(fwd_write_6bytes):
+	movl	-6(%eax), %ecx
+	movl	%ecx, -6(%edx)
+L(fwd_write_2bytes):
+	movzwl	-2(%eax), %ecx
+	movw	%cx, -2(%edx)
+#ifndef USE_AS_BCOPY
+# ifdef USE_AS_MEMPCPY
+	movl	%edx, %eax
+# else
+	movl	DEST(%esp), %eax
+# endif
+#endif
+	RETURN
+
+	ALIGN (4)
+L(fwd_write_47bytes):
+	movl	-47(%eax), %ecx
+	movl	%ecx, -47(%edx)
+L(fwd_write_43bytes):
+	movl	-43(%eax), %ecx
+	movl	%ecx, -43(%edx)
+L(fwd_write_39bytes):
+	movl	-39(%eax), %ecx
+	movl	%ecx, -39(%edx)
+L(fwd_write_35bytes):
+	movl	-35(%eax), %ecx
+	movl	%ecx, -35(%edx)
+L(fwd_write_31bytes):
+	movl	-31(%eax), %ecx
+	movl	%ecx, -31(%edx)
+L(fwd_write_27bytes):
+	movl	-27(%eax), %ecx
+	movl	%ecx, -27(%edx)
+L(fwd_write_23bytes):
+	movl	-23(%eax), %ecx
+	movl	%ecx, -23(%edx)
+L(fwd_write_19bytes):
+	movl	-19(%eax), %ecx
+	movl	%ecx, -19(%edx)
+L(fwd_write_15bytes):
+	movl	-15(%eax), %ecx
+	movl	%ecx, -15(%edx)
+L(fwd_write_11bytes):
+	movl	-11(%eax), %ecx
+	movl	%ecx, -11(%edx)
+L(fwd_write_7bytes):
+	movl	-7(%eax), %ecx
+	movl	%ecx, -7(%edx)
+L(fwd_write_3bytes):
+	movzwl	-3(%eax), %ecx
+	movzbl	-1(%eax), %eax
+	movw	%cx, -3(%edx)
+	movb	%al, -1(%edx)
+#ifndef USE_AS_BCOPY
+# ifdef USE_AS_MEMPCPY
+	movl	%edx, %eax
+# else
+	movl	DEST(%esp), %eax
+# endif
+#endif
+	RETURN
+
+	ALIGN (4)
+L(large_page):
+	movdqu	(%eax), %xmm1
+	lea	16(%eax), %eax
+	movdqu	%xmm0, (%esi)
+	movntdq	%xmm1, (%edx)
+	lea	16(%edx), %edx
+	POP (%esi)
+	lea	-0x90(%ecx), %ecx
+	POP (%edi)
+L(large_page_loop):
+	movdqu	(%eax), %xmm0
+	movdqu	0x10(%eax), %xmm1
+	movdqu	0x20(%eax), %xmm2
+	movdqu	0x30(%eax), %xmm3
+	movdqu	0x40(%eax), %xmm4
+	movdqu	0x50(%eax), %xmm5
+	movdqu	0x60(%eax), %xmm6
+	movdqu	0x70(%eax), %xmm7
+	lea	0x80(%eax), %eax
+
+	sub	$0x80, %ecx
+	movntdq	%xmm0, (%edx)
+	movntdq	%xmm1, 0x10(%edx)
+	movntdq	%xmm2, 0x20(%edx)
+	movntdq	%xmm3, 0x30(%edx)
+	movntdq	%xmm4, 0x40(%edx)
+	movntdq	%xmm5, 0x50(%edx)
+	movntdq	%xmm6, 0x60(%edx)
+	movntdq	%xmm7, 0x70(%edx)
+	lea	0x80(%edx), %edx
+	jae	L(large_page_loop)
+	cmp	$-0x40, %ecx
+	lea	0x80(%ecx), %ecx
+	jl	L(large_page_less_64bytes)
+
+	movdqu	(%eax), %xmm0
+	movdqu	0x10(%eax), %xmm1
+	movdqu	0x20(%eax), %xmm2
+	movdqu	0x30(%eax), %xmm3
+	lea	0x40(%eax), %eax
+
+	movntdq	%xmm0, (%edx)
+	movntdq	%xmm1, 0x10(%edx)
+	movntdq	%xmm2, 0x20(%edx)
+	movntdq	%xmm3, 0x30(%edx)
+	lea	0x40(%edx), %edx
+	sub	$0x40, %ecx
+L(large_page_less_64bytes):
+	cmp	$32, %ecx
+	jb	L(large_page_less_32bytes)
+	movdqu	(%eax), %xmm0
+	movdqu	0x10(%eax), %xmm1
+	lea	0x20(%eax), %eax
+	movntdq	%xmm0, (%edx)
+	movntdq	%xmm1, 0x10(%edx)
+	lea	0x20(%edx), %edx
+	sub	$0x20, %ecx
+L(large_page_less_32bytes):
+	add	%ecx, %edx
+	add	%ecx, %eax
+	sfence
+	BRANCH_TO_JMPTBL_ENTRY (L(table_48bytes_fwd), %ecx, 4)
+
+
+	ALIGN (4)
+L(bk_write_44bytes):
+	movl	40(%eax), %ecx
+	movl	%ecx, 40(%edx)
+L(bk_write_40bytes):
+	movl	36(%eax), %ecx
+	movl	%ecx, 36(%edx)
+L(bk_write_36bytes):
+	movl	32(%eax), %ecx
+	movl	%ecx, 32(%edx)
+L(bk_write_32bytes):
+	movl	28(%eax), %ecx
+	movl	%ecx, 28(%edx)
+L(bk_write_28bytes):
+	movl	24(%eax), %ecx
+	movl	%ecx, 24(%edx)
+L(bk_write_24bytes):
+	movl	20(%eax), %ecx
+	movl	%ecx, 20(%edx)
+L(bk_write_20bytes):
+	movl	16(%eax), %ecx
+	movl	%ecx, 16(%edx)
+L(bk_write_16bytes):
+	movl	12(%eax), %ecx
+	movl	%ecx, 12(%edx)
+L(bk_write_12bytes):
+	movl	8(%eax), %ecx
+	movl	%ecx, 8(%edx)
+L(bk_write_8bytes):
+	movl	4(%eax), %ecx
+	movl	%ecx, 4(%edx)
+L(bk_write_4bytes):
+	movl	(%eax), %ecx
+	movl	%ecx, (%edx)
+L(bk_write_0bytes):
+#ifndef USE_AS_BCOPY
+	movl	DEST(%esp), %eax
+# ifdef USE_AS_MEMPCPY
+	movl	LEN(%esp), %ecx
+	add	%ecx, %eax
+# endif
+#endif
+	RETURN
+
+	ALIGN (4)
+L(bk_write_45bytes):
+	movl	41(%eax), %ecx
+	movl	%ecx, 41(%edx)
+L(bk_write_41bytes):
+	movl	37(%eax), %ecx
+	movl	%ecx, 37(%edx)
+L(bk_write_37bytes):
+	movl	33(%eax), %ecx
+	movl	%ecx, 33(%edx)
+L(bk_write_33bytes):
+	movl	29(%eax), %ecx
+	movl	%ecx, 29(%edx)
+L(bk_write_29bytes):
+	movl	25(%eax), %ecx
+	movl	%ecx, 25(%edx)
+L(bk_write_25bytes):
+	movl	21(%eax), %ecx
+	movl	%ecx, 21(%edx)
+L(bk_write_21bytes):
+	movl	17(%eax), %ecx
+	movl	%ecx, 17(%edx)
+L(bk_write_17bytes):
+	movl	13(%eax), %ecx
+	movl	%ecx, 13(%edx)
+L(bk_write_13bytes):
+	movl	9(%eax), %ecx
+	movl	%ecx, 9(%edx)
+L(bk_write_9bytes):
+	movl	5(%eax), %ecx
+	movl	%ecx, 5(%edx)
+L(bk_write_5bytes):
+	movl	1(%eax), %ecx
+	movl	%ecx, 1(%edx)
+L(bk_write_1bytes):
+	movzbl	(%eax), %ecx
+	movb	%cl, (%edx)
+#ifndef USE_AS_BCOPY
+	movl	DEST(%esp), %eax
+# ifdef USE_AS_MEMPCPY
+	movl	LEN(%esp), %ecx
+	add	%ecx, %eax
+# endif
+#endif
+	RETURN
+
+	ALIGN (4)
+L(bk_write_46bytes):
+	movl	42(%eax), %ecx
+	movl	%ecx, 42(%edx)
+L(bk_write_42bytes):
+	movl	38(%eax), %ecx
+	movl	%ecx, 38(%edx)
+L(bk_write_38bytes):
+	movl	34(%eax), %ecx
+	movl	%ecx, 34(%edx)
+L(bk_write_34bytes):
+	movl	30(%eax), %ecx
+	movl	%ecx, 30(%edx)
+L(bk_write_30bytes):
+	movl	26(%eax), %ecx
+	movl	%ecx, 26(%edx)
+L(bk_write_26bytes):
+	movl	22(%eax), %ecx
+	movl	%ecx, 22(%edx)
+L(bk_write_22bytes):
+	movl	18(%eax), %ecx
+	movl	%ecx, 18(%edx)
+L(bk_write_18bytes):
+	movl	14(%eax), %ecx
+	movl	%ecx, 14(%edx)
+L(bk_write_14bytes):
+	movl	10(%eax), %ecx
+	movl	%ecx, 10(%edx)
+L(bk_write_10bytes):
+	movl	6(%eax), %ecx
+	movl	%ecx, 6(%edx)
+L(bk_write_6bytes):
+	movl	2(%eax), %ecx
+	movl	%ecx, 2(%edx)
+L(bk_write_2bytes):
+	movzwl	(%eax), %ecx
+	movw	%cx, (%edx)
+#ifndef USE_AS_BCOPY
+	movl	DEST(%esp), %eax
+# ifdef USE_AS_MEMPCPY
+	movl	LEN(%esp), %ecx
+	add	%ecx, %eax
+# endif
+#endif
+	RETURN
+
+	ALIGN (4)
+L(bk_write_47bytes):
+	movl	43(%eax), %ecx
+	movl	%ecx, 43(%edx)
+L(bk_write_43bytes):
+	movl	39(%eax), %ecx
+	movl	%ecx, 39(%edx)
+L(bk_write_39bytes):
+	movl	35(%eax), %ecx
+	movl	%ecx, 35(%edx)
+L(bk_write_35bytes):
+	movl	31(%eax), %ecx
+	movl	%ecx, 31(%edx)
+L(bk_write_31bytes):
+	movl	27(%eax), %ecx
+	movl	%ecx, 27(%edx)
+L(bk_write_27bytes):
+	movl	23(%eax), %ecx
+	movl	%ecx, 23(%edx)
+L(bk_write_23bytes):
+	movl	19(%eax), %ecx
+	movl	%ecx, 19(%edx)
+L(bk_write_19bytes):
+	movl	15(%eax), %ecx
+	movl	%ecx, 15(%edx)
+L(bk_write_15bytes):
+	movl	11(%eax), %ecx
+	movl	%ecx, 11(%edx)
+L(bk_write_11bytes):
+	movl	7(%eax), %ecx
+	movl	%ecx, 7(%edx)
+L(bk_write_7bytes):
+	movl	3(%eax), %ecx
+	movl	%ecx, 3(%edx)
+L(bk_write_3bytes):
+	movzwl	1(%eax), %ecx
+	movw	%cx, 1(%edx)
+	movzbl	(%eax), %eax
+	movb	%al, (%edx)
+#ifndef USE_AS_BCOPY
+	movl	DEST(%esp), %eax
+# ifdef USE_AS_MEMPCPY
+	movl	LEN(%esp), %ecx
+	add	%ecx, %eax
+# endif
+#endif
+	RETURN_END
+
+
+	.pushsection .rodata.ssse3,"a",@progbits
+	ALIGN (2)
+L(table_48bytes_fwd):
+	.int	JMPTBL (L(fwd_write_0bytes), L(table_48bytes_fwd))
+	.int	JMPTBL (L(fwd_write_1bytes), L(table_48bytes_fwd))
+	.int	JMPTBL (L(fwd_write_2bytes), L(table_48bytes_fwd))
+	.int	JMPTBL (L(fwd_write_3bytes), L(table_48bytes_fwd))
+	.int	JMPTBL (L(fwd_write_4bytes), L(table_48bytes_fwd))
+	.int	JMPTBL (L(fwd_write_5bytes), L(table_48bytes_fwd))
+	.int	JMPTBL (L(fwd_write_6bytes), L(table_48bytes_fwd))
+	.int	JMPTBL (L(fwd_write_7bytes), L(table_48bytes_fwd))
+	.int	JMPTBL (L(fwd_write_8bytes), L(table_48bytes_fwd))
+	.int	JMPTBL (L(fwd_write_9bytes), L(table_48bytes_fwd))
+	.int	JMPTBL (L(fwd_write_10bytes), L(table_48bytes_fwd))
+	.int	JMPTBL (L(fwd_write_11bytes), L(table_48bytes_fwd))
+	.int	JMPTBL (L(fwd_write_12bytes), L(table_48bytes_fwd))
+	.int	JMPTBL (L(fwd_write_13bytes), L(table_48bytes_fwd))
+	.int	JMPTBL (L(fwd_write_14bytes), L(table_48bytes_fwd))
+	.int	JMPTBL (L(fwd_write_15bytes), L(table_48bytes_fwd))
+	.int	JMPTBL (L(fwd_write_16bytes), L(table_48bytes_fwd))
+	.int	JMPTBL (L(fwd_write_17bytes), L(table_48bytes_fwd))
+	.int	JMPTBL (L(fwd_write_18bytes), L(table_48bytes_fwd))
+	.int	JMPTBL (L(fwd_write_19bytes), L(table_48bytes_fwd))
+	.int	JMPTBL (L(fwd_write_20bytes), L(table_48bytes_fwd))
+	.int	JMPTBL (L(fwd_write_21bytes), L(table_48bytes_fwd))
+	.int	JMPTBL (L(fwd_write_22bytes), L(table_48bytes_fwd))
+	.int	JMPTBL (L(fwd_write_23bytes), L(table_48bytes_fwd))
+	.int	JMPTBL (L(fwd_write_24bytes), L(table_48bytes_fwd))
+	.int	JMPTBL (L(fwd_write_25bytes), L(table_48bytes_fwd))
+	.int	JMPTBL (L(fwd_write_26bytes), L(table_48bytes_fwd))
+	.int	JMPTBL (L(fwd_write_27bytes), L(table_48bytes_fwd))
+	.int	JMPTBL (L(fwd_write_28bytes), L(table_48bytes_fwd))
+	.int	JMPTBL (L(fwd_write_29bytes), L(table_48bytes_fwd))
+	.int	JMPTBL (L(fwd_write_30bytes), L(table_48bytes_fwd))
+	.int	JMPTBL (L(fwd_write_31bytes), L(table_48bytes_fwd))
+	.int	JMPTBL (L(fwd_write_32bytes), L(table_48bytes_fwd))
+	.int	JMPTBL (L(fwd_write_33bytes), L(table_48bytes_fwd))
+	.int	JMPTBL (L(fwd_write_34bytes), L(table_48bytes_fwd))
+	.int	JMPTBL (L(fwd_write_35bytes), L(table_48bytes_fwd))
+	.int	JMPTBL (L(fwd_write_36bytes), L(table_48bytes_fwd))
+	.int	JMPTBL (L(fwd_write_37bytes), L(table_48bytes_fwd))
+	.int	JMPTBL (L(fwd_write_38bytes), L(table_48bytes_fwd))
+	.int	JMPTBL (L(fwd_write_39bytes), L(table_48bytes_fwd))
+	.int	JMPTBL (L(fwd_write_40bytes), L(table_48bytes_fwd))
+	.int	JMPTBL (L(fwd_write_41bytes), L(table_48bytes_fwd))
+	.int	JMPTBL (L(fwd_write_42bytes), L(table_48bytes_fwd))
+	.int	JMPTBL (L(fwd_write_43bytes), L(table_48bytes_fwd))
+	.int	JMPTBL (L(fwd_write_44bytes), L(table_48bytes_fwd))
+	.int	JMPTBL (L(fwd_write_45bytes), L(table_48bytes_fwd))
+	.int	JMPTBL (L(fwd_write_46bytes), L(table_48bytes_fwd))
+	.int	JMPTBL (L(fwd_write_47bytes), L(table_48bytes_fwd))
+
+	ALIGN (2)
+L(shl_table):
+	.int	JMPTBL (L(shl_0), L(shl_table))
+	.int	JMPTBL (L(shl_1), L(shl_table))
+	.int	JMPTBL (L(shl_2), L(shl_table))
+	.int	JMPTBL (L(shl_3), L(shl_table))
+	.int	JMPTBL (L(shl_4), L(shl_table))
+	.int	JMPTBL (L(shl_5), L(shl_table))
+	.int	JMPTBL (L(shl_6), L(shl_table))
+	.int	JMPTBL (L(shl_7), L(shl_table))
+	.int	JMPTBL (L(shl_8), L(shl_table))
+	.int	JMPTBL (L(shl_9), L(shl_table))
+	.int	JMPTBL (L(shl_10), L(shl_table))
+	.int	JMPTBL (L(shl_11), L(shl_table))
+	.int	JMPTBL (L(shl_12), L(shl_table))
+	.int	JMPTBL (L(shl_13), L(shl_table))
+	.int	JMPTBL (L(shl_14), L(shl_table))
+	.int	JMPTBL (L(shl_15), L(shl_table))
+
+	ALIGN (2)
+L(table_48_bytes_bwd):
+	.int	JMPTBL (L(bk_write_0bytes), L(table_48_bytes_bwd))
+	.int	JMPTBL (L(bk_write_1bytes), L(table_48_bytes_bwd))
+	.int	JMPTBL (L(bk_write_2bytes), L(table_48_bytes_bwd))
+	.int	JMPTBL (L(bk_write_3bytes), L(table_48_bytes_bwd))
+	.int	JMPTBL (L(bk_write_4bytes), L(table_48_bytes_bwd))
+	.int	JMPTBL (L(bk_write_5bytes), L(table_48_bytes_bwd))
+	.int	JMPTBL (L(bk_write_6bytes), L(table_48_bytes_bwd))
+	.int	JMPTBL (L(bk_write_7bytes), L(table_48_bytes_bwd))
+	.int	JMPTBL (L(bk_write_8bytes), L(table_48_bytes_bwd))
+	.int	JMPTBL (L(bk_write_9bytes), L(table_48_bytes_bwd))
+	.int	JMPTBL (L(bk_write_10bytes), L(table_48_bytes_bwd))
+	.int	JMPTBL (L(bk_write_11bytes), L(table_48_bytes_bwd))
+	.int	JMPTBL (L(bk_write_12bytes), L(table_48_bytes_bwd))
+	.int	JMPTBL (L(bk_write_13bytes), L(table_48_bytes_bwd))
+	.int	JMPTBL (L(bk_write_14bytes), L(table_48_bytes_bwd))
+	.int	JMPTBL (L(bk_write_15bytes), L(table_48_bytes_bwd))
+	.int	JMPTBL (L(bk_write_16bytes), L(table_48_bytes_bwd))
+	.int	JMPTBL (L(bk_write_17bytes), L(table_48_bytes_bwd))
+	.int	JMPTBL (L(bk_write_18bytes), L(table_48_bytes_bwd))
+	.int	JMPTBL (L(bk_write_19bytes), L(table_48_bytes_bwd))
+	.int	JMPTBL (L(bk_write_20bytes), L(table_48_bytes_bwd))
+	.int	JMPTBL (L(bk_write_21bytes), L(table_48_bytes_bwd))
+	.int	JMPTBL (L(bk_write_22bytes), L(table_48_bytes_bwd))
+	.int	JMPTBL (L(bk_write_23bytes), L(table_48_bytes_bwd))
+	.int	JMPTBL (L(bk_write_24bytes), L(table_48_bytes_bwd))
+	.int	JMPTBL (L(bk_write_25bytes), L(table_48_bytes_bwd))
+	.int	JMPTBL (L(bk_write_26bytes), L(table_48_bytes_bwd))
+	.int	JMPTBL (L(bk_write_27bytes), L(table_48_bytes_bwd))
+	.int	JMPTBL (L(bk_write_28bytes), L(table_48_bytes_bwd))
+	.int	JMPTBL (L(bk_write_29bytes), L(table_48_bytes_bwd))
+	.int	JMPTBL (L(bk_write_30bytes), L(table_48_bytes_bwd))
+	.int	JMPTBL (L(bk_write_31bytes), L(table_48_bytes_bwd))
+	.int	JMPTBL (L(bk_write_32bytes), L(table_48_bytes_bwd))
+	.int	JMPTBL (L(bk_write_33bytes), L(table_48_bytes_bwd))
+	.int	JMPTBL (L(bk_write_34bytes), L(table_48_bytes_bwd))
+	.int	JMPTBL (L(bk_write_35bytes), L(table_48_bytes_bwd))
+	.int	JMPTBL (L(bk_write_36bytes), L(table_48_bytes_bwd))
+	.int	JMPTBL (L(bk_write_37bytes), L(table_48_bytes_bwd))
+	.int	JMPTBL (L(bk_write_38bytes), L(table_48_bytes_bwd))
+	.int	JMPTBL (L(bk_write_39bytes), L(table_48_bytes_bwd))
+	.int	JMPTBL (L(bk_write_40bytes), L(table_48_bytes_bwd))
+	.int	JMPTBL (L(bk_write_41bytes), L(table_48_bytes_bwd))
+	.int	JMPTBL (L(bk_write_42bytes), L(table_48_bytes_bwd))
+	.int	JMPTBL (L(bk_write_43bytes), L(table_48_bytes_bwd))
+	.int	JMPTBL (L(bk_write_44bytes), L(table_48_bytes_bwd))
+	.int	JMPTBL (L(bk_write_45bytes), L(table_48_bytes_bwd))
+	.int	JMPTBL (L(bk_write_46bytes), L(table_48_bytes_bwd))
+	.int	JMPTBL (L(bk_write_47bytes), L(table_48_bytes_bwd))
+
+	.popsection
+
+#ifdef USE_AS_MEMMOVE
+	ALIGN (4)
+L(copy_backward):
+	PUSH (%esi)
+	movl	%eax, %esi
+	lea	(%ecx,%edx,1),%edx
+	lea	(%ecx,%esi,1),%esi
+	testl	$0x3, %edx
+	jnz	L(bk_align)
+
+L(bk_aligned_4):
+	cmp	$64, %ecx
+	jae	L(bk_write_more64bytes)
+
+L(bk_write_64bytesless):
+	cmp	$32, %ecx
+	jb	L(bk_write_less32bytes)
+
+L(bk_write_more32bytes):
+	/* Copy 32 bytes at a time.  */
+	sub	$32, %ecx
+	movl	-4(%esi), %eax
+	movl	%eax, -4(%edx)
+	movl	-8(%esi), %eax
+	movl	%eax, -8(%edx)
+	movl	-12(%esi), %eax
+	movl	%eax, -12(%edx)
+	movl	-16(%esi), %eax
+	movl	%eax, -16(%edx)
+	movl	-20(%esi), %eax
+	movl	%eax, -20(%edx)
+	movl	-24(%esi), %eax
+	movl	%eax, -24(%edx)
+	movl	-28(%esi), %eax
+	movl	%eax, -28(%edx)
+	movl	-32(%esi), %eax
+	movl	%eax, -32(%edx)
+	sub	$32, %edx
+	sub	$32, %esi
+
+L(bk_write_less32bytes):
+	movl	%esi, %eax
+	sub	%ecx, %edx
+	sub	%ecx, %eax
+	POP (%esi)
+L(bk_write_less32bytes_2):
+	BRANCH_TO_JMPTBL_ENTRY (L(table_48_bytes_bwd), %ecx, 4)
+
+	ALIGN (4)
+L(bk_align):
+	cmp	$8, %ecx
+	jbe	L(bk_write_less32bytes)
+	testl	$1, %edx
+	/* We get here only if (EDX & 3 ) != 0 so if (EDX & 1) ==0,
+	   then (EDX & 2) must be != 0.  */
+	jz	L(bk_got2)
+	sub	$1, %esi
+	sub	$1, %ecx
+	sub	$1, %edx
+	movzbl	(%esi), %eax
+	movb	%al, (%edx)
+
+	testl	$2, %edx
+	jz	L(bk_aligned_4)
+
+L(bk_got2):
+	sub	$2, %esi
+	sub	$2, %ecx
+	sub	$2, %edx
+	movzwl	(%esi), %eax
+	movw	%ax, (%edx)
+	jmp	L(bk_aligned_4)
+
+	ALIGN (4)
+L(bk_write_more64bytes):
+	/* Check alignment of last byte.  */
+	testl	$15, %edx
+	jz	L(bk_ssse3_cpy_pre)
+
+/* EDX is aligned 4 bytes, but not 16 bytes.  */
+L(bk_ssse3_align):
+	sub	$4, %esi
+	sub	$4, %ecx
+	sub	$4, %edx
+	movl	(%esi), %eax
+	movl	%eax, (%edx)
+
+	testl	$15, %edx
+	jz	L(bk_ssse3_cpy_pre)
+
+	sub	$4, %esi
+	sub	$4, %ecx
+	sub	$4, %edx
+	movl	(%esi), %eax
+	movl	%eax, (%edx)
+
+	testl	$15, %edx
+	jz	L(bk_ssse3_cpy_pre)
+
+	sub	$4, %esi
+	sub	$4, %ecx
+	sub	$4, %edx
+	movl	(%esi), %eax
+	movl	%eax, (%edx)
+
+L(bk_ssse3_cpy_pre):
+	cmp	$64, %ecx
+	jb	L(bk_write_more32bytes)
+
+L(bk_ssse3_cpy):
+	sub	$64, %esi
+	sub	$64, %ecx
+	sub	$64, %edx
+	movdqu	0x30(%esi), %xmm3
+	movdqa	%xmm3, 0x30(%edx)
+	movdqu	0x20(%esi), %xmm2
+	movdqa	%xmm2, 0x20(%edx)
+	movdqu	0x10(%esi), %xmm1
+	movdqa	%xmm1, 0x10(%edx)
+	movdqu	(%esi), %xmm0
+	movdqa	%xmm0, (%edx)
+	cmp	$64, %ecx
+	jae	L(bk_ssse3_cpy)
+	jmp	L(bk_write_64bytesless)
+
+#endif
+
+END (MEMCPY)
diff --git a/libc/arch-x86/string/ssse3-strcmp.S b/libc/arch-x86/string/ssse3-strcmp.S
new file mode 100644
index 0000000..cfb2e9f
--- /dev/null
+++ b/libc/arch-x86/string/ssse3-strcmp.S
@@ -0,0 +1,2265 @@
+/*
+Copyright (c) 2010, Intel Corporation
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+    * Redistributions of source code must retain the above copyright notice,
+    * this list of conditions and the following disclaimer.
+
+    * Redistributions in binary form must reproduce the above copyright notice,
+    * this list of conditions and the following disclaimer in the documentation
+    * and/or other materials provided with the distribution.
+
+    * Neither the name of Intel Corporation nor the names of its contributors
+    * may be used to endorse or promote products derived from this software
+    * without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
+ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+#ifndef L
+# define L(label)	.L##label
+#endif
+
+#ifndef cfi_startproc
+# define cfi_startproc			.cfi_startproc
+#endif
+
+#ifndef cfi_endproc
+# define cfi_endproc			.cfi_endproc
+#endif
+
+#ifndef cfi_rel_offset
+# define cfi_rel_offset(reg, off)	.cfi_rel_offset reg, off
+#endif
+
+#ifndef cfi_restore
+# define cfi_restore(reg)		.cfi_restore (reg)
+#endif
+
+#ifndef cfi_adjust_cfa_offset
+# define cfi_adjust_cfa_offset(off)	.cfi_adjust_cfa_offset off
+#endif
+
+#ifndef ENTRY
+# define ENTRY(name)			\
+	.type name,  @function; 	\
+	.globl name;			\
+	.p2align 4;			\
+name:					\
+	cfi_startproc
+#endif
+
+#ifndef END
+# define END(name)			\
+	cfi_endproc;			\
+	.size name, .-name
+#endif
+
+#define CFI_PUSH(REG)						\
+  cfi_adjust_cfa_offset (4);					\
+  cfi_rel_offset (REG, 0)
+
+#define CFI_POP(REG)						\
+  cfi_adjust_cfa_offset (-4);					\
+  cfi_restore (REG)
+
+#define PUSH(REG)	pushl REG; CFI_PUSH (REG)
+#define POP(REG)	popl REG; CFI_POP (REG)
+
+#ifndef USE_AS_STRNCMP
+# define STR1		4
+# define STR2		STR1+4
+# define RETURN		ret
+
+# define UPDATE_STRNCMP_COUNTER
+#else
+# define STR1		8
+# define STR2		STR1+4
+# define CNT		STR2+4
+# define RETURN		POP (%ebp); ret; CFI_PUSH (%ebp)
+
+# define UPDATE_STRNCMP_COUNTER				\
+	/* calculate left number to compare */		\
+	mov	$16, %esi;				\
+	sub	%ecx, %esi;				\
+	cmp	%esi, %ebp;				\
+	jbe	L(more8byteseq);			\
+	sub	%esi, %ebp
+#endif
+
+	.section .text.ssse3,"ax",@progbits
+ENTRY (ssse3_strcmp_latest)
+#ifdef USE_AS_STRNCMP
+	PUSH	(%ebp)
+#endif
+	movl	STR1(%esp), %edx
+	movl	STR2(%esp), %eax
+#ifdef USE_AS_STRNCMP
+	movl	CNT(%esp), %ebp
+	cmp	$16, %ebp
+	jb	L(less16bytes_sncmp)
+	jmp	L(more16bytes)
+#endif
+
+	movzbl	(%eax), %ecx
+	cmpb	%cl, (%edx)
+	jne	L(neq)
+	cmpl	$0, %ecx
+	je	L(eq)
+
+	movzbl	1(%eax), %ecx
+	cmpb	%cl, 1(%edx)
+	jne	L(neq)
+	cmpl	$0, %ecx
+	je	L(eq)
+
+	movzbl	2(%eax), %ecx
+	cmpb	%cl, 2(%edx)
+	jne	L(neq)
+	cmpl	$0, %ecx
+	je	L(eq)
+
+	movzbl	3(%eax), %ecx
+	cmpb	%cl, 3(%edx)
+	jne	L(neq)
+	cmpl	$0, %ecx
+	je	L(eq)
+
+	movzbl	4(%eax), %ecx
+	cmpb	%cl, 4(%edx)
+	jne	L(neq)
+	cmpl	$0, %ecx
+	je	L(eq)
+
+	movzbl	5(%eax), %ecx
+	cmpb	%cl, 5(%edx)
+	jne	L(neq)
+	cmpl	$0, %ecx
+	je	L(eq)
+
+	movzbl	6(%eax), %ecx
+	cmpb	%cl, 6(%edx)
+	jne	L(neq)
+	cmpl	$0, %ecx
+	je	L(eq)
+
+	movzbl	7(%eax), %ecx
+	cmpb	%cl, 7(%edx)
+	jne	L(neq)
+	cmpl	$0, %ecx
+	je	L(eq)
+
+	add	$8, %edx
+	add	$8, %eax
+#ifdef USE_AS_STRNCMP
+	cmp	$8, %ebp
+	lea	-8(%ebp), %ebp
+	je	L(eq)
+L(more16bytes):
+#endif
+	movl	%edx, %ecx
+	and	$0xfff, %ecx
+	cmp	$0xff0, %ecx
+	ja	L(crosspage)
+	mov	%eax, %ecx
+	and	$0xfff, %ecx
+	cmp	$0xff0, %ecx
+	ja	L(crosspage)
+	pxor	%xmm0, %xmm0
+	movlpd	(%eax), %xmm1
+	movlpd	(%edx), %xmm2
+	movhpd	8(%eax), %xmm1
+	movhpd	8(%edx), %xmm2
+	pcmpeqb	%xmm1, %xmm0
+	pcmpeqb	%xmm2, %xmm1
+	psubb	%xmm0, %xmm1
+	pmovmskb %xmm1, %ecx
+	sub	$0xffff, %ecx
+	jnz	L(less16bytes)
+#ifdef USE_AS_STRNCMP
+	cmp	$16, %ebp
+	lea	-16(%ebp), %ebp
+	jbe	L(eq)
+#endif
+	add	$16, %eax
+	add	$16, %edx
+
+L(crosspage):
+
+	PUSH	(%ebx)
+	PUSH	(%edi)
+	PUSH	(%esi)
+
+	movl	%edx, %edi
+	movl	%eax, %ecx
+	and	$0xf, %ecx
+	and	$0xf, %edi
+	xor	%ecx, %eax
+	xor	%edi, %edx
+	xor	%ebx, %ebx
+	cmp	%edi, %ecx
+	je	L(ashr_0)
+	ja	L(bigger)
+	or	$0x20, %ebx
+	xchg	%edx, %eax
+	xchg	%ecx, %edi
+L(bigger):
+	lea	15(%edi), %edi
+	sub	%ecx, %edi
+	cmp	$8, %edi
+	jle	L(ashr_less_8)
+	cmp	$14, %edi
+	je	L(ashr_15)
+	cmp	$13, %edi
+	je	L(ashr_14)
+	cmp	$12, %edi
+	je	L(ashr_13)
+	cmp	$11, %edi
+	je	L(ashr_12)
+	cmp	$10, %edi
+	je	L(ashr_11)
+	cmp	$9, %edi
+	je	L(ashr_10)
+L(ashr_less_8):
+	je	L(ashr_9)
+	cmp	$7, %edi
+	je	L(ashr_8)
+	cmp	$6, %edi
+	je	L(ashr_7)
+	cmp	$5, %edi
+	je	L(ashr_6)
+	cmp	$4, %edi
+	je	L(ashr_5)
+	cmp	$3, %edi
+	je	L(ashr_4)
+	cmp	$2, %edi
+	je	L(ashr_3)
+	cmp	$1, %edi
+	je	L(ashr_2)
+	cmp	$0, %edi
+	je	L(ashr_1)
+
+/*
+ * The following cases will be handled by ashr_0
+ *  ecx(offset of esi)  eax(offset of edi)  relative offset  corresponding case
+ *        n(0~15)            n(0~15)           15(15+ n-n)         ashr_0
+ */
+	.p2align 4
+L(ashr_0):
+	mov	$0xffff, %esi
+	movdqa	(%eax), %xmm1
+	pxor	%xmm0, %xmm0
+	pcmpeqb	%xmm1, %xmm0
+	pcmpeqb	(%edx), %xmm1
+	psubb	%xmm0, %xmm1
+	pmovmskb %xmm1, %edi
+	shr	%cl, %esi
+	shr	%cl, %edi
+	sub	%edi, %esi
+	mov	%ecx, %edi
+	jne	L(less32bytes)
+	UPDATE_STRNCMP_COUNTER
+	mov	$0x10, %ebx
+	mov	$0x10, %ecx
+	pxor	%xmm0, %xmm0
+	.p2align 4
+L(loop_ashr_0):
+	movdqa	(%eax, %ecx), %xmm1
+	movdqa	(%edx, %ecx), %xmm2
+
+	pcmpeqb	%xmm1, %xmm0
+	pcmpeqb	%xmm2, %xmm1
+	psubb	%xmm0, %xmm1
+	pmovmskb %xmm1, %esi
+	sub	$0xffff, %esi
+	jnz	L(exit)
+#ifdef USE_AS_STRNCMP
+	cmp	$16, %ebp
+	lea	-16(%ebp), %ebp
+	jbe	L(more8byteseq)
+#endif
+	add	$16, %ecx
+	jmp	L(loop_ashr_0)
+
+/*
+ * The following cases will be handled by ashr_1
+ * ecx(offset of esi)  eax(offset of edi)   relative offset   	corresponding case
+ *        n(15)            n -15            0(15 +(n-15) - n)         ashr_1
+ */
+	.p2align 4
+L(ashr_1):
+	mov	$0xffff, %esi
+	pxor	%xmm0, %xmm0
+	movdqa	(%edx), %xmm2
+	movdqa	(%eax), %xmm1
+	pcmpeqb	%xmm1, %xmm0
+	pslldq	$15, %xmm2
+	pcmpeqb	%xmm1, %xmm2
+	psubb	%xmm0, %xmm2
+	pmovmskb %xmm2, %edi
+	shr	%cl, %esi
+	shr	%cl, %edi
+	sub	%edi, %esi
+	lea	-15(%ecx), %edi
+	jnz	L(less32bytes)
+
+	UPDATE_STRNCMP_COUNTER
+
+	movdqa	(%edx), %xmm3
+	pxor	%xmm0, %xmm0
+	mov	$16, %ecx
+	or	$1, %ebx
+	lea	1(%edx), %edi
+	and	$0xfff, %edi
+	sub	$0x1000, %edi
+
+	.p2align 4
+L(loop_ashr_1):
+	add	$16, %edi
+	jg	L(nibble_ashr_1)
+
+L(gobble_ashr_1):
+	movdqa	(%eax, %ecx), %xmm1
+	movdqa	(%edx, %ecx), %xmm2
+	movdqa	%xmm2, %xmm4
+
+	palignr	$1, %xmm3, %xmm2
+
+	pcmpeqb	%xmm1, %xmm0
+	pcmpeqb	%xmm2, %xmm1
+	psubb	%xmm0, %xmm1
+	pmovmskb %xmm1, %esi
+	sub	$0xffff, %esi
+	jnz	L(exit)
+#ifdef USE_AS_STRNCMP
+	cmp	$16, %ebp
+	lea	-16(%ebp), %ebp
+	jbe	L(more8byteseq)
+#endif
+
+	add	$16, %ecx
+	movdqa	%xmm4, %xmm3
+
+	add	$16, %edi
+	jg	L(nibble_ashr_1)
+
+	movdqa	(%eax, %ecx), %xmm1
+	movdqa	(%edx, %ecx), %xmm2
+	movdqa	%xmm2, %xmm4
+
+	palignr	$1, %xmm3, %xmm2
+
+	pcmpeqb	%xmm1, %xmm0
+	pcmpeqb	%xmm2, %xmm1
+	psubb	%xmm0, %xmm1
+	pmovmskb %xmm1, %esi
+	sub	$0xffff, %esi
+	jnz	L(exit)
+
+#ifdef USE_AS_STRNCMP
+	cmp	$16, %ebp
+	lea	-16(%ebp), %ebp
+	jbe	L(more8byteseq)
+#endif
+	add	$16, %ecx
+	movdqa	%xmm4, %xmm3
+	jmp	L(loop_ashr_1)
+
+	.p2align 4
+L(nibble_ashr_1):
+	pcmpeqb	%xmm3, %xmm0
+	pmovmskb %xmm0, %esi
+	test	$0xfffe, %esi
+	jnz	L(ashr_1_exittail)
+
+#ifdef USE_AS_STRNCMP
+	cmp	$15, %ebp
+	jbe	L(ashr_1_exittail)
+#endif
+	pxor	%xmm0, %xmm0
+	sub	$0x1000, %edi
+	jmp	L(gobble_ashr_1)
+
+	.p2align 4
+L(ashr_1_exittail):
+	movdqa	(%eax, %ecx), %xmm1
+	psrldq	$1, %xmm0
+	psrldq	$1, %xmm3
+	jmp	L(aftertail)
+
+/*
+ * The following cases will be handled by ashr_2
+ * ecx(offset of esi)  eax(offset of edi)   relative offset   	corresponding case
+ *        n(14~15)            n -14            1(15 +(n-14) - n)         ashr_2
+ */
+	.p2align 4
+L(ashr_2):
+	mov	$0xffff, %esi
+	pxor	%xmm0, %xmm0
+	movdqa	(%edx), %xmm2
+	movdqa	(%eax), %xmm1
+	pcmpeqb	%xmm1, %xmm0
+	pslldq	$14, %xmm2
+	pcmpeqb	%xmm1, %xmm2
+	psubb	%xmm0, %xmm2
+	pmovmskb %xmm2, %edi
+	shr	%cl, %esi
+	shr	%cl, %edi
+	sub	%edi, %esi
+	lea	-14(%ecx), %edi
+	jnz	L(less32bytes)
+
+	UPDATE_STRNCMP_COUNTER
+
+	movdqa	(%edx), %xmm3
+	pxor	%xmm0, %xmm0
+	mov	$16, %ecx
+	or	$2, %ebx
+	lea	2(%edx), %edi
+	and	$0xfff, %edi
+	sub	$0x1000, %edi
+
+	.p2align 4
+L(loop_ashr_2):
+	add	$16, %edi
+	jg	L(nibble_ashr_2)
+
+L(gobble_ashr_2):
+	movdqa	(%eax, %ecx), %xmm1
+	movdqa	(%edx, %ecx), %xmm2
+	movdqa	%xmm2, %xmm4
+
+	palignr	$2, %xmm3, %xmm2
+
+	pcmpeqb	%xmm1, %xmm0
+	pcmpeqb	%xmm2, %xmm1
+	psubb	%xmm0, %xmm1
+	pmovmskb %xmm1, %esi
+	sub	$0xffff, %esi
+	jnz	L(exit)
+
+#ifdef USE_AS_STRNCMP
+	cmp	$16, %ebp
+	lea	-16(%ebp), %ebp
+	jbe	L(more8byteseq)
+#endif
+	add	$16, %ecx
+	movdqa	%xmm4, %xmm3
+
+	add	$16, %edi
+	jg	L(nibble_ashr_2)
+
+	movdqa	(%eax, %ecx), %xmm1
+	movdqa	(%edx, %ecx), %xmm2
+	movdqa	%xmm2, %xmm4
+
+	palignr	$2, %xmm3, %xmm2
+
+	pcmpeqb	%xmm1, %xmm0
+	pcmpeqb	%xmm2, %xmm1
+	psubb	%xmm0, %xmm1
+	pmovmskb %xmm1, %esi
+	sub	$0xffff, %esi
+	jnz	L(exit)
+
+#ifdef USE_AS_STRNCMP
+	cmp	$16, %ebp
+	lea	-16(%ebp), %ebp
+	jbe	L(more8byteseq)
+#endif
+	add	$16, %ecx
+	movdqa	%xmm4, %xmm3
+	jmp	L(loop_ashr_2)
+
+	.p2align 4
+L(nibble_ashr_2):
+	pcmpeqb	%xmm3, %xmm0
+	pmovmskb %xmm0, %esi
+	test	$0xfffc, %esi
+	jnz	L(ashr_2_exittail)
+
+#ifdef USE_AS_STRNCMP
+	cmp	$14, %ebp
+	jbe	L(ashr_2_exittail)
+#endif
+
+	pxor	%xmm0, %xmm0
+	sub	$0x1000, %edi
+	jmp	L(gobble_ashr_2)
+
+	.p2align 4
+L(ashr_2_exittail):
+	movdqa	(%eax, %ecx), %xmm1
+	psrldq	$2, %xmm0
+	psrldq	$2, %xmm3
+	jmp	L(aftertail)
+
+/*
+ * The following cases will be handled by ashr_3
+ * ecx(offset of esi)  eax(offset of edi)   relative offset   	corresponding case
+ *        n(13~15)            n -13            2(15 +(n-13) - n)         ashr_3
+ */
+	.p2align 4
+L(ashr_3):
+	mov	$0xffff, %esi
+	pxor	%xmm0, %xmm0
+	movdqa	(%edx), %xmm2
+	movdqa	(%eax), %xmm1
+	pcmpeqb	%xmm1, %xmm0
+	pslldq	$13, %xmm2
+	pcmpeqb	%xmm1, %xmm2
+	psubb	%xmm0, %xmm2
+	pmovmskb %xmm2, %edi
+	shr	%cl, %esi
+	shr	%cl, %edi
+	sub	%edi, %esi
+	lea	-13(%ecx), %edi
+	jnz	L(less32bytes)
+
+	UPDATE_STRNCMP_COUNTER
+
+	movdqa	(%edx), %xmm3
+	pxor	%xmm0, %xmm0
+	mov	$16, %ecx
+	or	$3, %ebx
+	lea	3(%edx), %edi
+	and	$0xfff, %edi
+	sub	$0x1000, %edi
+
+	.p2align 4
+L(loop_ashr_3):
+	add	$16, %edi
+	jg	L(nibble_ashr_3)
+
+L(gobble_ashr_3):
+	movdqa	(%eax, %ecx), %xmm1
+	movdqa	(%edx, %ecx), %xmm2
+	movdqa	%xmm2, %xmm4
+
+	palignr	$3, %xmm3, %xmm2
+
+	pcmpeqb	%xmm1, %xmm0
+	pcmpeqb	%xmm2, %xmm1
+	psubb	%xmm0, %xmm1
+	pmovmskb %xmm1, %esi
+	sub	$0xffff, %esi
+	jnz	L(exit)
+
+#ifdef USE_AS_STRNCMP
+	cmp	$16, %ebp
+	lea	-16(%ebp), %ebp
+	jbe	L(more8byteseq)
+#endif
+	add	$16, %ecx
+	movdqa	%xmm4, %xmm3
+
+	add	$16, %edi
+	jg	L(nibble_ashr_3)
+
+	movdqa	(%eax, %ecx), %xmm1
+	movdqa	(%edx, %ecx), %xmm2
+	movdqa	%xmm2, %xmm4
+
+	palignr	$3, %xmm3, %xmm2
+
+	pcmpeqb	%xmm1, %xmm0
+	pcmpeqb	%xmm2, %xmm1
+	psubb	%xmm0, %xmm1
+	pmovmskb %xmm1, %esi
+	sub	$0xffff, %esi
+	jnz	L(exit)
+
+#ifdef USE_AS_STRNCMP
+	cmp	$16, %ebp
+	lea	-16(%ebp), %ebp
+	jbe	L(more8byteseq)
+#endif
+	add	$16, %ecx
+	movdqa	%xmm4, %xmm3
+	jmp	L(loop_ashr_3)
+
+	.p2align 4
+L(nibble_ashr_3):
+	pcmpeqb	%xmm3, %xmm0
+	pmovmskb %xmm0, %esi
+	test	$0xfff8, %esi
+	jnz	L(ashr_3_exittail)
+
+#ifdef USE_AS_STRNCMP
+	cmp	$13, %ebp
+	jbe	L(ashr_3_exittail)
+#endif
+	pxor	%xmm0, %xmm0
+	sub	$0x1000, %edi
+	jmp	L(gobble_ashr_3)
+
+	.p2align 4
+L(ashr_3_exittail):
+	movdqa	(%eax, %ecx), %xmm1
+	psrldq	$3, %xmm0
+	psrldq	$3, %xmm3
+	jmp	L(aftertail)
+
+/*
+ * The following cases will be handled by ashr_4
+ * ecx(offset of esi)  eax(offset of edi)   relative offset   	corresponding case
+ *        n(12~15)            n -12            3(15 +(n-12) - n)         ashr_4
+ */
+	.p2align 4
+L(ashr_4):
+	mov	$0xffff, %esi
+	pxor	%xmm0, %xmm0
+	movdqa	(%edx), %xmm2
+	movdqa	(%eax), %xmm1
+	pcmpeqb	%xmm1, %xmm0
+	pslldq	$12, %xmm2
+	pcmpeqb	%xmm1, %xmm2
+	psubb	%xmm0, %xmm2
+	pmovmskb %xmm2, %edi
+	shr	%cl, %esi
+	shr	%cl, %edi
+	sub	%edi, %esi
+	lea	-12(%ecx), %edi
+	jnz	L(less32bytes)
+
+	UPDATE_STRNCMP_COUNTER
+
+	movdqa	(%edx), %xmm3
+	pxor	%xmm0, %xmm0
+	mov	$16, %ecx
+	or	$4, %ebx
+	lea	4(%edx), %edi
+	and	$0xfff, %edi
+	sub	$0x1000, %edi
+
+	.p2align 4
+L(loop_ashr_4):
+	add	$16, %edi
+	jg	L(nibble_ashr_4)
+
+L(gobble_ashr_4):
+	movdqa	(%eax, %ecx), %xmm1
+	movdqa	(%edx, %ecx), %xmm2
+	movdqa	%xmm2, %xmm4
+
+	palignr	$4, %xmm3, %xmm2
+
+	pcmpeqb	%xmm1, %xmm0
+	pcmpeqb	%xmm2, %xmm1
+	psubb	%xmm0, %xmm1
+	pmovmskb %xmm1, %esi
+	sub	$0xffff, %esi
+	jnz	L(exit)
+
+#ifdef USE_AS_STRNCMP
+	cmp	$16, %ebp
+	lea	-16(%ebp), %ebp
+	jbe	L(more8byteseq)
+#endif
+
+	add	$16, %ecx
+	movdqa	%xmm4, %xmm3
+
+	add	$16, %edi
+	jg	L(nibble_ashr_4)
+
+	movdqa	(%eax, %ecx), %xmm1
+	movdqa	(%edx, %ecx), %xmm2
+	movdqa	%xmm2, %xmm4
+
+	palignr	$4, %xmm3, %xmm2
+
+	pcmpeqb	%xmm1, %xmm0
+	pcmpeqb	%xmm2, %xmm1
+	psubb	%xmm0, %xmm1
+	pmovmskb %xmm1, %esi
+	sub	$0xffff, %esi
+	jnz	L(exit)
+
+#ifdef USE_AS_STRNCMP
+	cmp	$16, %ebp
+	lea	-16(%ebp), %ebp
+	jbe	L(more8byteseq)
+#endif
+
+	add	$16, %ecx
+	movdqa	%xmm4, %xmm3
+	jmp	L(loop_ashr_4)
+
+	.p2align 4
+L(nibble_ashr_4):
+	pcmpeqb	%xmm3, %xmm0
+	pmovmskb %xmm0, %esi
+	test	$0xfff0, %esi
+	jnz	L(ashr_4_exittail)
+
+#ifdef USE_AS_STRNCMP
+	cmp	$12, %ebp
+	jbe	L(ashr_4_exittail)
+#endif
+
+	pxor	%xmm0, %xmm0
+	sub	$0x1000, %edi
+	jmp	L(gobble_ashr_4)
+
+	.p2align 4
+L(ashr_4_exittail):
+	movdqa	(%eax, %ecx), %xmm1
+	psrldq	$4, %xmm0
+	psrldq	$4, %xmm3
+	jmp	L(aftertail)
+
+/*
+ * The following cases will be handled by ashr_5
+ * ecx(offset of esi)  eax(offset of edi)   relative offset   	corresponding case
+ *        n(11~15)            n -11            4(15 +(n-11) - n)         ashr_5
+ */
+	.p2align 4
+L(ashr_5):
+	mov	$0xffff, %esi
+	pxor	%xmm0, %xmm0
+	movdqa	(%edx), %xmm2
+	movdqa	(%eax), %xmm1
+	pcmpeqb	%xmm1, %xmm0
+	pslldq	$11, %xmm2
+	pcmpeqb	%xmm1, %xmm2
+	psubb	%xmm0, %xmm2
+	pmovmskb %xmm2, %edi
+	shr	%cl, %esi
+	shr	%cl, %edi
+	sub	%edi, %esi
+	lea	-11(%ecx), %edi
+	jnz	L(less32bytes)
+
+	UPDATE_STRNCMP_COUNTER
+
+	movdqa	(%edx), %xmm3
+	pxor	%xmm0, %xmm0
+	mov	$16, %ecx
+	or	$5, %ebx
+	lea	5(%edx), %edi
+	and	$0xfff, %edi
+	sub	$0x1000, %edi
+
+	.p2align 4
+L(loop_ashr_5):
+	add	$16, %edi
+	jg	L(nibble_ashr_5)
+
+L(gobble_ashr_5):
+	movdqa	(%eax, %ecx), %xmm1
+	movdqa	(%edx, %ecx), %xmm2
+	movdqa	%xmm2, %xmm4
+
+	palignr	$5, %xmm3, %xmm2
+
+	pcmpeqb	%xmm1, %xmm0
+	pcmpeqb	%xmm2, %xmm1
+	psubb	%xmm0, %xmm1
+	pmovmskb %xmm1, %esi
+	sub	$0xffff, %esi
+	jnz	L(exit)
+
+#ifdef USE_AS_STRNCMP
+	cmp	$16, %ebp
+	lea	-16(%ebp), %ebp
+	jbe	L(more8byteseq)
+#endif
+	add	$16, %ecx
+	movdqa	%xmm4, %xmm3
+
+	add	$16, %edi
+	jg	L(nibble_ashr_5)
+
+	movdqa	(%eax, %ecx), %xmm1
+	movdqa	(%edx, %ecx), %xmm2
+	movdqa	%xmm2, %xmm4
+
+	palignr	$5, %xmm3, %xmm2
+
+	pcmpeqb	%xmm1, %xmm0
+	pcmpeqb	%xmm2, %xmm1
+	psubb	%xmm0, %xmm1
+	pmovmskb %xmm1, %esi
+	sub	$0xffff, %esi
+	jnz	L(exit)
+
+#ifdef USE_AS_STRNCMP
+	cmp	$16, %ebp
+	lea	-16(%ebp), %ebp
+	jbe	L(more8byteseq)
+#endif
+	add	$16, %ecx
+	movdqa	%xmm4, %xmm3
+	jmp	L(loop_ashr_5)
+
+	.p2align 4
+L(nibble_ashr_5):
+	pcmpeqb	%xmm3, %xmm0
+	pmovmskb %xmm0, %esi
+	test	$0xffe0, %esi
+	jnz	L(ashr_5_exittail)
+
+#ifdef USE_AS_STRNCMP
+	cmp	$11, %ebp
+	jbe	L(ashr_5_exittail)
+#endif
+	pxor	%xmm0, %xmm0
+	sub	$0x1000, %edi
+	jmp	L(gobble_ashr_5)
+
+	.p2align 4
+L(ashr_5_exittail):
+	movdqa	(%eax, %ecx), %xmm1
+	psrldq	$5, %xmm0
+	psrldq	$5, %xmm3
+	jmp	L(aftertail)
+
+/*
+ * The following cases will be handled by ashr_6
+ * ecx(offset of esi)  eax(offset of edi)   relative offset   	corresponding case
+ *        n(10~15)            n -10            5(15 +(n-10) - n)         ashr_6
+ */
+
+	.p2align 4
+L(ashr_6):
+	mov	$0xffff, %esi
+	pxor	%xmm0, %xmm0
+	movdqa	(%edx), %xmm2
+	movdqa	(%eax), %xmm1
+	pcmpeqb	%xmm1, %xmm0
+	pslldq	$10, %xmm2
+	pcmpeqb	%xmm1, %xmm2
+	psubb	%xmm0, %xmm2
+	pmovmskb %xmm2, %edi
+	shr	%cl, %esi
+	shr	%cl, %edi
+	sub	%edi, %esi
+	lea	-10(%ecx), %edi
+	jnz	L(less32bytes)
+
+	UPDATE_STRNCMP_COUNTER
+
+	movdqa	(%edx), %xmm3
+	pxor	%xmm0, %xmm0
+	mov	$16, %ecx
+	or	$6, %ebx
+	lea	6(%edx), %edi
+	and	$0xfff, %edi
+	sub	$0x1000, %edi
+
+	.p2align 4
+L(loop_ashr_6):
+	add	$16, %edi
+	jg	L(nibble_ashr_6)
+
+L(gobble_ashr_6):
+	movdqa	(%eax, %ecx), %xmm1
+	movdqa	(%edx, %ecx), %xmm2
+	movdqa	%xmm2, %xmm4
+
+	palignr	$6, %xmm3, %xmm2
+
+	pcmpeqb	%xmm1, %xmm0
+	pcmpeqb	%xmm2, %xmm1
+	psubb	%xmm0, %xmm1
+	pmovmskb %xmm1, %esi
+	sub	$0xffff, %esi
+	jnz	L(exit)
+
+#ifdef USE_AS_STRNCMP
+	cmp	$16, %ebp
+	lea	-16(%ebp), %ebp
+	jbe	L(more8byteseq)
+#endif
+
+	add	$16, %ecx
+	movdqa	%xmm4, %xmm3
+
+	add	$16, %edi
+	jg	L(nibble_ashr_6)
+
+	movdqa	(%eax, %ecx), %xmm1
+	movdqa	(%edx, %ecx), %xmm2
+	movdqa	%xmm2, %xmm4
+
+	palignr	$6, %xmm3, %xmm2
+
+	pcmpeqb	%xmm1, %xmm0
+	pcmpeqb	%xmm2, %xmm1
+	psubb	%xmm0, %xmm1
+	pmovmskb %xmm1, %esi
+	sub	$0xffff, %esi
+	jnz	L(exit)
+#ifdef USE_AS_STRNCMP
+	cmp	$16, %ebp
+	lea	-16(%ebp), %ebp
+	jbe	L(more8byteseq)
+#endif
+
+	add	$16, %ecx
+	movdqa	%xmm4, %xmm3
+	jmp	L(loop_ashr_6)
+
+	.p2align 4
+L(nibble_ashr_6):
+	pcmpeqb	%xmm3, %xmm0
+	pmovmskb %xmm0, %esi
+	test	$0xffc0, %esi
+	jnz	L(ashr_6_exittail)
+
+#ifdef USE_AS_STRNCMP
+	cmp	$10, %ebp
+	jbe	L(ashr_6_exittail)
+#endif
+	pxor	%xmm0, %xmm0
+	sub	$0x1000, %edi
+	jmp	L(gobble_ashr_6)
+
+	.p2align 4
+L(ashr_6_exittail):
+	movdqa	(%eax, %ecx), %xmm1
+	psrldq	$6, %xmm0
+	psrldq	$6, %xmm3
+	jmp	L(aftertail)
+
+/*
+ * The following cases will be handled by ashr_7
+ * ecx(offset of esi)  eax(offset of edi)   relative offset   	corresponding case
+ *        n(9~15)            n - 9            6(15 +(n-9) - n)         ashr_7
+ */
+
+	.p2align 4
+L(ashr_7):
+	mov	$0xffff, %esi
+	pxor	%xmm0, %xmm0
+	movdqa	(%edx), %xmm2
+	movdqa	(%eax), %xmm1
+	pcmpeqb	%xmm1, %xmm0
+	pslldq	$9, %xmm2
+	pcmpeqb	%xmm1, %xmm2
+	psubb	%xmm0, %xmm2
+	pmovmskb %xmm2, %edi
+	shr	%cl, %esi
+	shr	%cl, %edi
+	sub	%edi, %esi
+	lea	-9(%ecx), %edi
+	jnz	L(less32bytes)
+
+	UPDATE_STRNCMP_COUNTER
+
+	movdqa	(%edx), %xmm3
+	pxor	%xmm0, %xmm0
+	mov	$16, %ecx
+	or	$7, %ebx
+	lea	8(%edx), %edi
+	and	$0xfff, %edi
+	sub	$0x1000, %edi
+
+	.p2align 4
+L(loop_ashr_7):
+	add	$16, %edi
+	jg	L(nibble_ashr_7)
+
+L(gobble_ashr_7):
+	movdqa	(%eax, %ecx), %xmm1
+	movdqa	(%edx, %ecx), %xmm2
+	movdqa	%xmm2, %xmm4
+
+	palignr	$7, %xmm3, %xmm2
+
+	pcmpeqb	%xmm1, %xmm0
+	pcmpeqb	%xmm2, %xmm1
+	psubb	%xmm0, %xmm1
+	pmovmskb %xmm1, %esi
+	sub	$0xffff, %esi
+	jnz	L(exit)
+
+#ifdef USE_AS_STRNCMP
+	cmp	$16, %ebp
+	lea	-16(%ebp), %ebp
+	jbe	L(more8byteseq)
+#endif
+
+	add	$16, %ecx
+	movdqa	%xmm4, %xmm3
+
+	add	$16, %edi
+	jg	L(nibble_ashr_7)
+
+	movdqa	(%eax, %ecx), %xmm1
+	movdqa	(%edx, %ecx), %xmm2
+	movdqa	%xmm2, %xmm4
+
+	palignr	$7, %xmm3, %xmm2
+
+	pcmpeqb	%xmm1, %xmm0
+	pcmpeqb	%xmm2, %xmm1
+	psubb	%xmm0, %xmm1
+	pmovmskb %xmm1, %esi
+	sub	$0xffff, %esi
+	jnz	L(exit)
+
+#ifdef USE_AS_STRNCMP
+	cmp	$16, %ebp
+	lea	-16(%ebp), %ebp
+	jbe	L(more8byteseq)
+#endif
+
+	add	$16, %ecx
+	movdqa	%xmm4, %xmm3
+	jmp	L(loop_ashr_7)
+
+	.p2align 4
+L(nibble_ashr_7):
+	pcmpeqb	%xmm3, %xmm0
+	pmovmskb %xmm0, %esi
+	test	$0xff80, %esi
+	jnz	L(ashr_7_exittail)
+
+#ifdef USE_AS_STRNCMP
+	cmp	$9, %ebp
+	jbe	L(ashr_7_exittail)
+#endif
+	pxor	%xmm0, %xmm0
+	pxor	%xmm0, %xmm0
+	sub	$0x1000, %edi
+	jmp	L(gobble_ashr_7)
+
+	.p2align 4
+L(ashr_7_exittail):
+	movdqa	(%eax, %ecx), %xmm1
+	psrldq	$7, %xmm0
+	psrldq	$7, %xmm3
+	jmp	L(aftertail)
+
+/*
+ * The following cases will be handled by ashr_8
+ * ecx(offset of esi)  eax(offset of edi)   relative offset   	corresponding case
+ *        n(8~15)            n - 8            7(15 +(n-8) - n)         ashr_8
+ */
+	.p2align 4
+L(ashr_8):
+	mov	$0xffff, %esi
+	pxor	%xmm0, %xmm0
+	movdqa	(%edx), %xmm2
+	movdqa	(%eax), %xmm1
+	pcmpeqb	%xmm1, %xmm0
+	pslldq	$8, %xmm2
+	pcmpeqb	%xmm1, %xmm2
+	psubb	%xmm0, %xmm2
+	pmovmskb %xmm2, %edi
+	shr	%cl, %esi
+	shr	%cl, %edi
+	sub	%edi, %esi
+	lea	-8(%ecx), %edi
+	jnz	L(less32bytes)
+
+	UPDATE_STRNCMP_COUNTER
+
+	movdqa	(%edx), %xmm3
+	pxor	%xmm0, %xmm0
+	mov	$16, %ecx
+	or	$8, %ebx
+	lea	8(%edx), %edi
+	and	$0xfff, %edi
+	sub	$0x1000, %edi
+
+	.p2align 4
+L(loop_ashr_8):
+	add	$16, %edi
+	jg	L(nibble_ashr_8)
+
+L(gobble_ashr_8):
+	movdqa	(%eax, %ecx), %xmm1
+	movdqa	(%edx, %ecx), %xmm2
+	movdqa	%xmm2, %xmm4
+
+	palignr	$8, %xmm3, %xmm2
+
+	pcmpeqb	%xmm1, %xmm0
+	pcmpeqb	%xmm2, %xmm1
+	psubb	%xmm0, %xmm1
+	pmovmskb %xmm1, %esi
+	sub	$0xffff, %esi
+	jnz	L(exit)
+
+#ifdef USE_AS_STRNCMP
+	cmp	$16, %ebp
+	lea	-16(%ebp), %ebp
+	jbe	L(more8byteseq)
+#endif
+	add	$16, %ecx
+	movdqa	%xmm4, %xmm3
+
+	add	$16, %edi
+	jg	L(nibble_ashr_8)
+
+	movdqa	(%eax, %ecx), %xmm1
+	movdqa	(%edx, %ecx), %xmm2
+	movdqa	%xmm2, %xmm4
+
+	palignr	$8, %xmm3, %xmm2
+
+	pcmpeqb	%xmm1, %xmm0
+	pcmpeqb	%xmm2, %xmm1
+	psubb	%xmm0, %xmm1
+	pmovmskb %xmm1, %esi
+	sub	$0xffff, %esi
+	jnz	L(exit)
+
+#ifdef USE_AS_STRNCMP
+	cmp	$16, %ebp
+	lea	-16(%ebp), %ebp
+	jbe	L(more8byteseq)
+#endif
+	add	$16, %ecx
+	movdqa	%xmm4, %xmm3
+	jmp	L(loop_ashr_8)
+
+	.p2align 4
+L(nibble_ashr_8):
+	pcmpeqb	%xmm3, %xmm0
+	pmovmskb %xmm0, %esi
+	test	$0xff00, %esi
+	jnz	L(ashr_8_exittail)
+
+#ifdef USE_AS_STRNCMP
+	cmp	$8, %ebp
+	jbe	L(ashr_8_exittail)
+#endif
+	pxor	%xmm0, %xmm0
+	pxor	%xmm0, %xmm0
+	sub	$0x1000, %edi
+	jmp	L(gobble_ashr_8)
+
+	.p2align 4
+L(ashr_8_exittail):
+	movdqa	(%eax, %ecx), %xmm1
+	psrldq	$8, %xmm0
+	psrldq	$8, %xmm3
+	jmp	L(aftertail)
+
+/*
+ * The following cases will be handled by ashr_9
+ * ecx(offset of esi)  eax(offset of edi)   relative offset   	corresponding case
+ *        n(7~15)            n - 7            8(15 +(n-7) - n)         ashr_9
+ */
+	.p2align 4
+L(ashr_9):
+	mov	$0xffff, %esi
+	pxor	%xmm0, %xmm0
+	movdqa	(%edx), %xmm2
+	movdqa	(%eax), %xmm1
+	pcmpeqb	%xmm1, %xmm0
+	pslldq	$7, %xmm2
+	pcmpeqb	%xmm1, %xmm2
+	psubb	%xmm0, %xmm2
+	pmovmskb %xmm2, %edi
+	shr	%cl, %esi
+	shr	%cl, %edi
+	sub	%edi, %esi
+	lea	-7(%ecx), %edi
+	jnz	L(less32bytes)
+
+	UPDATE_STRNCMP_COUNTER
+
+	movdqa	(%edx), %xmm3
+	pxor	%xmm0, %xmm0
+	mov	$16, %ecx
+	or	$9, %ebx
+	lea	9(%edx), %edi
+	and	$0xfff, %edi
+	sub	$0x1000, %edi
+
+	.p2align 4
+L(loop_ashr_9):
+	add	$16, %edi
+	jg	L(nibble_ashr_9)
+
+L(gobble_ashr_9):
+	movdqa	(%eax, %ecx), %xmm1
+	movdqa	(%edx, %ecx), %xmm2
+	movdqa	%xmm2, %xmm4
+
+	palignr	$9, %xmm3, %xmm2
+
+	pcmpeqb	%xmm1, %xmm0
+	pcmpeqb	%xmm2, %xmm1
+	psubb	%xmm0, %xmm1
+	pmovmskb %xmm1, %esi
+	sub	$0xffff, %esi
+	jnz	L(exit)
+
+#ifdef USE_AS_STRNCMP
+	cmp	$16, %ebp
+	lea	-16(%ebp), %ebp
+	jbe	L(more8byteseq)
+#endif
+	add	$16, %ecx
+	movdqa	%xmm4, %xmm3
+
+	add	$16, %edi
+	jg	L(nibble_ashr_9)
+
+	movdqa	(%eax, %ecx), %xmm1
+	movdqa	(%edx, %ecx), %xmm2
+	movdqa	%xmm2, %xmm4
+
+	palignr	$9, %xmm3, %xmm2
+
+	pcmpeqb	%xmm1, %xmm0
+	pcmpeqb	%xmm2, %xmm1
+	psubb	%xmm0, %xmm1
+	pmovmskb %xmm1, %esi
+	sub	$0xffff, %esi
+	jnz	L(exit)
+
+#ifdef USE_AS_STRNCMP
+	cmp	$16, %ebp
+	lea	-16(%ebp), %ebp
+	jbe	L(more8byteseq)
+#endif
+	add	$16, %ecx
+	movdqa	%xmm4, %xmm3
+	jmp	L(loop_ashr_9)
+
+	.p2align 4
+L(nibble_ashr_9):
+	pcmpeqb	%xmm3, %xmm0
+	pmovmskb %xmm0, %esi
+	test	$0xfe00, %esi
+	jnz	L(ashr_9_exittail)
+
+#ifdef USE_AS_STRNCMP
+	cmp	$7, %ebp
+	jbe	L(ashr_9_exittail)
+#endif
+	pxor	%xmm0, %xmm0
+	sub	$0x1000, %edi
+	jmp	L(gobble_ashr_9)
+
+	.p2align 4
+L(ashr_9_exittail):
+	movdqa	(%eax, %ecx), %xmm1
+	psrldq	$9, %xmm0
+	psrldq	$9, %xmm3
+	jmp	L(aftertail)
+
+/*
+ * The following cases will be handled by ashr_10
+ * ecx(offset of esi)  eax(offset of edi)   relative offset   	corresponding case
+ *        n(6~15)            n - 6            9(15 +(n-6) - n)         ashr_10
+ */
+	.p2align 4
+L(ashr_10):
+	mov	$0xffff, %esi
+	pxor	%xmm0, %xmm0
+	movdqa	(%edx), %xmm2
+	movdqa	(%eax), %xmm1
+	pcmpeqb	%xmm1, %xmm0
+	pslldq	$6, %xmm2
+	pcmpeqb	%xmm1, %xmm2
+	psubb	%xmm0, %xmm2
+	pmovmskb %xmm2, %edi
+	shr	%cl, %esi
+	shr	%cl, %edi
+	sub	%edi, %esi
+	lea	-6(%ecx), %edi
+	jnz	L(less32bytes)
+
+	UPDATE_STRNCMP_COUNTER
+
+	movdqa	(%edx), %xmm3
+	pxor	%xmm0, %xmm0
+	mov	$16, %ecx
+	or	$10, %ebx
+	lea	10(%edx), %edi
+	and	$0xfff, %edi
+	sub	$0x1000, %edi
+
+	.p2align 4
+L(loop_ashr_10):
+	add	$16, %edi
+	jg	L(nibble_ashr_10)
+
+L(gobble_ashr_10):
+	movdqa	(%eax, %ecx), %xmm1
+	movdqa	(%edx, %ecx), %xmm2
+	movdqa	%xmm2, %xmm4
+
+	palignr	$10, %xmm3, %xmm2
+
+	pcmpeqb	%xmm1, %xmm0
+	pcmpeqb	%xmm2, %xmm1
+	psubb	%xmm0, %xmm1
+	pmovmskb %xmm1, %esi
+	sub	$0xffff, %esi
+	jnz	L(exit)
+
+#ifdef USE_AS_STRNCMP
+	cmp	$16, %ebp
+	lea	-16(%ebp), %ebp
+	jbe	L(more8byteseq)
+#endif
+	add	$16, %ecx
+	movdqa	%xmm4, %xmm3
+
+	add	$16, %edi
+	jg	L(nibble_ashr_10)
+
+	movdqa	(%eax, %ecx), %xmm1
+	movdqa	(%edx, %ecx), %xmm2
+	movdqa	%xmm2, %xmm4
+
+	palignr	$10, %xmm3, %xmm2
+
+	pcmpeqb	%xmm1, %xmm0
+	pcmpeqb	%xmm2, %xmm1
+	psubb	%xmm0, %xmm1
+	pmovmskb %xmm1, %esi
+	sub	$0xffff, %esi
+	jnz	L(exit)
+
+#ifdef USE_AS_STRNCMP
+	cmp	$16, %ebp
+	lea	-16(%ebp), %ebp
+	jbe	L(more8byteseq)
+#endif
+	add	$16, %ecx
+	movdqa	%xmm4, %xmm3
+	jmp	L(loop_ashr_10)
+
+	.p2align 4
+L(nibble_ashr_10):
+	pcmpeqb	%xmm3, %xmm0
+	pmovmskb %xmm0, %esi
+	test	$0xfc00, %esi
+	jnz	L(ashr_10_exittail)
+
+#ifdef USE_AS_STRNCMP
+	cmp	$6, %ebp
+	jbe	L(ashr_10_exittail)
+#endif
+	pxor	%xmm0, %xmm0
+	sub	$0x1000, %edi
+	jmp	L(gobble_ashr_10)
+
+	.p2align 4
+L(ashr_10_exittail):
+	movdqa	(%eax, %ecx), %xmm1
+	psrldq	$10, %xmm0
+	psrldq	$10, %xmm3
+	jmp	L(aftertail)
+
+/*
+ * The following cases will be handled by ashr_11
+ * ecx(offset of esi)  eax(offset of edi)   relative offset   	corresponding case
+ *        n(5~15)            n - 5            10(15 +(n-5) - n)         ashr_11
+ */
+	.p2align 4
+L(ashr_11):
+	mov	$0xffff, %esi
+	pxor	%xmm0, %xmm0
+	movdqa	(%edx), %xmm2
+	movdqa	(%eax), %xmm1
+	pcmpeqb	%xmm1, %xmm0
+	pslldq	$5, %xmm2
+	pcmpeqb	%xmm1, %xmm2
+	psubb	%xmm0, %xmm2
+	pmovmskb %xmm2, %edi
+	shr	%cl, %esi
+	shr	%cl, %edi
+	sub	%edi, %esi
+	lea	-5(%ecx), %edi
+	jnz	L(less32bytes)
+
+	UPDATE_STRNCMP_COUNTER
+
+	movdqa	(%edx), %xmm3
+	pxor	%xmm0, %xmm0
+	mov	$16, %ecx
+	or	$11, %ebx
+	lea	11(%edx), %edi
+	and	$0xfff, %edi
+	sub	$0x1000, %edi
+
+	.p2align 4
+L(loop_ashr_11):
+	add	$16, %edi
+	jg	L(nibble_ashr_11)
+
+L(gobble_ashr_11):
+	movdqa	(%eax, %ecx), %xmm1
+	movdqa	(%edx, %ecx), %xmm2
+	movdqa	%xmm2, %xmm4
+
+	palignr	$11, %xmm3, %xmm2
+
+	pcmpeqb	%xmm1, %xmm0
+	pcmpeqb	%xmm2, %xmm1
+	psubb	%xmm0, %xmm1
+	pmovmskb %xmm1, %esi
+	sub	$0xffff, %esi
+	jnz	L(exit)
+
+#ifdef USE_AS_STRNCMP
+	cmp	$16, %ebp
+	lea	-16(%ebp), %ebp
+	jbe	L(more8byteseq)
+#endif
+	add	$16, %ecx
+	movdqa	%xmm4, %xmm3
+
+	add	$16, %edi
+	jg	L(nibble_ashr_11)
+
+	movdqa	(%eax, %ecx), %xmm1
+	movdqa	(%edx, %ecx), %xmm2
+	movdqa	%xmm2, %xmm4
+
+	palignr	$11, %xmm3, %xmm2
+
+	pcmpeqb	%xmm1, %xmm0
+	pcmpeqb	%xmm2, %xmm1
+	psubb	%xmm0, %xmm1
+	pmovmskb %xmm1, %esi
+	sub	$0xffff, %esi
+	jnz	L(exit)
+
+#ifdef USE_AS_STRNCMP
+	cmp	$16, %ebp
+	lea	-16(%ebp), %ebp
+	jbe	L(more8byteseq)
+#endif
+	add	$16, %ecx
+	movdqa	%xmm4, %xmm3
+	jmp	L(loop_ashr_11)
+
+	.p2align 4
+L(nibble_ashr_11):
+	pcmpeqb	%xmm3, %xmm0
+	pmovmskb %xmm0, %esi
+	test	$0xf800, %esi
+	jnz	L(ashr_11_exittail)
+
+#ifdef USE_AS_STRNCMP
+	cmp	$5, %ebp
+	jbe	L(ashr_11_exittail)
+#endif
+	pxor	%xmm0, %xmm0
+	sub	$0x1000, %edi
+	jmp	L(gobble_ashr_11)
+
+	.p2align 4
+L(ashr_11_exittail):
+	movdqa	(%eax, %ecx), %xmm1
+	psrldq	$11, %xmm0
+	psrldq	$11, %xmm3
+	jmp	L(aftertail)
+
+/*
+ * The following cases will be handled by ashr_12
+ * ecx(offset of esi)  eax(offset of edi)   relative offset   	corresponding case
+ *        n(4~15)            n - 4            11(15 +(n-4) - n)         ashr_12
+ */
+	.p2align 4
+L(ashr_12):
+	mov	$0xffff, %esi
+	pxor	%xmm0, %xmm0
+	movdqa	(%edx), %xmm2
+	movdqa	(%eax), %xmm1
+	pcmpeqb	%xmm1, %xmm0
+	pslldq	$4, %xmm2
+	pcmpeqb	%xmm1, %xmm2
+	psubb	%xmm0, %xmm2
+	pmovmskb %xmm2, %edi
+	shr	%cl, %esi
+	shr	%cl, %edi
+	sub	%edi, %esi
+	lea	-4(%ecx), %edi
+	jnz	L(less32bytes)
+
+	UPDATE_STRNCMP_COUNTER
+
+	movdqa	(%edx), %xmm3
+	pxor	%xmm0, %xmm0
+	mov	$16, %ecx
+	or	$12, %ebx
+	lea	12(%edx), %edi
+	and	$0xfff, %edi
+	sub	$0x1000, %edi
+
+	.p2align 4
+L(loop_ashr_12):
+	add	$16, %edi
+	jg	L(nibble_ashr_12)
+
+L(gobble_ashr_12):
+	movdqa	(%eax, %ecx), %xmm1
+	movdqa	(%edx, %ecx), %xmm2
+	movdqa	%xmm2, %xmm4
+
+	palignr	$12, %xmm3, %xmm2
+
+	pcmpeqb	%xmm1, %xmm0
+	pcmpeqb	%xmm2, %xmm1
+	psubb	%xmm0, %xmm1
+	pmovmskb %xmm1, %esi
+	sub	$0xffff, %esi
+	jnz	L(exit)
+
+	add	$16, %ecx
+	movdqa	%xmm4, %xmm3
+
+	add	$16, %edi
+	jg	L(nibble_ashr_12)
+
+#ifdef USE_AS_STRNCMP
+	cmp	$16, %ebp
+	lea	-16(%ebp), %ebp
+	jbe	L(more8byteseq)
+#endif
+	movdqa	(%eax, %ecx), %xmm1
+	movdqa	(%edx, %ecx), %xmm2
+	movdqa	%xmm2, %xmm4
+
+	palignr	$12, %xmm3, %xmm2
+
+	pcmpeqb	%xmm1, %xmm0
+	pcmpeqb	%xmm2, %xmm1
+	psubb	%xmm0, %xmm1
+	pmovmskb %xmm1, %esi
+	sub	$0xffff, %esi
+	jnz	L(exit)
+
+#ifdef USE_AS_STRNCMP
+	cmp	$16, %ebp
+	lea	-16(%ebp), %ebp
+	jbe	L(more8byteseq)
+#endif
+	add	$16, %ecx
+	movdqa	%xmm4, %xmm3
+	jmp	L(loop_ashr_12)
+
+	.p2align 4
+L(nibble_ashr_12):
+	pcmpeqb	%xmm3, %xmm0
+	pmovmskb %xmm0, %esi
+	test	$0xf000, %esi
+	jnz	L(ashr_12_exittail)
+
+#ifdef USE_AS_STRNCMP
+	cmp	$4, %ebp
+	jbe	L(ashr_12_exittail)
+#endif
+	pxor	%xmm0, %xmm0
+	sub	$0x1000, %edi
+	jmp	L(gobble_ashr_12)
+
+	.p2align 4
+L(ashr_12_exittail):
+	movdqa	(%eax, %ecx), %xmm1
+	psrldq	$12, %xmm0
+	psrldq	$12, %xmm3
+	jmp	L(aftertail)
+
+/*
+ * The following cases will be handled by ashr_13
+ * ecx(offset of esi)  eax(offset of edi)   relative offset   	corresponding case
+ *        n(3~15)            n - 3            12(15 +(n-3) - n)         ashr_13
+ */
+	.p2align 4
+L(ashr_13):
+	mov	$0xffff, %esi
+	pxor	%xmm0, %xmm0
+	movdqa	(%edx), %xmm2
+	movdqa	(%eax), %xmm1
+	pcmpeqb	%xmm1, %xmm0
+	pslldq	$3, %xmm2
+	pcmpeqb	%xmm1, %xmm2
+	psubb	%xmm0, %xmm2
+	pmovmskb %xmm2, %edi
+	shr	%cl, %esi
+	shr	%cl, %edi
+	sub	%edi, %esi
+	lea	-3(%ecx), %edi
+	jnz	L(less32bytes)
+
+	UPDATE_STRNCMP_COUNTER
+
+	movdqa	(%edx), %xmm3
+	pxor	%xmm0, %xmm0
+	mov	$16, %ecx
+	or	$13, %ebx
+	lea	13(%edx), %edi
+	and	$0xfff, %edi
+	sub	$0x1000, %edi
+
+	.p2align 4
+L(loop_ashr_13):
+	add	$16, %edi
+	jg	L(nibble_ashr_13)
+
+L(gobble_ashr_13):
+	movdqa	(%eax, %ecx), %xmm1
+	movdqa	(%edx, %ecx), %xmm2
+	movdqa	%xmm2, %xmm4
+
+	palignr	$13, %xmm3, %xmm2
+
+	pcmpeqb	%xmm1, %xmm0
+	pcmpeqb	%xmm2, %xmm1
+	psubb	%xmm0, %xmm1
+	pmovmskb %xmm1, %esi
+	sub	$0xffff, %esi
+	jnz	L(exit)
+
+#ifdef USE_AS_STRNCMP
+	cmp	$16, %ebp
+	lea	-16(%ebp), %ebp
+	jbe	L(more8byteseq)
+#endif
+	add	$16, %ecx
+	movdqa	%xmm4, %xmm3
+
+	add	$16, %edi
+	jg	L(nibble_ashr_13)
+
+	movdqa	(%eax, %ecx), %xmm1
+	movdqa	(%edx, %ecx), %xmm2
+	movdqa	%xmm2, %xmm4
+
+	palignr	$13, %xmm3, %xmm2
+
+	pcmpeqb	%xmm1, %xmm0
+	pcmpeqb	%xmm2, %xmm1
+	psubb	%xmm0, %xmm1
+	pmovmskb %xmm1, %esi
+	sub	$0xffff, %esi
+	jnz	L(exit)
+
+#ifdef USE_AS_STRNCMP
+	cmp	$16, %ebp
+	lea	-16(%ebp), %ebp
+	jbe	L(more8byteseq)
+#endif
+	add	$16, %ecx
+	movdqa	%xmm4, %xmm3
+	jmp	L(loop_ashr_13)
+
+	.p2align 4
+L(nibble_ashr_13):
+	pcmpeqb	%xmm3, %xmm0
+	pmovmskb %xmm0, %esi
+	test	$0xe000, %esi
+	jnz	L(ashr_13_exittail)
+
+#ifdef USE_AS_STRNCMP
+	cmp	$3, %ebp
+	jbe	L(ashr_13_exittail)
+#endif
+	pxor	%xmm0, %xmm0
+	sub	$0x1000, %edi
+	jmp	L(gobble_ashr_13)
+
+	.p2align 4
+L(ashr_13_exittail):
+	movdqa	(%eax, %ecx), %xmm1
+	psrldq	$13, %xmm0
+	psrldq	$13, %xmm3
+	jmp	L(aftertail)
+
+/*
+ * The following cases will be handled by ashr_14
+ * ecx(offset of esi)  eax(offset of edi)   relative offset   	corresponding case
+ *        n(2~15)            n - 2            13(15 +(n-2) - n)         ashr_14
+ */
+	.p2align 4
+L(ashr_14):
+	mov	$0xffff, %esi
+	pxor	%xmm0, %xmm0
+	movdqa	(%edx), %xmm2
+	movdqa	(%eax), %xmm1
+	pcmpeqb	%xmm1, %xmm0
+	pslldq	$2, %xmm2
+	pcmpeqb	%xmm1, %xmm2
+	psubb	%xmm0, %xmm2
+	pmovmskb %xmm2, %edi
+	shr	%cl, %esi
+	shr	%cl, %edi
+	sub	%edi, %esi
+	lea	-2(%ecx), %edi
+	jnz	L(less32bytes)
+
+	UPDATE_STRNCMP_COUNTER
+
+	movdqa	(%edx), %xmm3
+	pxor	%xmm0, %xmm0
+	mov	$16, %ecx
+	or	$14, %ebx
+	lea	14(%edx), %edi
+	and	$0xfff, %edi
+	sub	$0x1000, %edi
+
+	.p2align 4
+L(loop_ashr_14):
+	add	$16, %edi
+	jg	L(nibble_ashr_14)
+
+L(gobble_ashr_14):
+	movdqa	(%eax, %ecx), %xmm1
+	movdqa	(%edx, %ecx), %xmm2
+	movdqa	%xmm2, %xmm4
+
+	palignr	$14, %xmm3, %xmm2
+
+	pcmpeqb	%xmm1, %xmm0
+	pcmpeqb	%xmm2, %xmm1
+	psubb	%xmm0, %xmm1
+	pmovmskb %xmm1, %esi
+	sub	$0xffff, %esi
+	jnz	L(exit)
+
+#ifdef USE_AS_STRNCMP
+	cmp	$16, %ebp
+	lea	-16(%ebp), %ebp
+	jbe	L(more8byteseq)
+#endif
+	add	$16, %ecx
+	movdqa	%xmm4, %xmm3
+
+	add	$16, %edi
+	jg	L(nibble_ashr_14)
+
+	movdqa	(%eax, %ecx), %xmm1
+	movdqa	(%edx, %ecx), %xmm2
+	movdqa	%xmm2, %xmm4
+
+	palignr	$14, %xmm3, %xmm2
+
+	pcmpeqb	%xmm1, %xmm0
+	pcmpeqb	%xmm2, %xmm1
+	psubb	%xmm0, %xmm1
+	pmovmskb %xmm1, %esi
+	sub	$0xffff, %esi
+	jnz	L(exit)
+
+#ifdef USE_AS_STRNCMP
+	cmp	$16, %ebp
+	lea	-16(%ebp), %ebp
+	jbe	L(more8byteseq)
+#endif
+	add	$16, %ecx
+	movdqa	%xmm4, %xmm3
+	jmp	L(loop_ashr_14)
+
+	.p2align 4
+L(nibble_ashr_14):
+	pcmpeqb	%xmm3, %xmm0
+	pmovmskb %xmm0, %esi
+	test	$0xc000, %esi
+	jnz	L(ashr_14_exittail)
+
+#ifdef USE_AS_STRNCMP
+	cmp	$2, %ebp
+	jbe	L(ashr_14_exittail)
+#endif
+	pxor	%xmm0, %xmm0
+	sub	$0x1000, %edi
+	jmp	L(gobble_ashr_14)
+
+	.p2align 4
+L(ashr_14_exittail):
+	movdqa	(%eax, %ecx), %xmm1
+	psrldq	$14, %xmm0
+	psrldq	$14, %xmm3
+	jmp	L(aftertail)
+
+/*
+ * The following cases will be handled by ashr_14
+ * ecx(offset of esi)  eax(offset of edi)   relative offset   	corresponding case
+ *        n(1~15)            n - 1            14(15 +(n-1) - n)         ashr_15
+ */
+
+	.p2align 4
+L(ashr_15):
+	mov	$0xffff, %esi
+	pxor	%xmm0, %xmm0
+	movdqa	(%edx), %xmm2
+	movdqa	(%eax), %xmm1
+	pcmpeqb	%xmm1, %xmm0
+	pslldq	$1, %xmm2
+	pcmpeqb	%xmm1, %xmm2
+	psubb	%xmm0, %xmm2
+	pmovmskb %xmm2, %edi
+	shr	%cl, %esi
+	shr	%cl, %edi
+	sub	%edi, %esi
+	lea	-1(%ecx), %edi
+	jnz	L(less32bytes)
+
+	UPDATE_STRNCMP_COUNTER
+
+	movdqa	(%edx), %xmm3
+	pxor	%xmm0, %xmm0
+	mov	$16, %ecx
+	or	$15, %ebx
+	lea	15(%edx), %edi
+	and	$0xfff, %edi
+	sub	$0x1000, %edi
+
+	.p2align 4
+L(loop_ashr_15):
+	add	$16, %edi
+	jg	L(nibble_ashr_15)
+
+L(gobble_ashr_15):
+	movdqa	(%eax, %ecx), %xmm1
+	movdqa	(%edx, %ecx), %xmm2
+	movdqa	%xmm2, %xmm4
+
+	palignr	$15, %xmm3, %xmm2
+
+	pcmpeqb	%xmm1, %xmm0
+	pcmpeqb	%xmm2, %xmm1
+	psubb	%xmm0, %xmm1
+	pmovmskb %xmm1, %esi
+	sub	$0xffff, %esi
+	jnz	L(exit)
+
+#ifdef USE_AS_STRNCMP
+	cmp	$16, %ebp
+	lea	-16(%ebp), %ebp
+	jbe	L(more8byteseq)
+#endif
+	add	$16, %ecx
+	movdqa	%xmm4, %xmm3
+
+	add	$16, %edi
+	jg	L(nibble_ashr_15)
+
+	movdqa	(%eax, %ecx), %xmm1
+	movdqa	(%edx, %ecx), %xmm2
+	movdqa	%xmm2, %xmm4
+
+	palignr	$15, %xmm3, %xmm2
+
+	pcmpeqb	%xmm1, %xmm0
+	pcmpeqb	%xmm2, %xmm1
+	psubb	%xmm0, %xmm1
+	pmovmskb %xmm1, %esi
+	sub	$0xffff, %esi
+	jnz	L(exit)
+
+#ifdef USE_AS_STRNCMP
+	cmp	$16, %ebp
+	lea	-16(%ebp), %ebp
+	jbe	L(more8byteseq)
+#endif
+	add	$16, %ecx
+	movdqa	%xmm4, %xmm3
+	jmp	L(loop_ashr_15)
+
+	.p2align 4
+L(nibble_ashr_15):
+	pcmpeqb	%xmm3, %xmm0
+	pmovmskb %xmm0, %esi
+	test	$0x8000, %esi
+	jnz	L(ashr_15_exittail)
+
+#ifdef USE_AS_STRNCMP
+	cmp	$1, %ebp
+	jbe	L(ashr_15_exittail)
+#endif
+	pxor	%xmm0, %xmm0
+	sub	$0x1000, %edi
+	jmp	L(gobble_ashr_15)
+
+	.p2align 4
+L(ashr_15_exittail):
+	movdqa	(%eax, %ecx), %xmm1
+	psrldq	$15, %xmm0
+	psrldq	$15, %xmm3
+	jmp	L(aftertail)
+
+	.p2align 4
+L(aftertail):
+	pcmpeqb	%xmm3, %xmm1
+	psubb	%xmm0, %xmm1
+	pmovmskb %xmm1, %esi
+	not	%esi
+L(exit):
+	mov	%ebx, %edi
+	and	$0x1f, %edi
+	lea	-16(%edi, %ecx), %edi
+L(less32bytes):
+	add	%edi, %edx
+	add	%ecx, %eax
+	test	$0x20, %ebx
+	jz	L(ret2)
+	xchg	%eax, %edx
+
+	.p2align 4
+L(ret2):
+	mov	%esi, %ecx
+	POP	(%esi)
+	POP	(%edi)
+	POP	(%ebx)
+L(less16bytes):
+	test	%cl, %cl
+	jz	L(2next_8_bytes)
+
+	test	$0x01, %cl
+	jnz	L(Byte0)
+
+	test	$0x02, %cl
+	jnz	L(Byte1)
+
+	test	$0x04, %cl
+	jnz	L(Byte2)
+
+	test	$0x08, %cl
+	jnz	L(Byte3)
+
+	test	$0x10, %cl
+	jnz	L(Byte4)
+
+	test	$0x20, %cl
+	jnz	L(Byte5)
+
+	test	$0x40, %cl
+	jnz	L(Byte6)
+#ifdef USE_AS_STRNCMP
+	cmp	$7, %ebp
+	jbe	L(eq)
+#endif
+
+	movzx	7(%eax), %ecx
+	movzx	7(%edx), %eax
+
+	sub	%ecx, %eax
+	RETURN
+
+	.p2align 4
+L(Byte0):
+#ifdef USE_AS_STRNCMP
+	cmp	$0, %ebp
+	jbe	L(eq)
+#endif
+	movzx	(%eax), %ecx
+	movzx	(%edx), %eax
+
+	sub	%ecx, %eax
+	RETURN
+
+	.p2align 4
+L(Byte1):
+#ifdef USE_AS_STRNCMP
+	cmp	$1, %ebp
+	jbe	L(eq)
+#endif
+	movzx	1(%eax), %ecx
+	movzx	1(%edx), %eax
+
+	sub	%ecx, %eax
+	RETURN
+
+	.p2align 4
+L(Byte2):
+#ifdef USE_AS_STRNCMP
+	cmp	$2, %ebp
+	jbe	L(eq)
+#endif
+	movzx	2(%eax), %ecx
+	movzx	2(%edx), %eax
+
+	sub	%ecx, %eax
+	RETURN
+
+	.p2align 4
+L(Byte3):
+#ifdef USE_AS_STRNCMP
+	cmp	$3, %ebp
+	jbe	L(eq)
+#endif
+	movzx	3(%eax), %ecx
+	movzx	3(%edx), %eax
+
+	sub	%ecx, %eax
+	RETURN
+
+	.p2align 4
+L(Byte4):
+#ifdef USE_AS_STRNCMP
+	cmp	$4, %ebp
+	jbe	L(eq)
+#endif
+	movzx	4(%eax), %ecx
+	movzx	4(%edx), %eax
+
+	sub	%ecx, %eax
+	RETURN
+
+	.p2align 4
+L(Byte5):
+#ifdef USE_AS_STRNCMP
+	cmp	$5, %ebp
+	jbe	L(eq)
+#endif
+	movzx	5(%eax), %ecx
+	movzx	5(%edx), %eax
+
+	sub	%ecx, %eax
+	RETURN
+
+	.p2align 4
+L(Byte6):
+#ifdef USE_AS_STRNCMP
+	cmp	$6, %ebp
+	jbe	L(eq)
+#endif
+	movzx	6(%eax), %ecx
+	movzx	6(%edx), %eax
+
+	sub	%ecx, %eax
+	RETURN
+
+	.p2align 4
+L(2next_8_bytes):
+	add	$8, %eax
+	add	$8, %edx
+#ifdef USE_AS_STRNCMP
+	cmp	$8, %ebp
+	lea	-8(%ebp), %ebp
+	jbe	L(eq)
+#endif
+
+	test	$0x01, %ch
+	jnz	L(Byte0)
+
+	test	$0x02, %ch
+	jnz	L(Byte1)
+
+	test	$0x04, %ch
+	jnz	L(Byte2)
+
+	test	$0x08, %ch
+	jnz	L(Byte3)
+
+	test	$0x10, %ch
+	jnz	L(Byte4)
+
+	test	$0x20, %ch
+	jnz	L(Byte5)
+
+	test	$0x40, %ch
+	jnz	L(Byte6)
+
+#ifdef USE_AS_STRNCMP
+	cmp	$7, %ebp
+	jbe	L(eq)
+#endif
+	movzx	7(%eax), %ecx
+	movzx	7(%edx), %eax
+
+	sub	%ecx, %eax
+	RETURN
+
+	.p2align 4
+L(neq):
+	mov	$1, %eax
+	ja	L(neq_bigger)
+	neg	%eax
+L(neq_bigger):
+	RETURN
+
+#ifdef USE_AS_STRNCMP
+	CFI_PUSH (%ebx)
+	CFI_PUSH (%edi)
+	CFI_PUSH (%esi)
+
+	.p2align 4
+L(more8byteseq):
+	POP	(%esi)
+	POP	(%edi)
+	POP	(%ebx)
+#endif
+
+L(eq):
+
+#ifdef USE_AS_STRNCMP
+	POP	(%ebp)
+#endif
+	xorl	%eax, %eax
+	ret
+
+#ifdef USE_AS_STRNCMP
+	CFI_PUSH (%ebp)
+
+	.p2align 4
+L(less16bytes_sncmp):
+	test	%ebp, %ebp
+	jz	L(eq)
+
+	movzbl	(%eax), %ecx
+	cmpb	%cl, (%edx)
+	jne	L(neq)
+	test	%cl, %cl
+	je	L(eq)
+
+	cmp	$1, %ebp
+	je	L(eq)
+
+	movzbl	1(%eax), %ecx
+	cmpb	%cl, 1(%edx)
+	jne	L(neq)
+	test	%cl, %cl
+	je	L(eq)
+
+	cmp	$2, %ebp
+	je	L(eq)
+
+	movzbl	2(%eax), %ecx
+	cmpb	%cl, 2(%edx)
+	jne	L(neq)
+	test	%cl, %cl
+	je	L(eq)
+
+	cmp	$3, %ebp
+	je	L(eq)
+
+	movzbl	3(%eax), %ecx
+	cmpb	%cl, 3(%edx)
+	jne	L(neq)
+	test	%cl, %cl
+	je	L(eq)
+
+	cmp	$4, %ebp
+	je	L(eq)
+
+	movzbl	4(%eax), %ecx
+	cmpb	%cl, 4(%edx)
+	jne	L(neq)
+	test	%cl, %cl
+	je	L(eq)
+
+	cmp	$5, %ebp
+	je	L(eq)
+
+	movzbl	5(%eax), %ecx
+	cmpb	%cl, 5(%edx)
+	jne	L(neq)
+	test	%cl, %cl
+	je	L(eq)
+
+	cmp	$6, %ebp
+	je	L(eq)
+
+	movzbl	6(%eax), %ecx
+	cmpb	%cl, 6(%edx)
+	jne	L(neq)
+	test	%cl, %cl
+	je	L(eq)
+
+	cmp	$7, %ebp
+	je	L(eq)
+
+	movzbl	7(%eax), %ecx
+	cmpb	%cl, 7(%edx)
+	jne	L(neq)
+	test	%cl, %cl
+	je	L(eq)
+
+
+	cmp	$8, %ebp
+	je	L(eq)
+
+	movzbl	8(%eax), %ecx
+	cmpb	%cl, 8(%edx)
+	jne	L(neq)
+	test	%cl, %cl
+	je	L(eq)
+
+	cmp	$9, %ebp
+	je	L(eq)
+
+	movzbl	9(%eax), %ecx
+	cmpb	%cl, 9(%edx)
+	jne	L(neq)
+	test	%cl, %cl
+	je	L(eq)
+
+	cmp	$10, %ebp
+	je	L(eq)
+
+	movzbl	10(%eax), %ecx
+	cmpb	%cl, 10(%edx)
+	jne	L(neq)
+	test	%cl, %cl
+	je	L(eq)
+
+	cmp	$11, %ebp
+	je	L(eq)
+
+	movzbl	11(%eax), %ecx
+	cmpb	%cl, 11(%edx)
+	jne	L(neq)
+	test	%cl, %cl
+	je	L(eq)
+
+
+	cmp	$12, %ebp
+	je	L(eq)
+
+	movzbl	12(%eax), %ecx
+	cmpb	%cl, 12(%edx)
+	jne	L(neq)
+	test	%cl, %cl
+	je	L(eq)
+
+	cmp	$13, %ebp
+	je	L(eq)
+
+	movzbl	13(%eax), %ecx
+	cmpb	%cl, 13(%edx)
+	jne	L(neq)
+	test	%cl, %cl
+	je	L(eq)
+
+	cmp	$14, %ebp
+	je	L(eq)
+
+	movzbl	14(%eax), %ecx
+	cmpb	%cl, 14(%edx)
+	jne	L(neq)
+	test	%cl, %cl
+	je	L(eq)
+
+	cmp	$15, %ebp
+	je	L(eq)
+
+	movzbl	15(%eax), %ecx
+	cmpb	%cl, 15(%edx)
+	jne	L(neq)
+	test	%cl, %cl
+	je	L(eq)
+
+	POP	(%ebp)
+	xor	%eax, %eax
+	ret
+#endif
+
+END (ssse3_strcmp_latest)
diff --git a/libc/arch-x86/string/strcmp_wrapper.S b/libc/arch-x86/string/strcmp_wrapper.S
new file mode 100644
index 0000000..69b7f0b
--- /dev/null
+++ b/libc/arch-x86/string/strcmp_wrapper.S
@@ -0,0 +1,40 @@
+/*
+Copyright (c) 2010, Intel Corporation
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+    * Redistributions of source code must retain the above copyright notice,
+    * this list of conditions and the following disclaimer.
+
+    * Redistributions in binary form must reproduce the above copyright notice,
+    * this list of conditions and the following disclaimer in the documentation
+    * and/or other materials provided with the distribution.
+
+    * Neither the name of Intel Corporation nor the names of its contributors
+    * may be used to endorse or promote products derived from this software
+    * without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
+ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+#if defined(USE_SSSE3)
+
+# define ssse3_strcmp_latest strcmp
+# include "ssse3-strcmp.S"
+
+#else
+
+# include "strcmp.S"
+
+#endif
diff --git a/libc/arch-x86/string/strncmp_wrapper.S b/libc/arch-x86/string/strncmp_wrapper.S
new file mode 100644
index 0000000..2050184
--- /dev/null
+++ b/libc/arch-x86/string/strncmp_wrapper.S
@@ -0,0 +1,42 @@
+/*
+Copyright (c) 2010, Intel Corporation
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+    * Redistributions of source code must retain the above copyright notice,
+    * this list of conditions and the following disclaimer.
+
+    * Redistributions in binary form must reproduce the above copyright notice,
+    * this list of conditions and the following disclaimer in the documentation
+    * and/or other materials provided with the distribution.
+
+    * Neither the name of Intel Corporation nor the names of its contributors
+    * may be used to endorse or promote products derived from this software
+    * without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
+ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+#if defined(USE_SSSE3)
+
+# define USE_AS_STRNCMP
+# define ssse3_strcmp_latest strncmp
+# include "ssse3-strcmp.S"
+
+#else
+
+# include "strncmp.S"
+
+#endif
+
diff --git a/libc/bionic/malloc_leak.c b/libc/bionic/malloc_leak.c
index 305f954..b21bc6a 100644
--- a/libc/bionic/malloc_leak.c
+++ b/libc/bionic/malloc_leak.c
@@ -198,7 +198,7 @@
     // debug_log("info = %p\n", info);
     if (*info == NULL) {
         *overallSize = 0;
-        goto done;
+        goto out_nomem_info;
     }
 
     // debug_log("sorting list...\n");
@@ -211,8 +211,7 @@
         size_t entrySize = (sizeof(size_t) * 2) + (sizeof(intptr_t) * entry->numEntries);
         if (entrySize < *infoSize) {
             /* we're writing less than a full entry, clear out the rest */
-            /* TODO: only clear out the part we're not overwriting? */
-            memset(head, 0, *infoSize);
+            memset(head + entrySize, 0, *infoSize - entrySize);
         } else {
             /* make sure the amount we're copying doesn't exceed the limit */
             entrySize = *infoSize;
@@ -221,6 +220,7 @@
         head += *infoSize;
     }
 
+out_nomem_info:
     dlfree(list);
 
 done:
@@ -323,6 +323,8 @@
     } else {
         // create a new entry
         entry = (HashEntry*)dlmalloc(sizeof(HashEntry) + numEntries*sizeof(intptr_t));
+        if (!entry)
+            return NULL;
         entry->allocations = 1;
         entry->slot = slot;
         entry->prev = NULL;
@@ -665,8 +667,9 @@
     }
 
     if (new_buffer) {
-        size_t size = (bytes < old_bytes)?(bytes):(old_bytes);
-        memcpy(new_buffer, mem, size);
+        if (bytes > old_bytes)
+            bytes = old_bytes;
+        memcpy(new_buffer, mem, bytes);
         chk_free(mem);
     }
 
diff --git a/libc/bionic/pthread.c b/libc/bionic/pthread.c
index 8171aac..7d4056d 100644
--- a/libc/bionic/pthread.c
+++ b/libc/bionic/pthread.c
@@ -597,13 +597,12 @@
 
     for (thread = gThreadList; thread != NULL; thread = thread->next)
         if (thread == (pthread_internal_t*)thid)
-            break;
+            goto FoundIt;
 
-    if (!thread) {
-        pthread_mutex_unlock(&gThreadListLock);
-        return ESRCH;
-    }
+    pthread_mutex_unlock(&gThreadListLock);
+    return ESRCH;
 
+FoundIt:
     if (thread->attr.flags & PTHREAD_ATTR_FLAG_DETACHED) {
         pthread_mutex_unlock(&gThreadListLock);
         return EINVAL;
diff --git a/libc/bionic/pututline.c b/libc/bionic/pututline.c
index 2449068..c8427f7 100644
--- a/libc/bionic/pututline.c
+++ b/libc/bionic/pututline.c
@@ -34,7 +34,7 @@
 {
     FILE* f;
     struct utmp u;
-    int i;
+    long i;
 
     if (!(f = fopen(_PATH_UTMP, "w+")))
         return;
diff --git a/libc/bionic/ssp.c b/libc/bionic/ssp.c
index 20794f4..f83b2a4 100644
--- a/libc/bionic/ssp.c
+++ b/libc/bionic/ssp.c
@@ -76,9 +76,9 @@
     sigprocmask(SIG_BLOCK, &sigmask, NULL);
 
     /* Use /proc/self/exe link to obtain the program name for logging
-     * purposes. If it's not available, we set it to "unknown" */
+     * purposes. If it's not available, we set it to "<unknown>" */
     if ((count = readlink("/proc/self/exe", path, sizeof(path) - 1)) == -1) {
-        strlcpy(path, "unknown", sizeof(path));
+        strlcpy(path, "<unknown>", sizeof(path));
     } else {
         path[count] = '\0';
     }
diff --git a/libc/include/dlfcn.h b/libc/include/dlfcn.h
index 9582796..f84d1d1 100644
--- a/libc/include/dlfcn.h
+++ b/libc/include/dlfcn.h
@@ -32,10 +32,22 @@
 
 __BEGIN_DECLS
 
+typedef struct {
+    const char *dli_fname;  /* Pathname of shared object that
+                               contains address */
+    void       *dli_fbase;  /* Address at which shared object
+                               is loaded */
+    const char *dli_sname;  /* Name of nearest symbol with address
+                               lower than addr */
+    void       *dli_saddr;  /* Exact address of symbol named
+                               in dli_sname */
+} Dl_info;
+
 extern void*        dlopen(const char*  filename, int flag);
 extern int          dlclose(void*  handle);
 extern const char*  dlerror(void);
 extern void*        dlsym(void*  handle, const char*  symbol);
+extern int          dladdr(void* addr, Dl_info *info);
 
 enum {
   RTLD_NOW  = 0,
diff --git a/libc/kernel/common/linux/a1026.h b/libc/kernel/common/linux/a1026.h
new file mode 100644
index 0000000..2bf6190
--- /dev/null
+++ b/libc/kernel/common/linux/a1026.h
@@ -0,0 +1,67 @@
+/****************************************************************************
+ ****************************************************************************
+ ***
+ ***   This header was automatically generated from a Linux kernel header
+ ***   of the same name, to make information necessary for userspace to
+ ***   call into the kernel available to libc.  It contains only constants,
+ ***   structures, and macros generated from the original header, and thus,
+ ***   contains no copyrightable information.
+ ***
+ ****************************************************************************
+ ****************************************************************************/
+#ifndef __LINUX_A1026_H
+#define __LINUX_A1026_H
+
+#include <linux/ioctl.h>
+
+#define A1026_MAX_FW_SIZE (32*1024)
+struct a1026img {
+ unsigned char *buf;
+ unsigned img_size;
+};
+
+enum A1026_PathID {
+ A1026_PATH_SUSPEND,
+ A1026_PATH_INCALL_RECEIVER,
+ A1026_PATH_INCALL_HEADSET,
+ A1026_PATH_INCALL_SPEAKER,
+ A1026_PATH_INCALL_BT,
+ A1026_PATH_VR_NO_NS_RECEIVER,
+ A1026_PATH_VR_NO_NS_HEADSET,
+ A1026_PATH_VR_NO_NS_SPEAKER,
+ A1026_PATH_VR_NO_NS_BT,
+ A1026_PATH_VR_NS_RECEIVER,
+ A1026_PATH_VR_NS_HEADSET,
+ A1026_PATH_VR_NS_SPEAKER,
+ A1026_PATH_VR_NS_BT,
+ A1026_PATH_RECORD_RECEIVER,
+ A1026_PATH_RECORD_HEADSET,
+ A1026_PATH_RECORD_SPEAKER,
+ A1026_PATH_RECORD_BT,
+ A1026_PATH_CAMCORDER,
+ A1026_PATH_INCALL_TTY
+};
+
+enum A1026_NS_states {
+ A1026_NS_STATE_AUTO,
+ A1026_NS_STATE_OFF,
+ A1026_NS_STATE_CT,
+ A1026_NS_STATE_FT,
+ A1026_NS_NUM_STATES
+};
+
+#define A1026_IOCTL_MAGIC 'u'
+
+#define A1026_BOOTUP_INIT _IOW(A1026_IOCTL_MAGIC, 0x01, struct a1026img *)
+#define A1026_SET_CONFIG _IOW(A1026_IOCTL_MAGIC, 0x02, enum A1026_PathID)
+#define A1026_SET_NS_STATE _IOW(A1026_IOCTL_MAGIC, 0x03, enum A1026_NS_states)
+
+#define A1026_SET_MIC_ONOFF _IOW(A1026_IOCTL_MAGIC, 0x50, unsigned)
+#define A1026_SET_MICSEL_ONOFF _IOW(A1026_IOCTL_MAGIC, 0x51, unsigned)
+#define A1026_READ_DATA _IOR(A1026_IOCTL_MAGIC, 0x52, unsigned)
+#define A1026_WRITE_MSG _IOW(A1026_IOCTL_MAGIC, 0x53, unsigned)
+#define A1026_SYNC_CMD _IO(A1026_IOCTL_MAGIC, 0x54)
+#define A1026_SET_CMD_FILE _IOW(A1026_IOCTL_MAGIC, 0x55, unsigned)
+
+#endif
+
diff --git a/libc/kernel/common/linux/msm_kgsl.h b/libc/kernel/common/linux/msm_kgsl.h
index d717e57..740ba60 100644
--- a/libc/kernel/common/linux/msm_kgsl.h
+++ b/libc/kernel/common/linux/msm_kgsl.h
@@ -139,6 +139,8 @@
 struct kgsl_sharedmem_from_pmem {
  int pmem_fd;
  unsigned int gpuaddr;
+ unsigned int len;
+ unsigned int offset;
 };
 
 #define IOCTL_KGSL_SHAREDMEM_FROM_PMEM   _IOWR(KGSL_IOC_TYPE, 0x20, struct kgsl_sharedmem_from_pmem)
@@ -188,5 +190,12 @@
 
 #define IOCTL_KGSL_SHAREDMEM_FLUSH_CACHE   _IOW(KGSL_IOC_TYPE, 0x24, struct kgsl_sharedmem_free)
 
+struct kgsl_drawctxt_set_bin_base_offset {
+ unsigned int drawctxt_id;
+ unsigned int offset;
+};
+
+#define IOCTL_KGSL_DRAWCTXT_SET_BIN_BASE_OFFSET   _IOW(KGSL_IOC_TYPE, 0x25, struct kgsl_drawctxt_set_bin_base_offset)
+
 #endif
 
diff --git a/libc/kernel/common/linux/tpa2018d1.h b/libc/kernel/common/linux/tpa2018d1.h
new file mode 100644
index 0000000..4ae31fc
--- /dev/null
+++ b/libc/kernel/common/linux/tpa2018d1.h
@@ -0,0 +1,33 @@
+/****************************************************************************
+ ****************************************************************************
+ ***
+ ***   This header was automatically generated from a Linux kernel header
+ ***   of the same name, to make information necessary for userspace to
+ ***   call into the kernel available to libc.  It contains only constants,
+ ***   structures, and macros generated from the original header, and thus,
+ ***   contains no copyrightable information.
+ ***
+ ****************************************************************************
+ ****************************************************************************/
+#ifndef _LINUX_TPA2018D1_H
+#define _LINUX_TPA2018D1_H
+
+#include <linux/ioctl.h>
+
+enum tpa2018d1_mode {
+ TPA2018_MODE_OFF,
+ TPA2018_MODE_PLAYBACK,
+ TPA2018_MODE_RINGTONE,
+ TPA2018_MODE_VOICE_CALL,
+ TPA2018_NUM_MODES,
+};
+
+#define TPA2018_IOCTL_MAGIC 'a'
+#define TPA2018_SET_CONFIG _IOW(TPA2018_IOCTL_MAGIC, 1, unsigned)
+#define TPA2018_READ_CONFIG _IOR(TPA2018_IOCTL_MAGIC, 2, unsigned)
+#define TPA2018_SET_PARAM _IOW(TPA2018_IOCTL_MAGIC, 3, unsigned)
+#define TPA2018_SET_MODE _IOW(TPA2018_IOCTL_MAGIC, 4, unsigned)
+
+#endif
+
+
diff --git a/libc/stdio/asprintf.c b/libc/stdio/asprintf.c
index 7379140..1257c7f 100644
--- a/libc/stdio/asprintf.c
+++ b/libc/stdio/asprintf.c
@@ -39,7 +39,7 @@
 	f._bf._size = f._w = 127;		/* Leave room for the NUL */
 	va_start(ap, fmt);
 	ret = vfprintf(&f, fmt, ap);
-  va_end(ap);
+	va_end(ap);
 	if (ret == -1)
 		goto err;
 	*f._p = '\0';
@@ -50,10 +50,7 @@
 	return (ret);
 
 err:
-	if (f._bf._base) {
-		free(f._bf._base);
-		f._bf._base = NULL;
-	}
+	free(f._bf._base);
 	*str = NULL;
 	errno = ENOMEM;
 	return (-1);
diff --git a/libc/stdio/vasprintf.c b/libc/stdio/vasprintf.c
index c3280c9..54c46b3 100644
--- a/libc/stdio/vasprintf.c
+++ b/libc/stdio/vasprintf.c
@@ -48,10 +48,7 @@
 	return (ret);
 
 err:
-	if (f._bf._base) {
-		free(f._bf._base);
-		f._bf._base = NULL;
-	}
+	free(f._bf._base);
 	*str = NULL;
 	errno = ENOMEM;
 	return (-1);
diff --git a/libc/stdlib/assert.c b/libc/stdlib/assert.c
index b439d8e..816b050 100644
--- a/libc/stdlib/assert.c
+++ b/libc/stdlib/assert.c
@@ -49,6 +49,6 @@
 	(void)fprintf(stderr,
 	    "assertion \"%s\" failed: file \"%s\", line %d, function \"%s\"\n",
 	    failedexpr, file, line, func);
-  abort();
+	abort();
 	/* NOTREACHED */
 }
diff --git a/libc/stdlib/bsearch.c b/libc/stdlib/bsearch.c
index 8193d27..7eb6325 100644
--- a/libc/stdlib/bsearch.c
+++ b/libc/stdlib/bsearch.c
@@ -56,11 +56,11 @@
 	for (lim = nmemb; lim != 0; lim >>= 1) {
 		p = base + (lim >> 1) * size;
 		cmp = (*compar)(key, p);
-		if (cmp == 0)
-			return ((void *)p);
 		if (cmp > 0) {	/* key > p: move right */
 			base = (char *)p + size;
 			lim--;
+		} else if (cmp == 0) {
+			return ((void *)p);
 		} /* else move left */
 	}
 	return (NULL);
diff --git a/libc/stdlib/ctype_.c b/libc/stdlib/ctype_.c
index f59af3e..cf32f16 100644
--- a/libc/stdlib/ctype_.c
+++ b/libc/stdlib/ctype_.c
@@ -53,7 +53,7 @@
 	_P,	_L|_X,	_L|_X,	_L|_X,	_L|_X,	_L|_X,	_L|_X,	_L,
 	_L,	_L,	_L,	_L,	_L,	_L,	_L,	_L,
 	_L,	_L,	_L,	_L,	_L,	_L,	_L,	_L,
-/* determine printability based on the IS0 8859 8-bit standard */
+	/* determine printability based on the IS0 8859 8-bit standard */
 	_L,	_L,	_L,	_P,	_P,	_P,	_P,	_C,
 
 	_C,	_C,	_C,	_C,	_C,	_C,	_C,	_C, /* 80 */
diff --git a/libc/stdlib/getenv.c b/libc/stdlib/getenv.c
index 13abe30..72367b3 100644
--- a/libc/stdlib/getenv.c
+++ b/libc/stdlib/getenv.c
@@ -62,8 +62,8 @@
 		if (i == 0 && *cp++ == '=') {
 			*offset = p - environ;
 			return (cp);
-    }
-  }
+		}
+	}
 	return (NULL);
 }
 
diff --git a/libc/stdlib/putenv.c b/libc/stdlib/putenv.c
index c3bedae..54482f6 100644
--- a/libc/stdlib/putenv.c
+++ b/libc/stdlib/putenv.c
@@ -42,7 +42,7 @@
 	if ((equal = strchr(p, '=')) == NULL) {
 		(void)free(p);
 		return (-1);
-  }
+	}
 	*equal = '\0';
 	rval = setenv(p, equal + 1, 1);
 	(void)free(p);
diff --git a/libc/stdlib/qsort.c b/libc/stdlib/qsort.c
index cd66961..f6fc8e1 100644
--- a/libc/stdlib/qsort.c
+++ b/libc/stdlib/qsort.c
@@ -39,11 +39,11 @@
 /*
  * Qsort routine from Bentley & McIlroy's "Engineering a Sort Function".
  */
-#define swapcode(TYPE, parmi, parmj, n) { 		\
-	long i = (n) / sizeof (TYPE); 			\
-	TYPE *pi = (TYPE *) (parmi); 			\
-	TYPE *pj = (TYPE *) (parmj); 			\
-	do { 						\
+#define swapcode(TYPE, parmi, parmj, n) {		\
+	long i = (n) / sizeof (TYPE);			\
+	TYPE *pi = (TYPE *) (parmi);			\
+	TYPE *pj = (TYPE *) (parmj);			\
+	do {						\
 		TYPE	t = *pi;			\
 		*pi++ = *pj;				\
 		*pj++ = t;				\
@@ -56,7 +56,7 @@
 static __inline void
 swapfunc(char *a, char *b, int n, int swaptype)
 {
-	if (swaptype <= 1) 
+	if (swaptype <= 1)
 		swapcode(long, a, b, n)
 	else
 		swapcode(char, a, b, n)
@@ -70,7 +70,7 @@
 	} else						\
 		swapfunc(a, b, es, swaptype)
 
-#define vecswap(a, b, n) 	if ((n) > 0) swapfunc(a, b, n, swaptype)
+#define vecswap(a, b, n)	if ((n) > 0) swapfunc(a, b, n, swaptype)
 
 static __inline char *
 med3(char *a, char *b, char *c, int (*cmp)(const void *, const void *))
@@ -110,7 +110,7 @@
 	}
 	swap(a, pm);
 	pa = pb = (char *)a + es;
-    
+
 	pc = pd = (char *)a + (n - 1) * es;
 	for (;;) {
 		while (pb <= pc && (r = cmp(pb, a)) <= 0) {
@@ -118,7 +118,7 @@
 				swap_cnt = 1;
 				swap(pa, pb);
 				pa += es;
-      }
+			}
 			pb += es;
 		}
 		while (pb <= pc && (r = cmp(pc, a)) >= 0) {
@@ -138,11 +138,11 @@
 	}
 	if (swap_cnt == 0) {  /* Switch to insertion sort */
 		for (pm = (char *) a + es; pm < (char *) a + n * es; pm += es)
-			for (pl = pm; pl > (char *) a && cmp(pl - es, pl) > 0; 
+			for (pl = pm; pl > (char *) a && cmp(pl - es, pl) > 0;
 			     pl -= es)
 				swap(pl, pl - es);
 		return;
-    }
+	}
 
 	pn = (char *)a + n * es;
 	r = min(pa - (char *)a, pb - pa);
@@ -151,11 +151,11 @@
 	vecswap(pb, pn - r, r);
 	if ((r = pb - pa) > (int)es)
 		qsort(a, r / es, es, cmp);
-	if ((r = pd - pc) > (int)es) { 
+	if ((r = pd - pc) > (int)es) {
 		/* Iterate rather than recurse to save stack space */
 		a = pn - r;
 		n = r / es;
 		goto loop;
 	}
-/*		qsort(pn - r, r / es, es, cmp);*/
+	/* qsort(pn - r, r / es, es, cmp); */
 }
diff --git a/libc/stdlib/seed48.c b/libc/stdlib/seed48.c
index afd5f54..583262f 100644
--- a/libc/stdlib/seed48.c
+++ b/libc/stdlib/seed48.c
@@ -22,7 +22,7 @@
 seed48(unsigned short xseed[3])
 {
 	static unsigned short sseed[3];
-  
+
 	sseed[0] = __rand48_seed[0];
 	sseed[1] = __rand48_seed[1];
 	sseed[2] = __rand48_seed[2];
diff --git a/libc/stdlib/sha1hash.c b/libc/stdlib/sha1hash.c
index 28e3399..1c7aaf3 100644
--- a/libc/stdlib/sha1hash.c
+++ b/libc/stdlib/sha1hash.c
@@ -4,7 +4,7 @@
 100% Public Domain
 
 -----------------
-Modified 7/98 
+Modified 7/98
 By James H. Brown <jbrown@burgoyne.com>
 Still 100% Public Domain
 
@@ -26,7 +26,7 @@
 be guaranteed to generate the wrong hash (e.g. Test Vector #3, a million
 "a"s).
 
-I also changed the declaration of variables i & j in SHA1Update to 
+I also changed the declaration of variables i & j in SHA1Update to
 unsigned long from unsigned int for the same reason.
 
 These changes should make no difference to any 32 bit implementations since
@@ -53,7 +53,7 @@
 Modified 4/01
 By Saul Kravitz <Saul.Kravitz@celera.com>
 Still 100% PD
-Modified to run on Compaq Alpha hardware.  
+Modified to run on Compaq Alpha hardware.
 
 -----------------
 Modified 2/03
@@ -116,7 +116,7 @@
 void SHAPrintContext(SHA1_CTX *context, char *msg){
   printf("%s (%d,%d) %x %x %x %x %x\n",
 	 msg,
-	 context->count[0], context->count[1], 
+	 context->count[0], context->count[1],
 	 context->state[0],
 	 context->state[1],
 	 context->state[2],
@@ -238,8 +238,7 @@
     while ((context->count[0] & 504) != 448) {
         SHA1Update(context, (unsigned char *)"\0", 1);
     }
-    SHA1Update(context, finalcount, 8);  /* Should cause a SHA1Transform()
-*/
+    SHA1Update(context, finalcount, 8);  /* Should cause a SHA1Transform() */
     for (i = 0; i < 20; i++) {
         digest[i] = (unsigned char)
          ((context->state[i>>2] >> ((3-(i & 3)) * 8) ) & 255);
@@ -254,7 +253,7 @@
     SHA1Transform(context->state, context->buffer);
 #endif
 }
-  
+
 /*************************************************************/
 
 /* This is not quite the MIME base64 algorithm: it uses _ instead of /,
@@ -302,7 +301,7 @@
       fputs("Unable to open file.", stderr);
       return(-1);
     }
-  } 
+  }
   SHA1Init(&context);
   while (!feof(file)) {  /* note: what if ferror(file) */
     i = fread(buffer, 1, 16384, file);
diff --git a/libc/stdlib/strtod.c b/libc/stdlib/strtod.c
index 7fb7112..d2582b1 100644
--- a/libc/stdlib/strtod.c
+++ b/libc/stdlib/strtod.c
@@ -364,7 +364,7 @@
 	struct Bigint *next;
 	int k, maxwds, sign, wds;
 	ULong x[1];
-	};
+};
 
  typedef struct Bigint Bigint;
 
@@ -393,19 +393,19 @@
 
 	if ((rv = freelist[k]) != NULL) {
 		freelist[k] = rv->next;
-		}
+	}
 	else {
 		x = 1 << k;
 		rv = (Bigint *)MALLOC(sizeof(Bigint) + (x-1)*sizeof(Long));
 		rv->k = k;
 		rv->maxwds = x;
-		}
+	}
 	rv->sign = rv->wds = 0;
 
 	mutex_unlock(&freelist_mutex);
 
 	return rv;
-	}
+}
 
  static void
 Bfree
@@ -422,8 +422,8 @@
 		freelist[v->k] = v;
 
 		mutex_unlock(&freelist_mutex);
-		}
 	}
+}
 
 #define Bcopy(x,y) memcpy(&x->sign, &y->sign, \
     y->wds*sizeof(Long) + 2*sizeof(int))
@@ -458,8 +458,8 @@
 		a = (int)(y >> 16);
 		*x++ = y & 0xffff;
 #endif
-		}
-		while(++i < wds);
+	}
+	while(++i < wds);
 	if (a) {
 		if (wds >= b->maxwds) {
 			b1 = Balloc(b->k+1);
@@ -469,9 +469,9 @@
 			}
 		b->x[wds++] = a;
 		b->wds = wds;
-		}
-	return b;
 	}
+	return b;
+}
 
  static Bigint *
 s2b
@@ -503,13 +503,13 @@
 		do b = multadd(b, 10, *s++ - '0');
 			while(++i < nd0);
 		s++;
-		}
+	}
 	else
 		s += 10;
 	for(; i < nd; i++)
 		b = multadd(b, 10, *s++ - '0');
 	return b;
-	}
+}
 
  static int
 hi0bits
@@ -524,26 +524,26 @@
 	if (!(x & 0xffff0000)) {
 		k = 16;
 		x <<= 16;
-		}
+	}
 	if (!(x & 0xff000000)) {
 		k += 8;
 		x <<= 8;
-		}
+	}
 	if (!(x & 0xf0000000)) {
 		k += 4;
 		x <<= 4;
-		}
+	}
 	if (!(x & 0xc0000000)) {
 		k += 2;
 		x <<= 2;
-		}
+	}
 	if (!(x & 0x80000000)) {
 		k++;
 		if (!(x & 0x40000000))
 			return 32;
-		}
-	return k;
 	}
+	return k;
+}
 
  static int
 lo0bits
@@ -565,33 +565,33 @@
 			}
 		*y = x >> 2;
 		return 2;
-		}
+	}
 	k = 0;
 	if (!(x & 0xffff)) {
 		k = 16;
 		x >>= 16;
-		}
+	}
 	if (!(x & 0xff)) {
 		k += 8;
 		x >>= 8;
-		}
+	}
 	if (!(x & 0xf)) {
 		k += 4;
 		x >>= 4;
-		}
+	}
 	if (!(x & 0x3)) {
 		k += 2;
 		x >>= 2;
-		}
+	}
 	if (!(x & 1)) {
 		k++;
 		x >>= 1;
 		if (!x & 1)
 			return 32;
-		}
+	}
 	*y = x;
 	return k;
-	}
+}
 
  static Bigint *
 i2b
@@ -607,7 +607,7 @@
 	b->x[0] = i;
 	b->wds = 1;
 	return b;
-	}
+}
 
  static Bigint *
 mult
@@ -629,7 +629,7 @@
 		c = a;
 		a = b;
 		b = c;
-		}
+	}
 	k = a->k;
 	wa = a->wds;
 	wb = b->wds;
@@ -656,10 +656,10 @@
 				z2 = (*x++ >> 16) * y + (*xc >> 16) + carry;
 				carry = z2 >> 16;
 				Storeinc(xc, z2, z);
-				}
-				while(x < xae);
-			*xc = carry;
 			}
+			while(x < xae);
+			*xc = carry;
+		}
 		if ((y = *xb >> 16) != 0) {
 			x = xa;
 			xc = xc0;
@@ -671,11 +671,11 @@
 				Storeinc(xc, z, z2);
 				z2 = (*x++ >> 16) * y + (*xc & 0xffff) + carry;
 				carry = z2 >> 16;
-				}
-				while(x < xae);
-			*xc = z2;
 			}
+			while(x < xae);
+			*xc = z2;
 		}
+	}
 #else
 	for(; xb < xbe; xc0++) {
 		if (y = *xb++) {
@@ -686,16 +686,16 @@
 				z = *x++ * y + *xc + carry;
 				carry = z >> 16;
 				*xc++ = z & 0xffff;
-				}
-				while(x < xae);
-			*xc = carry;
 			}
+			while(x < xae);
+			*xc = carry;
 		}
+	}
 #endif
 	for(xc0 = c->x, xc = xc0 + wc; wc > 0 && !*--xc; --wc) ;
 	c->wds = wc;
 	return c;
-	}
+}
 
  static Bigint *p5s;
 
@@ -720,23 +720,23 @@
 		/* first time */
 		p5 = p5s = i2b(625);
 		p5->next = 0;
-		}
+	}
 	for(;;) {
 		if (k & 1) {
 			b1 = mult(b, p5);
 			Bfree(b);
 			b = b1;
-			}
+		}
 		if (!(k = (unsigned int) k >> 1))
 			break;
 		if (!(p51 = p5->next)) {
 			p51 = p5->next = mult(p5,p5);
 			p51->next = 0;
-			}
-		p5 = p51;
 		}
-	return b;
+		p5 = p51;
 	}
+	return b;
+}
 
  static Bigint *
 lshift
@@ -772,11 +772,11 @@
 		do {
 			*x1++ = *x << k | z;
 			z = *x++ >> k1;
-			}
-			while(x < xe);
+		}
+		while(x < xe);
 		if ((*x1 = z) != 0)
 			++n1;
-		}
+	}
 #else
 	if (k &= 0xf) {
 		k1 = 16 - k;
@@ -784,11 +784,11 @@
 		do {
 			*x1++ = *x << k  & 0xffff | z;
 			z = *x++ >> k1;
-			}
-			while(x < xe);
+		}
+		while(x < xe);
 		if (*x1 = z)
 			++n1;
-		}
+	}
 #endif
 	else do
 		*x1++ = *x++;
@@ -796,7 +796,7 @@
 	b1->wds = n1 - 1;
 	Bfree(b);
 	return b1;
-	}
+}
 
  static int
 cmp
@@ -828,9 +828,9 @@
 			return *xa < *xb ? -1 : 1;
 		if (xa <= xa0)
 			break;
-		}
-	return 0;
 	}
+	return 0;
+}
 
  static Bigint *
 diff
@@ -854,13 +854,13 @@
 		c->wds = 1;
 		c->x[0] = 0;
 		return c;
-		}
+	}
 	if (i < 0) {
 		c = a;
 		a = b;
 		b = c;
 		i = 1;
-		}
+	}
 	else
 		i = 0;
 	c = Balloc(a->k);
@@ -882,8 +882,8 @@
 		borrow = (ULong)z >> 16;
 		Sign_Extend(borrow, z);
 		Storeinc(xc, z, y);
-		}
-		while(xb < xbe);
+	}
+	while(xb < xbe);
 	while(xa < xae) {
 		y = (*xa & 0xffff) + borrow;
 		borrow = (ULong)y >> 16;
@@ -892,27 +892,27 @@
 		borrow = (ULong)z >> 16;
 		Sign_Extend(borrow, z);
 		Storeinc(xc, z, y);
-		}
+	}
 #else
 	do {
 		y = *xa++ - *xb++ + borrow;
 		borrow = y >> 16;
 		Sign_Extend(borrow, y);
 		*xc++ = y & 0xffff;
-		}
-		while(xb < xbe);
+	}
+	while(xb < xbe);
 	while(xa < xae) {
 		y = *xa++ + borrow;
 		borrow = y >> 16;
 		Sign_Extend(borrow, y);
 		*xc++ = y & 0xffff;
-		}
+	}
 #endif
 	while(!*--xc)
 		wa--;
 	c->wds = wa;
 	return c;
-	}
+}
 
  static double
 ulp
@@ -937,22 +937,22 @@
 		word0(a) = L;
 		word1(a) = 0;
 #ifndef Sudden_Underflow
-		}
+	}
 	else {
 		L = (ULong)-L >> Exp_shift;
 		if (L < Exp_shift) {
 			word0(a) = 0x80000 >> L;
 			word1(a) = 0;
-			}
+		}
 		else {
 			word0(a) = 0;
 			L -= Exp_shift;
 			word1(a) = L >= 31 ? 1 : 1 << (31 - L);
-			}
 		}
+	}
 #endif
 	return value(a);
-	}
+}
 
  static double
 b2d
@@ -986,17 +986,17 @@
 		w = xa > xa0 ? *--xa : 0;
 		d1 = y << ((32-Ebits) + k) | w >> (Ebits - k);
 		goto ret_d;
-		}
+	}
 	z = xa > xa0 ? *--xa : 0;
 	if (k -= Ebits) {
 		d0 = Exp_1 | y << k | z >> (32 - k);
 		y = xa > xa0 ? *--xa : 0;
 		d1 = z << k | y >> (32 - k);
-		}
+	}
 	else {
 		d0 = Exp_1 | y;
 		d1 = z;
-		}
+	}
 #else
 	if (k < Ebits + 16) {
 		z = xa > xa0 ? *--xa : 0;
@@ -1005,7 +1005,7 @@
 		y = xa > xa0 ? *--xa : 0;
 		d1 = z << k + 16 - Ebits | w << k - Ebits | y >> 16 + Ebits - k;
 		goto ret_d;
-		}
+	}
 	z = xa > xa0 ? *--xa : 0;
 	w = xa > xa0 ? *--xa : 0;
 	k -= Ebits + 16;
@@ -1022,7 +1022,7 @@
 #undef d1
 #endif
 	return value(d);
-	}
+}
 
  static Bigint *
 d2b
@@ -1072,11 +1072,11 @@
 		if ((k = lo0bits(&y)) != 0) {
 			x[0] = y | z << (32 - k);
 			z >>= k;
-			}
+		}
 		else
 			x[0] = y;
 		i = b->wds = (x[1] = z) ? 2 : 1;
-		}
+	}
 	else {
 #ifdef DEBUG
 		if (!z)
@@ -1086,7 +1086,7 @@
 		x[0] = z;
 		i = b->wds = 1;
 		k += 32;
-		}
+	}
 #else
 	if (y = d1) {
 		if (k = lo0bits(&y))
@@ -1095,22 +1095,22 @@
 				x[1] = z >> k - 16 & 0xffff;
 				x[2] = z >> k;
 				i = 2;
-				}
+			}
 			else {
 				x[0] = y & 0xffff;
 				x[1] = y >> 16 | z << 16 - k & 0xffff;
 				x[2] = z >> k & 0xffff;
 				x[3] = z >> k+16;
 				i = 3;
-				}
+			}
 		else {
 			x[0] = y & 0xffff;
 			x[1] = y >> 16;
 			x[2] = z & 0xffff;
 			x[3] = z >> 16;
 			i = 3;
-			}
 		}
+	}
 	else {
 #ifdef DEBUG
 		if (!z)
@@ -1120,14 +1120,14 @@
 		if (k >= 16) {
 			x[0] = z;
 			i = 0;
-			}
+		}
 		else {
 			x[0] = z & 0xffff;
 			x[1] = z >> 16;
 			i = 1;
-			}
-		k += 32;
 		}
+		k += 32;
+	}
 	while(!x[i])
 		--i;
 	b->wds = i + 1;
@@ -1143,7 +1143,7 @@
 		*bits = P - k;
 #endif
 #ifndef Sudden_Underflow
-		}
+	}
 	else {
 		*e = de - Bias - (P-1) + 1 + k;
 #ifdef Pack_32
@@ -1154,7 +1154,7 @@
 		}
 #endif
 	return b;
-	}
+}
 #undef d0
 #undef d1
 
@@ -1181,23 +1181,23 @@
 		word0(da) += (k >> 2)*Exp_msk1;
 		if (k &= 3)
 			da *= 1 << k;
-		}
+	}
 	else {
 		k = -k;
 		word0(db) += (k >> 2)*Exp_msk1;
 		if (k &= 3)
 			db *= 1 << k;
-		}
+	}
 #else
 	if (k > 0)
 		word0(da) += k*Exp_msk1;
 	else {
 		k = -k;
 		word0(db) += k*Exp_msk1;
-		}
+	}
 #endif
 	return value(da) / value(db);
-	}
+}
 
 static CONST double
 tens[] = {
@@ -1207,7 +1207,7 @@
 #ifdef VAX
 		, 1e23, 1e24
 #endif
-		};
+};
 
 #ifdef IEEE_Arith
 static CONST double bigtens[] = { 1e16, 1e32, 1e64, 1e128, 1e256 };
@@ -1312,7 +1312,7 @@
 		while(*++s == '0') ;
 		if (!*s)
 			goto ret;
-		}
+	}
 	s0 = s;
 	y = z = 0;
 	for(nd = nf = 0; (c = *s) >= '0' && c <= '9'; nd++, s++)
@@ -1333,7 +1333,7 @@
 				goto have_dig;
 				}
 			goto dig_done;
-			}
+		}
 		for(; c >= '0' && c <= '9'; c = *++s) {
  have_dig:
 			nz++;
@@ -1349,16 +1349,16 @@
 				else if (nd <= DBL_DIG + 1)
 					z = 10*z + c;
 				nz = 0;
-				}
 			}
 		}
+	}
  dig_done:
 	e = 0;
 	if (c == 'e' || c == 'E') {
 		if (!nd && !nz && !nz0) {
 			s = s00;
 			goto ret;
-			}
+		}
 		s00 = s;
 		esign = 0;
 		switch(c = *++s) {
@@ -1367,7 +1367,7 @@
 				/* FALLTHROUGH */
 			case '+':
 				c = *++s;
-			}
+		}
 		if (c >= '0' && c <= '9') {
 			while(c == '0')
 				c = *++s;
@@ -1385,18 +1385,18 @@
 					e = (int)L;
 				if (esign)
 					e = -e;
-				}
+			}
 			else
 				e = 0;
-			}
+		}
 		else
 			s = s00;
-		}
+	}
 	if (!nd) {
 		if (!nz && !nz0)
 			s = s00;
 		goto ret;
-		}
+	}
 	e1 = e -= nf;
 
 	/* Now we have nd0 digits, starting at s0, followed by a
@@ -1415,7 +1415,7 @@
 #ifndef RND_PRODQUOT
 		&& FLT_ROUNDS == 1
 #endif
-			) {
+		) {
 		if (!e)
 			goto ret;
 		if (e > 0) {
@@ -1427,7 +1427,7 @@
 				    tens[e]);
 				goto ret;
 #endif
-				}
+			}
 			i = DBL_DIG - nd;
 			if (e <= Ten_pmax + i) {
 				/* A fancier test would sometimes let us do
@@ -1452,16 +1452,16 @@
 				    tens[e]);
 #endif
 				goto ret;
-				}
 			}
+		}
 #ifndef Inaccurate_Divide
 		else if (e >= -Ten_pmax) {
 			/* value(rv) = */ rounded_quotient(value(rv),
 			    tens[-e]);
 			goto ret;
-			}
-#endif
 		}
+#endif
+	}
 	e1 += nd - k;
 
 	/* Get starting approximation = rv * 10**e1 */
@@ -1477,7 +1477,7 @@
 				if (bd0)
 					goto retfree;
 				goto ret;
-				}
+			}
 			if ((e1 = (unsigned int)e1 >> 4) != 0) {
 				for(j = 0; e1 > 1; j++,
 				    e1 = (unsigned int)e1 >> 1)
@@ -1497,10 +1497,9 @@
 					}
 				else
 					word0(rv) += P*Exp_msk1;
-				}
-
 			}
 		}
+	}
 	else if (e1 < 0) {
 		e1 = -e1;
 		if ((i = e1 & 15) != 0)
@@ -1526,15 +1525,15 @@
 					if (bd0)
 						goto retfree;
 					goto ret;
-					}
+				}
 				word0(rv) = Tiny0;
 				word1(rv) = Tiny1;
 				/* The refinement below will clean
 				 * this approximation up.
 				 */
-				}
 			}
 		}
+	}
 
 	/* Now the hard part -- adjusting rv to the correct value.*/
 
@@ -1551,11 +1550,11 @@
 		if (e >= 0) {
 			bb2 = bb5 = 0;
 			bd2 = bd5 = e;
-			}
+		}
 		else {
 			bb2 = bb5 = -e;
 			bd2 = bd5 = 0;
-			}
+		}
 		if (bbe >= 0)
 			bb2 += bbe;
 		else
@@ -1583,13 +1582,13 @@
 			bb2 -= i;
 			bd2 -= i;
 			bs2 -= i;
-			}
+		}
 		if (bb5 > 0) {
 			bs = pow5mult(bs, bb5);
 			bb1 = mult(bs, bb);
 			Bfree(bb);
 			bb = bb1;
-			}
+		}
 		if (bb2 > 0)
 			bb = lshift(bb, bb2);
 		if (bd5 > 0)
@@ -1612,7 +1611,7 @@
 			if (cmp(delta, bs) > 0)
 				goto drop_down;
 			break;
-			}
+		}
 		if (i == 0) {
 			/* exactly half-way between */
 			if (dsign) {
@@ -1627,8 +1626,8 @@
 						;
 					word1(rv) = 0;
 					break;
-					}
 				}
+			}
 			else if (!(word0(rv) & Bndry_mask) && !word1(rv)) {
  drop_down:
 				/* boundary case -- decrement exponent */
@@ -1651,7 +1650,7 @@
 #else
 				break;
 #endif
-				}
+			}
 #ifndef ROUND_BIASED
 			if (!(word1(rv) & LSB))
 				break;
@@ -1665,10 +1664,10 @@
 				if (!value(rv))
 					goto undfl;
 #endif
-				}
+			}
 #endif
 			break;
-			}
+		}
 		if ((aadj = ratio(delta, bs)) <= 2.) {
 			if (dsign)
 				aadj = aadj1 = 1.;
@@ -1679,7 +1678,7 @@
 #endif
 				aadj = 1.;
 				aadj1 = -1.;
-				}
+			}
 			else {
 				/* special case -- power of FLT_RADIX to be */
 				/* rounded down... */
@@ -1690,7 +1689,7 @@
 					aadj *= 0.5;
 				aadj1 = -aadj;
 				}
-			}
+		}
 		else {
 			aadj *= 0.5;
 			aadj1 = dsign ? aadj : -aadj;
@@ -1702,12 +1701,12 @@
 				case 0: /* towards 0 */
 				case 3: /* towards -infinity */
 					aadj1 += 0.5;
-				}
+			}
 #else
 			if (FLT_ROUNDS == 0)
 				aadj1 += 0.5;
 #endif
-			}
+		}
 		y = word0(rv) & Exp_mask;
 
 		/* Check for overflow */
@@ -1724,10 +1723,10 @@
 				word0(rv) = Big0;
 				word1(rv) = Big1;
 				goto cont;
-				}
+			}
 			else
 				word0(rv) += P*Exp_msk1;
-			}
+		}
 		else {
 #ifdef Sudden_Underflow
 			if ((word0(rv) & Exp_mask) <= P*Exp_msk1) {
@@ -1740,21 +1739,21 @@
 #else
 				if ((word0(rv) & Exp_mask) <= P*Exp_msk1)
 #endif
-					{
+				{
 					if (word0(rv0) == Tiny0
 					 && word1(rv0) == Tiny1)
 						goto undfl;
 					word0(rv) = Tiny0;
 					word1(rv) = Tiny1;
 					goto cont;
-					}
+				}
 				else
 					word0(rv) -= P*Exp_msk1;
 				}
 			else {
 				adj = aadj1 * ulp(value(rv));
 				value(rv) += adj;
-				}
+			}
 #else
 			/* Compute adj so that the IEEE rounding rules will
 			 * correctly round rv + adj in some half-way cases.
@@ -1767,11 +1766,11 @@
 				aadj1 = (double)(int)(aadj + 0.5);
 				if (!dsign)
 					aadj1 = -aadj1;
-				}
+			}
 			adj = aadj1 * ulp(value(rv));
 			value(rv) += adj;
 #endif
-			}
+		}
 		z = word0(rv) & Exp_mask;
 		if (y == z) {
 			/* Can we stop now? */
@@ -1781,16 +1780,16 @@
 			if (dsign || word1(rv) || word0(rv) & Bndry_mask) {
 				if (aadj < .4999999 || aadj > .5000001)
 					break;
-				}
+			}
 			else if (aadj < .4999999/FLT_RADIX)
 				break;
-			}
+		}
  cont:
 		Bfree(bb);
 		Bfree(bd);
 		Bfree(bs);
 		Bfree(delta);
-		}
+	}
  retfree:
 	Bfree(bb);
 	Bfree(bd);
@@ -1802,7 +1801,7 @@
 		/* LINTED interface specification */
 		*se = (char *)s;
 	return sign ? -value(rv) : value(rv);
-	}
+}
 
  static int
 quorem
@@ -1861,15 +1860,15 @@
 			Sign_Extend(borrow, y);
 			*bx++ = y & 0xffff;
 #endif
-			}
-			while(sx <= sxe);
+		}
+		while(sx <= sxe);
 		if (!*bxe) {
 			bx = b->x;
 			while(--bxe > bx && !*bxe)
 				--n;
 			b->wds = n;
-			}
 		}
+	}
 	if (cmp(b, S) >= 0) {
 		q++;
 		borrow = 0;
@@ -1897,18 +1896,18 @@
 			Sign_Extend(borrow, y);
 			*bx++ = y & 0xffff;
 #endif
-			}
-			while(sx <= sxe);
+		}
+		while(sx <= sxe);
 		bx = b->x;
 		bxe = bx + n;
 		if (!*bxe) {
 			while(--bxe > bx && !*bxe)
 				--n;
 			b->wds = n;
-			}
 		}
-	return q;
 	}
+	return q;
+}
 
 /* freedtoa(s) must be used to free values s returned by dtoa
  * when MULTIPLE_THREADS is #defined.  It should be used in all cases,
@@ -2028,7 +2027,7 @@
 		/* set sign for everything, including 0's and NaNs */
 		*sign = 1;
 		word0(d) &= ~Sign_bit;	/* clear sign bit */
-		}
+	}
 	else
 		*sign = 0;
 
@@ -2038,7 +2037,7 @@
 #else
 	if (word0(d)  == 0x8000)
 #endif
-		{
+	{
 		/* Infinity or NaN */
 		*decpt = 9999;
 		s =
@@ -2046,30 +2045,30 @@
 			!word1(d) && !(word0(d) & 0xfffff) ? "Infinity" :
 #endif
 				"NaN";
-        result = Balloc(strlen(s)+1);
-        s0 = (char *)(void *)result;
-        strcpy(s0, s);
-        if (rve)
-            *rve =
+		result = Balloc(strlen(s)+1);
+		s0 = (char *)(void *)result;
+		strcpy(s0, s);
+		if (rve)
+			*rve =
 #ifdef IEEE_Arith
-                s0[3] ? s0 + 8 :
+				s0[3] ? s0 + 8 :
 #endif
-                        s0 + 3;
+				s0 + 3;
 		return s0;
-		}
+	}
 #endif
 #ifdef IBM
 	value(d) += 0; /* normalize */
 #endif
 	if (!value(d)) {
 		*decpt = 1;
-        result = Balloc(2);
-        s0 = (char *)(void *)result;
-        strcpy(s0, "0");
-        if (rve)
-            *rve = s0 + 1;
-        return s0;
-		}
+		result = Balloc(2);
+		s0 = (char *)(void *)result;
+		strcpy(s0, "0");
+		if (rve)
+			*rve = s0 + 1;
+		return s0;
+	}
 
 	b = d2b(value(d), &be, &bbits);
 #ifdef Sudden_Underflow
@@ -2114,7 +2113,7 @@
 #endif
 #ifndef Sudden_Underflow
 		denorm = 0;
-		}
+	}
 	else {
 		/* d is denormalized */
 
@@ -2125,7 +2124,7 @@
 		word0(d2) -= 31*Exp_msk1; /* adjust exponent */
 		i -= (Bias + (P-1) - 1) + 1;
 		denorm = 1;
-		}
+	}
 #endif
 	ds = (value(d2)-1.5)*0.289529654602168 + 0.1760912590558 +
 	    i*0.301029995663981;
@@ -2137,33 +2136,33 @@
 		if (value(d) < tens[k])
 			k--;
 		k_check = 0;
-		}
+	}
 	j = bbits - i - 1;
 	if (j >= 0) {
 		b2 = 0;
 		s2 = j;
-		}
+	}
 	else {
 		b2 = -j;
 		s2 = 0;
-		}
+	}
 	if (k >= 0) {
 		b5 = 0;
 		s5 = k;
 		s2 += k;
-		}
+	}
 	else {
 		b2 -= k;
 		b5 = -k;
 		s5 = 0;
-		}
+	}
 	if (mode < 0 || mode > 9)
 		mode = 0;
 	try_quick = 1;
 	if (mode > 5) {
 		mode -= 4;
 		try_quick = 0;
-		}
+	}
 	leftright = 1;
 	switch(mode) {
 		case 0:
@@ -2189,7 +2188,7 @@
 			ilim1 = i - 1;
 			if (i <= 0)
 				i = 1;
-		}
+	}
 	j = sizeof(ULong);
         for(result_k = 0; (int)(sizeof(Bigint) - sizeof(ULong)) + j <= i;
 		j <<= 1) result_k++;
@@ -2225,7 +2224,7 @@
 					ds *= bigtens[i];
 					}
 			value(d) /= ds;
-			}
+		}
 		else if ((jj1 = -k) != 0) {
 			value(d) *= tens[jj1 & 0xf];
 			for(j = (unsigned int)jj1 >> 4; j;
@@ -2233,8 +2232,8 @@
 				if (j & 1) {
 					ieps++;
 					value(d) *= bigtens[i];
-					}
-			}
+				}
+		}
 		if (k_check && value(d) < 1. && ilim > 0) {
 			if (ilim1 <= 0)
 				goto fast_failed;
@@ -2242,7 +2241,7 @@
 			k--;
 			value(d) *= 10.;
 			ieps++;
-			}
+		}
 		value(eps) = ieps*value(d) + 7.;
 		word0(eps) -= (P-1)*Exp_msk1;
 		if (ilim == 0) {
@@ -2253,7 +2252,7 @@
 			if (value(d) < -value(eps))
 				goto no_digits;
 			goto fast_failed;
-			}
+		}
 #ifndef No_leftright
 		if (leftright) {
 			/* Use Steele & White method of only
@@ -2273,7 +2272,7 @@
 				value(eps) *= 10.;
 				value(d) *= 10.;
 				}
-			}
+		}
 		else {
 #endif
 			/* Generate ilim digits, then fix them up. */
@@ -2291,17 +2290,17 @@
 						goto ret1;
 						}
 					break;
-					}
 				}
-#ifndef No_leftright
 			}
+#ifndef No_leftright
+		}
 #endif
  fast_failed:
 		s = s0;
 		value(d) = value(d2);
 		k = k0;
 		ilim = ilim0;
-		}
+	}
 
 	/* Do we have a "small" integer? */
 
@@ -2313,7 +2312,7 @@
 			if (ilim < 0 || value(d) <= 5*ds)
 				goto no_digits;
 			goto one_digit;
-			}
+		}
 		for(i = 1;; i++) {
 			L = value(d) / ds;
 			value(d) -= L*ds;
@@ -2322,7 +2321,7 @@
 			if (value(d) < 0) {
 				L--;
 				value(d) += ds;
-				}
+			}
 #endif
 			*s++ = '0' + (int)L;
 			if (i == ilim) {
@@ -2334,16 +2333,16 @@
 							k++;
 							*s = '0';
 							break;
-							}
+						}
 					++*s++;
-					}
-				break;
 				}
+				break;
+			}
 			if (!(value(d) *= 10.))
 				break;
 			}
 		goto ret1;
-		}
+	}
 
 	m2 = b2;
 	m5 = b5;
@@ -2359,7 +2358,7 @@
 #else
 				1 + P - bbits;
 #endif
-			}
+		}
 		else {
 			j = ilim - 1;
 			if (m5 >= j)
@@ -2368,22 +2367,22 @@
 				s5 += j -= m5;
 				b5 += j;
 				m5 = 0;
-				}
+			}
 			if ((i = ilim) < 0) {
 				m2 -= i;
 				i = 0;
-				}
 			}
+		}
 		b2 += i;
 		s2 += i;
 		mhi = i2b(1);
-		}
+	}
 	if (m2 > 0 && s2 > 0) {
 		i = m2 < s2 ? m2 : s2;
 		b2 -= i;
 		m2 -= i;
 		s2 -= i;
-		}
+	}
 	if (b5 > 0) {
 		if (leftright) {
 			if (m5 > 0) {
@@ -2391,13 +2390,13 @@
 				b1 = mult(mhi, b);
 				Bfree(b);
 				b = b1;
-				}
+			}
 			if ((j = b5 - m5) != 0)
 				b = pow5mult(b, j);
 			}
 		else
 			b = pow5mult(b, b5);
-		}
+	}
 	S = i2b(1);
 	if (s5 > 0)
 		S = pow5mult(S, s5);
@@ -2417,7 +2416,7 @@
 			}
 		else
 			spec_case = 0;
-		}
+	}
 
 	/* Arrange for convenient computation of quotients:
 	 * shift left if necessary so divisor has 4 leading 0 bits.
@@ -2438,13 +2437,13 @@
 		b2 += i;
 		m2 += i;
 		s2 += i;
-		}
+	}
 	else if (i < 4) {
 		i += 28;
 		b2 += i;
 		m2 += i;
 		s2 += i;
-		}
+	}
 	if (b2 > 0)
 		b = lshift(b, b2);
 	if (s2 > 0)
@@ -2457,19 +2456,19 @@
 				mhi = multadd(mhi, 10, 0);
 			ilim = ilim1;
 			}
-		}
+	}
 	if (ilim <= 0 && mode > 2) {
 		if (ilim < 0 || cmp(b,S = multadd(S,5,0)) <= 0) {
 			/* no digits, fcvt style */
  no_digits:
 			k = -1 - ndigits;
 			goto ret;
-			}
+		}
  one_digit:
 		*s++ = '1';
 		k++;
 		goto ret;
-		}
+	}
 	if (leftright) {
 		if (m2 > 0)
 			mhi = lshift(mhi, m2);
@@ -2483,7 +2482,7 @@
 			mhi = Balloc(mhi->k);
 			Bcopy(mhi, mlo);
 			mhi = lshift(mhi, Log2P);
-			}
+		}
 
 		for(i = 1;;i++) {
 			dig = quorem(b,S) + '0';
@@ -2502,7 +2501,7 @@
 					dig++;
 				*s++ = dig;
 				goto ret;
-				}
+			}
 #endif
 			if (j < 0 || (j == 0 && !mode
 #ifndef ROUND_BIASED
@@ -2518,7 +2517,7 @@
 					}
 				*s++ = dig;
 				goto ret;
-				}
+			}
 			if (jj1 > 0) {
 				if (dig == '9') { /* possible if i == 1 */
  round_9_up:
@@ -2527,7 +2526,7 @@
 					}
 				*s++ = dig + 1;
 				goto ret;
-				}
+			}
 			*s++ = dig;
 			if (i == ilim)
 				break;
@@ -2537,16 +2536,16 @@
 			else {
 				mlo = multadd(mlo, 10, 0);
 				mhi = multadd(mhi, 10, 0);
-				}
 			}
 		}
+	}
 	else
 		for(i = 1;; i++) {
 			*s++ = dig = quorem(b,S) + '0';
 			if (i >= ilim)
 				break;
 			b = multadd(b, 10, 0);
-			}
+		}
 
 	/* Round off last digit */
 
@@ -2561,18 +2560,18 @@
 				goto ret;
 				}
 		++*s++;
-		}
+	}
 	else {
 		while(*--s == '0');
 		s++;
-		}
+	}
  ret:
 	Bfree(S);
 	if (mhi) {
 		if (mlo && mlo != mhi)
 			Bfree(mlo);
 		Bfree(mhi);
-		}
+	}
  ret1:
 	Bfree(b);
 	if (s == s0) {				/* don't return empty string */
@@ -2584,7 +2583,7 @@
 	if (rve)
 		*rve = s;
 	return s0;
-	}
+}
 #ifdef __cplusplus
 }
 #endif
diff --git a/libc/stdlib/strtoimax.c b/libc/stdlib/strtoimax.c
index a742eb9..0b4323d 100644
--- a/libc/stdlib/strtoimax.c
+++ b/libc/stdlib/strtoimax.c
@@ -103,7 +103,7 @@
 		    cutoff = INTMAX_MAX / x; \
 		 }; \
 		 break
-		 
+
 	switch (base) {
             case 4:
                 if (neg) {
@@ -118,13 +118,13 @@
 	    CASE_BASE(8);
 	    CASE_BASE(10);
 	    CASE_BASE(16);
-	    default:  
+	    default:
 	              cutoff  = neg ? INTMAX_MIN : INTMAX_MAX;
 		      cutlim  = cutoff % base;
 	              cutoff /= base;
 	}
 #undef CASE_BASE
-	
+
 	if (neg) {
 		if (cutlim > 0) {
 			cutlim -= base;
diff --git a/libc/stdlib/strtoumax.c b/libc/stdlib/strtoumax.c
index ec45377..e1ff623 100644
--- a/libc/stdlib/strtoumax.c
+++ b/libc/stdlib/strtoumax.c
@@ -57,7 +57,7 @@
 	if (c == '-') {
 		neg = 1;
 		c = *s++;
-	} else { 
+	} else {
 		neg = 0;
 		if (c == '+')
 			c = *s++;
@@ -76,7 +76,7 @@
             case x: cutoff = UINTMAX_MAX / x;    \
 	            cutlim = UINTMAX_MAX % x;    \
 		    break
-		    	
+
         switch (base) {
         CASE_BASE(8);
 	CASE_BASE(10);
@@ -85,7 +85,7 @@
 	    cutoff = UINTMAX_MAX / base;
 	    cutlim = UINTMAX_MAX % base;
 	}
-	
+
 	for (acc = 0, any = 0;; c = (unsigned char) *s++) {
 		if (isdigit(c))
 			c -= '0';
diff --git a/libc/stdlib/wchar.c b/libc/stdlib/wchar.c
index d805333..0bbdaa9 100644
--- a/libc/stdlib/wchar.c
+++ b/libc/stdlib/wchar.c
@@ -387,7 +387,7 @@
 
 int wctob(wint_t c)
 {
-  return c;
+    return c;
 }
 
 wctype_t wctype(const char *property)
diff --git a/libc/string/memcpy.c b/libc/string/memcpy.c
index 4cd4a80..dea78b2 100644
--- a/libc/string/memcpy.c
+++ b/libc/string/memcpy.c
@@ -25,5 +25,5 @@
  * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
-#define MEM_COPY
+#define MEMCOPY
 #include "bcopy.c"
diff --git a/libc/string/memset.c b/libc/string/memset.c
index ed9cdd7..41dafb2 100644
--- a/libc/string/memset.c
+++ b/libc/string/memset.c
@@ -34,10 +34,10 @@
     char*  end = q + n;
 
     for (;;) {
-        if (q < end) break; *q++ = (char) c;
-        if (q < end) break; *q++ = (char) c;
-        if (q < end) break; *q++ = (char) c;
-        if (q < end) break; *q++ = (char) c;
+        if (q >= end) break; *q++ = (char) c;
+        if (q >= end) break; *q++ = (char) c;
+        if (q >= end) break; *q++ = (char) c;
+        if (q >= end) break; *q++ = (char) c;
     }
 
   return dst;
diff --git a/libc/string/strcasecmp.c b/libc/string/strcasecmp.c
index 12f3a09..2be0913 100644
--- a/libc/string/strcasecmp.c
+++ b/libc/string/strcasecmp.c
@@ -98,8 +98,8 @@
 			if (cm[*us1] != cm[*us2++])
 				return (cm[*us1] - cm[*--us2]);
 			if (*us1++ == '\0')
-      break;
+				break;
 		} while (--n != 0);
-  }
+	}
 	return (0);
 }
diff --git a/libc/string/strchr.c b/libc/string/strchr.c
index e33694c..31ba4e2 100644
--- a/libc/string/strchr.c
+++ b/libc/string/strchr.c
@@ -38,6 +38,6 @@
 			return((char *)p);
 		if (!*p)
 			return((char *)NULL);
-  }
+	}
 	/* NOTREACHED */
 }
diff --git a/libc/string/strcoll.c b/libc/string/strcoll.c
index 365cad5..e3b1ec3 100755
--- a/libc/string/strcoll.c
+++ b/libc/string/strcoll.c
@@ -36,5 +36,5 @@
 int
 strcoll(const char *s1, const char *s2)
 {
-	return strcmp (s1, s2);
+	return strcmp(s1, s2);
 }
diff --git a/libc/string/strlcat.c b/libc/string/strlcat.c
index ad2215b..ceab094 100644
--- a/libc/string/strlcat.c
+++ b/libc/string/strlcat.c
@@ -46,9 +46,9 @@
 		if (n != 1) {
 			*d++ = *s;
 			n--;
-  }
+		}
 		s++;
-  }
+	}
 	*d = '\0';
 
 	return(dlen + (s - src));	/* count does not include NUL */
diff --git a/libc/string/strlcpy.c b/libc/string/strlcpy.c
index 38277eb..d32b659 100644
--- a/libc/string/strlcpy.c
+++ b/libc/string/strlcpy.c
@@ -37,7 +37,7 @@
 			if ((*d++ = *s++) == '\0')
 				break;
 		}
-  }
+	}
 
 	/* Not enough room in dst, add NUL and traverse rest of src */
 	if (n == 0) {
diff --git a/libc/string/strncat.c b/libc/string/strncat.c
index 1cb9405..c4df4f2 100644
--- a/libc/string/strncat.c
+++ b/libc/string/strncat.c
@@ -52,6 +52,6 @@
 			d++;
 		} while (--n != 0);
 		*d = 0;
-  }
+	}
 	return (dst);
 }
diff --git a/libc/string/strncmp.c b/libc/string/strncmp.c
index 9da41ab..1768808 100644
--- a/libc/string/strncmp.c
+++ b/libc/string/strncmp.c
@@ -38,14 +38,13 @@
 int
 strncmp(const char *s1, const char *s2, size_t n)
 {
-
 	if (n == 0)
 		return (0);
 	do {
 		if (*s1 != *s2++)
 			return (*(unsigned char *)s1 - *(unsigned char *)--s2);
 		if (*s1++ == 0)
-      break;
+			break;
 	} while (--n != 0);
 	return (0);
 }
diff --git a/libc/string/strncpy.c b/libc/string/strncpy.c
index b91091b..4426cbe 100644
--- a/libc/string/strncpy.c
+++ b/libc/string/strncpy.c
@@ -54,8 +54,8 @@
 				/* NUL pad the remaining n-1 bytes */
 				while (--n != 0)
 					*d++ = 0;
-      break;
-  }
+				break;
+			}
 		} while (--n != 0);
 	}
 	return (dst);
diff --git a/libc/string/strpbrk.c b/libc/string/strpbrk.c
index 6ba3796..cd3b71c 100644
--- a/libc/string/strpbrk.c
+++ b/libc/string/strpbrk.c
@@ -38,7 +38,7 @@
 {
 	const char *scanp;
 	int c, sc;
-  
+
 	while ((c = *s1++) != 0) {
 		for (scanp = s2; (sc = *scanp++) != 0;)
 			if (sc == c)
diff --git a/libc/string/strrchr.c b/libc/string/strrchr.c
index 2800781..4918f82 100644
--- a/libc/string/strrchr.c
+++ b/libc/string/strrchr.c
@@ -34,12 +34,12 @@
 strrchr(const char *p, int ch)
 {
 	char *save;
-  
+
 	for (save = NULL;; ++p) {
 		if (*p == ch)
 			save = (char *)p;
 		if (!*p)
 			return(save);
-  }
+	}
 	/* NOTREACHED */
 }
diff --git a/libc/string/strsep.c b/libc/string/strsep.c
index bcca681..c44bc5b 100644
--- a/libc/string/strsep.c
+++ b/libc/string/strsep.c
@@ -34,7 +34,7 @@
 
 /*
  * Get next token from string *stringp, where tokens are possibly-empty
- * strings separated by characters from delim.  
+ * strings separated by characters from delim.
  *
  * Writes NULs into the string at *stringp to end tokens.
  * delim need not remain constant from call to call.
diff --git a/libc/string/strstr.c b/libc/string/strstr.c
index debe96c..95a865b 100644
--- a/libc/string/strstr.c
+++ b/libc/string/strstr.c
@@ -51,6 +51,6 @@
 			} while (sc != c);
 		} while (strncmp(s, find, len) != 0);
 		s--;
-    }
+	}
 	return ((char *)s);
 }
diff --git a/libc/string/strxfrm.c b/libc/string/strxfrm.c
index f1843b5..3c4d707 100755
--- a/libc/string/strxfrm.c
+++ b/libc/string/strxfrm.c
@@ -29,7 +29,7 @@
 
 /*
  * Transform string s2 to string s1 using the current locale so that
- * strcmp of transformed strings yields the same result as strcoll. 
+ * strcmp of transformed strings yields the same result as strcoll.
  * Since Bionic really does not support locales, we assume we always use
  * the C locale.
  *
diff --git a/libc/unistd/pread.c b/libc/unistd/pread.c
index d2f71f7..b55623e 100644
--- a/libc/unistd/pread.c
+++ b/libc/unistd/pread.c
@@ -25,10 +25,10 @@
 #include <sys/types.h>
 #include <unistd.h>
 
-extern int __pread64(int fd, void *buf, size_t nbytes, off_t lo, off_t hi);
+extern int __pread64(int fd, void *buf, size_t nbytes, loff_t offset);
 
 ssize_t pread(int fd, void *buf, size_t nbytes, off_t offset)
 {
-    return __pread64(fd, buf, nbytes, offset, 0);
+    return __pread64(fd, buf, nbytes, offset);
 }
 
diff --git a/libc/unistd/pwrite.c b/libc/unistd/pwrite.c
index 5adf40a..ea080d2 100644
--- a/libc/unistd/pwrite.c
+++ b/libc/unistd/pwrite.c
@@ -28,10 +28,10 @@
 #include <sys/types.h>
 #include <unistd.h>
 
-extern int __pwrite64(int fd, void *buf, size_t nbytes, off_t lo, off_t hi);
+extern int __pwrite64(int fd, void *buf, size_t nbytes, loff_t offset);
 
 ssize_t pwrite(int fd, void *buf, size_t nbytes, off_t offset)
 {
-    return __pwrite64(fd, buf, nbytes, offset, 0);
+    return __pwrite64(fd, buf, nbytes, offset);
 }
 
diff --git a/libc/zoneinfo/zoneinfo.dat b/libc/zoneinfo/zoneinfo.dat
index e5bf25a..c9f0b6f 100644
--- a/libc/zoneinfo/zoneinfo.dat
+++ b/libc/zoneinfo/zoneinfo.dat
Binary files differ
diff --git a/libc/zoneinfo/zoneinfo.idx b/libc/zoneinfo/zoneinfo.idx
index 78a3650..cb560db 100644
--- a/libc/zoneinfo/zoneinfo.idx
+++ b/libc/zoneinfo/zoneinfo.idx
Binary files differ
diff --git a/libc/zoneinfo/zoneinfo.version b/libc/zoneinfo/zoneinfo.version
index 289c5d1..57a3708 100644
--- a/libc/zoneinfo/zoneinfo.version
+++ b/libc/zoneinfo/zoneinfo.version
@@ -1 +1 @@
-2007h
+2009s
diff --git a/libdl/libdl.c b/libdl/libdl.c
index 7971942..b36af16 100644
--- a/libdl/libdl.c
+++ b/libdl/libdl.c
@@ -14,12 +14,14 @@
  * limitations under the License.
  */
 
+#include <dlfcn.h>
 /* These are stubs for functions that are actually defined
  * in the dynamic linker (dlfcn.c), and hijacked at runtime.
  */
 void *dlopen(const char *filename, int flag) { return 0; }
-char *dlerror(void) { return 0; }
+const char *dlerror(void) { return 0; }
 void *dlsym(void *handle, const char *symbol) { return 0; }
+int dladdr(void *addr, Dl_info *info) { return 0; }
 int dlclose(void *handle) { return 0; }
 
 #ifdef __arm__
diff --git a/libstdc++/src/new.cpp b/libstdc++/src/new.cpp
index 8189159..a9c92d4 100644
--- a/libstdc++/src/new.cpp
+++ b/libstdc++/src/new.cpp
@@ -23,16 +23,12 @@
 
 void  operator delete(void* ptr)
 {
-    if (ptr) {
-        free(ptr);
-    }
+    free(ptr);
 }
 
 void  operator delete[](void* ptr)
 {
-    if (ptr) {
-        free(ptr);
-    }
+    free(ptr);
 }
 
 void* operator new(std::size_t size, const std::nothrow_t&)
@@ -47,16 +43,12 @@
 
 void  operator delete(void* ptr, const std::nothrow_t&)
 {
-    if (ptr) {
-        free(ptr);
-    }
+    free(ptr);
 }
 
 void  operator delete[](void* ptr, const std::nothrow_t&)
 {
-    if (ptr) {
-        free(ptr);
-    }
+    free(ptr);
 }
 
 
diff --git a/linker/dlfcn.c b/linker/dlfcn.c
index 039926c..30f5f4c 100644
--- a/linker/dlfcn.c
+++ b/linker/dlfcn.c
@@ -90,9 +90,15 @@
     }
 
     if(handle == RTLD_DEFAULT) {
-        sym = lookup(symbol, &found);
+        sym = lookup(symbol, &found, NULL);
     } else if(handle == RTLD_NEXT) {
-        sym = lookup(symbol, &found);
+        void *ret_addr = __builtin_return_address(0);
+        soinfo *si = find_containing_library(ret_addr);
+
+        sym = NULL;
+        if(si && si->next) {
+            sym = lookup(symbol, &found, si->next);
+        }
     } else {
         found = (soinfo*)handle;
         sym = lookup_in_library(found, symbol);
@@ -117,6 +123,37 @@
     return 0;
 }
 
+int dladdr(void *addr, Dl_info *info)
+{
+    int ret = 0;
+
+    pthread_mutex_lock(&dl_lock);
+
+    /* Determine if this address can be found in any library currently mapped */
+    soinfo *si = find_containing_library(addr);
+
+    if(si) {
+        memset(info, 0, sizeof(Dl_info));
+
+        info->dli_fname = si->name;
+        info->dli_fbase = (void*)si->base;
+
+        /* Determine if any symbol in the library contains the specified address */
+        Elf32_Sym *sym = find_containing_symbol(addr, si);
+
+        if(sym != NULL) {
+            info->dli_sname = si->strtab + sym->st_name;
+            info->dli_saddr = (void*)(si->base + sym->st_value);
+        }
+
+        ret = 1;
+    }
+
+    pthread_mutex_unlock(&dl_lock);
+
+    return ret;
+}
+
 int dlclose(void *handle)
 {
     pthread_mutex_lock(&dl_lock);
@@ -126,22 +163,22 @@
 }
 
 #if defined(ANDROID_ARM_LINKER)
-//                     0000000 00011111 111112 22222222 233333333334444444444
-//                     0123456 78901234 567890 12345678 901234567890123456789
+//                     0000000 00011111 111112 22222222 2333333 333344444444445555555
+//                     0123456 78901234 567890 12345678 9012345 678901234567890123456
 #define ANDROID_LIBDL_STRTAB \
-                      "dlopen\0dlclose\0dlsym\0dlerror\0dl_unwind_find_exidx\0"
+                      "dlopen\0dlclose\0dlsym\0dlerror\0dladdr\0dl_unwind_find_exidx\0"
 
 #elif defined(ANDROID_X86_LINKER)
-//                     0000000 00011111 111112 22222222 2333333333344444
-//                     0123456 78901234 567890 12345678 9012345678901234
+//                     0000000 00011111 111112 22222222 2333333 3333444444444455
+//                     0123456 78901234 567890 12345678 9012345 6789012345678901
 #define ANDROID_LIBDL_STRTAB \
-                      "dlopen\0dlclose\0dlsym\0dlerror\0dl_iterate_phdr\0"
+                      "dlopen\0dlclose\0dlsym\0dlerror\0dladdr\0dl_iterate_phdr\0"
 
 #elif defined(ANDROID_SH_LINKER)
-//                     0000000 00011111 111112 22222222 2333333333344444
-//                     0123456 78901234 567890 12345678 9012345678901234
+//                     0000000 00011111 111112 22222222 2333333 3333444444444455
+//                     0123456 78901234 567890 12345678 9012345 6789012345678901
 #define ANDROID_LIBDL_STRTAB \
-                      "dlopen\0dlclose\0dlsym\0dlerror\0dl_iterate_phdr\0"
+                      "dlopen\0dlclose\0dlsym\0dlerror\0dladdr\0dl_iterate_phdr\0"
 
 #else /* !defined(ANDROID_ARM_LINKER) && !defined(ANDROID_X86_LINKER) */
 #error Unsupported architecture. Only ARM and x86 are presently supported.
@@ -175,20 +212,25 @@
       st_info: STB_GLOBAL << 4,
       st_shndx: 1,
     },
-#ifdef ANDROID_ARM_LINKER
     { st_name: 29,
+      st_value: (Elf32_Addr) &dladdr,
+      st_info: STB_GLOBAL << 4,
+      st_shndx: 1,
+    },
+#ifdef ANDROID_ARM_LINKER
+    { st_name: 36,
       st_value: (Elf32_Addr) &dl_unwind_find_exidx,
       st_info: STB_GLOBAL << 4,
       st_shndx: 1,
     },
 #elif defined(ANDROID_X86_LINKER)
-    { st_name: 29,
+    { st_name: 36,
       st_value: (Elf32_Addr) &dl_iterate_phdr,
       st_info: STB_GLOBAL << 4,
       st_shndx: 1,
     },
 #elif defined(ANDROID_SH_LINKER)
-    { st_name: 29,
+    { st_name: 36,
       st_value: (Elf32_Addr) &dl_iterate_phdr,
       st_info: STB_GLOBAL << 4,
       st_shndx: 1,
@@ -216,7 +258,7 @@
  * stubbing them out in libdl.
  */
 static unsigned libdl_buckets[1] = { 1 };
-static unsigned libdl_chains[6] = { 0, 2, 3, 4, 5, 0 };
+static unsigned libdl_chains[7] = { 0, 2, 3, 4, 5, 6, 0 };
 
 soinfo libdl_info = {
     name: "libdl.so",
@@ -226,7 +268,7 @@
     symtab: libdl_symtab,
 
     nbucket: 1,
-    nchain: 6,
+    nchain: 7,
     bucket: libdl_buckets,
     chain: libdl_chains,
 };
diff --git a/linker/linker.c b/linker/linker.c
index 9779290..451e96c 100644
--- a/linker/linker.c
+++ b/linker/linker.c
@@ -471,7 +471,7 @@
             DEBUG("%5d %s: looking up %s in %s\n",
                   pid, si->name, name, lsi->name);
             s = _do_lookup_in_so(lsi, name, &elf_hash);
-            if(s != NULL)
+            if ((s != NULL) && (s->st_shndx != SHN_UNDEF))
                 goto done;
         }
     }
@@ -512,13 +512,17 @@
 
 /* This is used by dl_sym().  It performs a global symbol lookup.
  */
-Elf32_Sym *lookup(const char *name, soinfo **found)
+Elf32_Sym *lookup(const char *name, soinfo **found, soinfo *start)
 {
     unsigned elf_hash = 0;
     Elf32_Sym *s = NULL;
     soinfo *si;
 
-    for(si = solist; (s == NULL) && (si != NULL); si = si->next)
+    if(start == NULL) {
+        start = solist;
+    }
+
+    for(si = start; (s == NULL) && (si != NULL); si = si->next)
     {
         if(si->flags & FLAG_ERROR)
             continue;
@@ -538,6 +542,40 @@
     return 0;
 }
 
+soinfo *find_containing_library(void *addr)
+{
+    soinfo *si;
+
+    for(si = solist; si != NULL; si = si->next)
+    {
+        if((unsigned)addr >= si->base && (unsigned)addr - si->base < si->size) {
+            return si;
+        }
+    }
+
+    return NULL;
+}
+
+Elf32_Sym *find_containing_symbol(void *addr, soinfo *si)
+{
+    unsigned int i;
+    unsigned soaddr = (unsigned)addr - si->base;
+
+    /* Search the library's symbol table for any defined symbol which
+     * contains this address */
+    for(i=0; i<si->nchain; i++) {
+        Elf32_Sym *sym = &si->symtab[i];
+
+        if(sym->st_shndx != SHN_UNDEF &&
+           soaddr >= sym->st_value &&
+           soaddr < sym->st_value + sym->st_size) {
+            return sym;
+        }
+    }
+
+    return NULL;
+}
+
 #if 0
 static void dump(soinfo *si)
 {
@@ -1295,6 +1333,13 @@
                        reloc, sym_addr, sym_name);
             *((unsigned*)reloc) += sym_addr;
             break;
+        case R_ARM_REL32:
+            COUNT_RELOC(RELOC_RELATIVE);
+            MARK(rel->r_offset);
+            TRACE_TYPE(RELO, "%5d RELO REL32 %08x <- %08x - %08x %s\n", pid,
+                       reloc, sym_addr, rel->r_offset, sym_name);
+            *((unsigned*)reloc) += sym_addr - rel->r_offset;
+            break;
 #elif defined(ANDROID_X86_LINKER)
         case R_386_JUMP_SLOT:
             COUNT_RELOC(RELOC_ABSOLUTE);
diff --git a/linker/linker.h b/linker/linker.h
index 2e51338..68ac275 100644
--- a/linker/linker.h
+++ b/linker/linker.h
@@ -172,6 +172,13 @@
 #define R_ARM_JUMP_SLOT  22
 #define R_ARM_RELATIVE   23
 
+/* According to the AAPCS specification, we only
+ * need the above relocations. However, in practice,
+ * the following ones turn up from time to time.
+ */
+#define R_ARM_ABS32      2
+#define R_ARM_REL32      3
+
 #elif defined(ANDROID_X86_LINKER)
 
 #define R_386_32         1
@@ -214,16 +221,12 @@
 #define DT_PREINIT_ARRAYSZ 33
 #endif
 
-/* in theory we only need the above relative relocations,
-   but in practice the following one turns up from time
-   to time.  fushigi na.
-*/
-#define R_ARM_ABS32      2
-
 soinfo *find_library(const char *name);
 unsigned unload_library(soinfo *si);
 Elf32_Sym *lookup_in_library(soinfo *si, const char *name);
-Elf32_Sym *lookup(const char *name, soinfo **found);
+Elf32_Sym *lookup(const char *name, soinfo **found, soinfo *start);
+soinfo *find_containing_library(void *addr);
+Elf32_Sym *find_containing_symbol(void *addr, soinfo *si);
 const char *linker_get_error(void);
 
 #ifdef ANDROID_ARM_LINKER