Merge "Revert "Update to v6.11 kernel headers."" into main
diff --git a/libc/Android.bp b/libc/Android.bp
index a4f2c69..eeea728 100644
--- a/libc/Android.bp
+++ b/libc/Android.bp
@@ -1181,12 +1181,9 @@
                 "arch-x86/bionic/vfork.S",
                 "arch-x86/bionic/__x86.get_pc_thunk.S",
 
-                "arch-x86/generic/string/memcmp.S",
-
                 "arch-x86/string/sse2-memchr-atom.S",
                 "arch-x86/string/sse2-memmove-slm.S",
                 "arch-x86/string/sse2-memrchr-atom.S",
-                "arch-x86/string/sse2-memset-atom.S",
                 "arch-x86/string/sse2-memset-slm.S",
                 "arch-x86/string/sse2-stpcpy-slm.S",
                 "arch-x86/string/sse2-stpncpy-slm.S",
@@ -1200,18 +1197,14 @@
                 "arch-x86/string/sse2-wcsrchr-atom.S",
                 "arch-x86/string/sse2-wcslen-atom.S",
                 "arch-x86/string/sse2-wcscmp-atom.S",
-                "arch-x86/string/sse2-strlen-atom.S",
 
                 "arch-x86/string/ssse3-memcmp-atom.S",
-                "arch-x86/string/ssse3-memmove-atom.S",
                 "arch-x86/string/ssse3-strcat-atom.S",
                 "arch-x86/string/ssse3-strcmp-atom.S",
-                "arch-x86/string/ssse3-strcpy-atom.S",
                 "arch-x86/string/ssse3-strlcat-atom.S",
                 "arch-x86/string/ssse3-strlcpy-atom.S",
                 "arch-x86/string/ssse3-strncat-atom.S",
                 "arch-x86/string/ssse3-strncmp-atom.S",
-                "arch-x86/string/ssse3-strncpy-atom.S",
                 "arch-x86/string/ssse3-wcscat-atom.S",
                 "arch-x86/string/ssse3-wcscpy-atom.S",
                 "arch-x86/string/ssse3-wmemcmp-atom.S",
diff --git a/libc/arch-x86/dynamic_function_dispatch.cpp b/libc/arch-x86/dynamic_function_dispatch.cpp
index e6cc5fb..98d7ec2 100644
--- a/libc/arch-x86/dynamic_function_dispatch.cpp
+++ b/libc/arch-x86/dynamic_function_dispatch.cpp
@@ -33,57 +33,11 @@
 
 DEFINE_IFUNC_FOR(memcmp) {
   __builtin_cpu_init();
-  if (__builtin_cpu_is("atom")) RETURN_FUNC(memcmp_func_t, memcmp_atom);
   if (__builtin_cpu_supports("sse4.1")) RETURN_FUNC(memcmp_func_t, memcmp_sse4);
-  RETURN_FUNC(memcmp_func_t, memcmp_generic);
+  RETURN_FUNC(memcmp_func_t, memcmp_atom);
 }
 MEMCMP_SHIM()
 
-DEFINE_IFUNC_FOR(memset) {
-  __builtin_cpu_init();
-  if (__builtin_cpu_is("atom")) RETURN_FUNC(memset_func_t, memset_atom);
-  RETURN_FUNC(memset_func_t, memset_generic);
-}
-MEMSET_SHIM()
-
-DEFINE_IFUNC_FOR(__memset_chk) {
-  __builtin_cpu_init();
-  if (__builtin_cpu_is("atom")) RETURN_FUNC(__memset_chk_func_t, __memset_chk_atom);
-  RETURN_FUNC(__memset_chk_func_t, __memset_chk_generic);
-}
-__MEMSET_CHK_SHIM()
-
-DEFINE_IFUNC_FOR(memmove) {
-  __builtin_cpu_init();
-  if (__builtin_cpu_is("atom")) RETURN_FUNC(memmove_func_t, memmove_atom);
-  RETURN_FUNC(memmove_func_t, memmove_generic);
-}
-MEMMOVE_SHIM()
-
-DEFINE_IFUNC_FOR(memcpy) { return memmove_resolver(); }
-MEMCPY_SHIM()
-
-DEFINE_IFUNC_FOR(strcpy) {
-  __builtin_cpu_init();
-  if (__builtin_cpu_is("atom")) RETURN_FUNC(strcpy_func_t, strcpy_atom);
-  RETURN_FUNC(strcpy_func_t, strcpy_generic);
-}
-STRCPY_SHIM()
-
-DEFINE_IFUNC_FOR(strncpy) {
-  __builtin_cpu_init();
-  if (__builtin_cpu_is("atom")) RETURN_FUNC(strncpy_func_t, strncpy_atom);
-  RETURN_FUNC(strncpy_func_t, strncpy_generic);
-}
-STRNCPY_SHIM()
-
-DEFINE_IFUNC_FOR(strlen) {
-  __builtin_cpu_init();
-  if (__builtin_cpu_is("atom")) RETURN_FUNC(strlen_func_t, strlen_atom);
-  RETURN_FUNC(strlen_func_t, strlen_generic);
-}
-STRLEN_SHIM()
-
 typedef int wmemcmp_func_t(const wchar_t*, const wchar_t*, size_t);
 DEFINE_IFUNC_FOR(wmemcmp) {
   __builtin_cpu_init();
diff --git a/libc/arch-x86/generic/string/memcmp.S b/libc/arch-x86/generic/string/memcmp.S
deleted file mode 100644
index 1d327c7..0000000
--- a/libc/arch-x86/generic/string/memcmp.S
+++ /dev/null
@@ -1,44 +0,0 @@
-/*	$OpenBSD: memcmp.S,v 1.4 2005/08/07 11:30:38 espie Exp $ */
-/*
- * Written by J.T. Conklin <jtc@netbsd.org>.
- * Public domain.
- */
-
-#include <private/bionic_asm.h>
-
-ENTRY(memcmp_generic)
-	pushl	%edi
-	pushl	%esi
-	movl	12(%esp),%edi
-	movl	16(%esp),%esi
-	cld				/* set compare direction forward */
-
-	movl	20(%esp),%ecx		/* compare by words */
-	shrl	$2,%ecx
-	repe
-	cmpsl
-	jne	L5			/* do we match so far? */
-
-	movl	20(%esp),%ecx		/* compare remainder by bytes */
-	andl	$3,%ecx
-	repe
-	cmpsb
-	jne	L6			/* do we match? */
-
-	xorl	%eax,%eax		/* we match, return zero	*/
-	popl	%esi
-	popl	%edi
-	ret
-
-L5:	movl	$4,%ecx			/* We know that one of the next	*/
-	subl	%ecx,%edi		/* four pairs of bytes do not	*/
-	subl	%ecx,%esi		/* match.			*/
-	repe
-	cmpsb
-L6:	movzbl  -1(%edi),%eax		/* Perform unsigned comparison	*/
-	movzbl	-1(%esi),%edx
-	subl	%edx,%eax
-	popl	%esi
-	popl	%edi
-	ret
-END(memcmp_generic)
diff --git a/libc/arch-x86/string/sse2-memmove-slm.S b/libc/arch-x86/string/sse2-memmove-slm.S
index 7f42374..2ed4e7b 100644
--- a/libc/arch-x86/string/sse2-memmove-slm.S
+++ b/libc/arch-x86/string/sse2-memmove-slm.S
@@ -31,7 +31,7 @@
 #define FOR_SILVERMONT
 
 #ifndef MEMMOVE
-# define MEMMOVE	memmove_generic
+# define MEMMOVE	memmove
 #endif
 
 #ifndef L
@@ -551,3 +551,9 @@
 	jmp	L(mm_recalc_len)
 
 END (MEMMOVE)
+
+// N.B., `private/bionic_asm.h` provides ALIAS_SYMBOL, but that file provides
+// conflicting definitions for some macros in this file. Since ALIAS_SYMBOL is
+// small, inline it here.
+.globl memcpy;
+.equ memcpy, MEMMOVE
diff --git a/libc/arch-x86/string/sse2-memset-atom.S b/libc/arch-x86/string/sse2-memset-atom.S
deleted file mode 100644
index e43ead0..0000000
--- a/libc/arch-x86/string/sse2-memset-atom.S
+++ /dev/null
@@ -1,841 +0,0 @@
-/*
-Copyright (c) 2010, Intel Corporation
-All rights reserved.
-
-Redistribution and use in source and binary forms, with or without
-modification, are permitted provided that the following conditions are met:
-
-    * Redistributions of source code must retain the above copyright notice,
-    * this list of conditions and the following disclaimer.
-
-    * Redistributions in binary form must reproduce the above copyright notice,
-    * this list of conditions and the following disclaimer in the documentation
-    * and/or other materials provided with the distribution.
-
-    * Neither the name of Intel Corporation nor the names of its contributors
-    * may be used to endorse or promote products derived from this software
-    * without specific prior written permission.
-
-THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
-ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
-WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
-DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
-ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
-(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
-LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
-ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-*/
-
-#include <private/bionic_asm.h>
-
-#define FOR_ATOM
-
-#ifndef L
-# define L(label)	.L##label
-#endif
-
-#ifndef ALIGN
-# define ALIGN(n)	.p2align n
-#endif
-
-#define CFI_PUSH(REG)						\
-  .cfi_adjust_cfa_offset 4;					\
-  .cfi_rel_offset REG, 0
-
-#define CFI_POP(REG)						\
-  .cfi_adjust_cfa_offset -4;					\
-  .cfi_restore REG
-
-#define PUSH(REG)	pushl REG; CFI_PUSH(REG)
-#define POP(REG)	popl REG; CFI_POP(REG)
-
-#define PARMS 8  /* Preserve EBX. */
-#define DST PARMS
-#define CHR (DST+4)
-#define LEN (CHR+4)
-#define CHK_DST_LEN (LEN+4)
-#define SETRTNVAL	movl DST(%esp), %eax
-
-#define ENTRANCE	PUSH(%ebx);
-#define RETURN_END	POP(%ebx); ret
-#define RETURN		RETURN_END; CFI_PUSH(%ebx)
-#define JMPTBL(I, B)	I - B
-
-#define SETUP_PIC_REG(x)	call	__x86.get_pc_thunk.x
-
-/* Load an entry in a jump table into EBX and branch to it.  TABLE is a
-   jump table with relative offsets.   */
-# define BRANCH_TO_JMPTBL_ENTRY(TABLE)				\
-    /* We first load PC into EBX.  */				\
-    call	__x86.get_pc_thunk.bx;				\
-    /* Get the address of the jump table.  */			\
-    add		$(TABLE - .), %ebx;				\
-    /* Get the entry and convert the relative offset to the	\
-       absolute address.  */					\
-    add		(%ebx,%ecx,4), %ebx;				\
-    add		%ecx, %edx;					\
-    /* We loaded the jump table and adjusted EDX. Go.  */	\
-    jmp		*%ebx
-
-ENTRY(__memset_chk_atom)
-  ENTRANCE
-
-  movl LEN(%esp), %ecx
-  cmpl CHK_DST_LEN(%esp), %ecx
-  jna L(memset_length_loaded)
-
-  POP(%ebx) // Undo ENTRANCE without returning.
-  jmp __memset_chk_fail
-END(__memset_chk_atom)
-
-	.section .text.sse2,"ax",@progbits
-	ALIGN(4)
-ENTRY(memset_atom)
-	ENTRANCE
-
-	movl	LEN(%esp), %ecx
-L(memset_length_loaded):
-	movzbl	CHR(%esp), %eax
-	movb	%al, %ah
-	/* Fill the whole EAX with pattern.  */
-	movl	%eax, %edx
-	shl	$16, %eax
-	or	%edx, %eax
-	movl	DST(%esp), %edx
-	cmp	$32, %ecx
-	jae	L(32bytesormore)
-
-L(write_less32bytes):
-	BRANCH_TO_JMPTBL_ENTRY(L(table_less_32bytes))
-
-
-	.pushsection .rodata.sse2,"a",@progbits
-	ALIGN(2)
-L(table_less_32bytes):
-	.int	JMPTBL(L(write_0bytes), L(table_less_32bytes))
-	.int	JMPTBL(L(write_1bytes), L(table_less_32bytes))
-	.int	JMPTBL(L(write_2bytes), L(table_less_32bytes))
-	.int	JMPTBL(L(write_3bytes), L(table_less_32bytes))
-	.int	JMPTBL(L(write_4bytes), L(table_less_32bytes))
-	.int	JMPTBL(L(write_5bytes), L(table_less_32bytes))
-	.int	JMPTBL(L(write_6bytes), L(table_less_32bytes))
-	.int	JMPTBL(L(write_7bytes), L(table_less_32bytes))
-	.int	JMPTBL(L(write_8bytes), L(table_less_32bytes))
-	.int	JMPTBL(L(write_9bytes), L(table_less_32bytes))
-	.int	JMPTBL(L(write_10bytes), L(table_less_32bytes))
-	.int	JMPTBL(L(write_11bytes), L(table_less_32bytes))
-	.int	JMPTBL(L(write_12bytes), L(table_less_32bytes))
-	.int	JMPTBL(L(write_13bytes), L(table_less_32bytes))
-	.int	JMPTBL(L(write_14bytes), L(table_less_32bytes))
-	.int	JMPTBL(L(write_15bytes), L(table_less_32bytes))
-	.int	JMPTBL(L(write_16bytes), L(table_less_32bytes))
-	.int	JMPTBL(L(write_17bytes), L(table_less_32bytes))
-	.int	JMPTBL(L(write_18bytes), L(table_less_32bytes))
-	.int	JMPTBL(L(write_19bytes), L(table_less_32bytes))
-	.int	JMPTBL(L(write_20bytes), L(table_less_32bytes))
-	.int	JMPTBL(L(write_21bytes), L(table_less_32bytes))
-	.int	JMPTBL(L(write_22bytes), L(table_less_32bytes))
-	.int	JMPTBL(L(write_23bytes), L(table_less_32bytes))
-	.int	JMPTBL(L(write_24bytes), L(table_less_32bytes))
-	.int	JMPTBL(L(write_25bytes), L(table_less_32bytes))
-	.int	JMPTBL(L(write_26bytes), L(table_less_32bytes))
-	.int	JMPTBL(L(write_27bytes), L(table_less_32bytes))
-	.int	JMPTBL(L(write_28bytes), L(table_less_32bytes))
-	.int	JMPTBL(L(write_29bytes), L(table_less_32bytes))
-	.int	JMPTBL(L(write_30bytes), L(table_less_32bytes))
-	.int	JMPTBL(L(write_31bytes), L(table_less_32bytes))
-	.popsection
-
-	ALIGN(4)
-L(write_28bytes):
-	movl	%eax, -28(%edx)
-L(write_24bytes):
-	movl	%eax, -24(%edx)
-L(write_20bytes):
-	movl	%eax, -20(%edx)
-L(write_16bytes):
-	movl	%eax, -16(%edx)
-L(write_12bytes):
-	movl	%eax, -12(%edx)
-L(write_8bytes):
-	movl	%eax, -8(%edx)
-L(write_4bytes):
-	movl	%eax, -4(%edx)
-L(write_0bytes):
-	SETRTNVAL
-	RETURN
-
-	ALIGN(4)
-L(write_29bytes):
-	movl	%eax, -29(%edx)
-L(write_25bytes):
-	movl	%eax, -25(%edx)
-L(write_21bytes):
-	movl	%eax, -21(%edx)
-L(write_17bytes):
-	movl	%eax, -17(%edx)
-L(write_13bytes):
-	movl	%eax, -13(%edx)
-L(write_9bytes):
-	movl	%eax, -9(%edx)
-L(write_5bytes):
-	movl	%eax, -5(%edx)
-L(write_1bytes):
-	movb	%al, -1(%edx)
-	SETRTNVAL
-	RETURN
-
-	ALIGN(4)
-L(write_30bytes):
-	movl	%eax, -30(%edx)
-L(write_26bytes):
-	movl	%eax, -26(%edx)
-L(write_22bytes):
-	movl	%eax, -22(%edx)
-L(write_18bytes):
-	movl	%eax, -18(%edx)
-L(write_14bytes):
-	movl	%eax, -14(%edx)
-L(write_10bytes):
-	movl	%eax, -10(%edx)
-L(write_6bytes):
-	movl	%eax, -6(%edx)
-L(write_2bytes):
-	movw	%ax, -2(%edx)
-	SETRTNVAL
-	RETURN
-
-	ALIGN(4)
-L(write_31bytes):
-	movl	%eax, -31(%edx)
-L(write_27bytes):
-	movl	%eax, -27(%edx)
-L(write_23bytes):
-	movl	%eax, -23(%edx)
-L(write_19bytes):
-	movl	%eax, -19(%edx)
-L(write_15bytes):
-	movl	%eax, -15(%edx)
-L(write_11bytes):
-	movl	%eax, -11(%edx)
-L(write_7bytes):
-	movl	%eax, -7(%edx)
-L(write_3bytes):
-	movw	%ax, -3(%edx)
-	movb	%al, -1(%edx)
-	SETRTNVAL
-	RETURN
-
-	ALIGN(4)
-/* ECX > 32 and EDX is 4 byte aligned.  */
-L(32bytesormore):
-	/* Fill xmm0 with the pattern.  */
-	movd	%eax, %xmm0
-	pshufd	$0, %xmm0, %xmm0
-	testl	$0xf, %edx
-	jz	L(aligned_16)
-/* ECX > 32 and EDX is not 16 byte aligned.  */
-L(not_aligned_16):
-	movdqu	%xmm0, (%edx)
-	movl	%edx, %eax
-	and	$-16, %edx
-	add	$16, %edx
-	sub	%edx, %eax
-	add	%eax, %ecx
-	movd	%xmm0, %eax
-
-	ALIGN(4)
-L(aligned_16):
-	cmp	$128, %ecx
-	jae	L(128bytesormore)
-
-L(aligned_16_less128bytes):
-	BRANCH_TO_JMPTBL_ENTRY(L(table_16_128bytes))
-
-	ALIGN(4)
-L(128bytesormore):
-	PUSH(%ebx)
-	SETUP_PIC_REG(bx)
-	add	$_GLOBAL_OFFSET_TABLE_, %ebx
-	mov	__x86_shared_cache_size@GOTOFF(%ebx), %ebx
-	cmp	%ebx, %ecx
-	jae	L(128bytesormore_nt_start)
-
-
-	POP(%ebx)
-# define RESTORE_EBX_STATE CFI_PUSH(%ebx)
-	PUSH(%ebx)
-	SETUP_PIC_REG(bx)
-	add	$_GLOBAL_OFFSET_TABLE_, %ebx
-	cmp	__x86_data_cache_size@GOTOFF(%ebx), %ecx
-	POP(%ebx)
-
-	jae	L(128bytes_L2_normal)
-	subl	$128, %ecx
-L(128bytesormore_normal):
-	sub	$128, %ecx
-	movdqa	%xmm0, (%edx)
-	movdqa	%xmm0, 0x10(%edx)
-	movdqa	%xmm0, 0x20(%edx)
-	movdqa	%xmm0, 0x30(%edx)
-	movdqa	%xmm0, 0x40(%edx)
-	movdqa	%xmm0, 0x50(%edx)
-	movdqa	%xmm0, 0x60(%edx)
-	movdqa	%xmm0, 0x70(%edx)
-	lea	128(%edx), %edx
-	jb	L(128bytesless_normal)
-
-
-	sub	$128, %ecx
-	movdqa	%xmm0, (%edx)
-	movdqa	%xmm0, 0x10(%edx)
-	movdqa	%xmm0, 0x20(%edx)
-	movdqa	%xmm0, 0x30(%edx)
-	movdqa	%xmm0, 0x40(%edx)
-	movdqa	%xmm0, 0x50(%edx)
-	movdqa	%xmm0, 0x60(%edx)
-	movdqa	%xmm0, 0x70(%edx)
-	lea	128(%edx), %edx
-	jae	L(128bytesormore_normal)
-
-L(128bytesless_normal):
-	add	$128, %ecx
-	BRANCH_TO_JMPTBL_ENTRY(L(table_16_128bytes))
-
-	ALIGN(4)
-L(128bytes_L2_normal):
-	prefetcht0	0x380(%edx)
-	prefetcht0	0x3c0(%edx)
-	sub	$128, %ecx
-	movdqa	%xmm0, (%edx)
-	movaps	%xmm0, 0x10(%edx)
-	movaps	%xmm0, 0x20(%edx)
-	movaps	%xmm0, 0x30(%edx)
-	movaps	%xmm0, 0x40(%edx)
-	movaps	%xmm0, 0x50(%edx)
-	movaps	%xmm0, 0x60(%edx)
-	movaps	%xmm0, 0x70(%edx)
-	add	$128, %edx
-	cmp	$128, %ecx
-	jae	L(128bytes_L2_normal)
-
-L(128bytesless_L2_normal):
-	BRANCH_TO_JMPTBL_ENTRY(L(table_16_128bytes))
-
-	RESTORE_EBX_STATE
-L(128bytesormore_nt_start):
-	sub	%ebx, %ecx
-	mov	%ebx, %eax
-	and	$0x7f, %eax
-	add	%eax, %ecx
-	movd	%xmm0, %eax
-	ALIGN(4)
-L(128bytesormore_shared_cache_loop):
-	prefetcht0	0x3c0(%edx)
-	prefetcht0	0x380(%edx)
-	sub	$0x80, %ebx
-	movdqa	%xmm0, (%edx)
-	movdqa	%xmm0, 0x10(%edx)
-	movdqa	%xmm0, 0x20(%edx)
-	movdqa	%xmm0, 0x30(%edx)
-	movdqa	%xmm0, 0x40(%edx)
-	movdqa	%xmm0, 0x50(%edx)
-	movdqa	%xmm0, 0x60(%edx)
-	movdqa	%xmm0, 0x70(%edx)
-	add	$0x80, %edx
-	cmp	$0x80, %ebx
-	jae	L(128bytesormore_shared_cache_loop)
-	cmp	$0x80, %ecx
-	jb	L(shared_cache_loop_end)
-	ALIGN(4)
-L(128bytesormore_nt):
-	sub	$0x80, %ecx
-	movntdq	%xmm0, (%edx)
-	movntdq	%xmm0, 0x10(%edx)
-	movntdq	%xmm0, 0x20(%edx)
-	movntdq	%xmm0, 0x30(%edx)
-	movntdq	%xmm0, 0x40(%edx)
-	movntdq	%xmm0, 0x50(%edx)
-	movntdq	%xmm0, 0x60(%edx)
-	movntdq	%xmm0, 0x70(%edx)
-	add	$0x80, %edx
-	cmp	$0x80, %ecx
-	jae	L(128bytesormore_nt)
-	sfence
-L(shared_cache_loop_end):
-	POP(%ebx)
-	BRANCH_TO_JMPTBL_ENTRY(L(table_16_128bytes))
-
-
-	.pushsection .rodata.sse2,"a",@progbits
-	ALIGN(2)
-L(table_16_128bytes):
-	.int	JMPTBL(L(aligned_16_0bytes), L(table_16_128bytes))
-	.int	JMPTBL(L(aligned_16_1bytes), L(table_16_128bytes))
-	.int	JMPTBL(L(aligned_16_2bytes), L(table_16_128bytes))
-	.int	JMPTBL(L(aligned_16_3bytes), L(table_16_128bytes))
-	.int	JMPTBL(L(aligned_16_4bytes), L(table_16_128bytes))
-	.int	JMPTBL(L(aligned_16_5bytes), L(table_16_128bytes))
-	.int	JMPTBL(L(aligned_16_6bytes), L(table_16_128bytes))
-	.int	JMPTBL(L(aligned_16_7bytes), L(table_16_128bytes))
-	.int	JMPTBL(L(aligned_16_8bytes), L(table_16_128bytes))
-	.int	JMPTBL(L(aligned_16_9bytes), L(table_16_128bytes))
-	.int	JMPTBL(L(aligned_16_10bytes), L(table_16_128bytes))
-	.int	JMPTBL(L(aligned_16_11bytes), L(table_16_128bytes))
-	.int	JMPTBL(L(aligned_16_12bytes), L(table_16_128bytes))
-	.int	JMPTBL(L(aligned_16_13bytes), L(table_16_128bytes))
-	.int	JMPTBL(L(aligned_16_14bytes), L(table_16_128bytes))
-	.int	JMPTBL(L(aligned_16_15bytes), L(table_16_128bytes))
-	.int	JMPTBL(L(aligned_16_16bytes), L(table_16_128bytes))
-	.int	JMPTBL(L(aligned_16_17bytes), L(table_16_128bytes))
-	.int	JMPTBL(L(aligned_16_18bytes), L(table_16_128bytes))
-	.int	JMPTBL(L(aligned_16_19bytes), L(table_16_128bytes))
-	.int	JMPTBL(L(aligned_16_20bytes), L(table_16_128bytes))
-	.int	JMPTBL(L(aligned_16_21bytes), L(table_16_128bytes))
-	.int	JMPTBL(L(aligned_16_22bytes), L(table_16_128bytes))
-	.int	JMPTBL(L(aligned_16_23bytes), L(table_16_128bytes))
-	.int	JMPTBL(L(aligned_16_24bytes), L(table_16_128bytes))
-	.int	JMPTBL(L(aligned_16_25bytes), L(table_16_128bytes))
-	.int	JMPTBL(L(aligned_16_26bytes), L(table_16_128bytes))
-	.int	JMPTBL(L(aligned_16_27bytes), L(table_16_128bytes))
-	.int	JMPTBL(L(aligned_16_28bytes), L(table_16_128bytes))
-	.int	JMPTBL(L(aligned_16_29bytes), L(table_16_128bytes))
-	.int	JMPTBL(L(aligned_16_30bytes), L(table_16_128bytes))
-	.int	JMPTBL(L(aligned_16_31bytes), L(table_16_128bytes))
-	.int	JMPTBL(L(aligned_16_32bytes), L(table_16_128bytes))
-	.int	JMPTBL(L(aligned_16_33bytes), L(table_16_128bytes))
-	.int	JMPTBL(L(aligned_16_34bytes), L(table_16_128bytes))
-	.int	JMPTBL(L(aligned_16_35bytes), L(table_16_128bytes))
-	.int	JMPTBL(L(aligned_16_36bytes), L(table_16_128bytes))
-	.int	JMPTBL(L(aligned_16_37bytes), L(table_16_128bytes))
-	.int	JMPTBL(L(aligned_16_38bytes), L(table_16_128bytes))
-	.int	JMPTBL(L(aligned_16_39bytes), L(table_16_128bytes))
-	.int	JMPTBL(L(aligned_16_40bytes), L(table_16_128bytes))
-	.int	JMPTBL(L(aligned_16_41bytes), L(table_16_128bytes))
-	.int	JMPTBL(L(aligned_16_42bytes), L(table_16_128bytes))
-	.int	JMPTBL(L(aligned_16_43bytes), L(table_16_128bytes))
-	.int	JMPTBL(L(aligned_16_44bytes), L(table_16_128bytes))
-	.int	JMPTBL(L(aligned_16_45bytes), L(table_16_128bytes))
-	.int	JMPTBL(L(aligned_16_46bytes), L(table_16_128bytes))
-	.int	JMPTBL(L(aligned_16_47bytes), L(table_16_128bytes))
-	.int	JMPTBL(L(aligned_16_48bytes), L(table_16_128bytes))
-	.int	JMPTBL(L(aligned_16_49bytes), L(table_16_128bytes))
-	.int	JMPTBL(L(aligned_16_50bytes), L(table_16_128bytes))
-	.int	JMPTBL(L(aligned_16_51bytes), L(table_16_128bytes))
-	.int	JMPTBL(L(aligned_16_52bytes), L(table_16_128bytes))
-	.int	JMPTBL(L(aligned_16_53bytes), L(table_16_128bytes))
-	.int	JMPTBL(L(aligned_16_54bytes), L(table_16_128bytes))
-	.int	JMPTBL(L(aligned_16_55bytes), L(table_16_128bytes))
-	.int	JMPTBL(L(aligned_16_56bytes), L(table_16_128bytes))
-	.int	JMPTBL(L(aligned_16_57bytes), L(table_16_128bytes))
-	.int	JMPTBL(L(aligned_16_58bytes), L(table_16_128bytes))
-	.int	JMPTBL(L(aligned_16_59bytes), L(table_16_128bytes))
-	.int	JMPTBL(L(aligned_16_60bytes), L(table_16_128bytes))
-	.int	JMPTBL(L(aligned_16_61bytes), L(table_16_128bytes))
-	.int	JMPTBL(L(aligned_16_62bytes), L(table_16_128bytes))
-	.int	JMPTBL(L(aligned_16_63bytes), L(table_16_128bytes))
-	.int	JMPTBL(L(aligned_16_64bytes), L(table_16_128bytes))
-	.int	JMPTBL(L(aligned_16_65bytes), L(table_16_128bytes))
-	.int	JMPTBL(L(aligned_16_66bytes), L(table_16_128bytes))
-	.int	JMPTBL(L(aligned_16_67bytes), L(table_16_128bytes))
-	.int	JMPTBL(L(aligned_16_68bytes), L(table_16_128bytes))
-	.int	JMPTBL(L(aligned_16_69bytes), L(table_16_128bytes))
-	.int	JMPTBL(L(aligned_16_70bytes), L(table_16_128bytes))
-	.int	JMPTBL(L(aligned_16_71bytes), L(table_16_128bytes))
-	.int	JMPTBL(L(aligned_16_72bytes), L(table_16_128bytes))
-	.int	JMPTBL(L(aligned_16_73bytes), L(table_16_128bytes))
-	.int	JMPTBL(L(aligned_16_74bytes), L(table_16_128bytes))
-	.int	JMPTBL(L(aligned_16_75bytes), L(table_16_128bytes))
-	.int	JMPTBL(L(aligned_16_76bytes), L(table_16_128bytes))
-	.int	JMPTBL(L(aligned_16_77bytes), L(table_16_128bytes))
-	.int	JMPTBL(L(aligned_16_78bytes), L(table_16_128bytes))
-	.int	JMPTBL(L(aligned_16_79bytes), L(table_16_128bytes))
-	.int	JMPTBL(L(aligned_16_80bytes), L(table_16_128bytes))
-	.int	JMPTBL(L(aligned_16_81bytes), L(table_16_128bytes))
-	.int	JMPTBL(L(aligned_16_82bytes), L(table_16_128bytes))
-	.int	JMPTBL(L(aligned_16_83bytes), L(table_16_128bytes))
-	.int	JMPTBL(L(aligned_16_84bytes), L(table_16_128bytes))
-	.int	JMPTBL(L(aligned_16_85bytes), L(table_16_128bytes))
-	.int	JMPTBL(L(aligned_16_86bytes), L(table_16_128bytes))
-	.int	JMPTBL(L(aligned_16_87bytes), L(table_16_128bytes))
-	.int	JMPTBL(L(aligned_16_88bytes), L(table_16_128bytes))
-	.int	JMPTBL(L(aligned_16_89bytes), L(table_16_128bytes))
-	.int	JMPTBL(L(aligned_16_90bytes), L(table_16_128bytes))
-	.int	JMPTBL(L(aligned_16_91bytes), L(table_16_128bytes))
-	.int	JMPTBL(L(aligned_16_92bytes), L(table_16_128bytes))
-	.int	JMPTBL(L(aligned_16_93bytes), L(table_16_128bytes))
-	.int	JMPTBL(L(aligned_16_94bytes), L(table_16_128bytes))
-	.int	JMPTBL(L(aligned_16_95bytes), L(table_16_128bytes))
-	.int	JMPTBL(L(aligned_16_96bytes), L(table_16_128bytes))
-	.int	JMPTBL(L(aligned_16_97bytes), L(table_16_128bytes))
-	.int	JMPTBL(L(aligned_16_98bytes), L(table_16_128bytes))
-	.int	JMPTBL(L(aligned_16_99bytes), L(table_16_128bytes))
-	.int	JMPTBL(L(aligned_16_100bytes), L(table_16_128bytes))
-	.int	JMPTBL(L(aligned_16_101bytes), L(table_16_128bytes))
-	.int	JMPTBL(L(aligned_16_102bytes), L(table_16_128bytes))
-	.int	JMPTBL(L(aligned_16_103bytes), L(table_16_128bytes))
-	.int	JMPTBL(L(aligned_16_104bytes), L(table_16_128bytes))
-	.int	JMPTBL(L(aligned_16_105bytes), L(table_16_128bytes))
-	.int	JMPTBL(L(aligned_16_106bytes), L(table_16_128bytes))
-	.int	JMPTBL(L(aligned_16_107bytes), L(table_16_128bytes))
-	.int	JMPTBL(L(aligned_16_108bytes), L(table_16_128bytes))
-	.int	JMPTBL(L(aligned_16_109bytes), L(table_16_128bytes))
-	.int	JMPTBL(L(aligned_16_110bytes), L(table_16_128bytes))
-	.int	JMPTBL(L(aligned_16_111bytes), L(table_16_128bytes))
-	.int	JMPTBL(L(aligned_16_112bytes), L(table_16_128bytes))
-	.int	JMPTBL(L(aligned_16_113bytes), L(table_16_128bytes))
-	.int	JMPTBL(L(aligned_16_114bytes), L(table_16_128bytes))
-	.int	JMPTBL(L(aligned_16_115bytes), L(table_16_128bytes))
-	.int	JMPTBL(L(aligned_16_116bytes), L(table_16_128bytes))
-	.int	JMPTBL(L(aligned_16_117bytes), L(table_16_128bytes))
-	.int	JMPTBL(L(aligned_16_118bytes), L(table_16_128bytes))
-	.int	JMPTBL(L(aligned_16_119bytes), L(table_16_128bytes))
-	.int	JMPTBL(L(aligned_16_120bytes), L(table_16_128bytes))
-	.int	JMPTBL(L(aligned_16_121bytes), L(table_16_128bytes))
-	.int	JMPTBL(L(aligned_16_122bytes), L(table_16_128bytes))
-	.int	JMPTBL(L(aligned_16_123bytes), L(table_16_128bytes))
-	.int	JMPTBL(L(aligned_16_124bytes), L(table_16_128bytes))
-	.int	JMPTBL(L(aligned_16_125bytes), L(table_16_128bytes))
-	.int	JMPTBL(L(aligned_16_126bytes), L(table_16_128bytes))
-	.int	JMPTBL(L(aligned_16_127bytes), L(table_16_128bytes))
-	.popsection
-
-	ALIGN(4)
-L(aligned_16_112bytes):
-	movdqa	%xmm0, -112(%edx)
-L(aligned_16_96bytes):
-	movdqa	%xmm0, -96(%edx)
-L(aligned_16_80bytes):
-	movdqa	%xmm0, -80(%edx)
-L(aligned_16_64bytes):
-	movdqa	%xmm0, -64(%edx)
-L(aligned_16_48bytes):
-	movdqa	%xmm0, -48(%edx)
-L(aligned_16_32bytes):
-	movdqa	%xmm0, -32(%edx)
-L(aligned_16_16bytes):
-	movdqa	%xmm0, -16(%edx)
-L(aligned_16_0bytes):
-	SETRTNVAL
-	RETURN
-
-	ALIGN(4)
-L(aligned_16_113bytes):
-	movdqa	%xmm0, -113(%edx)
-L(aligned_16_97bytes):
-	movdqa	%xmm0, -97(%edx)
-L(aligned_16_81bytes):
-	movdqa	%xmm0, -81(%edx)
-L(aligned_16_65bytes):
-	movdqa	%xmm0, -65(%edx)
-L(aligned_16_49bytes):
-	movdqa	%xmm0, -49(%edx)
-L(aligned_16_33bytes):
-	movdqa	%xmm0, -33(%edx)
-L(aligned_16_17bytes):
-	movdqa	%xmm0, -17(%edx)
-L(aligned_16_1bytes):
-	movb	%al, -1(%edx)
-	SETRTNVAL
-	RETURN
-
-	ALIGN(4)
-L(aligned_16_114bytes):
-	movdqa	%xmm0, -114(%edx)
-L(aligned_16_98bytes):
-	movdqa	%xmm0, -98(%edx)
-L(aligned_16_82bytes):
-	movdqa	%xmm0, -82(%edx)
-L(aligned_16_66bytes):
-	movdqa	%xmm0, -66(%edx)
-L(aligned_16_50bytes):
-	movdqa	%xmm0, -50(%edx)
-L(aligned_16_34bytes):
-	movdqa	%xmm0, -34(%edx)
-L(aligned_16_18bytes):
-	movdqa	%xmm0, -18(%edx)
-L(aligned_16_2bytes):
-	movw	%ax, -2(%edx)
-	SETRTNVAL
-	RETURN
-
-	ALIGN(4)
-L(aligned_16_115bytes):
-	movdqa	%xmm0, -115(%edx)
-L(aligned_16_99bytes):
-	movdqa	%xmm0, -99(%edx)
-L(aligned_16_83bytes):
-	movdqa	%xmm0, -83(%edx)
-L(aligned_16_67bytes):
-	movdqa	%xmm0, -67(%edx)
-L(aligned_16_51bytes):
-	movdqa	%xmm0, -51(%edx)
-L(aligned_16_35bytes):
-	movdqa	%xmm0, -35(%edx)
-L(aligned_16_19bytes):
-	movdqa	%xmm0, -19(%edx)
-L(aligned_16_3bytes):
-	movw	%ax, -3(%edx)
-	movb	%al, -1(%edx)
-	SETRTNVAL
-	RETURN
-
-	ALIGN(4)
-L(aligned_16_116bytes):
-	movdqa	%xmm0, -116(%edx)
-L(aligned_16_100bytes):
-	movdqa	%xmm0, -100(%edx)
-L(aligned_16_84bytes):
-	movdqa	%xmm0, -84(%edx)
-L(aligned_16_68bytes):
-	movdqa	%xmm0, -68(%edx)
-L(aligned_16_52bytes):
-	movdqa	%xmm0, -52(%edx)
-L(aligned_16_36bytes):
-	movdqa	%xmm0, -36(%edx)
-L(aligned_16_20bytes):
-	movdqa	%xmm0, -20(%edx)
-L(aligned_16_4bytes):
-	movl	%eax, -4(%edx)
-	SETRTNVAL
-	RETURN
-
-	ALIGN(4)
-L(aligned_16_117bytes):
-	movdqa	%xmm0, -117(%edx)
-L(aligned_16_101bytes):
-	movdqa	%xmm0, -101(%edx)
-L(aligned_16_85bytes):
-	movdqa	%xmm0, -85(%edx)
-L(aligned_16_69bytes):
-	movdqa	%xmm0, -69(%edx)
-L(aligned_16_53bytes):
-	movdqa	%xmm0, -53(%edx)
-L(aligned_16_37bytes):
-	movdqa	%xmm0, -37(%edx)
-L(aligned_16_21bytes):
-	movdqa	%xmm0, -21(%edx)
-L(aligned_16_5bytes):
-	movl	%eax, -5(%edx)
-	movb	%al, -1(%edx)
-	SETRTNVAL
-	RETURN
-
-	ALIGN(4)
-L(aligned_16_118bytes):
-	movdqa	%xmm0, -118(%edx)
-L(aligned_16_102bytes):
-	movdqa	%xmm0, -102(%edx)
-L(aligned_16_86bytes):
-	movdqa	%xmm0, -86(%edx)
-L(aligned_16_70bytes):
-	movdqa	%xmm0, -70(%edx)
-L(aligned_16_54bytes):
-	movdqa	%xmm0, -54(%edx)
-L(aligned_16_38bytes):
-	movdqa	%xmm0, -38(%edx)
-L(aligned_16_22bytes):
-	movdqa	%xmm0, -22(%edx)
-L(aligned_16_6bytes):
-	movl	%eax, -6(%edx)
-	movw	%ax, -2(%edx)
-	SETRTNVAL
-	RETURN
-
-	ALIGN(4)
-L(aligned_16_119bytes):
-	movdqa	%xmm0, -119(%edx)
-L(aligned_16_103bytes):
-	movdqa	%xmm0, -103(%edx)
-L(aligned_16_87bytes):
-	movdqa	%xmm0, -87(%edx)
-L(aligned_16_71bytes):
-	movdqa	%xmm0, -71(%edx)
-L(aligned_16_55bytes):
-	movdqa	%xmm0, -55(%edx)
-L(aligned_16_39bytes):
-	movdqa	%xmm0, -39(%edx)
-L(aligned_16_23bytes):
-	movdqa	%xmm0, -23(%edx)
-L(aligned_16_7bytes):
-	movl	%eax, -7(%edx)
-	movw	%ax, -3(%edx)
-	movb	%al, -1(%edx)
-	SETRTNVAL
-	RETURN
-
-	ALIGN(4)
-L(aligned_16_120bytes):
-	movdqa	%xmm0, -120(%edx)
-L(aligned_16_104bytes):
-	movdqa	%xmm0, -104(%edx)
-L(aligned_16_88bytes):
-	movdqa	%xmm0, -88(%edx)
-L(aligned_16_72bytes):
-	movdqa	%xmm0, -72(%edx)
-L(aligned_16_56bytes):
-	movdqa	%xmm0, -56(%edx)
-L(aligned_16_40bytes):
-	movdqa	%xmm0, -40(%edx)
-L(aligned_16_24bytes):
-	movdqa	%xmm0, -24(%edx)
-L(aligned_16_8bytes):
-	movq	%xmm0, -8(%edx)
-	SETRTNVAL
-	RETURN
-
-	ALIGN(4)
-L(aligned_16_121bytes):
-	movdqa	%xmm0, -121(%edx)
-L(aligned_16_105bytes):
-	movdqa	%xmm0, -105(%edx)
-L(aligned_16_89bytes):
-	movdqa	%xmm0, -89(%edx)
-L(aligned_16_73bytes):
-	movdqa	%xmm0, -73(%edx)
-L(aligned_16_57bytes):
-	movdqa	%xmm0, -57(%edx)
-L(aligned_16_41bytes):
-	movdqa	%xmm0, -41(%edx)
-L(aligned_16_25bytes):
-	movdqa	%xmm0, -25(%edx)
-L(aligned_16_9bytes):
-	movq	%xmm0, -9(%edx)
-	movb	%al, -1(%edx)
-	SETRTNVAL
-	RETURN
-
-	ALIGN(4)
-L(aligned_16_122bytes):
-	movdqa	%xmm0, -122(%edx)
-L(aligned_16_106bytes):
-	movdqa	%xmm0, -106(%edx)
-L(aligned_16_90bytes):
-	movdqa	%xmm0, -90(%edx)
-L(aligned_16_74bytes):
-	movdqa	%xmm0, -74(%edx)
-L(aligned_16_58bytes):
-	movdqa	%xmm0, -58(%edx)
-L(aligned_16_42bytes):
-	movdqa	%xmm0, -42(%edx)
-L(aligned_16_26bytes):
-	movdqa	%xmm0, -26(%edx)
-L(aligned_16_10bytes):
-	movq	%xmm0, -10(%edx)
-	movw	%ax, -2(%edx)
-	SETRTNVAL
-	RETURN
-
-	ALIGN(4)
-L(aligned_16_123bytes):
-	movdqa	%xmm0, -123(%edx)
-L(aligned_16_107bytes):
-	movdqa	%xmm0, -107(%edx)
-L(aligned_16_91bytes):
-	movdqa	%xmm0, -91(%edx)
-L(aligned_16_75bytes):
-	movdqa	%xmm0, -75(%edx)
-L(aligned_16_59bytes):
-	movdqa	%xmm0, -59(%edx)
-L(aligned_16_43bytes):
-	movdqa	%xmm0, -43(%edx)
-L(aligned_16_27bytes):
-	movdqa	%xmm0, -27(%edx)
-L(aligned_16_11bytes):
-	movq	%xmm0, -11(%edx)
-	movw	%ax, -3(%edx)
-	movb	%al, -1(%edx)
-	SETRTNVAL
-	RETURN
-
-	ALIGN(4)
-L(aligned_16_124bytes):
-	movdqa	%xmm0, -124(%edx)
-L(aligned_16_108bytes):
-	movdqa	%xmm0, -108(%edx)
-L(aligned_16_92bytes):
-	movdqa	%xmm0, -92(%edx)
-L(aligned_16_76bytes):
-	movdqa	%xmm0, -76(%edx)
-L(aligned_16_60bytes):
-	movdqa	%xmm0, -60(%edx)
-L(aligned_16_44bytes):
-	movdqa	%xmm0, -44(%edx)
-L(aligned_16_28bytes):
-	movdqa	%xmm0, -28(%edx)
-L(aligned_16_12bytes):
-	movq	%xmm0, -12(%edx)
-	movl	%eax, -4(%edx)
-	SETRTNVAL
-	RETURN
-
-	ALIGN(4)
-L(aligned_16_125bytes):
-	movdqa	%xmm0, -125(%edx)
-L(aligned_16_109bytes):
-	movdqa	%xmm0, -109(%edx)
-L(aligned_16_93bytes):
-	movdqa	%xmm0, -93(%edx)
-L(aligned_16_77bytes):
-	movdqa	%xmm0, -77(%edx)
-L(aligned_16_61bytes):
-	movdqa	%xmm0, -61(%edx)
-L(aligned_16_45bytes):
-	movdqa	%xmm0, -45(%edx)
-L(aligned_16_29bytes):
-	movdqa	%xmm0, -29(%edx)
-L(aligned_16_13bytes):
-	movq	%xmm0, -13(%edx)
-	movl	%eax, -5(%edx)
-	movb	%al, -1(%edx)
-	SETRTNVAL
-	RETURN
-
-	ALIGN(4)
-L(aligned_16_126bytes):
-	movdqa	%xmm0, -126(%edx)
-L(aligned_16_110bytes):
-	movdqa	%xmm0, -110(%edx)
-L(aligned_16_94bytes):
-	movdqa	%xmm0, -94(%edx)
-L(aligned_16_78bytes):
-	movdqa	%xmm0, -78(%edx)
-L(aligned_16_62bytes):
-	movdqa	%xmm0, -62(%edx)
-L(aligned_16_46bytes):
-	movdqa	%xmm0, -46(%edx)
-L(aligned_16_30bytes):
-	movdqa	%xmm0, -30(%edx)
-L(aligned_16_14bytes):
-	movq	%xmm0, -14(%edx)
-	movl	%eax, -6(%edx)
-	movw	%ax, -2(%edx)
-	SETRTNVAL
-	RETURN
-
-	ALIGN(4)
-L(aligned_16_127bytes):
-	movdqa	%xmm0, -127(%edx)
-L(aligned_16_111bytes):
-	movdqa	%xmm0, -111(%edx)
-L(aligned_16_95bytes):
-	movdqa	%xmm0, -95(%edx)
-L(aligned_16_79bytes):
-	movdqa	%xmm0, -79(%edx)
-L(aligned_16_63bytes):
-	movdqa	%xmm0, -63(%edx)
-L(aligned_16_47bytes):
-	movdqa	%xmm0, -47(%edx)
-L(aligned_16_31bytes):
-	movdqa	%xmm0, -31(%edx)
-L(aligned_16_15bytes):
-	movq	%xmm0, -15(%edx)
-	movl	%eax, -7(%edx)
-	movw	%ax, -3(%edx)
-	movb	%al, -1(%edx)
-	SETRTNVAL
-	RETURN_END
-
-END(memset_atom)
diff --git a/libc/arch-x86/string/sse2-memset-slm.S b/libc/arch-x86/string/sse2-memset-slm.S
index e4c8fa1..ec2ee52 100644
--- a/libc/arch-x86/string/sse2-memset-slm.S
+++ b/libc/arch-x86/string/sse2-memset-slm.S
@@ -79,7 +79,7 @@
     /* We loaded the jump table and adjusted EDX. Go.  */	\
     jmp		*%ebx
 
-ENTRY(__memset_chk_generic)
+ENTRY(__memset_chk)
   ENTRANCE
 
   movl LEN(%esp), %ecx
@@ -88,11 +88,11 @@
 
   POP(%ebx) // Undo ENTRANCE without returning.
   jmp __memset_chk_fail
-END(__memset_chk_generic)
+END(__memset_chk)
 
 	.section .text.sse2,"ax",@progbits
 	ALIGN(4)
-ENTRY(memset_generic)
+ENTRY(memset)
 	ENTRANCE
 
 	movl	LEN(%esp), %ecx
@@ -755,4 +755,4 @@
 	SETRTNVAL
 	RETURN_END
 
-END(memset_generic)
+END(memset)
diff --git a/libc/arch-x86/string/sse2-strcpy-slm.S b/libc/arch-x86/string/sse2-strcpy-slm.S
index 22ceeab..b5d84b5 100644
--- a/libc/arch-x86/string/sse2-strcpy-slm.S
+++ b/libc/arch-x86/string/sse2-strcpy-slm.S
@@ -79,7 +79,7 @@
 #define POP(REG) popl REG; CFI_POP (REG)
 
 #ifndef STRCPY
-# define STRCPY  strcpy_generic
+# define STRCPY  strcpy
 #endif
 
 #ifdef USE_AS_STPNCPY
diff --git a/libc/arch-x86/string/sse2-strlen-slm.S b/libc/arch-x86/string/sse2-strlen-slm.S
index b805ad6..27cc025 100644
--- a/libc/arch-x86/string/sse2-strlen-slm.S
+++ b/libc/arch-x86/string/sse2-strlen-slm.S
@@ -29,7 +29,7 @@
 */
 
 #ifndef STRLEN
-# define STRLEN strlen_generic
+# define STRLEN strlen
 #endif
 
 #ifndef L
diff --git a/libc/arch-x86/string/sse2-strncpy-slm.S b/libc/arch-x86/string/sse2-strncpy-slm.S
index aff7fb9..591419f 100644
--- a/libc/arch-x86/string/sse2-strncpy-slm.S
+++ b/libc/arch-x86/string/sse2-strncpy-slm.S
@@ -29,5 +29,5 @@
 */

 

 #define USE_AS_STRNCPY

-#define STRCPY strncpy_generic

+#define STRCPY strncpy

 #include "sse2-strcpy-slm.S"

diff --git a/libc/arch-x86/string/ssse3-memcpy-atom.S b/libc/arch-x86/string/ssse3-memcpy-atom.S
deleted file mode 100644
index 83e1985..0000000
--- a/libc/arch-x86/string/ssse3-memcpy-atom.S
+++ /dev/null
@@ -1,3124 +0,0 @@
-/*
-Copyright (c) 2010, Intel Corporation
-All rights reserved.
-
-Redistribution and use in source and binary forms, with or without
-modification, are permitted provided that the following conditions are met:
-
-    * Redistributions of source code must retain the above copyright notice,
-    * this list of conditions and the following disclaimer.
-
-    * Redistributions in binary form must reproduce the above copyright notice,
-    * this list of conditions and the following disclaimer in the documentation
-    * and/or other materials provided with the distribution.
-
-    * Neither the name of Intel Corporation nor the names of its contributors
-    * may be used to endorse or promote products derived from this software
-    * without specific prior written permission.
-
-THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
-ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
-WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
-DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
-ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
-(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
-LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
-ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-*/
-
-#define FOR_ATOM
-
-#ifndef MEMCPY
-# define MEMCPY	memcpy_atom
-#endif
-
-#ifndef L
-# define L(label)	.L##label
-#endif
-
-#ifndef cfi_startproc
-# define cfi_startproc	.cfi_startproc
-#endif
-
-#ifndef cfi_endproc
-# define cfi_endproc	.cfi_endproc
-#endif
-
-#ifndef cfi_rel_offset
-# define cfi_rel_offset(reg, off)	.cfi_rel_offset reg, off
-#endif
-
-#ifndef cfi_restore
-# define cfi_restore(reg)	.cfi_restore reg
-#endif
-
-#ifndef cfi_adjust_cfa_offset
-# define cfi_adjust_cfa_offset(off)	.cfi_adjust_cfa_offset off
-#endif
-
-#ifndef ENTRY
-# define ENTRY(name)		\
-	.type name,  @function;		\
-	.globl name;		\
-	.p2align 4;		\
-name:		\
-	cfi_startproc
-#endif
-
-#ifndef END
-# define END(name)		\
-	cfi_endproc;		\
-	.size name, .-name
-#endif
-
-#define DEST		PARMS
-#define SRC		DEST+4
-#define LEN		SRC+4
-
-#define CFI_PUSH(REG)		\
-  cfi_adjust_cfa_offset (4);		\
-  cfi_rel_offset (REG, 0)
-
-#define CFI_POP(REG)		\
-  cfi_adjust_cfa_offset (-4);		\
-  cfi_restore (REG)
-
-#define PUSH(REG)	pushl REG; CFI_PUSH (REG)
-#define POP(REG)	popl REG; CFI_POP (REG)
-
-#if (defined SHARED || defined __PIC__)
-# define PARMS		8		/* Preserve EBX.  */
-# define ENTRANCE	PUSH (%ebx);
-# define RETURN_END	POP (%ebx); ret
-# define RETURN		RETURN_END; CFI_PUSH (%ebx)
-# define JMPTBL(I, B)	I - B
-
-# define SETUP_PIC_REG(x)	call	__x86.get_pc_thunk.x
-
-/* Load an entry in a jump table into EBX and branch to it.  TABLE is a
-	jump table with relative offsets.  INDEX is a register contains the
-	index into the jump table.   SCALE is the scale of INDEX. */
-
-# define BRANCH_TO_JMPTBL_ENTRY(TABLE, INDEX, SCALE)		\
-    /* We first load PC into EBX.  */		\
-	SETUP_PIC_REG(bx);		\
-    /* Get the address of the jump table.  */		\
-	addl	$(TABLE - .), %ebx;		\
-    /* Get the entry and convert the relative offset to the		\
-	absolute	address.  */		\
-	addl	(%ebx, INDEX, SCALE), %ebx;		\
-    /* We loaded the jump table.  Go.  */		\
-	jmp	*%ebx
-#else
-
-# define PARMS		4
-# define ENTRANCE
-# define RETURN_END	ret
-# define RETURN		RETURN_END
-# define JMPTBL(I, B)	I
-
-/* Branch to an entry in a jump table.  TABLE is a jump table with
-	absolute offsets.  INDEX is a register contains the index into the
-	jump table.  SCALE is the scale of INDEX. */
-
-# define BRANCH_TO_JMPTBL_ENTRY(TABLE, INDEX, SCALE)		\
-	jmp	*TABLE(, INDEX, SCALE)
-#endif
-
-	.section .text.ssse3,"ax",@progbits
-ENTRY (MEMCPY)
-	ENTRANCE
-	movl	LEN(%esp), %ecx
-	movl	SRC(%esp), %eax
-	movl	DEST(%esp), %edx
-
-#ifdef USE_AS_MEMMOVE
-	cmp	%eax, %edx
-	jb	L(copy_forward)
-	je	L(fwd_write_0bytes)
-	cmp	$32, %ecx
-	jae	L(memmove_bwd)
-	jmp	L(bk_write_less32bytes_2)
-
-	.p2align 4
-L(memmove_bwd):
-	add	%ecx, %eax
-	cmp	%eax, %edx
-	movl	SRC(%esp), %eax
-	jb	L(copy_backward)
-
-L(copy_forward):
-#endif
-	cmp	$48, %ecx
-	jae	L(48bytesormore)
-
-L(fwd_write_less32bytes):
-#ifndef USE_AS_MEMMOVE
-	cmp	%dl, %al
-	jb	L(bk_write)
-#endif
-	add	%ecx, %edx
-	add	%ecx, %eax
-	BRANCH_TO_JMPTBL_ENTRY (L(table_48bytes_fwd), %ecx, 4)
-#ifndef USE_AS_MEMMOVE
-	.p2align 4
-L(bk_write):
-	BRANCH_TO_JMPTBL_ENTRY (L(table_48_bytes_bwd), %ecx, 4)
-#endif
-
-	.p2align 4
-L(48bytesormore):
-#ifndef USE_AS_MEMMOVE
-	movlpd	(%eax), %xmm0
-	movlpd	8(%eax), %xmm1
-	movlpd	%xmm0, (%edx)
-	movlpd	%xmm1, 8(%edx)
-#else
-	movdqu	(%eax), %xmm0
-#endif
-	PUSH (%edi)
-	movl	%edx, %edi
-	and	$-16, %edx
-	add	$16, %edx
-	sub	%edx, %edi
-	add	%edi, %ecx
-	sub	%edi, %eax
-
-#ifdef SHARED_CACHE_SIZE_HALF
-	cmp	$SHARED_CACHE_SIZE_HALF, %ecx
-#else
-# if (defined SHARED || defined __PIC__)
-	SETUP_PIC_REG(bx)
-	add	$_GLOBAL_OFFSET_TABLE_, %ebx
-	cmp	__x86_shared_cache_size_half@GOTOFF(%ebx), %ecx
-# else
-	cmp	__x86_shared_cache_size_half, %ecx
-# endif
-#endif
-
-	mov	%eax, %edi
-	jae	L(large_page)
-	and	$0xf, %edi
-	jz	L(shl_0)
-	BRANCH_TO_JMPTBL_ENTRY (L(shl_table), %edi, 4)
-
-	.p2align 4
-L(shl_0):
-#ifdef USE_AS_MEMMOVE
-	movl	DEST+4(%esp), %edi
-	movdqu	%xmm0, (%edi)
-#endif
-	xor	%edi, %edi
-	cmp	$127, %ecx
-	ja	L(shl_0_gobble)
-	lea	-32(%ecx), %ecx
-
-	.p2align 4
-L(shl_0_loop):
-	movdqa	(%eax, %edi), %xmm0
-	movdqa	16(%eax, %edi), %xmm1
-	sub	$32, %ecx
-	movdqa	%xmm0, (%edx, %edi)
-	movdqa	%xmm1, 16(%edx, %edi)
-	lea	32(%edi), %edi
-	jb	L(shl_0_end)
-
-	movdqa	(%eax, %edi), %xmm0
-	movdqa	16(%eax, %edi), %xmm1
-	sub	$32, %ecx
-	movdqa	%xmm0, (%edx, %edi)
-	movdqa	%xmm1, 16(%edx, %edi)
-	lea	32(%edi), %edi
-	jb	L(shl_0_end)
-
-	movdqa	(%eax, %edi), %xmm0
-	movdqa	16(%eax, %edi), %xmm1
-	sub	$32, %ecx
-	movdqa	%xmm0, (%edx, %edi)
-	movdqa	%xmm1, 16(%edx, %edi)
-	lea	32(%edi), %edi
-	jb	L(shl_0_end)
-
-	movdqa	(%eax, %edi), %xmm0
-	movdqa	16(%eax, %edi), %xmm1
-	sub	$32, %ecx
-	movdqa	%xmm0, (%edx, %edi)
-	movdqa	%xmm1, 16(%edx, %edi)
-	lea	32(%edi), %edi
-
-L(shl_0_end):
-	lea	32(%ecx), %ecx
-	add	%ecx, %edi
-	add	%edi, %edx
-	add	%edi, %eax
-	POP (%edi)
-	BRANCH_TO_JMPTBL_ENTRY (L(table_48bytes_fwd_align), %ecx, 4)
-
-	CFI_PUSH (%edi)
-
-	.p2align 4
-L(shl_0_gobble):
-#ifdef DATA_CACHE_SIZE_HALF
-	cmp	$DATA_CACHE_SIZE_HALF, %ecx
-#else
-# if (defined SHARED || defined __PIC__)
-	SETUP_PIC_REG(bx)
-	add	$_GLOBAL_OFFSET_TABLE_, %ebx
-	cmp	__x86_data_cache_size_half@GOTOFF(%ebx), %ecx
-# else
-	cmp	__x86_data_cache_size_half, %ecx
-# endif
-#endif
-	POP	(%edi)
-	lea	-128(%ecx), %ecx
-	jae	L(shl_0_gobble_mem_loop)
-
-	.p2align 4
-L(shl_0_gobble_cache_loop):
-	movdqa	(%eax), %xmm0
-	movdqa	0x10(%eax), %xmm1
-	movdqa	0x20(%eax), %xmm2
-	movdqa	0x30(%eax), %xmm3
-	movdqa	0x40(%eax), %xmm4
-	movdqa	0x50(%eax), %xmm5
-	movdqa	0x60(%eax), %xmm6
-	movdqa	0x70(%eax), %xmm7
-	lea	0x80(%eax), %eax
-	sub	$128, %ecx
-	movdqa	%xmm0, (%edx)
-	movdqa	%xmm1, 0x10(%edx)
-	movdqa	%xmm2, 0x20(%edx)
-	movdqa	%xmm3, 0x30(%edx)
-	movdqa	%xmm4, 0x40(%edx)
-	movdqa	%xmm5, 0x50(%edx)
-	movdqa	%xmm6, 0x60(%edx)
-	movdqa	%xmm7, 0x70(%edx)
-	lea	0x80(%edx), %edx
-
-	jae	L(shl_0_gobble_cache_loop)
-	cmp	$-0x40, %ecx
-	lea	0x80(%ecx), %ecx
-	jl	L(shl_0_cache_less_64bytes)
-
-	movdqa	(%eax), %xmm0
-	sub	$0x40, %ecx
-	movdqa	0x10(%eax), %xmm1
-	movdqa	%xmm0, (%edx)
-	movdqa	%xmm1, 0x10(%edx)
-	movdqa	0x20(%eax), %xmm0
-	movdqa	0x30(%eax), %xmm1
-	add	$0x40, %eax
-	movdqa	%xmm0, 0x20(%edx)
-	movdqa	%xmm1, 0x30(%edx)
-	add	$0x40, %edx
-
-L(shl_0_cache_less_64bytes):
-	cmp	$0x20, %ecx
-	jb	L(shl_0_cache_less_32bytes)
-	movdqa	(%eax), %xmm0
-	sub	$0x20, %ecx
-	movdqa	0x10(%eax), %xmm1
-	add	$0x20, %eax
-	movdqa	%xmm0, (%edx)
-	movdqa	%xmm1, 0x10(%edx)
-	add	$0x20, %edx
-
-L(shl_0_cache_less_32bytes):
-	cmp	$0x10, %ecx
-	jb	L(shl_0_cache_less_16bytes)
-	sub	$0x10, %ecx
-	movdqa	(%eax), %xmm0
-	add	$0x10, %eax
-	movdqa	%xmm0, (%edx)
-	add	$0x10, %edx
-
-L(shl_0_cache_less_16bytes):
-	add	%ecx, %edx
-	add	%ecx, %eax
-	BRANCH_TO_JMPTBL_ENTRY (L(table_48bytes_fwd), %ecx, 4)
-
-	.p2align 4
-L(shl_0_gobble_mem_loop):
-	prefetcht0 0x1c0(%eax)
-	prefetcht0 0x280(%eax)
-	prefetcht0 0x1c0(%edx)
-
-	movdqa	(%eax), %xmm0
-	movdqa	0x10(%eax), %xmm1
-	movdqa	0x20(%eax), %xmm2
-	movdqa	0x30(%eax), %xmm3
-	movdqa	0x40(%eax), %xmm4
-	movdqa	0x50(%eax), %xmm5
-	movdqa	0x60(%eax), %xmm6
-	movdqa	0x70(%eax), %xmm7
-	lea	0x80(%eax), %eax
-	sub	$0x80, %ecx
-	movdqa	%xmm0, (%edx)
-	movdqa	%xmm1, 0x10(%edx)
-	movdqa	%xmm2, 0x20(%edx)
-	movdqa	%xmm3, 0x30(%edx)
-	movdqa	%xmm4, 0x40(%edx)
-	movdqa	%xmm5, 0x50(%edx)
-	movdqa	%xmm6, 0x60(%edx)
-	movdqa	%xmm7, 0x70(%edx)
-	lea	0x80(%edx), %edx
-
-	jae	L(shl_0_gobble_mem_loop)
-	cmp	$-0x40, %ecx
-	lea	0x80(%ecx), %ecx
-	jl	L(shl_0_mem_less_64bytes)
-
-	movdqa	(%eax), %xmm0
-	sub	$0x40, %ecx
-	movdqa	0x10(%eax), %xmm1
-
-	movdqa	%xmm0, (%edx)
-	movdqa	%xmm1, 0x10(%edx)
-
-	movdqa	0x20(%eax), %xmm0
-	movdqa	0x30(%eax), %xmm1
-	add	$0x40, %eax
-
-	movdqa	%xmm0, 0x20(%edx)
-	movdqa	%xmm1, 0x30(%edx)
-	add	$0x40, %edx
-
-L(shl_0_mem_less_64bytes):
-	cmp	$0x20, %ecx
-	jb	L(shl_0_mem_less_32bytes)
-	movdqa	(%eax), %xmm0
-	sub	$0x20, %ecx
-	movdqa	0x10(%eax), %xmm1
-	add	$0x20, %eax
-	movdqa	%xmm0, (%edx)
-	movdqa	%xmm1, 0x10(%edx)
-	add	$0x20, %edx
-
-L(shl_0_mem_less_32bytes):
-	cmp	$0x10, %ecx
-	jb	L(shl_0_mem_less_16bytes)
-	sub	$0x10, %ecx
-	movdqa	(%eax), %xmm0
-	add	$0x10, %eax
-	movdqa	%xmm0, (%edx)
-	add	$0x10, %edx
-
-L(shl_0_mem_less_16bytes):
-	add	%ecx, %edx
-	add	%ecx, %eax
-	BRANCH_TO_JMPTBL_ENTRY (L(table_48bytes_fwd_align), %ecx, 4)
-
-	.p2align 4
-L(shl_1):
-#ifndef USE_AS_MEMMOVE
-	movaps	-1(%eax), %xmm1
-#else
-	movl	DEST+4(%esp), %edi
-	movaps	-1(%eax), %xmm1
-	movdqu	%xmm0, (%edi)
-#endif
-#ifdef DATA_CACHE_SIZE_HALF
-	cmp	$DATA_CACHE_SIZE_HALF, %ecx
-#else
-# if (defined SHARED || defined __PIC__)
-	SETUP_PIC_REG(bx)
-	add	$_GLOBAL_OFFSET_TABLE_, %ebx
-	cmp	__x86_data_cache_size_half@GOTOFF(%ebx), %ecx
-# else
-	cmp	__x86_data_cache_size_half, %ecx
-# endif
-#endif
-	jb L(sh_1_no_prefetch)
-
-	lea	-64(%ecx), %ecx
-
-	.p2align 4
-L(Shl1LoopStart):
-	prefetcht0 0x1c0(%eax)
-	prefetcht0 0x1c0(%edx)
-	movaps	15(%eax), %xmm2
-	movaps	31(%eax), %xmm3
-	movaps	47(%eax), %xmm4
-	movaps	63(%eax), %xmm5
-	movaps	%xmm5, %xmm7
-	palignr	$1, %xmm4, %xmm5
-	palignr	$1, %xmm3, %xmm4
-	movaps	%xmm5, 48(%edx)
-	palignr	$1, %xmm2, %xmm3
-	lea	64(%eax), %eax
-	palignr	$1, %xmm1, %xmm2
-	movaps	%xmm4, 32(%edx)
-	movaps	%xmm3, 16(%edx)
-	movaps	%xmm7, %xmm1
-	movaps	%xmm2, (%edx)
-	lea	64(%edx), %edx
-	sub	$64, %ecx
-	ja	L(Shl1LoopStart)
-
-L(Shl1LoopLeave):
-	add	$32, %ecx
-	jle	L(shl_end_0)
-
-	movaps	15(%eax), %xmm2
-	movaps	31(%eax), %xmm3
-	palignr	$1, %xmm2, %xmm3
-	palignr	$1, %xmm1, %xmm2
-	movaps	%xmm2, (%edx)
-	movaps	%xmm3, 16(%edx)
-	lea	32(%edx, %ecx), %edx
-	lea	32(%eax, %ecx), %eax
-	POP (%edi)
-	BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
-
-	CFI_PUSH (%edi)
-
-	.p2align 4
-L(sh_1_no_prefetch):
-	lea	-32(%ecx), %ecx
-	lea	-1(%eax), %eax
-	xor	%edi, %edi
-
-	.p2align 4
-L(sh_1_no_prefetch_loop):
-	movdqa	16(%eax, %edi), %xmm2
-	sub	$32, %ecx
-	movdqa	32(%eax, %edi), %xmm3
-	movdqa	%xmm3, %xmm4
-	palignr	$1, %xmm2, %xmm3
-	palignr	$1, %xmm1, %xmm2
-	lea	32(%edi), %edi
-	movdqa	%xmm2, -32(%edx, %edi)
-	movdqa	%xmm3, -16(%edx, %edi)
-	jb	L(sh_1_end_no_prefetch_loop)
-
-	movdqa	16(%eax, %edi), %xmm2
-	sub	$32, %ecx
-	movdqa	32(%eax, %edi), %xmm3
-	movdqa	%xmm3, %xmm1
-	palignr	$1, %xmm2, %xmm3
-	palignr	$1, %xmm4, %xmm2
-	lea	32(%edi), %edi
-	movdqa	%xmm2, -32(%edx, %edi)
-	movdqa	%xmm3, -16(%edx, %edi)
-	jae	L(sh_1_no_prefetch_loop)
-
-L(sh_1_end_no_prefetch_loop):
-	lea	32(%ecx), %ecx
-	add	%ecx, %edi
-	add	%edi, %edx
-	lea	1(%edi, %eax), %eax
-	POP	(%edi)
-	BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
-
-	CFI_PUSH (%edi)
-
-	.p2align 4
-L(shl_2):
-#ifndef USE_AS_MEMMOVE
-	movaps	-2(%eax), %xmm1
-#else
-	movl	DEST+4(%esp), %edi
-	movaps	-2(%eax), %xmm1
-	movdqu	%xmm0, (%edi)
-#endif
-#ifdef DATA_CACHE_SIZE_HALF
-	cmp	$DATA_CACHE_SIZE_HALF, %ecx
-#else
-# if (defined SHARED || defined __PIC__)
-	SETUP_PIC_REG(bx)
-	add	$_GLOBAL_OFFSET_TABLE_, %ebx
-	cmp	__x86_data_cache_size_half@GOTOFF(%ebx), %ecx
-# else
-	cmp	__x86_data_cache_size_half, %ecx
-# endif
-#endif
-	jb L(sh_2_no_prefetch)
-
-	lea	-64(%ecx), %ecx
-
-	.p2align 4
-L(Shl2LoopStart):
-	prefetcht0 0x1c0(%eax)
-	prefetcht0 0x1c0(%edx)
-	movaps	14(%eax), %xmm2
-	movaps	30(%eax), %xmm3
-	movaps	46(%eax), %xmm4
-	movaps	62(%eax), %xmm5
-	movaps	%xmm5, %xmm7
-	palignr	$2, %xmm4, %xmm5
-	palignr	$2, %xmm3, %xmm4
-	movaps	%xmm5, 48(%edx)
-	palignr	$2, %xmm2, %xmm3
-	lea	64(%eax), %eax
-	palignr	$2, %xmm1, %xmm2
-	movaps	%xmm4, 32(%edx)
-	movaps	%xmm3, 16(%edx)
-	movaps	%xmm7, %xmm1
-	movaps	%xmm2, (%edx)
-	lea	64(%edx), %edx
-	sub	$64, %ecx
-	ja	L(Shl2LoopStart)
-
-L(Shl2LoopLeave):
-	add	$32, %ecx
-	jle	L(shl_end_0)
-
-	movaps	14(%eax), %xmm2
-	movaps	30(%eax), %xmm3
-	palignr	$2, %xmm2, %xmm3
-	palignr	$2, %xmm1, %xmm2
-	movaps	%xmm2, (%edx)
-	movaps	%xmm3, 16(%edx)
-	lea	32(%edx, %ecx), %edx
-	lea	32(%eax, %ecx), %eax
-	POP (%edi)
-	BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
-
-	CFI_PUSH (%edi)
-
-	.p2align 4
-L(sh_2_no_prefetch):
-	lea	-32(%ecx), %ecx
-	lea	-2(%eax), %eax
-	xor	%edi, %edi
-
-	.p2align 4
-L(sh_2_no_prefetch_loop):
-	movdqa	16(%eax, %edi), %xmm2
-	sub	$32, %ecx
-	movdqa	32(%eax, %edi), %xmm3
-	movdqa	%xmm3, %xmm4
-	palignr	$2, %xmm2, %xmm3
-	palignr	$2, %xmm1, %xmm2
-	lea	32(%edi), %edi
-	movdqa	%xmm2, -32(%edx, %edi)
-	movdqa	%xmm3, -16(%edx, %edi)
-	jb	L(sh_2_end_no_prefetch_loop)
-
-	movdqa	16(%eax, %edi), %xmm2
-	sub	$32, %ecx
-	movdqa	32(%eax, %edi), %xmm3
-	movdqa	%xmm3, %xmm1
-	palignr	$2, %xmm2, %xmm3
-	palignr	$2, %xmm4, %xmm2
-	lea	32(%edi), %edi
-	movdqa	%xmm2, -32(%edx, %edi)
-	movdqa	%xmm3, -16(%edx, %edi)
-	jae	L(sh_2_no_prefetch_loop)
-
-L(sh_2_end_no_prefetch_loop):
-	lea	32(%ecx), %ecx
-	add	%ecx, %edi
-	add	%edi, %edx
-	lea	2(%edi, %eax), %eax
-	POP	(%edi)
-	BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
-
-	CFI_PUSH (%edi)
-
-	.p2align 4
-L(shl_3):
-#ifndef USE_AS_MEMMOVE
-	movaps	-3(%eax), %xmm1
-#else
-	movl	DEST+4(%esp), %edi
-	movaps	-3(%eax), %xmm1
-	movdqu	%xmm0, (%edi)
-#endif
-#ifdef DATA_CACHE_SIZE_HALF
-	cmp	$DATA_CACHE_SIZE_HALF, %ecx
-#else
-# if (defined SHARED || defined __PIC__)
-	SETUP_PIC_REG(bx)
-	add	$_GLOBAL_OFFSET_TABLE_, %ebx
-	cmp	__x86_data_cache_size_half@GOTOFF(%ebx), %ecx
-# else
-	cmp	__x86_data_cache_size_half, %ecx
-# endif
-#endif
-	jb L(sh_3_no_prefetch)
-
-	lea	-64(%ecx), %ecx
-
-	.p2align 4
-L(Shl3LoopStart):
-	prefetcht0 0x1c0(%eax)
-	prefetcht0 0x1c0(%edx)
-	movaps	13(%eax), %xmm2
-	movaps	29(%eax), %xmm3
-	movaps	45(%eax), %xmm4
-	movaps	61(%eax), %xmm5
-	movaps	%xmm5, %xmm7
-	palignr	$3, %xmm4, %xmm5
-	palignr	$3, %xmm3, %xmm4
-	movaps	%xmm5, 48(%edx)
-	palignr	$3, %xmm2, %xmm3
-	lea	64(%eax), %eax
-	palignr	$3, %xmm1, %xmm2
-	movaps	%xmm4, 32(%edx)
-	movaps	%xmm3, 16(%edx)
-	movaps	%xmm7, %xmm1
-	movaps	%xmm2, (%edx)
-	lea	64(%edx), %edx
-	sub	$64, %ecx
-	ja	L(Shl3LoopStart)
-
-L(Shl3LoopLeave):
-	add	$32, %ecx
-	jle	L(shl_end_0)
-
-	movaps	13(%eax), %xmm2
-	movaps	29(%eax), %xmm3
-	palignr	$3, %xmm2, %xmm3
-	palignr	$3, %xmm1, %xmm2
-	movaps	%xmm2, (%edx)
-	movaps	%xmm3, 16(%edx)
-	lea	32(%edx, %ecx), %edx
-	lea	32(%eax, %ecx), %eax
-	POP (%edi)
-	BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
-
-	CFI_PUSH (%edi)
-
-	.p2align 4
-L(sh_3_no_prefetch):
-	lea	-32(%ecx), %ecx
-	lea	-3(%eax), %eax
-	xor	%edi, %edi
-
-	.p2align 4
-L(sh_3_no_prefetch_loop):
-	movdqa	16(%eax, %edi), %xmm2
-	sub	$32, %ecx
-	movdqa	32(%eax, %edi), %xmm3
-	movdqa	%xmm3, %xmm4
-	palignr	$3, %xmm2, %xmm3
-	palignr	$3, %xmm1, %xmm2
-	lea	32(%edi), %edi
-	movdqa	%xmm2, -32(%edx, %edi)
-	movdqa	%xmm3, -16(%edx, %edi)
-
-	jb	L(sh_3_end_no_prefetch_loop)
-
-	movdqa	16(%eax, %edi), %xmm2
-	sub	$32, %ecx
-	movdqa	32(%eax, %edi), %xmm3
-	movdqa	%xmm3, %xmm1
-	palignr	$3, %xmm2, %xmm3
-	palignr	$3, %xmm4, %xmm2
-	lea	32(%edi), %edi
-	movdqa	%xmm2, -32(%edx, %edi)
-	movdqa	%xmm3, -16(%edx, %edi)
-
-	jae	L(sh_3_no_prefetch_loop)
-
-L(sh_3_end_no_prefetch_loop):
-	lea	32(%ecx), %ecx
-	add	%ecx, %edi
-	add	%edi, %edx
-	lea	3(%edi, %eax), %eax
-	POP	(%edi)
-	BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
-
-	CFI_PUSH (%edi)
-
-	.p2align 4
-L(shl_4):
-#ifndef USE_AS_MEMMOVE
-	movaps	-4(%eax), %xmm1
-#else
-	movl	DEST+4(%esp), %edi
-	movaps	-4(%eax), %xmm1
-	movdqu	%xmm0, (%edi)
-#endif
-#ifdef DATA_CACHE_SIZE_HALF
-	cmp	$DATA_CACHE_SIZE_HALF, %ecx
-#else
-# if (defined SHARED || defined __PIC__)
-	SETUP_PIC_REG(bx)
-	add	$_GLOBAL_OFFSET_TABLE_, %ebx
-	cmp	__x86_data_cache_size_half@GOTOFF(%ebx), %ecx
-# else
-	cmp	__x86_data_cache_size_half, %ecx
-# endif
-#endif
-	jb L(sh_4_no_prefetch)
-
-	lea	-64(%ecx), %ecx
-
-	.p2align 4
-L(Shl4LoopStart):
-	prefetcht0 0x1c0(%eax)
-	prefetcht0 0x1c0(%edx)
-	movaps	12(%eax), %xmm2
-	movaps	28(%eax), %xmm3
-	movaps	44(%eax), %xmm4
-	movaps	60(%eax), %xmm5
-	movaps	%xmm5, %xmm7
-	palignr	$4, %xmm4, %xmm5
-	palignr	$4, %xmm3, %xmm4
-	movaps	%xmm5, 48(%edx)
-	palignr	$4, %xmm2, %xmm3
-	lea	64(%eax), %eax
-	palignr	$4, %xmm1, %xmm2
-	movaps	%xmm4, 32(%edx)
-	movaps	%xmm3, 16(%edx)
-	movaps	%xmm7, %xmm1
-	movaps	%xmm2, (%edx)
-	lea	64(%edx), %edx
-	sub	$64, %ecx
-	ja	L(Shl4LoopStart)
-
-L(Shl4LoopLeave):
-	add	$32, %ecx
-	jle	L(shl_end_0)
-
-	movaps	12(%eax), %xmm2
-	movaps	28(%eax), %xmm3
-	palignr	$4, %xmm2, %xmm3
-	palignr	$4, %xmm1, %xmm2
-	movaps	%xmm2, (%edx)
-	movaps	%xmm3, 16(%edx)
-	lea	32(%edx, %ecx), %edx
-	lea	32(%eax, %ecx), %eax
-	POP (%edi)
-	BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
-
-	CFI_PUSH (%edi)
-
-	.p2align 4
-L(sh_4_no_prefetch):
-	lea	-32(%ecx), %ecx
-	lea	-4(%eax), %eax
-	xor	%edi, %edi
-
-	.p2align 4
-L(sh_4_no_prefetch_loop):
-	movdqa	16(%eax, %edi), %xmm2
-	sub	$32, %ecx
-	movdqa	32(%eax, %edi), %xmm3
-	movdqa	%xmm3, %xmm4
-	palignr	$4, %xmm2, %xmm3
-	palignr	$4, %xmm1, %xmm2
-	lea	32(%edi), %edi
-	movdqa	%xmm2, -32(%edx, %edi)
-	movdqa	%xmm3, -16(%edx, %edi)
-
-	jb	L(sh_4_end_no_prefetch_loop)
-
-	movdqa	16(%eax, %edi), %xmm2
-	sub	$32, %ecx
-	movdqa	32(%eax, %edi), %xmm3
-	movdqa	%xmm3, %xmm1
-	palignr	$4, %xmm2, %xmm3
-	palignr	$4, %xmm4, %xmm2
-	lea	32(%edi), %edi
-	movdqa	%xmm2, -32(%edx, %edi)
-	movdqa	%xmm3, -16(%edx, %edi)
-
-	jae	L(sh_4_no_prefetch_loop)
-
-L(sh_4_end_no_prefetch_loop):
-	lea	32(%ecx), %ecx
-	add	%ecx, %edi
-	add	%edi, %edx
-	lea	4(%edi, %eax), %eax
-	POP	(%edi)
-	BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
-
-	CFI_PUSH (%edi)
-
-	.p2align 4
-L(shl_5):
-#ifndef USE_AS_MEMMOVE
-	movaps	-5(%eax), %xmm1
-#else
-	movl	DEST+4(%esp), %edi
-	movaps	-5(%eax), %xmm1
-	movdqu	%xmm0, (%edi)
-#endif
-#ifdef DATA_CACHE_SIZE_HALF
-	cmp	$DATA_CACHE_SIZE_HALF, %ecx
-#else
-# if (defined SHARED || defined __PIC__)
-	SETUP_PIC_REG(bx)
-	add	$_GLOBAL_OFFSET_TABLE_, %ebx
-	cmp	__x86_data_cache_size_half@GOTOFF(%ebx), %ecx
-# else
-	cmp	__x86_data_cache_size_half, %ecx
-# endif
-#endif
-	jb L(sh_5_no_prefetch)
-
-	lea	-64(%ecx), %ecx
-
-	.p2align 4
-L(Shl5LoopStart):
-	prefetcht0 0x1c0(%eax)
-	prefetcht0 0x1c0(%edx)
-	movaps	11(%eax), %xmm2
-	movaps	27(%eax), %xmm3
-	movaps	43(%eax), %xmm4
-	movaps	59(%eax), %xmm5
-	movaps	%xmm5, %xmm7
-	palignr	$5, %xmm4, %xmm5
-	palignr	$5, %xmm3, %xmm4
-	movaps	%xmm5, 48(%edx)
-	palignr	$5, %xmm2, %xmm3
-	lea	64(%eax), %eax
-	palignr	$5, %xmm1, %xmm2
-	movaps	%xmm4, 32(%edx)
-	movaps	%xmm3, 16(%edx)
-	movaps	%xmm7, %xmm1
-	movaps	%xmm2, (%edx)
-	lea	64(%edx), %edx
-	sub	$64, %ecx
-	ja	L(Shl5LoopStart)
-
-L(Shl5LoopLeave):
-	add	$32, %ecx
-	jle	L(shl_end_0)
-
-	movaps	11(%eax), %xmm2
-	movaps	27(%eax), %xmm3
-	palignr	$5, %xmm2, %xmm3
-	palignr	$5, %xmm1, %xmm2
-	movaps	%xmm2, (%edx)
-	movaps	%xmm3, 16(%edx)
-	lea	32(%edx, %ecx), %edx
-	lea	32(%eax, %ecx), %eax
-	POP (%edi)
-	BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
-
-	CFI_PUSH (%edi)
-
-	.p2align 4
-L(sh_5_no_prefetch):
-	lea	-32(%ecx), %ecx
-	lea	-5(%eax), %eax
-	xor	%edi, %edi
-
-	.p2align 4
-L(sh_5_no_prefetch_loop):
-	movdqa	16(%eax, %edi), %xmm2
-	sub	$32, %ecx
-	movdqa	32(%eax, %edi), %xmm3
-	movdqa	%xmm3, %xmm4
-	palignr	$5, %xmm2, %xmm3
-	palignr	$5, %xmm1, %xmm2
-	lea	32(%edi), %edi
-	movdqa	%xmm2, -32(%edx, %edi)
-	movdqa	%xmm3, -16(%edx, %edi)
-
-	jb	L(sh_5_end_no_prefetch_loop)
-
-	movdqa	16(%eax, %edi), %xmm2
-	sub	$32, %ecx
-	movdqa	32(%eax, %edi), %xmm3
-	movdqa	%xmm3, %xmm1
-	palignr	$5, %xmm2, %xmm3
-	palignr	$5, %xmm4, %xmm2
-	lea	32(%edi), %edi
-	movdqa	%xmm2, -32(%edx, %edi)
-	movdqa	%xmm3, -16(%edx, %edi)
-
-	jae	L(sh_5_no_prefetch_loop)
-
-L(sh_5_end_no_prefetch_loop):
-	lea	32(%ecx), %ecx
-	add	%ecx, %edi
-	add	%edi, %edx
-	lea	5(%edi, %eax), %eax
-	POP	(%edi)
-	BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
-
-	CFI_PUSH (%edi)
-
-	.p2align 4
-L(shl_6):
-#ifndef USE_AS_MEMMOVE
-	movaps	-6(%eax), %xmm1
-#else
-	movl	DEST+4(%esp), %edi
-	movaps	-6(%eax), %xmm1
-	movdqu	%xmm0, (%edi)
-#endif
-#ifdef DATA_CACHE_SIZE_HALF
-	cmp	$DATA_CACHE_SIZE_HALF, %ecx
-#else
-# if (defined SHARED || defined __PIC__)
-	SETUP_PIC_REG(bx)
-	add	$_GLOBAL_OFFSET_TABLE_, %ebx
-	cmp	__x86_data_cache_size_half@GOTOFF(%ebx), %ecx
-# else
-	cmp	__x86_data_cache_size_half, %ecx
-# endif
-#endif
-	jb L(sh_6_no_prefetch)
-
-	lea	-64(%ecx), %ecx
-
-	.p2align 4
-L(Shl6LoopStart):
-	prefetcht0 0x1c0(%eax)
-	prefetcht0 0x1c0(%edx)
-	movaps	10(%eax), %xmm2
-	movaps	26(%eax), %xmm3
-	movaps	42(%eax), %xmm4
-	movaps	58(%eax), %xmm5
-	movaps	%xmm5, %xmm7
-	palignr	$6, %xmm4, %xmm5
-	palignr	$6, %xmm3, %xmm4
-	movaps	%xmm5, 48(%edx)
-	palignr	$6, %xmm2, %xmm3
-	lea	64(%eax), %eax
-	palignr	$6, %xmm1, %xmm2
-	movaps	%xmm4, 32(%edx)
-	movaps	%xmm3, 16(%edx)
-	movaps	%xmm7, %xmm1
-	movaps	%xmm2, (%edx)
-	lea	64(%edx), %edx
-	sub	$64, %ecx
-	ja	L(Shl6LoopStart)
-
-L(Shl6LoopLeave):
-	add	$32, %ecx
-	jle	L(shl_end_0)
-
-	movaps	10(%eax), %xmm2
-	movaps	26(%eax), %xmm3
-	palignr	$6, %xmm2, %xmm3
-	palignr	$6, %xmm1, %xmm2
-	movaps	%xmm2, (%edx)
-	movaps	%xmm3, 16(%edx)
-	lea	32(%edx, %ecx), %edx
-	lea	32(%eax, %ecx), %eax
-	POP (%edi)
-	BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
-
-	CFI_PUSH (%edi)
-
-	.p2align 4
-L(sh_6_no_prefetch):
-	lea	-32(%ecx), %ecx
-	lea	-6(%eax), %eax
-	xor	%edi, %edi
-
-	.p2align 4
-L(sh_6_no_prefetch_loop):
-	movdqa	16(%eax, %edi), %xmm2
-	sub	$32, %ecx
-	movdqa	32(%eax, %edi), %xmm3
-	movdqa	%xmm3, %xmm4
-	palignr	$6, %xmm2, %xmm3
-	palignr	$6, %xmm1, %xmm2
-	lea	32(%edi), %edi
-	movdqa	%xmm2, -32(%edx, %edi)
-	movdqa	%xmm3, -16(%edx, %edi)
-
-	jb	L(sh_6_end_no_prefetch_loop)
-
-	movdqa	16(%eax, %edi), %xmm2
-	sub	$32, %ecx
-	movdqa	32(%eax, %edi), %xmm3
-	movdqa	%xmm3, %xmm1
-	palignr	$6, %xmm2, %xmm3
-	palignr	$6, %xmm4, %xmm2
-	lea	32(%edi), %edi
-	movdqa	%xmm2, -32(%edx, %edi)
-	movdqa	%xmm3, -16(%edx, %edi)
-
-	jae	L(sh_6_no_prefetch_loop)
-
-L(sh_6_end_no_prefetch_loop):
-	lea	32(%ecx), %ecx
-	add	%ecx, %edi
-	add	%edi, %edx
-	lea	6(%edi, %eax), %eax
-	POP	(%edi)
-	BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
-
-	CFI_PUSH (%edi)
-
-	.p2align 4
-L(shl_7):
-#ifndef USE_AS_MEMMOVE
-	movaps	-7(%eax), %xmm1
-#else
-	movl	DEST+4(%esp), %edi
-	movaps	-7(%eax), %xmm1
-	movdqu	%xmm0, (%edi)
-#endif
-#ifdef DATA_CACHE_SIZE_HALF
-	cmp	$DATA_CACHE_SIZE_HALF, %ecx
-#else
-# if (defined SHARED || defined __PIC__)
-	SETUP_PIC_REG(bx)
-	add	$_GLOBAL_OFFSET_TABLE_, %ebx
-	cmp	__x86_data_cache_size_half@GOTOFF(%ebx), %ecx
-# else
-	cmp	__x86_data_cache_size_half, %ecx
-# endif
-#endif
-	jb L(sh_7_no_prefetch)
-
-	lea	-64(%ecx), %ecx
-
-	.p2align 4
-L(Shl7LoopStart):
-	prefetcht0 0x1c0(%eax)
-	prefetcht0 0x1c0(%edx)
-	movaps	9(%eax), %xmm2
-	movaps	25(%eax), %xmm3
-	movaps	41(%eax), %xmm4
-	movaps	57(%eax), %xmm5
-	movaps	%xmm5, %xmm7
-	palignr	$7, %xmm4, %xmm5
-	palignr	$7, %xmm3, %xmm4
-	movaps	%xmm5, 48(%edx)
-	palignr	$7, %xmm2, %xmm3
-	lea	64(%eax), %eax
-	palignr	$7, %xmm1, %xmm2
-	movaps	%xmm4, 32(%edx)
-	movaps	%xmm3, 16(%edx)
-	movaps	%xmm7, %xmm1
-	movaps	%xmm2, (%edx)
-	lea	64(%edx), %edx
-	sub	$64, %ecx
-	ja	L(Shl7LoopStart)
-
-L(Shl7LoopLeave):
-	add	$32, %ecx
-	jle	L(shl_end_0)
-
-	movaps	9(%eax), %xmm2
-	movaps	25(%eax), %xmm3
-	palignr	$7, %xmm2, %xmm3
-	palignr	$7, %xmm1, %xmm2
-	movaps	%xmm2, (%edx)
-	movaps	%xmm3, 16(%edx)
-	lea	32(%edx, %ecx), %edx
-	lea	32(%eax, %ecx), %eax
-	POP (%edi)
-	BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
-
-	CFI_PUSH (%edi)
-
-	.p2align 4
-L(sh_7_no_prefetch):
-	lea	-32(%ecx), %ecx
-	lea	-7(%eax), %eax
-	xor	%edi, %edi
-
-	.p2align 4
-L(sh_7_no_prefetch_loop):
-	movdqa	16(%eax, %edi), %xmm2
-	sub	$32, %ecx
-	movdqa	32(%eax, %edi), %xmm3
-	movdqa	%xmm3, %xmm4
-	palignr	$7, %xmm2, %xmm3
-	palignr	$7, %xmm1, %xmm2
-	lea	32(%edi), %edi
-	movdqa	%xmm2, -32(%edx, %edi)
-	movdqa	%xmm3, -16(%edx, %edi)
-	jb	L(sh_7_end_no_prefetch_loop)
-
-	movdqa	16(%eax, %edi), %xmm2
-	sub	$32, %ecx
-	movdqa	32(%eax, %edi), %xmm3
-	movdqa	%xmm3, %xmm1
-	palignr	$7, %xmm2, %xmm3
-	palignr	$7, %xmm4, %xmm2
-	lea	32(%edi), %edi
-	movdqa	%xmm2, -32(%edx, %edi)
-	movdqa	%xmm3, -16(%edx, %edi)
-	jae	L(sh_7_no_prefetch_loop)
-
-L(sh_7_end_no_prefetch_loop):
-	lea	32(%ecx), %ecx
-	add	%ecx, %edi
-	add	%edi, %edx
-	lea	7(%edi, %eax), %eax
-	POP	(%edi)
-	BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
-
-	CFI_PUSH (%edi)
-
-	.p2align 4
-L(shl_8):
-#ifndef USE_AS_MEMMOVE
-	movaps	-8(%eax), %xmm1
-#else
-	movl	DEST+4(%esp), %edi
-	movaps	-8(%eax), %xmm1
-	movdqu	%xmm0, (%edi)
-#endif
-#ifdef DATA_CACHE_SIZE_HALF
-	cmp	$DATA_CACHE_SIZE_HALF, %ecx
-#else
-# if (defined SHARED || defined __PIC__)
-	SETUP_PIC_REG(bx)
-	add	$_GLOBAL_OFFSET_TABLE_, %ebx
-	cmp	__x86_data_cache_size_half@GOTOFF(%ebx), %ecx
-# else
-	cmp	__x86_data_cache_size_half, %ecx
-# endif
-#endif
-	jb L(sh_8_no_prefetch)
-
-	lea	-64(%ecx), %ecx
-
-	.p2align 4
-L(Shl8LoopStart):
-	prefetcht0 0x1c0(%eax)
-	prefetcht0 0x1c0(%edx)
-	movaps	8(%eax), %xmm2
-	movaps	24(%eax), %xmm3
-	movaps	40(%eax), %xmm4
-	movaps	56(%eax), %xmm5
-	movaps	%xmm5, %xmm7
-	palignr	$8, %xmm4, %xmm5
-	palignr	$8, %xmm3, %xmm4
-	movaps	%xmm5, 48(%edx)
-	palignr	$8, %xmm2, %xmm3
-	lea	64(%eax), %eax
-	palignr	$8, %xmm1, %xmm2
-	movaps	%xmm4, 32(%edx)
-	movaps	%xmm3, 16(%edx)
-	movaps	%xmm7, %xmm1
-	movaps	%xmm2, (%edx)
-	lea	64(%edx), %edx
-	sub	$64, %ecx
-	ja	L(Shl8LoopStart)
-
-L(LoopLeave8):
-	add	$32, %ecx
-	jle	L(shl_end_0)
-
-	movaps	8(%eax), %xmm2
-	movaps	24(%eax), %xmm3
-	palignr	$8, %xmm2, %xmm3
-	palignr	$8, %xmm1, %xmm2
-	movaps	%xmm2, (%edx)
-	movaps	%xmm3, 16(%edx)
-	lea	32(%edx, %ecx), %edx
-	lea	32(%eax, %ecx), %eax
-	POP (%edi)
-	BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
-
-	CFI_PUSH (%edi)
-
-	.p2align 4
-L(sh_8_no_prefetch):
-	lea	-32(%ecx), %ecx
-	lea	-8(%eax), %eax
-	xor	%edi, %edi
-
-	.p2align 4
-L(sh_8_no_prefetch_loop):
-	movdqa	16(%eax, %edi), %xmm2
-	sub	$32, %ecx
-	movdqa	32(%eax, %edi), %xmm3
-	movdqa	%xmm3, %xmm4
-	palignr	$8, %xmm2, %xmm3
-	palignr	$8, %xmm1, %xmm2
-	lea	32(%edi), %edi
-	movdqa	%xmm2, -32(%edx, %edi)
-	movdqa	%xmm3, -16(%edx, %edi)
-	jb	L(sh_8_end_no_prefetch_loop)
-
-	movdqa	16(%eax, %edi), %xmm2
-	sub	$32, %ecx
-	movdqa	32(%eax, %edi), %xmm3
-	movdqa	%xmm3, %xmm1
-	palignr	$8, %xmm2, %xmm3
-	palignr	$8, %xmm4, %xmm2
-	lea	32(%edi), %edi
-	movdqa	%xmm2, -32(%edx, %edi)
-	movdqa	%xmm3, -16(%edx, %edi)
-	jae	L(sh_8_no_prefetch_loop)
-
-L(sh_8_end_no_prefetch_loop):
-	lea	32(%ecx), %ecx
-	add	%ecx, %edi
-	add	%edi, %edx
-	lea	8(%edi, %eax), %eax
-	POP	(%edi)
-	BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
-
-	CFI_PUSH (%edi)
-
-	.p2align 4
-L(shl_9):
-#ifndef USE_AS_MEMMOVE
-	movaps	-9(%eax), %xmm1
-#else
-	movl	DEST+4(%esp), %edi
-	movaps	-9(%eax), %xmm1
-	movdqu	%xmm0, (%edi)
-#endif
-#ifdef DATA_CACHE_SIZE_HALF
-	cmp	$DATA_CACHE_SIZE_HALF, %ecx
-#else
-# if (defined SHARED || defined __PIC__)
-	SETUP_PIC_REG(bx)
-	add	$_GLOBAL_OFFSET_TABLE_, %ebx
-	cmp	__x86_data_cache_size_half@GOTOFF(%ebx), %ecx
-# else
-	cmp	__x86_data_cache_size_half, %ecx
-# endif
-#endif
-	jb L(sh_9_no_prefetch)
-
-	lea	-64(%ecx), %ecx
-
-	.p2align 4
-L(Shl9LoopStart):
-	prefetcht0 0x1c0(%eax)
-	prefetcht0 0x1c0(%edx)
-	movaps	7(%eax), %xmm2
-	movaps	23(%eax), %xmm3
-	movaps	39(%eax), %xmm4
-	movaps	55(%eax), %xmm5
-	movaps	%xmm5, %xmm7
-	palignr	$9, %xmm4, %xmm5
-	palignr	$9, %xmm3, %xmm4
-	movaps	%xmm5, 48(%edx)
-	palignr	$9, %xmm2, %xmm3
-	lea	64(%eax), %eax
-	palignr	$9, %xmm1, %xmm2
-	movaps	%xmm4, 32(%edx)
-	movaps	%xmm3, 16(%edx)
-	movaps	%xmm7, %xmm1
-	movaps	%xmm2, (%edx)
-	lea	64(%edx), %edx
-	sub	$64, %ecx
-	ja	L(Shl9LoopStart)
-
-L(Shl9LoopLeave):
-	add	$32, %ecx
-	jle	L(shl_end_0)
-
-	movaps	7(%eax), %xmm2
-	movaps	23(%eax), %xmm3
-	palignr	$9, %xmm2, %xmm3
-	palignr	$9, %xmm1, %xmm2
-
-	movaps	%xmm2, (%edx)
-	movaps	%xmm3, 16(%edx)
-	lea	32(%edx, %ecx), %edx
-	lea	32(%eax, %ecx), %eax
-	POP (%edi)
-	BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
-
-	CFI_PUSH (%edi)
-
-	.p2align 4
-L(sh_9_no_prefetch):
-	lea	-32(%ecx), %ecx
-	lea	-9(%eax), %eax
-	xor	%edi, %edi
-
-	.p2align 4
-L(sh_9_no_prefetch_loop):
-	movdqa	16(%eax, %edi), %xmm2
-	sub	$32, %ecx
-	movdqa	32(%eax, %edi), %xmm3
-	movdqa	%xmm3, %xmm4
-	palignr	$9, %xmm2, %xmm3
-	palignr	$9, %xmm1, %xmm2
-	lea	32(%edi), %edi
-	movdqa	%xmm2, -32(%edx, %edi)
-	movdqa	%xmm3, -16(%edx, %edi)
-	jb	L(sh_9_end_no_prefetch_loop)
-
-	movdqa	16(%eax, %edi), %xmm2
-	sub	$32, %ecx
-	movdqa	32(%eax, %edi), %xmm3
-	movdqa	%xmm3, %xmm1
-	palignr	$9, %xmm2, %xmm3
-	palignr	$9, %xmm4, %xmm2
-	lea	32(%edi), %edi
-	movdqa	%xmm2, -32(%edx, %edi)
-	movdqa	%xmm3, -16(%edx, %edi)
-	jae	L(sh_9_no_prefetch_loop)
-
-L(sh_9_end_no_prefetch_loop):
-	lea	32(%ecx), %ecx
-	add	%ecx, %edi
-	add	%edi, %edx
-	lea	9(%edi, %eax), %eax
-	POP	(%edi)
-	BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
-
-	CFI_PUSH (%edi)
-
-	.p2align 4
-L(shl_10):
-#ifndef USE_AS_MEMMOVE
-	movaps	-10(%eax), %xmm1
-#else
-	movl	DEST+4(%esp), %edi
-	movaps	-10(%eax), %xmm1
-	movdqu	%xmm0, (%edi)
-#endif
-#ifdef DATA_CACHE_SIZE_HALF
-	cmp	$DATA_CACHE_SIZE_HALF, %ecx
-#else
-# if (defined SHARED || defined __PIC__)
-	SETUP_PIC_REG(bx)
-	add	$_GLOBAL_OFFSET_TABLE_, %ebx
-	cmp	__x86_data_cache_size_half@GOTOFF(%ebx), %ecx
-# else
-	cmp	__x86_data_cache_size_half, %ecx
-# endif
-#endif
-	jb L(sh_10_no_prefetch)
-
-	lea	-64(%ecx), %ecx
-
-	.p2align 4
-L(Shl10LoopStart):
-	prefetcht0 0x1c0(%eax)
-	prefetcht0 0x1c0(%edx)
-	movaps	6(%eax), %xmm2
-	movaps	22(%eax), %xmm3
-	movaps	38(%eax), %xmm4
-	movaps	54(%eax), %xmm5
-	movaps	%xmm5, %xmm7
-	palignr	$10, %xmm4, %xmm5
-	palignr	$10, %xmm3, %xmm4
-	movaps	%xmm5, 48(%edx)
-	palignr	$10, %xmm2, %xmm3
-	lea	64(%eax), %eax
-	palignr	$10, %xmm1, %xmm2
-	movaps	%xmm4, 32(%edx)
-	movaps	%xmm3, 16(%edx)
-	movaps	%xmm7, %xmm1
-	movaps	%xmm2, (%edx)
-	lea	64(%edx), %edx
-	sub	$64, %ecx
-	ja	L(Shl10LoopStart)
-
-L(Shl10LoopLeave):
-	add	$32, %ecx
-	jle	L(shl_end_0)
-
-	movaps	6(%eax), %xmm2
-	movaps	22(%eax), %xmm3
-	palignr	$10, %xmm2, %xmm3
-	palignr	$10, %xmm1, %xmm2
-
-	movaps	%xmm2, (%edx)
-	movaps	%xmm3, 16(%edx)
-	lea	32(%edx, %ecx), %edx
-	lea	32(%eax, %ecx), %eax
-	POP (%edi)
-	BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
-
-	CFI_PUSH (%edi)
-
-	.p2align 4
-L(sh_10_no_prefetch):
-	lea	-32(%ecx), %ecx
-	lea	-10(%eax), %eax
-	xor	%edi, %edi
-
-	.p2align 4
-L(sh_10_no_prefetch_loop):
-	movdqa	16(%eax, %edi), %xmm2
-	sub	$32, %ecx
-	movdqa	32(%eax, %edi), %xmm3
-	movdqa	%xmm3, %xmm4
-	palignr	$10, %xmm2, %xmm3
-	palignr	$10, %xmm1, %xmm2
-	lea	32(%edi), %edi
-	movdqa	%xmm2, -32(%edx, %edi)
-	movdqa	%xmm3, -16(%edx, %edi)
-	jb	L(sh_10_end_no_prefetch_loop)
-
-	movdqa	16(%eax, %edi), %xmm2
-	sub	$32, %ecx
-	movdqa	32(%eax, %edi), %xmm3
-	movdqa	%xmm3, %xmm1
-	palignr	$10, %xmm2, %xmm3
-	palignr	$10, %xmm4, %xmm2
-	lea	32(%edi), %edi
-	movdqa	%xmm2, -32(%edx, %edi)
-	movdqa	%xmm3, -16(%edx, %edi)
-	jae	L(sh_10_no_prefetch_loop)
-
-L(sh_10_end_no_prefetch_loop):
-	lea	32(%ecx), %ecx
-	add	%ecx, %edi
-	add	%edi, %edx
-	lea	10(%edi, %eax), %eax
-	POP	(%edi)
-	BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
-
-	CFI_PUSH (%edi)
-
-	.p2align 4
-L(shl_11):
-#ifndef USE_AS_MEMMOVE
-	movaps	-11(%eax), %xmm1
-#else
-	movl	DEST+4(%esp), %edi
-	movaps	-11(%eax), %xmm1
-	movdqu	%xmm0, (%edi)
-#endif
-#ifdef DATA_CACHE_SIZE_HALF
-	cmp	$DATA_CACHE_SIZE_HALF, %ecx
-#else
-# if (defined SHARED || defined __PIC__)
-	SETUP_PIC_REG(bx)
-	add	$_GLOBAL_OFFSET_TABLE_, %ebx
-	cmp	__x86_data_cache_size_half@GOTOFF(%ebx), %ecx
-# else
-	cmp	__x86_data_cache_size_half, %ecx
-# endif
-#endif
-	jb L(sh_11_no_prefetch)
-
-	lea	-64(%ecx), %ecx
-
-	.p2align 4
-L(Shl11LoopStart):
-	prefetcht0 0x1c0(%eax)
-	prefetcht0 0x1c0(%edx)
-	movaps	5(%eax), %xmm2
-	movaps	21(%eax), %xmm3
-	movaps	37(%eax), %xmm4
-	movaps	53(%eax), %xmm5
-	movaps	%xmm5, %xmm7
-	palignr	$11, %xmm4, %xmm5
-	palignr	$11, %xmm3, %xmm4
-	movaps	%xmm5, 48(%edx)
-	palignr	$11, %xmm2, %xmm3
-	lea	64(%eax), %eax
-	palignr	$11, %xmm1, %xmm2
-	movaps	%xmm4, 32(%edx)
-	movaps	%xmm3, 16(%edx)
-	movaps	%xmm7, %xmm1
-	movaps	%xmm2, (%edx)
-	lea	64(%edx), %edx
-	sub	$64, %ecx
-	ja	L(Shl11LoopStart)
-
-L(Shl11LoopLeave):
-	add	$32, %ecx
-	jle	L(shl_end_0)
-
-	movaps	5(%eax), %xmm2
-	movaps	21(%eax), %xmm3
-	palignr	$11, %xmm2, %xmm3
-	palignr	$11, %xmm1, %xmm2
-
-	movaps	%xmm2, (%edx)
-	movaps	%xmm3, 16(%edx)
-	lea	32(%edx, %ecx), %edx
-	lea	32(%eax, %ecx), %eax
-	POP (%edi)
-	BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
-
-	CFI_PUSH (%edi)
-
-	.p2align 4
-L(sh_11_no_prefetch):
-	lea	-32(%ecx), %ecx
-	lea	-11(%eax), %eax
-	xor	%edi, %edi
-
-	.p2align 4
-L(sh_11_no_prefetch_loop):
-	movdqa	16(%eax, %edi), %xmm2
-	sub	$32, %ecx
-	movdqa	32(%eax, %edi), %xmm3
-	movdqa	%xmm3, %xmm4
-	palignr	$11, %xmm2, %xmm3
-	palignr	$11, %xmm1, %xmm2
-	lea	32(%edi), %edi
-	movdqa	%xmm2, -32(%edx, %edi)
-	movdqa	%xmm3, -16(%edx, %edi)
-	jb	L(sh_11_end_no_prefetch_loop)
-
-	movdqa	16(%eax, %edi), %xmm2
-	sub	$32, %ecx
-	movdqa	32(%eax, %edi), %xmm3
-	movdqa	%xmm3, %xmm1
-	palignr	$11, %xmm2, %xmm3
-	palignr	$11, %xmm4, %xmm2
-	lea	32(%edi), %edi
-	movdqa	%xmm2, -32(%edx, %edi)
-	movdqa	%xmm3, -16(%edx, %edi)
-	jae	L(sh_11_no_prefetch_loop)
-
-L(sh_11_end_no_prefetch_loop):
-	lea	32(%ecx), %ecx
-	add	%ecx, %edi
-	add	%edi, %edx
-	lea	11(%edi, %eax), %eax
-	POP	(%edi)
-	BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
-
-	CFI_PUSH (%edi)
-
-	.p2align 4
-L(shl_12):
-#ifndef USE_AS_MEMMOVE
-	movaps	-12(%eax), %xmm1
-#else
-	movl	DEST+4(%esp), %edi
-	movaps	-12(%eax), %xmm1
-	movdqu	%xmm0, (%edi)
-#endif
-#ifdef DATA_CACHE_SIZE_HALF
-	cmp	$DATA_CACHE_SIZE_HALF, %ecx
-#else
-# if (defined SHARED || defined __PIC__)
-	SETUP_PIC_REG(bx)
-	add	$_GLOBAL_OFFSET_TABLE_, %ebx
-	cmp	__x86_data_cache_size_half@GOTOFF(%ebx), %ecx
-# else
-	cmp	__x86_data_cache_size_half, %ecx
-# endif
-#endif
-	jb L(sh_12_no_prefetch)
-
-	lea	-64(%ecx), %ecx
-
-	.p2align 4
-L(Shl12LoopStart):
-	prefetcht0 0x1c0(%eax)
-	prefetcht0 0x1c0(%edx)
-	movaps	4(%eax), %xmm2
-	movaps	20(%eax), %xmm3
-	movaps	36(%eax), %xmm4
-	movaps	52(%eax), %xmm5
-	movaps	%xmm5, %xmm7
-	palignr	$12, %xmm4, %xmm5
-	palignr	$12, %xmm3, %xmm4
-	movaps	%xmm5, 48(%edx)
-	palignr	$12, %xmm2, %xmm3
-	lea	64(%eax), %eax
-	palignr	$12, %xmm1, %xmm2
-	movaps	%xmm4, 32(%edx)
-	movaps	%xmm3, 16(%edx)
-	movaps	%xmm7, %xmm1
-	movaps	%xmm2, (%edx)
-	lea	64(%edx), %edx
-	sub	$64, %ecx
-	ja	L(Shl12LoopStart)
-
-L(Shl12LoopLeave):
-	add	$32, %ecx
-	jle	L(shl_end_0)
-
-	movaps	4(%eax), %xmm2
-	movaps	20(%eax), %xmm3
-	palignr	$12, %xmm2, %xmm3
-	palignr	$12, %xmm1, %xmm2
-
-	movaps	%xmm2, (%edx)
-	movaps	%xmm3, 16(%edx)
-	lea	32(%edx, %ecx), %edx
-	lea	32(%eax, %ecx), %eax
-	POP (%edi)
-	BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
-
-	CFI_PUSH (%edi)
-
-	.p2align 4
-L(sh_12_no_prefetch):
-	lea	-32(%ecx), %ecx
-	lea	-12(%eax), %eax
-	xor	%edi, %edi
-
-	.p2align 4
-L(sh_12_no_prefetch_loop):
-	movdqa	16(%eax, %edi), %xmm2
-	sub	$32, %ecx
-	movdqa	32(%eax, %edi), %xmm3
-	movdqa	%xmm3, %xmm4
-	palignr	$12, %xmm2, %xmm3
-	palignr	$12, %xmm1, %xmm2
-	lea	32(%edi), %edi
-	movdqa	%xmm2, -32(%edx, %edi)
-	movdqa	%xmm3, -16(%edx, %edi)
-	jb	L(sh_12_end_no_prefetch_loop)
-
-	movdqa	16(%eax, %edi), %xmm2
-	sub	$32, %ecx
-	movdqa	32(%eax, %edi), %xmm3
-	movdqa	%xmm3, %xmm1
-	palignr	$12, %xmm2, %xmm3
-	palignr	$12, %xmm4, %xmm2
-	lea	32(%edi), %edi
-	movdqa	%xmm2, -32(%edx, %edi)
-	movdqa	%xmm3, -16(%edx, %edi)
-	jae	L(sh_12_no_prefetch_loop)
-
-L(sh_12_end_no_prefetch_loop):
-	lea	32(%ecx), %ecx
-	add	%ecx, %edi
-	add	%edi, %edx
-	lea	12(%edi, %eax), %eax
-	POP	(%edi)
-	BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
-
-	CFI_PUSH (%edi)
-
-	.p2align 4
-L(shl_13):
-#ifndef USE_AS_MEMMOVE
-	movaps	-13(%eax), %xmm1
-#else
-	movl	DEST+4(%esp), %edi
-	movaps	-13(%eax), %xmm1
-	movdqu	%xmm0, (%edi)
-#endif
-#ifdef DATA_CACHE_SIZE_HALF
-	cmp	$DATA_CACHE_SIZE_HALF, %ecx
-#else
-# if (defined SHARED || defined __PIC__)
-	SETUP_PIC_REG(bx)
-	add	$_GLOBAL_OFFSET_TABLE_, %ebx
-	cmp	__x86_data_cache_size_half@GOTOFF(%ebx), %ecx
-# else
-	cmp	__x86_data_cache_size_half, %ecx
-# endif
-#endif
-	jb L(sh_13_no_prefetch)
-
-	lea	-64(%ecx), %ecx
-
-	.p2align 4
-L(Shl13LoopStart):
-	prefetcht0 0x1c0(%eax)
-	prefetcht0 0x1c0(%edx)
-	movaps	3(%eax), %xmm2
-	movaps	19(%eax), %xmm3
-	movaps	35(%eax), %xmm4
-	movaps	51(%eax), %xmm5
-	movaps	%xmm5, %xmm7
-	palignr	$13, %xmm4, %xmm5
-	palignr	$13, %xmm3, %xmm4
-	movaps	%xmm5, 48(%edx)
-	palignr	$13, %xmm2, %xmm3
-	lea	64(%eax), %eax
-	palignr	$13, %xmm1, %xmm2
-	movaps	%xmm4, 32(%edx)
-	movaps	%xmm3, 16(%edx)
-	movaps	%xmm7, %xmm1
-	movaps	%xmm2, (%edx)
-	lea	64(%edx), %edx
-	sub	$64, %ecx
-	ja	L(Shl13LoopStart)
-
-L(Shl13LoopLeave):
-	add	$32, %ecx
-	jle	L(shl_end_0)
-
-	movaps	3(%eax), %xmm2
-	movaps	19(%eax), %xmm3
-	palignr	$13, %xmm2, %xmm3
-	palignr	$13, %xmm1, %xmm2
-
-	movaps	%xmm2, (%edx)
-	movaps	%xmm3, 16(%edx)
-	lea	32(%edx, %ecx), %edx
-	lea	32(%eax, %ecx), %eax
-	POP (%edi)
-	BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
-
-	CFI_PUSH (%edi)
-
-	.p2align 4
-L(sh_13_no_prefetch):
-	lea	-32(%ecx), %ecx
-	lea	-13(%eax), %eax
-	xor	%edi, %edi
-
-	.p2align 4
-L(sh_13_no_prefetch_loop):
-	movdqa	16(%eax, %edi), %xmm2
-	sub	$32, %ecx
-	movdqa	32(%eax, %edi), %xmm3
-	movdqa	%xmm3, %xmm4
-	palignr	$13, %xmm2, %xmm3
-	palignr	$13, %xmm1, %xmm2
-	lea	32(%edi), %edi
-	movdqa	%xmm2, -32(%edx, %edi)
-	movdqa	%xmm3, -16(%edx, %edi)
-	jb	L(sh_13_end_no_prefetch_loop)
-
-	movdqa	16(%eax, %edi), %xmm2
-	sub	$32, %ecx
-	movdqa	32(%eax, %edi), %xmm3
-	movdqa	%xmm3, %xmm1
-	palignr	$13, %xmm2, %xmm3
-	palignr	$13, %xmm4, %xmm2
-	lea	32(%edi), %edi
-	movdqa	%xmm2, -32(%edx, %edi)
-	movdqa	%xmm3, -16(%edx, %edi)
-	jae	L(sh_13_no_prefetch_loop)
-
-L(sh_13_end_no_prefetch_loop):
-	lea	32(%ecx), %ecx
-	add	%ecx, %edi
-	add	%edi, %edx
-	lea	13(%edi, %eax), %eax
-	POP	(%edi)
-	BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
-
-	CFI_PUSH (%edi)
-
-	.p2align 4
-L(shl_14):
-#ifndef USE_AS_MEMMOVE
-	movaps	-14(%eax), %xmm1
-#else
-	movl	DEST+4(%esp), %edi
-	movaps	-14(%eax), %xmm1
-	movdqu	%xmm0, (%edi)
-#endif
-#ifdef DATA_CACHE_SIZE_HALF
-	cmp	$DATA_CACHE_SIZE_HALF, %ecx
-#else
-# if (defined SHARED || defined __PIC__)
-	SETUP_PIC_REG(bx)
-	add	$_GLOBAL_OFFSET_TABLE_, %ebx
-	cmp	__x86_data_cache_size_half@GOTOFF(%ebx), %ecx
-# else
-	cmp	__x86_data_cache_size_half, %ecx
-# endif
-#endif
-	jb L(sh_14_no_prefetch)
-
-	lea	-64(%ecx), %ecx
-
-	.p2align 4
-L(Shl14LoopStart):
-	prefetcht0 0x1c0(%eax)
-	prefetcht0 0x1c0(%edx)
-	movaps	2(%eax), %xmm2
-	movaps	18(%eax), %xmm3
-	movaps	34(%eax), %xmm4
-	movaps	50(%eax), %xmm5
-	movaps	%xmm5, %xmm7
-	palignr	$14, %xmm4, %xmm5
-	palignr	$14, %xmm3, %xmm4
-	movaps	%xmm5, 48(%edx)
-	palignr	$14, %xmm2, %xmm3
-	lea	64(%eax), %eax
-	palignr	$14, %xmm1, %xmm2
-	movaps	%xmm4, 32(%edx)
-	movaps	%xmm3, 16(%edx)
-	movaps	%xmm7, %xmm1
-	movaps	%xmm2, (%edx)
-	lea	64(%edx), %edx
-	sub	$64, %ecx
-	ja	L(Shl14LoopStart)
-
-L(Shl14LoopLeave):
-	add	$32, %ecx
-	jle	L(shl_end_0)
-
-	movaps	2(%eax), %xmm2
-	movaps	18(%eax), %xmm3
-	palignr	$14, %xmm2, %xmm3
-	palignr	$14, %xmm1, %xmm2
-
-	movaps	%xmm2, (%edx)
-	movaps	%xmm3, 16(%edx)
-	lea	32(%edx, %ecx), %edx
-	lea	32(%eax, %ecx), %eax
-	POP (%edi)
-	BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
-
-	CFI_PUSH (%edi)
-
-	.p2align 4
-L(sh_14_no_prefetch):
-	lea	-32(%ecx), %ecx
-	lea	-14(%eax), %eax
-	xor	%edi, %edi
-
-	.p2align 4
-L(sh_14_no_prefetch_loop):
-	movdqa	16(%eax, %edi), %xmm2
-	sub	$32, %ecx
-	movdqa	32(%eax, %edi), %xmm3
-	movdqa	%xmm3, %xmm4
-	palignr	$14, %xmm2, %xmm3
-	palignr	$14, %xmm1, %xmm2
-	lea	32(%edi), %edi
-	movdqa	%xmm2, -32(%edx, %edi)
-	movdqa	%xmm3, -16(%edx, %edi)
-	jb	L(sh_14_end_no_prefetch_loop)
-
-	movdqa	16(%eax, %edi), %xmm2
-	sub	$32, %ecx
-	movdqa	32(%eax, %edi), %xmm3
-	movdqa	%xmm3, %xmm1
-	palignr	$14, %xmm2, %xmm3
-	palignr	$14, %xmm4, %xmm2
-	lea	32(%edi), %edi
-	movdqa	%xmm2, -32(%edx, %edi)
-	movdqa	%xmm3, -16(%edx, %edi)
-	jae	L(sh_14_no_prefetch_loop)
-
-L(sh_14_end_no_prefetch_loop):
-	lea	32(%ecx), %ecx
-	add	%ecx, %edi
-	add	%edi, %edx
-	lea	14(%edi, %eax), %eax
-	POP	(%edi)
-	BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
-
-	CFI_PUSH (%edi)
-
-	.p2align 4
-L(shl_15):
-#ifndef USE_AS_MEMMOVE
-	movaps	-15(%eax), %xmm1
-#else
-	movl	DEST+4(%esp), %edi
-	movaps	-15(%eax), %xmm1
-	movdqu	%xmm0, (%edi)
-#endif
-#ifdef DATA_CACHE_SIZE_HALF
-	cmp	$DATA_CACHE_SIZE_HALF, %ecx
-#else
-# if (defined SHARED || defined __PIC__)
-	SETUP_PIC_REG(bx)
-	add	$_GLOBAL_OFFSET_TABLE_, %ebx
-	cmp	__x86_data_cache_size_half@GOTOFF(%ebx), %ecx
-# else
-	cmp	__x86_data_cache_size_half, %ecx
-# endif
-#endif
-	jb L(sh_15_no_prefetch)
-
-	lea	-64(%ecx), %ecx
-
-	.p2align 4
-L(Shl15LoopStart):
-	prefetcht0 0x1c0(%eax)
-	prefetcht0 0x1c0(%edx)
-	movaps	1(%eax), %xmm2
-	movaps	17(%eax), %xmm3
-	movaps	33(%eax), %xmm4
-	movaps	49(%eax), %xmm5
-	movaps	%xmm5, %xmm7
-	palignr	$15, %xmm4, %xmm5
-	palignr	$15, %xmm3, %xmm4
-	movaps	%xmm5, 48(%edx)
-	palignr	$15, %xmm2, %xmm3
-	lea	64(%eax), %eax
-	palignr	$15, %xmm1, %xmm2
-	movaps	%xmm4, 32(%edx)
-	movaps	%xmm3, 16(%edx)
-	movaps	%xmm7, %xmm1
-	movaps	%xmm2, (%edx)
-	lea	64(%edx), %edx
-	sub	$64, %ecx
-	ja	L(Shl15LoopStart)
-
-L(Shl15LoopLeave):
-	add	$32, %ecx
-	jle	L(shl_end_0)
-
-	movaps	1(%eax), %xmm2
-	movaps	17(%eax), %xmm3
-	palignr	$15, %xmm2, %xmm3
-	palignr	$15, %xmm1, %xmm2
-
-	movaps	%xmm2, (%edx)
-	movaps	%xmm3, 16(%edx)
-	lea	32(%edx, %ecx), %edx
-	lea	32(%eax, %ecx), %eax
-	POP (%edi)
-	BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
-
-	CFI_PUSH (%edi)
-
-	.p2align 4
-L(sh_15_no_prefetch):
-	lea	-32(%ecx), %ecx
-	lea	-15(%eax), %eax
-	xor	%edi, %edi
-
-	.p2align 4
-L(sh_15_no_prefetch_loop):
-	movdqa	16(%eax, %edi), %xmm2
-	sub	$32, %ecx
-	movdqa	32(%eax, %edi), %xmm3
-	movdqa	%xmm3, %xmm4
-	palignr	$15, %xmm2, %xmm3
-	palignr	$15, %xmm1, %xmm2
-	lea	32(%edi), %edi
-	movdqa	%xmm2, -32(%edx, %edi)
-	movdqa	%xmm3, -16(%edx, %edi)
-	jb	L(sh_15_end_no_prefetch_loop)
-
-	movdqa	16(%eax, %edi), %xmm2
-	sub	$32, %ecx
-	movdqa	32(%eax, %edi), %xmm3
-	movdqa	%xmm3, %xmm1
-	palignr	$15, %xmm2, %xmm3
-	palignr	$15, %xmm4, %xmm2
-	lea	32(%edi), %edi
-	movdqa	%xmm2, -32(%edx, %edi)
-	movdqa	%xmm3, -16(%edx, %edi)
-	jae	L(sh_15_no_prefetch_loop)
-
-L(sh_15_end_no_prefetch_loop):
-	lea	32(%ecx), %ecx
-	add	%ecx, %edi
-	add	%edi, %edx
-	lea	15(%edi, %eax), %eax
-	POP	(%edi)
-	BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
-
-	CFI_PUSH (%edi)
-
-	.p2align 4
-L(shl_end_0):
-	lea	32(%ecx), %ecx
-	lea	(%edx, %ecx), %edx
-	lea	(%eax, %ecx), %eax
-	POP	(%edi)
-	BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
-
-	.p2align 4
-L(fwd_write_44bytes):
-	movq	-44(%eax), %xmm0
-	movq	%xmm0, -44(%edx)
-L(fwd_write_36bytes):
-	movq	-36(%eax), %xmm0
-	movq	%xmm0, -36(%edx)
-L(fwd_write_28bytes):
-	movq	-28(%eax), %xmm0
-	movq	%xmm0, -28(%edx)
-L(fwd_write_20bytes):
-	movq	-20(%eax), %xmm0
-	movq	%xmm0, -20(%edx)
-L(fwd_write_12bytes):
-	movq	-12(%eax), %xmm0
-	movq	%xmm0, -12(%edx)
-L(fwd_write_4bytes):
-	movl	-4(%eax), %ecx
-	movl	%ecx, -4(%edx)
-#ifdef USE_AS_MEMPCPY
-	movl	%edx, %eax
-#else
-	movl	DEST(%esp), %eax
-#endif
-	RETURN
-
-	.p2align 4
-L(fwd_write_40bytes):
-	movq	-40(%eax), %xmm0
-	movq	%xmm0, -40(%edx)
-L(fwd_write_32bytes):
-	movq	-32(%eax), %xmm0
-	movq	%xmm0, -32(%edx)
-L(fwd_write_24bytes):
-	movq	-24(%eax), %xmm0
-	movq	%xmm0, -24(%edx)
-L(fwd_write_16bytes):
-	movq	-16(%eax), %xmm0
-	movq	%xmm0, -16(%edx)
-L(fwd_write_8bytes):
-	movq	-8(%eax), %xmm0
-	movq	%xmm0, -8(%edx)
-L(fwd_write_0bytes):
-#ifdef USE_AS_MEMPCPY
-	movl	%edx, %eax
-#else
-	movl	DEST(%esp), %eax
-#endif
-	RETURN
-
-	.p2align 4
-L(fwd_write_5bytes):
-	movl	-5(%eax), %ecx
-	movl	-4(%eax), %eax
-	movl	%ecx, -5(%edx)
-	movl	%eax, -4(%edx)
-#ifdef USE_AS_MEMPCPY
-	movl	%edx, %eax
-#else
-	movl	DEST(%esp), %eax
-#endif
-	RETURN
-
-	.p2align 4
-L(fwd_write_45bytes):
-	movq	-45(%eax), %xmm0
-	movq	%xmm0, -45(%edx)
-L(fwd_write_37bytes):
-	movq	-37(%eax), %xmm0
-	movq	%xmm0, -37(%edx)
-L(fwd_write_29bytes):
-	movq	-29(%eax), %xmm0
-	movq	%xmm0, -29(%edx)
-L(fwd_write_21bytes):
-	movq	-21(%eax), %xmm0
-	movq	%xmm0, -21(%edx)
-L(fwd_write_13bytes):
-	movq	-13(%eax), %xmm0
-	movq	%xmm0, -13(%edx)
-	movl	-5(%eax), %ecx
-	movl	%ecx, -5(%edx)
-	movzbl	-1(%eax), %ecx
-	movb	%cl, -1(%edx)
-#ifdef USE_AS_MEMPCPY
-	movl	%edx, %eax
-#else
-	movl	DEST(%esp), %eax
-#endif
-	RETURN
-
-	.p2align 4
-L(fwd_write_41bytes):
-	movq	-41(%eax), %xmm0
-	movq	%xmm0, -41(%edx)
-L(fwd_write_33bytes):
-	movq	-33(%eax), %xmm0
-	movq	%xmm0, -33(%edx)
-L(fwd_write_25bytes):
-	movq	-25(%eax), %xmm0
-	movq	%xmm0, -25(%edx)
-L(fwd_write_17bytes):
-	movq	-17(%eax), %xmm0
-	movq	%xmm0, -17(%edx)
-L(fwd_write_9bytes):
-	movq	-9(%eax), %xmm0
-	movq	%xmm0, -9(%edx)
-L(fwd_write_1bytes):
-	movzbl	-1(%eax), %ecx
-	movb	%cl, -1(%edx)
-#ifdef USE_AS_MEMPCPY
-	movl	%edx, %eax
-#else
-	movl	DEST(%esp), %eax
-#endif
-	RETURN
-
-	.p2align 4
-L(fwd_write_46bytes):
-	movq	-46(%eax), %xmm0
-	movq	%xmm0, -46(%edx)
-L(fwd_write_38bytes):
-	movq	-38(%eax), %xmm0
-	movq	%xmm0, -38(%edx)
-L(fwd_write_30bytes):
-	movq	-30(%eax), %xmm0
-	movq	%xmm0, -30(%edx)
-L(fwd_write_22bytes):
-	movq	-22(%eax), %xmm0
-	movq	%xmm0, -22(%edx)
-L(fwd_write_14bytes):
-	movq	-14(%eax), %xmm0
-	movq	%xmm0, -14(%edx)
-L(fwd_write_6bytes):
-	movl	-6(%eax), %ecx
-	movl	%ecx, -6(%edx)
-	movzwl	-2(%eax), %ecx
-	movw	%cx, -2(%edx)
-#ifdef USE_AS_MEMPCPY
-	movl	%edx, %eax
-#else
-	movl	DEST(%esp), %eax
-#endif
-	RETURN
-
-	.p2align 4
-L(fwd_write_42bytes):
-	movq	-42(%eax), %xmm0
-	movq	%xmm0, -42(%edx)
-L(fwd_write_34bytes):
-	movq	-34(%eax), %xmm0
-	movq	%xmm0, -34(%edx)
-L(fwd_write_26bytes):
-	movq	-26(%eax), %xmm0
-	movq	%xmm0, -26(%edx)
-L(fwd_write_18bytes):
-	movq	-18(%eax), %xmm0
-	movq	%xmm0, -18(%edx)
-L(fwd_write_10bytes):
-	movq	-10(%eax), %xmm0
-	movq	%xmm0, -10(%edx)
-L(fwd_write_2bytes):
-	movzwl	-2(%eax), %ecx
-	movw	%cx, -2(%edx)
-#ifdef USE_AS_MEMPCPY
-	movl	%edx, %eax
-#else
-	movl	DEST(%esp), %eax
-#endif
-	RETURN
-
-	.p2align 4
-L(fwd_write_47bytes):
-	movq	-47(%eax), %xmm0
-	movq	%xmm0, -47(%edx)
-L(fwd_write_39bytes):
-	movq	-39(%eax), %xmm0
-	movq	%xmm0, -39(%edx)
-L(fwd_write_31bytes):
-	movq	-31(%eax), %xmm0
-	movq	%xmm0, -31(%edx)
-L(fwd_write_23bytes):
-	movq	-23(%eax), %xmm0
-	movq	%xmm0, -23(%edx)
-L(fwd_write_15bytes):
-	movq	-15(%eax), %xmm0
-	movq	%xmm0, -15(%edx)
-L(fwd_write_7bytes):
-	movl	-7(%eax), %ecx
-	movl	%ecx, -7(%edx)
-	movzwl	-3(%eax), %ecx
-	movzbl	-1(%eax), %eax
-	movw	%cx, -3(%edx)
-	movb	%al, -1(%edx)
-#ifdef USE_AS_MEMPCPY
-	movl	%edx, %eax
-#else
-	movl	DEST(%esp), %eax
-#endif
-	RETURN
-
-	.p2align 4
-L(fwd_write_43bytes):
-	movq	-43(%eax), %xmm0
-	movq	%xmm0, -43(%edx)
-L(fwd_write_35bytes):
-	movq	-35(%eax), %xmm0
-	movq	%xmm0, -35(%edx)
-L(fwd_write_27bytes):
-	movq	-27(%eax), %xmm0
-	movq	%xmm0, -27(%edx)
-L(fwd_write_19bytes):
-	movq	-19(%eax), %xmm0
-	movq	%xmm0, -19(%edx)
-L(fwd_write_11bytes):
-	movq	-11(%eax), %xmm0
-	movq	%xmm0, -11(%edx)
-L(fwd_write_3bytes):
-	movzwl	-3(%eax), %ecx
-	movzbl	-1(%eax), %eax
-	movw	%cx, -3(%edx)
-	movb	%al, -1(%edx)
-#ifdef USE_AS_MEMPCPY
-	movl	%edx, %eax
-#else
-	movl	DEST(%esp), %eax
-#endif
-	RETURN
-
-	.p2align 4
-L(fwd_write_40bytes_align):
-	movdqa	-40(%eax), %xmm0
-	movdqa	%xmm0, -40(%edx)
-L(fwd_write_24bytes_align):
-	movdqa	-24(%eax), %xmm0
-	movdqa	%xmm0, -24(%edx)
-L(fwd_write_8bytes_align):
-	movq	-8(%eax), %xmm0
-	movq	%xmm0, -8(%edx)
-L(fwd_write_0bytes_align):
-#ifdef USE_AS_MEMPCPY
-	movl	%edx, %eax
-#else
-	movl	DEST(%esp), %eax
-#endif
-	RETURN
-
-	.p2align 4
-L(fwd_write_32bytes_align):
-	movdqa	-32(%eax), %xmm0
-	movdqa	%xmm0, -32(%edx)
-L(fwd_write_16bytes_align):
-	movdqa	-16(%eax), %xmm0
-	movdqa	%xmm0, -16(%edx)
-#ifdef USE_AS_MEMPCPY
-	movl	%edx, %eax
-#else
-	movl	DEST(%esp), %eax
-#endif
-	RETURN
-
-	.p2align 4
-L(fwd_write_5bytes_align):
-	movl	-5(%eax), %ecx
-	movl	-4(%eax), %eax
-	movl	%ecx, -5(%edx)
-	movl	%eax, -4(%edx)
-#ifdef USE_AS_MEMPCPY
-	movl	%edx, %eax
-#else
-	movl	DEST(%esp), %eax
-#endif
-	RETURN
-
-	.p2align 4
-L(fwd_write_45bytes_align):
-	movdqa	-45(%eax), %xmm0
-	movdqa	%xmm0, -45(%edx)
-L(fwd_write_29bytes_align):
-	movdqa	-29(%eax), %xmm0
-	movdqa	%xmm0, -29(%edx)
-L(fwd_write_13bytes_align):
-	movq	-13(%eax), %xmm0
-	movq	%xmm0, -13(%edx)
-	movl	-5(%eax), %ecx
-	movl	%ecx, -5(%edx)
-	movzbl	-1(%eax), %ecx
-	movb	%cl, -1(%edx)
-#ifdef USE_AS_MEMPCPY
-	movl	%edx, %eax
-#else
-	movl	DEST(%esp), %eax
-#endif
-	RETURN
-
-	.p2align 4
-L(fwd_write_37bytes_align):
-	movdqa	-37(%eax), %xmm0
-	movdqa	%xmm0, -37(%edx)
-L(fwd_write_21bytes_align):
-	movdqa	-21(%eax), %xmm0
-	movdqa	%xmm0, -21(%edx)
-	movl	-5(%eax), %ecx
-	movl	%ecx, -5(%edx)
-	movzbl	-1(%eax), %ecx
-	movb	%cl, -1(%edx)
-#ifdef USE_AS_MEMPCPY
-	movl	%edx, %eax
-#else
-	movl	DEST(%esp), %eax
-#endif
-	RETURN
-
-	.p2align 4
-L(fwd_write_41bytes_align):
-	movdqa	-41(%eax), %xmm0
-	movdqa	%xmm0, -41(%edx)
-L(fwd_write_25bytes_align):
-	movdqa	-25(%eax), %xmm0
-	movdqa	%xmm0, -25(%edx)
-L(fwd_write_9bytes_align):
-	movq	-9(%eax), %xmm0
-	movq	%xmm0, -9(%edx)
-L(fwd_write_1bytes_align):
-	movzbl	-1(%eax), %ecx
-	movb	%cl, -1(%edx)
-#ifdef USE_AS_MEMPCPY
-	movl	%edx, %eax
-#else
-	movl	DEST(%esp), %eax
-#endif
-	RETURN
-
-	.p2align 4
-L(fwd_write_33bytes_align):
-	movdqa	-33(%eax), %xmm0
-	movdqa	%xmm0, -33(%edx)
-L(fwd_write_17bytes_align):
-	movdqa	-17(%eax), %xmm0
-	movdqa	%xmm0, -17(%edx)
-	movzbl	-1(%eax), %ecx
-	movb	%cl, -1(%edx)
-#ifdef USE_AS_MEMPCPY
-	movl	%edx, %eax
-#else
-	movl	DEST(%esp), %eax
-#endif
-	RETURN
-
-	.p2align 4
-L(fwd_write_46bytes_align):
-	movdqa	-46(%eax), %xmm0
-	movdqa	%xmm0, -46(%edx)
-L(fwd_write_30bytes_align):
-	movdqa	-30(%eax), %xmm0
-	movdqa	%xmm0, -30(%edx)
-L(fwd_write_14bytes_align):
-	movq	-14(%eax), %xmm0
-	movq	%xmm0, -14(%edx)
-L(fwd_write_6bytes_align):
-	movl	-6(%eax), %ecx
-	movl	%ecx, -6(%edx)
-	movzwl	-2(%eax), %ecx
-	movw	%cx, -2(%edx)
-#ifdef USE_AS_MEMPCPY
-	movl	%edx, %eax
-#else
-	movl	DEST(%esp), %eax
-#endif
-	RETURN
-
-	.p2align 4
-L(fwd_write_38bytes_align):
-	movdqa	-38(%eax), %xmm0
-	movdqa	%xmm0, -38(%edx)
-L(fwd_write_22bytes_align):
-	movdqa	-22(%eax), %xmm0
-	movdqa	%xmm0, -22(%edx)
-	movl	-6(%eax), %ecx
-	movl	%ecx, -6(%edx)
-	movzwl	-2(%eax), %ecx
-	movw	%cx, -2(%edx)
-#ifdef USE_AS_MEMPCPY
-	movl	%edx, %eax
-#else
-	movl	DEST(%esp), %eax
-#endif
-	RETURN
-
-	.p2align 4
-L(fwd_write_42bytes_align):
-	movdqa	-42(%eax), %xmm0
-	movdqa	%xmm0, -42(%edx)
-L(fwd_write_26bytes_align):
-	movdqa	-26(%eax), %xmm0
-	movdqa	%xmm0, -26(%edx)
-L(fwd_write_10bytes_align):
-	movq	-10(%eax), %xmm0
-	movq	%xmm0, -10(%edx)
-L(fwd_write_2bytes_align):
-	movzwl	-2(%eax), %ecx
-	movw	%cx, -2(%edx)
-#ifdef USE_AS_MEMPCPY
-	movl	%edx, %eax
-#else
-	movl	DEST(%esp), %eax
-#endif
-	RETURN
-
-	.p2align 4
-L(fwd_write_34bytes_align):
-	movdqa	-34(%eax), %xmm0
-	movdqa	%xmm0, -34(%edx)
-L(fwd_write_18bytes_align):
-	movdqa	-18(%eax), %xmm0
-	movdqa	%xmm0, -18(%edx)
-	movzwl	-2(%eax), %ecx
-	movw	%cx, -2(%edx)
-#ifdef USE_AS_MEMPCPY
-	movl	%edx, %eax
-#else
-	movl	DEST(%esp), %eax
-#endif
-	RETURN
-
-	.p2align 4
-L(fwd_write_47bytes_align):
-	movdqa	-47(%eax), %xmm0
-	movdqa	%xmm0, -47(%edx)
-L(fwd_write_31bytes_align):
-	movdqa	-31(%eax), %xmm0
-	movdqa	%xmm0, -31(%edx)
-L(fwd_write_15bytes_align):
-	movq	-15(%eax), %xmm0
-	movq	%xmm0, -15(%edx)
-L(fwd_write_7bytes_align):
-	movl	-7(%eax), %ecx
-	movl	%ecx, -7(%edx)
-	movzwl	-3(%eax), %ecx
-	movzbl	-1(%eax), %eax
-	movw	%cx, -3(%edx)
-	movb	%al, -1(%edx)
-#ifdef USE_AS_MEMPCPY
-	movl	%edx, %eax
-#else
-	movl	DEST(%esp), %eax
-#endif
-	RETURN
-
-	.p2align 4
-L(fwd_write_39bytes_align):
-	movdqa	-39(%eax), %xmm0
-	movdqa	%xmm0, -39(%edx)
-L(fwd_write_23bytes_align):
-	movdqa	-23(%eax), %xmm0
-	movdqa	%xmm0, -23(%edx)
-	movl	-7(%eax), %ecx
-	movl	%ecx, -7(%edx)
-	movzwl	-3(%eax), %ecx
-	movzbl	-1(%eax), %eax
-	movw	%cx, -3(%edx)
-	movb	%al, -1(%edx)
-#ifdef USE_AS_MEMPCPY
-	movl	%edx, %eax
-#else
-	movl	DEST(%esp), %eax
-#endif
-	RETURN
-
-	.p2align 4
-L(fwd_write_43bytes_align):
-	movdqa	-43(%eax), %xmm0
-	movdqa	%xmm0, -43(%edx)
-L(fwd_write_27bytes_align):
-	movdqa	-27(%eax), %xmm0
-	movdqa	%xmm0, -27(%edx)
-L(fwd_write_11bytes_align):
-	movq	-11(%eax), %xmm0
-	movq	%xmm0, -11(%edx)
-L(fwd_write_3bytes_align):
-	movzwl	-3(%eax), %ecx
-	movzbl	-1(%eax), %eax
-	movw	%cx, -3(%edx)
-	movb	%al, -1(%edx)
-#ifdef USE_AS_MEMPCPY
-	movl	%edx, %eax
-#else
-	movl	DEST(%esp), %eax
-#endif
-	RETURN
-
-	.p2align 4
-L(fwd_write_35bytes_align):
-	movdqa	-35(%eax), %xmm0
-	movdqa	%xmm0, -35(%edx)
-L(fwd_write_19bytes_align):
-	movdqa	-19(%eax), %xmm0
-	movdqa	%xmm0, -19(%edx)
-	movzwl	-3(%eax), %ecx
-	movzbl	-1(%eax), %eax
-	movw	%cx, -3(%edx)
-	movb	%al, -1(%edx)
-#ifdef USE_AS_MEMPCPY
-	movl	%edx, %eax
-#else
-	movl	DEST(%esp), %eax
-#endif
-	RETURN
-
-	.p2align 4
-L(fwd_write_44bytes_align):
-	movdqa	-44(%eax), %xmm0
-	movdqa	%xmm0, -44(%edx)
-L(fwd_write_28bytes_align):
-	movdqa	-28(%eax), %xmm0
-	movdqa	%xmm0, -28(%edx)
-L(fwd_write_12bytes_align):
-	movq	-12(%eax), %xmm0
-	movq	%xmm0, -12(%edx)
-L(fwd_write_4bytes_align):
-	movl	-4(%eax), %ecx
-	movl	%ecx, -4(%edx)
-#ifdef USE_AS_MEMPCPY
-	movl	%edx, %eax
-#else
-	movl	DEST(%esp), %eax
-#endif
-	RETURN
-
-	.p2align 4
-L(fwd_write_36bytes_align):
-	movdqa	-36(%eax), %xmm0
-	movdqa	%xmm0, -36(%edx)
-L(fwd_write_20bytes_align):
-	movdqa	-20(%eax), %xmm0
-	movdqa	%xmm0, -20(%edx)
-	movl	-4(%eax), %ecx
-	movl	%ecx, -4(%edx)
-#ifdef USE_AS_MEMPCPY
-	movl	%edx, %eax
-#else
-	movl	DEST(%esp), %eax
-#endif
-	RETURN_END
-
-	CFI_PUSH (%edi)
-
-	.p2align 4
-L(large_page):
-	movdqu	(%eax), %xmm1
-#ifdef USE_AS_MEMMOVE
-	movl	DEST+4(%esp), %edi
-	movdqu	%xmm0, (%edi)
-#endif
-	lea	16(%eax), %eax
-	movntdq	%xmm1, (%edx)
-	lea	16(%edx), %edx
-	lea	-0x90(%ecx), %ecx
-	POP (%edi)
-
-	.p2align 4
-L(large_page_loop):
-	movdqu	(%eax), %xmm0
-	movdqu	0x10(%eax), %xmm1
-	movdqu	0x20(%eax), %xmm2
-	movdqu	0x30(%eax), %xmm3
-	movdqu	0x40(%eax), %xmm4
-	movdqu	0x50(%eax), %xmm5
-	movdqu	0x60(%eax), %xmm6
-	movdqu	0x70(%eax), %xmm7
-	lea	0x80(%eax), %eax
-
-	sub	$0x80, %ecx
-	movntdq	%xmm0, (%edx)
-	movntdq	%xmm1, 0x10(%edx)
-	movntdq	%xmm2, 0x20(%edx)
-	movntdq	%xmm3, 0x30(%edx)
-	movntdq	%xmm4, 0x40(%edx)
-	movntdq	%xmm5, 0x50(%edx)
-	movntdq	%xmm6, 0x60(%edx)
-	movntdq	%xmm7, 0x70(%edx)
-	lea	0x80(%edx), %edx
-	jae	L(large_page_loop)
-	cmp	$-0x40, %ecx
-	lea	0x80(%ecx), %ecx
-	jl	L(large_page_less_64bytes)
-
-	movdqu	(%eax), %xmm0
-	movdqu	0x10(%eax), %xmm1
-	movdqu	0x20(%eax), %xmm2
-	movdqu	0x30(%eax), %xmm3
-	lea	0x40(%eax), %eax
-
-	movntdq	%xmm0, (%edx)
-	movntdq	%xmm1, 0x10(%edx)
-	movntdq	%xmm2, 0x20(%edx)
-	movntdq	%xmm3, 0x30(%edx)
-	lea	0x40(%edx), %edx
-	sub	$0x40, %ecx
-L(large_page_less_64bytes):
-	cmp	$32, %ecx
-	jb	L(large_page_less_32bytes)
-	movdqu	(%eax), %xmm0
-	movdqu	0x10(%eax), %xmm1
-	lea	0x20(%eax), %eax
-	movntdq	%xmm0, (%edx)
-	movntdq	%xmm1, 0x10(%edx)
-	lea	0x20(%edx), %edx
-	sub	$0x20, %ecx
-L(large_page_less_32bytes):
-	add	%ecx, %edx
-	add	%ecx, %eax
-	sfence
-	BRANCH_TO_JMPTBL_ENTRY (L(table_48bytes_fwd), %ecx, 4)
-
-	.p2align 4
-L(bk_write_44bytes):
-	movq	36(%eax), %xmm0
-	movq	%xmm0, 36(%edx)
-L(bk_write_36bytes):
-	movq	28(%eax), %xmm0
-	movq	%xmm0, 28(%edx)
-L(bk_write_28bytes):
-	movq	20(%eax), %xmm0
-	movq	%xmm0, 20(%edx)
-L(bk_write_20bytes):
-	movq	12(%eax), %xmm0
-	movq	%xmm0, 12(%edx)
-L(bk_write_12bytes):
-	movq	4(%eax), %xmm0
-	movq	%xmm0, 4(%edx)
-L(bk_write_4bytes):
-	movl	(%eax), %ecx
-	movl	%ecx, (%edx)
-L(bk_write_0bytes):
-	movl	DEST(%esp), %eax
-#ifdef USE_AS_MEMPCPY
-	movl	LEN(%esp), %ecx
-	add	%ecx, %eax
-#endif
-	RETURN
-
-	.p2align 4
-L(bk_write_40bytes):
-	movq	32(%eax), %xmm0
-	movq	%xmm0, 32(%edx)
-L(bk_write_32bytes):
-	movq	24(%eax), %xmm0
-	movq	%xmm0, 24(%edx)
-L(bk_write_24bytes):
-	movq	16(%eax), %xmm0
-	movq	%xmm0, 16(%edx)
-L(bk_write_16bytes):
-	movq	8(%eax), %xmm0
-	movq	%xmm0, 8(%edx)
-L(bk_write_8bytes):
-	movq	(%eax), %xmm0
-	movq	%xmm0, (%edx)
-	movl	DEST(%esp), %eax
-#ifdef USE_AS_MEMPCPY
-	movl	LEN(%esp), %ecx
-	add	%ecx, %eax
-#endif
-	RETURN
-
-	.p2align 4
-L(bk_write_45bytes):
-	movq	37(%eax), %xmm0
-	movq	%xmm0, 37(%edx)
-L(bk_write_37bytes):
-	movq	29(%eax), %xmm0
-	movq	%xmm0, 29(%edx)
-L(bk_write_29bytes):
-	movq	21(%eax), %xmm0
-	movq	%xmm0, 21(%edx)
-L(bk_write_21bytes):
-	movq	13(%eax), %xmm0
-	movq	%xmm0, 13(%edx)
-L(bk_write_13bytes):
-	movq	5(%eax), %xmm0
-	movq	%xmm0, 5(%edx)
-L(bk_write_5bytes):
-	movl	1(%eax), %ecx
-	movl	%ecx, 1(%edx)
-L(bk_write_1bytes):
-	movzbl	(%eax), %ecx
-	movb	%cl, (%edx)
-	movl	DEST(%esp), %eax
-#ifdef USE_AS_MEMPCPY
-	movl	LEN(%esp), %ecx
-	add	%ecx, %eax
-#endif
-	RETURN
-
-	.p2align 4
-L(bk_write_41bytes):
-	movq	33(%eax), %xmm0
-	movq	%xmm0, 33(%edx)
-L(bk_write_33bytes):
-	movq	25(%eax), %xmm0
-	movq	%xmm0, 25(%edx)
-L(bk_write_25bytes):
-	movq	17(%eax), %xmm0
-	movq	%xmm0, 17(%edx)
-L(bk_write_17bytes):
-	movq	9(%eax), %xmm0
-	movq	%xmm0, 9(%edx)
-L(bk_write_9bytes):
-	movq	1(%eax), %xmm0
-	movq	%xmm0, 1(%edx)
-	movzbl	(%eax), %ecx
-	movb	%cl, (%edx)
-	movl	DEST(%esp), %eax
-#ifdef USE_AS_MEMPCPY
-	movl	LEN(%esp), %ecx
-	add	%ecx, %eax
-#endif
-	RETURN
-
-	.p2align 4
-L(bk_write_46bytes):
-	movq	38(%eax), %xmm0
-	movq	%xmm0, 38(%edx)
-L(bk_write_38bytes):
-	movq	30(%eax), %xmm0
-	movq	%xmm0, 30(%edx)
-L(bk_write_30bytes):
-	movq	22(%eax), %xmm0
-	movq	%xmm0, 22(%edx)
-L(bk_write_22bytes):
-	movq	14(%eax), %xmm0
-	movq	%xmm0, 14(%edx)
-L(bk_write_14bytes):
-	movq	6(%eax), %xmm0
-	movq	%xmm0, 6(%edx)
-L(bk_write_6bytes):
-	movl	2(%eax), %ecx
-	movl	%ecx, 2(%edx)
-	movzwl	(%eax), %ecx
-	movw	%cx, (%edx)
-	movl	DEST(%esp), %eax
-#ifdef USE_AS_MEMPCPY
-	movl	LEN(%esp), %ecx
-	add	%ecx, %eax
-#endif
-	RETURN
-
-	.p2align 4
-L(bk_write_42bytes):
-	movq	34(%eax), %xmm0
-	movq	%xmm0, 34(%edx)
-L(bk_write_34bytes):
-	movq	26(%eax), %xmm0
-	movq	%xmm0, 26(%edx)
-L(bk_write_26bytes):
-	movq	18(%eax), %xmm0
-	movq	%xmm0, 18(%edx)
-L(bk_write_18bytes):
-	movq	10(%eax), %xmm0
-	movq	%xmm0, 10(%edx)
-L(bk_write_10bytes):
-	movq	2(%eax), %xmm0
-	movq	%xmm0, 2(%edx)
-L(bk_write_2bytes):
-	movzwl	(%eax), %ecx
-	movw	%cx, (%edx)
-	movl	DEST(%esp), %eax
-#ifdef USE_AS_MEMPCPY
-	movl	LEN(%esp), %ecx
-	add	%ecx, %eax
-#endif
-	RETURN
-
-	.p2align 4
-L(bk_write_47bytes):
-	movq	39(%eax), %xmm0
-	movq	%xmm0, 39(%edx)
-L(bk_write_39bytes):
-	movq	31(%eax), %xmm0
-	movq	%xmm0, 31(%edx)
-L(bk_write_31bytes):
-	movq	23(%eax), %xmm0
-	movq	%xmm0, 23(%edx)
-L(bk_write_23bytes):
-	movq	15(%eax), %xmm0
-	movq	%xmm0, 15(%edx)
-L(bk_write_15bytes):
-	movq	7(%eax), %xmm0
-	movq	%xmm0, 7(%edx)
-L(bk_write_7bytes):
-	movl	3(%eax), %ecx
-	movl	%ecx, 3(%edx)
-	movzwl	1(%eax), %ecx
-	movw	%cx, 1(%edx)
-	movzbl	(%eax), %eax
-	movb	%al, (%edx)
-	movl	DEST(%esp), %eax
-#ifdef USE_AS_MEMPCPY
-	movl	LEN(%esp), %ecx
-	add	%ecx, %eax
-#endif
-	RETURN
-
-	.p2align 4
-L(bk_write_43bytes):
-	movq	35(%eax), %xmm0
-	movq	%xmm0, 35(%edx)
-L(bk_write_35bytes):
-	movq	27(%eax), %xmm0
-	movq	%xmm0, 27(%edx)
-L(bk_write_27bytes):
-	movq	19(%eax), %xmm0
-	movq	%xmm0, 19(%edx)
-L(bk_write_19bytes):
-	movq	11(%eax), %xmm0
-	movq	%xmm0, 11(%edx)
-L(bk_write_11bytes):
-	movq	3(%eax), %xmm0
-	movq	%xmm0, 3(%edx)
-L(bk_write_3bytes):
-	movzwl	1(%eax), %ecx
-	movw	%cx, 1(%edx)
-	movzbl	(%eax), %eax
-	movb	%al, (%edx)
-	movl	DEST(%esp), %eax
-#ifdef USE_AS_MEMPCPY
-	movl	LEN(%esp), %ecx
-	add	%ecx, %eax
-#endif
-	RETURN_END
-
-
-	.pushsection .rodata.ssse3,"a",@progbits
-	.p2align 2
-L(table_48bytes_fwd):
-	.int	JMPTBL (L(fwd_write_0bytes), L(table_48bytes_fwd))
-	.int	JMPTBL (L(fwd_write_1bytes), L(table_48bytes_fwd))
-	.int	JMPTBL (L(fwd_write_2bytes), L(table_48bytes_fwd))
-	.int	JMPTBL (L(fwd_write_3bytes), L(table_48bytes_fwd))
-	.int	JMPTBL (L(fwd_write_4bytes), L(table_48bytes_fwd))
-	.int	JMPTBL (L(fwd_write_5bytes), L(table_48bytes_fwd))
-	.int	JMPTBL (L(fwd_write_6bytes), L(table_48bytes_fwd))
-	.int	JMPTBL (L(fwd_write_7bytes), L(table_48bytes_fwd))
-	.int	JMPTBL (L(fwd_write_8bytes), L(table_48bytes_fwd))
-	.int	JMPTBL (L(fwd_write_9bytes), L(table_48bytes_fwd))
-	.int	JMPTBL (L(fwd_write_10bytes), L(table_48bytes_fwd))
-	.int	JMPTBL (L(fwd_write_11bytes), L(table_48bytes_fwd))
-	.int	JMPTBL (L(fwd_write_12bytes), L(table_48bytes_fwd))
-	.int	JMPTBL (L(fwd_write_13bytes), L(table_48bytes_fwd))
-	.int	JMPTBL (L(fwd_write_14bytes), L(table_48bytes_fwd))
-	.int	JMPTBL (L(fwd_write_15bytes), L(table_48bytes_fwd))
-	.int	JMPTBL (L(fwd_write_16bytes), L(table_48bytes_fwd))
-	.int	JMPTBL (L(fwd_write_17bytes), L(table_48bytes_fwd))
-	.int	JMPTBL (L(fwd_write_18bytes), L(table_48bytes_fwd))
-	.int	JMPTBL (L(fwd_write_19bytes), L(table_48bytes_fwd))
-	.int	JMPTBL (L(fwd_write_20bytes), L(table_48bytes_fwd))
-	.int	JMPTBL (L(fwd_write_21bytes), L(table_48bytes_fwd))
-	.int	JMPTBL (L(fwd_write_22bytes), L(table_48bytes_fwd))
-	.int	JMPTBL (L(fwd_write_23bytes), L(table_48bytes_fwd))
-	.int	JMPTBL (L(fwd_write_24bytes), L(table_48bytes_fwd))
-	.int	JMPTBL (L(fwd_write_25bytes), L(table_48bytes_fwd))
-	.int	JMPTBL (L(fwd_write_26bytes), L(table_48bytes_fwd))
-	.int	JMPTBL (L(fwd_write_27bytes), L(table_48bytes_fwd))
-	.int	JMPTBL (L(fwd_write_28bytes), L(table_48bytes_fwd))
-	.int	JMPTBL (L(fwd_write_29bytes), L(table_48bytes_fwd))
-	.int	JMPTBL (L(fwd_write_30bytes), L(table_48bytes_fwd))
-	.int	JMPTBL (L(fwd_write_31bytes), L(table_48bytes_fwd))
-	.int	JMPTBL (L(fwd_write_32bytes), L(table_48bytes_fwd))
-	.int	JMPTBL (L(fwd_write_33bytes), L(table_48bytes_fwd))
-	.int	JMPTBL (L(fwd_write_34bytes), L(table_48bytes_fwd))
-	.int	JMPTBL (L(fwd_write_35bytes), L(table_48bytes_fwd))
-	.int	JMPTBL (L(fwd_write_36bytes), L(table_48bytes_fwd))
-	.int	JMPTBL (L(fwd_write_37bytes), L(table_48bytes_fwd))
-	.int	JMPTBL (L(fwd_write_38bytes), L(table_48bytes_fwd))
-	.int	JMPTBL (L(fwd_write_39bytes), L(table_48bytes_fwd))
-	.int	JMPTBL (L(fwd_write_40bytes), L(table_48bytes_fwd))
-	.int	JMPTBL (L(fwd_write_41bytes), L(table_48bytes_fwd))
-	.int	JMPTBL (L(fwd_write_42bytes), L(table_48bytes_fwd))
-	.int	JMPTBL (L(fwd_write_43bytes), L(table_48bytes_fwd))
-	.int	JMPTBL (L(fwd_write_44bytes), L(table_48bytes_fwd))
-	.int	JMPTBL (L(fwd_write_45bytes), L(table_48bytes_fwd))
-	.int	JMPTBL (L(fwd_write_46bytes), L(table_48bytes_fwd))
-	.int	JMPTBL (L(fwd_write_47bytes), L(table_48bytes_fwd))
-
-	.p2align 2
-L(table_48bytes_fwd_align):
-	.int	JMPTBL (L(fwd_write_0bytes_align), L(table_48bytes_fwd_align))
-	.int	JMPTBL (L(fwd_write_1bytes_align), L(table_48bytes_fwd_align))
-	.int	JMPTBL (L(fwd_write_2bytes_align), L(table_48bytes_fwd_align))
-	.int	JMPTBL (L(fwd_write_3bytes_align), L(table_48bytes_fwd_align))
-	.int	JMPTBL (L(fwd_write_4bytes_align), L(table_48bytes_fwd_align))
-	.int	JMPTBL (L(fwd_write_5bytes_align), L(table_48bytes_fwd_align))
-	.int	JMPTBL (L(fwd_write_6bytes_align), L(table_48bytes_fwd_align))
-	.int	JMPTBL (L(fwd_write_7bytes_align), L(table_48bytes_fwd_align))
-	.int	JMPTBL (L(fwd_write_8bytes_align), L(table_48bytes_fwd_align))
-	.int	JMPTBL (L(fwd_write_9bytes_align), L(table_48bytes_fwd_align))
-	.int	JMPTBL (L(fwd_write_10bytes_align), L(table_48bytes_fwd_align))
-	.int	JMPTBL (L(fwd_write_11bytes_align), L(table_48bytes_fwd_align))
-	.int	JMPTBL (L(fwd_write_12bytes_align), L(table_48bytes_fwd_align))
-	.int	JMPTBL (L(fwd_write_13bytes_align), L(table_48bytes_fwd_align))
-	.int	JMPTBL (L(fwd_write_14bytes_align), L(table_48bytes_fwd_align))
-	.int	JMPTBL (L(fwd_write_15bytes_align), L(table_48bytes_fwd_align))
-	.int	JMPTBL (L(fwd_write_16bytes_align), L(table_48bytes_fwd_align))
-	.int	JMPTBL (L(fwd_write_17bytes_align), L(table_48bytes_fwd_align))
-	.int	JMPTBL (L(fwd_write_18bytes_align), L(table_48bytes_fwd_align))
-	.int	JMPTBL (L(fwd_write_19bytes_align), L(table_48bytes_fwd_align))
-	.int	JMPTBL (L(fwd_write_20bytes_align), L(table_48bytes_fwd_align))
-	.int	JMPTBL (L(fwd_write_21bytes_align), L(table_48bytes_fwd_align))
-	.int	JMPTBL (L(fwd_write_22bytes_align), L(table_48bytes_fwd_align))
-	.int	JMPTBL (L(fwd_write_23bytes_align), L(table_48bytes_fwd_align))
-	.int	JMPTBL (L(fwd_write_24bytes_align), L(table_48bytes_fwd_align))
-	.int	JMPTBL (L(fwd_write_25bytes_align), L(table_48bytes_fwd_align))
-	.int	JMPTBL (L(fwd_write_26bytes_align), L(table_48bytes_fwd_align))
-	.int	JMPTBL (L(fwd_write_27bytes_align), L(table_48bytes_fwd_align))
-	.int	JMPTBL (L(fwd_write_28bytes_align), L(table_48bytes_fwd_align))
-	.int	JMPTBL (L(fwd_write_29bytes_align), L(table_48bytes_fwd_align))
-	.int	JMPTBL (L(fwd_write_30bytes_align), L(table_48bytes_fwd_align))
-	.int	JMPTBL (L(fwd_write_31bytes_align), L(table_48bytes_fwd_align))
-	.int	JMPTBL (L(fwd_write_32bytes_align), L(table_48bytes_fwd_align))
-	.int	JMPTBL (L(fwd_write_33bytes_align), L(table_48bytes_fwd_align))
-	.int	JMPTBL (L(fwd_write_34bytes_align), L(table_48bytes_fwd_align))
-	.int	JMPTBL (L(fwd_write_35bytes_align), L(table_48bytes_fwd_align))
-	.int	JMPTBL (L(fwd_write_36bytes_align), L(table_48bytes_fwd_align))
-	.int	JMPTBL (L(fwd_write_37bytes_align), L(table_48bytes_fwd_align))
-	.int	JMPTBL (L(fwd_write_38bytes_align), L(table_48bytes_fwd_align))
-	.int	JMPTBL (L(fwd_write_39bytes_align), L(table_48bytes_fwd_align))
-	.int	JMPTBL (L(fwd_write_40bytes_align), L(table_48bytes_fwd_align))
-	.int	JMPTBL (L(fwd_write_41bytes_align), L(table_48bytes_fwd_align))
-	.int	JMPTBL (L(fwd_write_42bytes_align), L(table_48bytes_fwd_align))
-	.int	JMPTBL (L(fwd_write_43bytes_align), L(table_48bytes_fwd_align))
-	.int	JMPTBL (L(fwd_write_44bytes_align), L(table_48bytes_fwd_align))
-	.int	JMPTBL (L(fwd_write_45bytes_align), L(table_48bytes_fwd_align))
-	.int	JMPTBL (L(fwd_write_46bytes_align), L(table_48bytes_fwd_align))
-	.int	JMPTBL (L(fwd_write_47bytes_align), L(table_48bytes_fwd_align))
-
-	.p2align 2
-L(shl_table):
-	.int	JMPTBL (L(shl_0), L(shl_table))
-	.int	JMPTBL (L(shl_1), L(shl_table))
-	.int	JMPTBL (L(shl_2), L(shl_table))
-	.int	JMPTBL (L(shl_3), L(shl_table))
-	.int	JMPTBL (L(shl_4), L(shl_table))
-	.int	JMPTBL (L(shl_5), L(shl_table))
-	.int	JMPTBL (L(shl_6), L(shl_table))
-	.int	JMPTBL (L(shl_7), L(shl_table))
-	.int	JMPTBL (L(shl_8), L(shl_table))
-	.int	JMPTBL (L(shl_9), L(shl_table))
-	.int	JMPTBL (L(shl_10), L(shl_table))
-	.int	JMPTBL (L(shl_11), L(shl_table))
-	.int	JMPTBL (L(shl_12), L(shl_table))
-	.int	JMPTBL (L(shl_13), L(shl_table))
-	.int	JMPTBL (L(shl_14), L(shl_table))
-	.int	JMPTBL (L(shl_15), L(shl_table))
-
-	.p2align 2
-L(table_48_bytes_bwd):
-	.int	JMPTBL (L(bk_write_0bytes), L(table_48_bytes_bwd))
-	.int	JMPTBL (L(bk_write_1bytes), L(table_48_bytes_bwd))
-	.int	JMPTBL (L(bk_write_2bytes), L(table_48_bytes_bwd))
-	.int	JMPTBL (L(bk_write_3bytes), L(table_48_bytes_bwd))
-	.int	JMPTBL (L(bk_write_4bytes), L(table_48_bytes_bwd))
-	.int	JMPTBL (L(bk_write_5bytes), L(table_48_bytes_bwd))
-	.int	JMPTBL (L(bk_write_6bytes), L(table_48_bytes_bwd))
-	.int	JMPTBL (L(bk_write_7bytes), L(table_48_bytes_bwd))
-	.int	JMPTBL (L(bk_write_8bytes), L(table_48_bytes_bwd))
-	.int	JMPTBL (L(bk_write_9bytes), L(table_48_bytes_bwd))
-	.int	JMPTBL (L(bk_write_10bytes), L(table_48_bytes_bwd))
-	.int	JMPTBL (L(bk_write_11bytes), L(table_48_bytes_bwd))
-	.int	JMPTBL (L(bk_write_12bytes), L(table_48_bytes_bwd))
-	.int	JMPTBL (L(bk_write_13bytes), L(table_48_bytes_bwd))
-	.int	JMPTBL (L(bk_write_14bytes), L(table_48_bytes_bwd))
-	.int	JMPTBL (L(bk_write_15bytes), L(table_48_bytes_bwd))
-	.int	JMPTBL (L(bk_write_16bytes), L(table_48_bytes_bwd))
-	.int	JMPTBL (L(bk_write_17bytes), L(table_48_bytes_bwd))
-	.int	JMPTBL (L(bk_write_18bytes), L(table_48_bytes_bwd))
-	.int	JMPTBL (L(bk_write_19bytes), L(table_48_bytes_bwd))
-	.int	JMPTBL (L(bk_write_20bytes), L(table_48_bytes_bwd))
-	.int	JMPTBL (L(bk_write_21bytes), L(table_48_bytes_bwd))
-	.int	JMPTBL (L(bk_write_22bytes), L(table_48_bytes_bwd))
-	.int	JMPTBL (L(bk_write_23bytes), L(table_48_bytes_bwd))
-	.int	JMPTBL (L(bk_write_24bytes), L(table_48_bytes_bwd))
-	.int	JMPTBL (L(bk_write_25bytes), L(table_48_bytes_bwd))
-	.int	JMPTBL (L(bk_write_26bytes), L(table_48_bytes_bwd))
-	.int	JMPTBL (L(bk_write_27bytes), L(table_48_bytes_bwd))
-	.int	JMPTBL (L(bk_write_28bytes), L(table_48_bytes_bwd))
-	.int	JMPTBL (L(bk_write_29bytes), L(table_48_bytes_bwd))
-	.int	JMPTBL (L(bk_write_30bytes), L(table_48_bytes_bwd))
-	.int	JMPTBL (L(bk_write_31bytes), L(table_48_bytes_bwd))
-	.int	JMPTBL (L(bk_write_32bytes), L(table_48_bytes_bwd))
-	.int	JMPTBL (L(bk_write_33bytes), L(table_48_bytes_bwd))
-	.int	JMPTBL (L(bk_write_34bytes), L(table_48_bytes_bwd))
-	.int	JMPTBL (L(bk_write_35bytes), L(table_48_bytes_bwd))
-	.int	JMPTBL (L(bk_write_36bytes), L(table_48_bytes_bwd))
-	.int	JMPTBL (L(bk_write_37bytes), L(table_48_bytes_bwd))
-	.int	JMPTBL (L(bk_write_38bytes), L(table_48_bytes_bwd))
-	.int	JMPTBL (L(bk_write_39bytes), L(table_48_bytes_bwd))
-	.int	JMPTBL (L(bk_write_40bytes), L(table_48_bytes_bwd))
-	.int	JMPTBL (L(bk_write_41bytes), L(table_48_bytes_bwd))
-	.int	JMPTBL (L(bk_write_42bytes), L(table_48_bytes_bwd))
-	.int	JMPTBL (L(bk_write_43bytes), L(table_48_bytes_bwd))
-	.int	JMPTBL (L(bk_write_44bytes), L(table_48_bytes_bwd))
-	.int	JMPTBL (L(bk_write_45bytes), L(table_48_bytes_bwd))
-	.int	JMPTBL (L(bk_write_46bytes), L(table_48_bytes_bwd))
-	.int	JMPTBL (L(bk_write_47bytes), L(table_48_bytes_bwd))
-
-	.popsection
-
-#ifdef USE_AS_MEMMOVE
-	.p2align 4
-L(copy_backward):
-	PUSH (%edi)
-	movl	%eax, %edi
-	lea	(%ecx,%edx,1),%edx
-	lea	(%ecx,%edi,1),%edi
-	testl	$0x3, %edx
-	jnz	L(bk_align)
-
-L(bk_aligned_4):
-	cmp	$64, %ecx
-	jae	L(bk_write_more64bytes)
-
-L(bk_write_64bytesless):
-	cmp	$32, %ecx
-	jb	L(bk_write_less32bytes)
-
-L(bk_write_more32bytes):
-	/* Copy 32 bytes at a time.  */
-	sub	$32, %ecx
-	movq	-8(%edi), %xmm0
-	movq	%xmm0, -8(%edx)
-	movq	-16(%edi), %xmm0
-	movq	%xmm0, -16(%edx)
-	movq	-24(%edi), %xmm0
-	movq	%xmm0, -24(%edx)
-	movq	-32(%edi), %xmm0
-	movq	%xmm0, -32(%edx)
-	sub	$32, %edx
-	sub	$32, %edi
-
-L(bk_write_less32bytes):
-	movl	%edi, %eax
-	sub	%ecx, %edx
-	sub	%ecx, %eax
-	POP (%edi)
-L(bk_write_less32bytes_2):
-	BRANCH_TO_JMPTBL_ENTRY (L(table_48_bytes_bwd), %ecx, 4)
-
-	CFI_PUSH (%edi)
-
-	.p2align 4
-L(bk_align):
-	cmp	$8, %ecx
-	jbe	L(bk_write_less32bytes)
-	testl	$1, %edx
-	/* We get here only if (EDX & 3 ) != 0 so if (EDX & 1) ==0,
-	then	(EDX & 2) must be != 0.  */
-	jz	L(bk_got2)
-	sub	$1, %edi
-	sub	$1, %ecx
-	sub	$1, %edx
-	movzbl	(%edi), %eax
-	movb	%al, (%edx)
-
-	testl	$2, %edx
-	jz	L(bk_aligned_4)
-
-L(bk_got2):
-	sub	$2, %edi
-	sub	$2, %ecx
-	sub	$2, %edx
-	movzwl	(%edi), %eax
-	movw	%ax, (%edx)
-	jmp	L(bk_aligned_4)
-
-	.p2align 4
-L(bk_write_more64bytes):
-	/* Check alignment of last byte.  */
-	testl	$15, %edx
-	jz	L(bk_ssse3_cpy_pre)
-
-/* EDX is aligned 4 bytes, but not 16 bytes.  */
-L(bk_ssse3_align):
-	sub	$4, %edi
-	sub	$4, %ecx
-	sub	$4, %edx
-	movl	(%edi), %eax
-	movl	%eax, (%edx)
-
-	testl	$15, %edx
-	jz	L(bk_ssse3_cpy_pre)
-
-	sub	$4, %edi
-	sub	$4, %ecx
-	sub	$4, %edx
-	movl	(%edi), %eax
-	movl	%eax, (%edx)
-
-	testl	$15, %edx
-	jz	L(bk_ssse3_cpy_pre)
-
-	sub	$4, %edi
-	sub	$4, %ecx
-	sub	$4, %edx
-	movl	(%edi), %eax
-	movl	%eax, (%edx)
-
-L(bk_ssse3_cpy_pre):
-	cmp	$64, %ecx
-	jb	L(bk_write_more32bytes)
-
-	.p2align 4
-L(bk_ssse3_cpy):
-	sub	$64, %edi
-	sub	$64, %ecx
-	sub	$64, %edx
-	movdqu	0x30(%edi), %xmm3
-	movdqa	%xmm3, 0x30(%edx)
-	movdqu	0x20(%edi), %xmm2
-	movdqa	%xmm2, 0x20(%edx)
-	movdqu	0x10(%edi), %xmm1
-	movdqa	%xmm1, 0x10(%edx)
-	movdqu	(%edi), %xmm0
-	movdqa	%xmm0, (%edx)
-	cmp	$64, %ecx
-	jae	L(bk_ssse3_cpy)
-	jmp	L(bk_write_64bytesless)
-
-#endif
-
-END (MEMCPY)
diff --git a/libc/arch-x86/string/ssse3-memmove-atom.S b/libc/arch-x86/string/ssse3-memmove-atom.S
deleted file mode 100644
index 3572eac..0000000
--- a/libc/arch-x86/string/ssse3-memmove-atom.S
+++ /dev/null
@@ -1,34 +0,0 @@
-/*
-Copyright (c) 2010, Intel Corporation
-All rights reserved.
-
-Redistribution and use in source and binary forms, with or without
-modification, are permitted provided that the following conditions are met:
-
-    * Redistributions of source code must retain the above copyright notice,
-    * this list of conditions and the following disclaimer.
-
-    * Redistributions in binary form must reproduce the above copyright notice,
-    * this list of conditions and the following disclaimer in the documentation
-    * and/or other materials provided with the distribution.
-
-    * Neither the name of Intel Corporation nor the names of its contributors
-    * may be used to endorse or promote products derived from this software
-    * without specific prior written permission.
-
-THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
-ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
-WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
-DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
-ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
-(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
-LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
-ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-*/
-
-
-#define MEMCPY memmove_atom
-#define USE_AS_MEMMOVE
-#include "ssse3-memcpy-atom.S"
diff --git a/libc/arch-x86/string/ssse3-strncpy-atom.S b/libc/arch-x86/string/ssse3-strncpy-atom.S
deleted file mode 100644
index 0c27ffe..0000000
--- a/libc/arch-x86/string/ssse3-strncpy-atom.S
+++ /dev/null
@@ -1,33 +0,0 @@
-/*
-Copyright (c) 2011, Intel Corporation
-All rights reserved.
-
-Redistribution and use in source and binary forms, with or without
-modification, are permitted provided that the following conditions are met:
-
-    * Redistributions of source code must retain the above copyright notice,
-    * this list of conditions and the following disclaimer.
-
-    * Redistributions in binary form must reproduce the above copyright notice,
-    * this list of conditions and the following disclaimer in the documentation
-    * and/or other materials provided with the distribution.
-
-    * Neither the name of Intel Corporation nor the names of its contributors
-    * may be used to endorse or promote products derived from this software
-    * without specific prior written permission.
-
-THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
-ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
-WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
-DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
-ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
-(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
-LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
-ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-*/
-
-#define USE_AS_STRNCPY
-#define STRCPY strncpy_atom
-#include "ssse3-strcpy-atom.S"
diff --git a/libc/include/bits/fortify/string.h b/libc/include/bits/fortify/string.h
index 4d32b04..041967b 100644
--- a/libc/include/bits/fortify/string.h
+++ b/libc/include/bits/fortify/string.h
@@ -220,8 +220,13 @@
 }
 
 #if __BIONIC_FORTIFY_RUNTIME_CHECKS_ENABLED
-__BIONIC_FORTIFY_INLINE
-size_t strlen(const char* _Nonnull const s __pass_object_size0) __overloadable {
+/*
+ * Clang, when parsing C, can fold strlen to a constant without LLVM's help.
+ * This doesn't apply to overloads of strlen, so write this differently. We
+ * can't use `__pass_object_size0` here, but that's fine: it doesn't help much
+ * on __always_inline functions.
+ */
+extern __always_inline __inline__ __attribute__((gnu_inline)) size_t strlen(const char* _Nonnull s) {
     return __strlen_chk(s, __bos0(s));
 }
 #endif
diff --git a/libc/include/unistd.h b/libc/include/unistd.h
index e1c268f..9bc01f0 100644
--- a/libc/include/unistd.h
+++ b/libc/include/unistd.h
@@ -257,8 +257,29 @@
 int linkat(int __old_dir_fd, const char* _Nonnull __old_path, int __new_dir_fd, const char* _Nonnull __new_path, int __flags);
 int unlink(const char* _Nonnull __path);
 int unlinkat(int __dirfd, const char* _Nonnull __path, int __flags);
+
+/**
+ * [chdir(2)](https://man7.org/linux/man-pages/man2/chdir.2.html) changes
+ * the current working directory to the given path.
+ *
+ * This function affects all threads in the process, so is generally a bad idea
+ * on Android where most code will be running in a multi-threaded context.
+ *
+ * Returns 0 on success, and returns -1 and sets `errno` on failure.
+ */
 int chdir(const char* _Nonnull __path);
+
+/**
+ * [fchdir(2)](https://man7.org/linux/man-pages/man2/chdir.2.html) changes
+ * the current working directory to the given fd.
+ *
+ * This function affects all threads in the process, so is generally a bad idea
+ * on Android where most code will be running in a multi-threaded context.
+ *
+ * Returns 0 on success, and returns -1 and sets `errno` on failure.
+ */
 int fchdir(int __fd);
+
 int rmdir(const char* _Nonnull __path);
 int pipe(int __fds[_Nonnull 2]);
 #if defined(__USE_GNU)
diff --git a/libc/kernel/tools/update_all.py b/libc/kernel/tools/update_all.py
index ae89a80..9e5ed42 100755
--- a/libc/kernel/tools/update_all.py
+++ b/libc/kernel/tools/update_all.py
@@ -92,7 +92,8 @@
                      'kernel/uapi/asm-arm/asm/unistd.h',
                      'kernel/uapi/asm-arm/asm/unistd-eabi.h',
                      'kernel/uapi/asm-arm/asm/unistd-oabi.h',
-                     'kernel/uapi/asm-riscv/asm/unistd.h',
+                     'kernel/uapi/asm-riscv/asm/unistd_32.h',
+                     'kernel/uapi/asm-riscv/asm/unistd_64.h',
                      'kernel/uapi/asm-x86/asm/unistd_32.h',
                      'kernel/uapi/asm-x86/asm/unistd_64.h',
                      'kernel/uapi/asm-x86/asm/unistd_x32.h']:
diff --git a/linker/linker_phdr.cpp b/linker/linker_phdr.cpp
index 48206be..b7db4cd 100644
--- a/linker/linker_phdr.cpp
+++ b/linker/linker_phdr.cpp
@@ -976,7 +976,6 @@
     ElfW(Addr) seg_start = phdr->p_vaddr + load_bias_;
     ElfW(Addr) seg_end = seg_start + p_memsz;
 
-    ElfW(Addr) seg_page_start = page_start(seg_start);
     ElfW(Addr) seg_page_end = page_end(seg_end);
 
     ElfW(Addr) seg_file_end = seg_start + p_filesz;
diff --git a/tests/Android.bp b/tests/Android.bp
index 6aecb40..0bd4a32 100644
--- a/tests/Android.bp
+++ b/tests/Android.bp
@@ -746,6 +746,40 @@
     },
 }
 
+cc_defaults {
+    name: "bionic_fortify_c_tests_defaults",
+    defaults: [
+        "bionic_clang_fortify_tests_w_flags",
+        "bionic_tests_defaults",
+    ],
+    cflags: [
+        "-U_FORTIFY_SOURCE",
+        // -fbuiltin is required here to counteract -fno-builtin from
+        // `bionic_tests_defaults`. With `-fno-builtin`, Clang won't
+        // const-evaluate calls to `strlen`, which is tested for here.
+        "-fbuiltin",
+    ],
+    srcs: [
+        "clang_fortify_c_only_tests.c",
+    ],
+    tidy: false,
+    shared: {
+        enabled: false,
+    },
+}
+
+cc_test_library {
+    name: "libfortify1-c-tests-clang",
+    defaults: ["bionic_fortify_c_tests_defaults"],
+    cflags: ["-D_FORTIFY_SOURCE=1"],
+}
+
+cc_test_library {
+    name: "libfortify2-c-tests-clang",
+    defaults: ["bionic_fortify_c_tests_defaults"],
+    cflags: ["-D_FORTIFY_SOURCE=2"],
+}
+
 // -----------------------------------------------------------------------------
 // Library of all tests (excluding the dynamic linker tests).
 // -----------------------------------------------------------------------------
@@ -757,8 +791,10 @@
         "libBionicStandardTests",
         "libBionicElfTlsTests",
         "libBionicFramePointerTests",
+        "libfortify1-c-tests-clang",
         "libfortify1-tests-clang",
         "libfortify1-new-tests-clang",
+        "libfortify2-c-tests-clang",
         "libfortify2-tests-clang",
         "libfortify2-new-tests-clang",
     ],
diff --git a/tests/clang_fortify_c_only_tests.c b/tests/clang_fortify_c_only_tests.c
new file mode 100644
index 0000000..3bec848
--- /dev/null
+++ b/tests/clang_fortify_c_only_tests.c
@@ -0,0 +1,38 @@
+/*
+ * Copyright (C) 2024 The Android Open Source Project
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *  * Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in
+ *    the documentation and/or other materials provided with the
+ *    distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+ * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+ * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS
+ * OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
+ * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
+ * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <string.h>
+
+// This is a test specifically of bionic's FORTIFY machinery. Other stdlibs need not apply.
+#ifdef __BIONIC__
+
+// Ensure that strlen can be evaluated at compile-time. Clang doesn't support
+// this in C++, but does in C.
+_Static_assert(strlen("foo") == 3, "");
+
+#endif  // __BIONIC__
diff --git a/tests/clang_fortify_tests.cpp b/tests/clang_fortify_tests.cpp
index f08fd1f..105c261 100644
--- a/tests/clang_fortify_tests.cpp
+++ b/tests/clang_fortify_tests.cpp
@@ -89,6 +89,8 @@
 #include <unistd.h>
 #include <wchar.h>
 
+#include <array>
+
 #ifndef COMPILATION_TESTS
 #include <android-base/silent_death_test.h>
 #include <gtest/gtest.h>
@@ -133,6 +135,25 @@
 
 const static int kBogusFD = -1;
 
+FORTIFY_TEST(strlen) {
+  auto run_strlen_with_contents = [&](std::array<char, 3> contents) {
+    // A lot of cruft is necessary to make this test DTRT. LLVM and Clang love to fold/optimize
+    // strlen calls, and that's the opposite of what we want to happen.
+
+    // Loop to convince LLVM that `contents` can never be known (since `xor volatile_value` can flip
+    // any bit in each elem of `contents`).
+    volatile char always_zero = 0;
+    for (char& c : contents) {
+      c ^= always_zero;
+    }
+    // Store in a volatile, so the strlen itself cannot be optimized out.
+    volatile size_t _strlen_result = strlen(&contents.front());
+  };
+
+  EXPECT_NO_DEATH(run_strlen_with_contents({'f', 'o', '\0'}));
+  EXPECT_FORTIFY_DEATH(run_strlen_with_contents({'f', 'o', 'o'}));
+}
+
 FORTIFY_TEST(string) {
   char small_buffer[8] = {};
 
diff --git a/tests/cpu_target_features_test.cpp b/tests/cpu_target_features_test.cpp
index 3458bca..d773772 100644
--- a/tests/cpu_target_features_test.cpp
+++ b/tests/cpu_target_features_test.cpp
@@ -54,15 +54,3 @@
   GTEST_SKIP() << "Not targeting an aarch64 architecture.";
 #endif
 }
-
-TEST(cpu_target_features, has_expected_arm_compiler_values) {
-#if defined(__arm__)
-  ExecTestHelper eth;
-  char* const argv[] = {nullptr};
-  const auto invocation = [&] { execvp("cpu-target-features", argv); };
-  eth.Run(invocation, 0, "(^|\n)__ARM_FEATURE_AES=1($|\n)");
-  eth.Run(invocation, 0, "(^|\n)__ARM_FEATURE_CRC32=1($|\n)");
-#else
-  GTEST_SKIP() << "Not targeting an arm architecture.";
-#endif
-}