libc: remove most dynamically-selected x86 atom functions

The 'generic' versions of these all have newer impls from Intel, and
seemingly use the same feature baseline (sse2 or ssse3). There's
no obvious reason to carry these old impls around.

While I'm here, also remove the ssse3-memcpy. While that might be
a bit faster, `s/memmove/memcpy` seems slightly risky for 32-bit x86.

Bug: 358360849
Test: mma
Change-Id: I62cbf638bf4d9fdb799af93168b0cc0ea841526f
diff --git a/libc/Android.bp b/libc/Android.bp
index a4f2c69..532ed4d 100644
--- a/libc/Android.bp
+++ b/libc/Android.bp
@@ -1186,7 +1186,6 @@
                 "arch-x86/string/sse2-memchr-atom.S",
                 "arch-x86/string/sse2-memmove-slm.S",
                 "arch-x86/string/sse2-memrchr-atom.S",
-                "arch-x86/string/sse2-memset-atom.S",
                 "arch-x86/string/sse2-memset-slm.S",
                 "arch-x86/string/sse2-stpcpy-slm.S",
                 "arch-x86/string/sse2-stpncpy-slm.S",
@@ -1200,18 +1199,14 @@
                 "arch-x86/string/sse2-wcsrchr-atom.S",
                 "arch-x86/string/sse2-wcslen-atom.S",
                 "arch-x86/string/sse2-wcscmp-atom.S",
-                "arch-x86/string/sse2-strlen-atom.S",
 
                 "arch-x86/string/ssse3-memcmp-atom.S",
-                "arch-x86/string/ssse3-memmove-atom.S",
                 "arch-x86/string/ssse3-strcat-atom.S",
                 "arch-x86/string/ssse3-strcmp-atom.S",
-                "arch-x86/string/ssse3-strcpy-atom.S",
                 "arch-x86/string/ssse3-strlcat-atom.S",
                 "arch-x86/string/ssse3-strlcpy-atom.S",
                 "arch-x86/string/ssse3-strncat-atom.S",
                 "arch-x86/string/ssse3-strncmp-atom.S",
-                "arch-x86/string/ssse3-strncpy-atom.S",
                 "arch-x86/string/ssse3-wcscat-atom.S",
                 "arch-x86/string/ssse3-wcscpy-atom.S",
                 "arch-x86/string/ssse3-wmemcmp-atom.S",
diff --git a/libc/arch-x86/dynamic_function_dispatch.cpp b/libc/arch-x86/dynamic_function_dispatch.cpp
index e6cc5fb..db64237 100644
--- a/libc/arch-x86/dynamic_function_dispatch.cpp
+++ b/libc/arch-x86/dynamic_function_dispatch.cpp
@@ -39,51 +39,6 @@
 }
 MEMCMP_SHIM()
 
-DEFINE_IFUNC_FOR(memset) {
-  __builtin_cpu_init();
-  if (__builtin_cpu_is("atom")) RETURN_FUNC(memset_func_t, memset_atom);
-  RETURN_FUNC(memset_func_t, memset_generic);
-}
-MEMSET_SHIM()
-
-DEFINE_IFUNC_FOR(__memset_chk) {
-  __builtin_cpu_init();
-  if (__builtin_cpu_is("atom")) RETURN_FUNC(__memset_chk_func_t, __memset_chk_atom);
-  RETURN_FUNC(__memset_chk_func_t, __memset_chk_generic);
-}
-__MEMSET_CHK_SHIM()
-
-DEFINE_IFUNC_FOR(memmove) {
-  __builtin_cpu_init();
-  if (__builtin_cpu_is("atom")) RETURN_FUNC(memmove_func_t, memmove_atom);
-  RETURN_FUNC(memmove_func_t, memmove_generic);
-}
-MEMMOVE_SHIM()
-
-DEFINE_IFUNC_FOR(memcpy) { return memmove_resolver(); }
-MEMCPY_SHIM()
-
-DEFINE_IFUNC_FOR(strcpy) {
-  __builtin_cpu_init();
-  if (__builtin_cpu_is("atom")) RETURN_FUNC(strcpy_func_t, strcpy_atom);
-  RETURN_FUNC(strcpy_func_t, strcpy_generic);
-}
-STRCPY_SHIM()
-
-DEFINE_IFUNC_FOR(strncpy) {
-  __builtin_cpu_init();
-  if (__builtin_cpu_is("atom")) RETURN_FUNC(strncpy_func_t, strncpy_atom);
-  RETURN_FUNC(strncpy_func_t, strncpy_generic);
-}
-STRNCPY_SHIM()
-
-DEFINE_IFUNC_FOR(strlen) {
-  __builtin_cpu_init();
-  if (__builtin_cpu_is("atom")) RETURN_FUNC(strlen_func_t, strlen_atom);
-  RETURN_FUNC(strlen_func_t, strlen_generic);
-}
-STRLEN_SHIM()
-
 typedef int wmemcmp_func_t(const wchar_t*, const wchar_t*, size_t);
 DEFINE_IFUNC_FOR(wmemcmp) {
   __builtin_cpu_init();
diff --git a/libc/arch-x86/string/sse2-memmove-slm.S b/libc/arch-x86/string/sse2-memmove-slm.S
index 7f42374..2ed4e7b 100644
--- a/libc/arch-x86/string/sse2-memmove-slm.S
+++ b/libc/arch-x86/string/sse2-memmove-slm.S
@@ -31,7 +31,7 @@
 #define FOR_SILVERMONT
 
 #ifndef MEMMOVE
-# define MEMMOVE	memmove_generic
+# define MEMMOVE	memmove
 #endif
 
 #ifndef L
@@ -551,3 +551,9 @@
 	jmp	L(mm_recalc_len)
 
 END (MEMMOVE)
+
+// N.B., `private/bionic_asm.h` provides ALIAS_SYMBOL, but that file provides
+// conflicting definitions for some macros in this file. Since ALIAS_SYMBOL is
+// small, inline it here.
+.globl memcpy;
+.equ memcpy, MEMMOVE
diff --git a/libc/arch-x86/string/sse2-memset-atom.S b/libc/arch-x86/string/sse2-memset-atom.S
deleted file mode 100644
index e43ead0..0000000
--- a/libc/arch-x86/string/sse2-memset-atom.S
+++ /dev/null
@@ -1,841 +0,0 @@
-/*
-Copyright (c) 2010, Intel Corporation
-All rights reserved.
-
-Redistribution and use in source and binary forms, with or without
-modification, are permitted provided that the following conditions are met:
-
-    * Redistributions of source code must retain the above copyright notice,
-    * this list of conditions and the following disclaimer.
-
-    * Redistributions in binary form must reproduce the above copyright notice,
-    * this list of conditions and the following disclaimer in the documentation
-    * and/or other materials provided with the distribution.
-
-    * Neither the name of Intel Corporation nor the names of its contributors
-    * may be used to endorse or promote products derived from this software
-    * without specific prior written permission.
-
-THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
-ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
-WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
-DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
-ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
-(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
-LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
-ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-*/
-
-#include <private/bionic_asm.h>
-
-#define FOR_ATOM
-
-#ifndef L
-# define L(label)	.L##label
-#endif
-
-#ifndef ALIGN
-# define ALIGN(n)	.p2align n
-#endif
-
-#define CFI_PUSH(REG)						\
-  .cfi_adjust_cfa_offset 4;					\
-  .cfi_rel_offset REG, 0
-
-#define CFI_POP(REG)						\
-  .cfi_adjust_cfa_offset -4;					\
-  .cfi_restore REG
-
-#define PUSH(REG)	pushl REG; CFI_PUSH(REG)
-#define POP(REG)	popl REG; CFI_POP(REG)
-
-#define PARMS 8  /* Preserve EBX. */
-#define DST PARMS
-#define CHR (DST+4)
-#define LEN (CHR+4)
-#define CHK_DST_LEN (LEN+4)
-#define SETRTNVAL	movl DST(%esp), %eax
-
-#define ENTRANCE	PUSH(%ebx);
-#define RETURN_END	POP(%ebx); ret
-#define RETURN		RETURN_END; CFI_PUSH(%ebx)
-#define JMPTBL(I, B)	I - B
-
-#define SETUP_PIC_REG(x)	call	__x86.get_pc_thunk.x
-
-/* Load an entry in a jump table into EBX and branch to it.  TABLE is a
-   jump table with relative offsets.   */
-# define BRANCH_TO_JMPTBL_ENTRY(TABLE)				\
-    /* We first load PC into EBX.  */				\
-    call	__x86.get_pc_thunk.bx;				\
-    /* Get the address of the jump table.  */			\
-    add		$(TABLE - .), %ebx;				\
-    /* Get the entry and convert the relative offset to the	\
-       absolute address.  */					\
-    add		(%ebx,%ecx,4), %ebx;				\
-    add		%ecx, %edx;					\
-    /* We loaded the jump table and adjusted EDX. Go.  */	\
-    jmp		*%ebx
-
-ENTRY(__memset_chk_atom)
-  ENTRANCE
-
-  movl LEN(%esp), %ecx
-  cmpl CHK_DST_LEN(%esp), %ecx
-  jna L(memset_length_loaded)
-
-  POP(%ebx) // Undo ENTRANCE without returning.
-  jmp __memset_chk_fail
-END(__memset_chk_atom)
-
-	.section .text.sse2,"ax",@progbits
-	ALIGN(4)
-ENTRY(memset_atom)
-	ENTRANCE
-
-	movl	LEN(%esp), %ecx
-L(memset_length_loaded):
-	movzbl	CHR(%esp), %eax
-	movb	%al, %ah
-	/* Fill the whole EAX with pattern.  */
-	movl	%eax, %edx
-	shl	$16, %eax
-	or	%edx, %eax
-	movl	DST(%esp), %edx
-	cmp	$32, %ecx
-	jae	L(32bytesormore)
-
-L(write_less32bytes):
-	BRANCH_TO_JMPTBL_ENTRY(L(table_less_32bytes))
-
-
-	.pushsection .rodata.sse2,"a",@progbits
-	ALIGN(2)
-L(table_less_32bytes):
-	.int	JMPTBL(L(write_0bytes), L(table_less_32bytes))
-	.int	JMPTBL(L(write_1bytes), L(table_less_32bytes))
-	.int	JMPTBL(L(write_2bytes), L(table_less_32bytes))
-	.int	JMPTBL(L(write_3bytes), L(table_less_32bytes))
-	.int	JMPTBL(L(write_4bytes), L(table_less_32bytes))
-	.int	JMPTBL(L(write_5bytes), L(table_less_32bytes))
-	.int	JMPTBL(L(write_6bytes), L(table_less_32bytes))
-	.int	JMPTBL(L(write_7bytes), L(table_less_32bytes))
-	.int	JMPTBL(L(write_8bytes), L(table_less_32bytes))
-	.int	JMPTBL(L(write_9bytes), L(table_less_32bytes))
-	.int	JMPTBL(L(write_10bytes), L(table_less_32bytes))
-	.int	JMPTBL(L(write_11bytes), L(table_less_32bytes))
-	.int	JMPTBL(L(write_12bytes), L(table_less_32bytes))
-	.int	JMPTBL(L(write_13bytes), L(table_less_32bytes))
-	.int	JMPTBL(L(write_14bytes), L(table_less_32bytes))
-	.int	JMPTBL(L(write_15bytes), L(table_less_32bytes))
-	.int	JMPTBL(L(write_16bytes), L(table_less_32bytes))
-	.int	JMPTBL(L(write_17bytes), L(table_less_32bytes))
-	.int	JMPTBL(L(write_18bytes), L(table_less_32bytes))
-	.int	JMPTBL(L(write_19bytes), L(table_less_32bytes))
-	.int	JMPTBL(L(write_20bytes), L(table_less_32bytes))
-	.int	JMPTBL(L(write_21bytes), L(table_less_32bytes))
-	.int	JMPTBL(L(write_22bytes), L(table_less_32bytes))
-	.int	JMPTBL(L(write_23bytes), L(table_less_32bytes))
-	.int	JMPTBL(L(write_24bytes), L(table_less_32bytes))
-	.int	JMPTBL(L(write_25bytes), L(table_less_32bytes))
-	.int	JMPTBL(L(write_26bytes), L(table_less_32bytes))
-	.int	JMPTBL(L(write_27bytes), L(table_less_32bytes))
-	.int	JMPTBL(L(write_28bytes), L(table_less_32bytes))
-	.int	JMPTBL(L(write_29bytes), L(table_less_32bytes))
-	.int	JMPTBL(L(write_30bytes), L(table_less_32bytes))
-	.int	JMPTBL(L(write_31bytes), L(table_less_32bytes))
-	.popsection
-
-	ALIGN(4)
-L(write_28bytes):
-	movl	%eax, -28(%edx)
-L(write_24bytes):
-	movl	%eax, -24(%edx)
-L(write_20bytes):
-	movl	%eax, -20(%edx)
-L(write_16bytes):
-	movl	%eax, -16(%edx)
-L(write_12bytes):
-	movl	%eax, -12(%edx)
-L(write_8bytes):
-	movl	%eax, -8(%edx)
-L(write_4bytes):
-	movl	%eax, -4(%edx)
-L(write_0bytes):
-	SETRTNVAL
-	RETURN
-
-	ALIGN(4)
-L(write_29bytes):
-	movl	%eax, -29(%edx)
-L(write_25bytes):
-	movl	%eax, -25(%edx)
-L(write_21bytes):
-	movl	%eax, -21(%edx)
-L(write_17bytes):
-	movl	%eax, -17(%edx)
-L(write_13bytes):
-	movl	%eax, -13(%edx)
-L(write_9bytes):
-	movl	%eax, -9(%edx)
-L(write_5bytes):
-	movl	%eax, -5(%edx)
-L(write_1bytes):
-	movb	%al, -1(%edx)
-	SETRTNVAL
-	RETURN
-
-	ALIGN(4)
-L(write_30bytes):
-	movl	%eax, -30(%edx)
-L(write_26bytes):
-	movl	%eax, -26(%edx)
-L(write_22bytes):
-	movl	%eax, -22(%edx)
-L(write_18bytes):
-	movl	%eax, -18(%edx)
-L(write_14bytes):
-	movl	%eax, -14(%edx)
-L(write_10bytes):
-	movl	%eax, -10(%edx)
-L(write_6bytes):
-	movl	%eax, -6(%edx)
-L(write_2bytes):
-	movw	%ax, -2(%edx)
-	SETRTNVAL
-	RETURN
-
-	ALIGN(4)
-L(write_31bytes):
-	movl	%eax, -31(%edx)
-L(write_27bytes):
-	movl	%eax, -27(%edx)
-L(write_23bytes):
-	movl	%eax, -23(%edx)
-L(write_19bytes):
-	movl	%eax, -19(%edx)
-L(write_15bytes):
-	movl	%eax, -15(%edx)
-L(write_11bytes):
-	movl	%eax, -11(%edx)
-L(write_7bytes):
-	movl	%eax, -7(%edx)
-L(write_3bytes):
-	movw	%ax, -3(%edx)
-	movb	%al, -1(%edx)
-	SETRTNVAL
-	RETURN
-
-	ALIGN(4)
-/* ECX > 32 and EDX is 4 byte aligned.  */
-L(32bytesormore):
-	/* Fill xmm0 with the pattern.  */
-	movd	%eax, %xmm0
-	pshufd	$0, %xmm0, %xmm0
-	testl	$0xf, %edx
-	jz	L(aligned_16)
-/* ECX > 32 and EDX is not 16 byte aligned.  */
-L(not_aligned_16):
-	movdqu	%xmm0, (%edx)
-	movl	%edx, %eax
-	and	$-16, %edx
-	add	$16, %edx
-	sub	%edx, %eax
-	add	%eax, %ecx
-	movd	%xmm0, %eax
-
-	ALIGN(4)
-L(aligned_16):
-	cmp	$128, %ecx
-	jae	L(128bytesormore)
-
-L(aligned_16_less128bytes):
-	BRANCH_TO_JMPTBL_ENTRY(L(table_16_128bytes))
-
-	ALIGN(4)
-L(128bytesormore):
-	PUSH(%ebx)
-	SETUP_PIC_REG(bx)
-	add	$_GLOBAL_OFFSET_TABLE_, %ebx
-	mov	__x86_shared_cache_size@GOTOFF(%ebx), %ebx
-	cmp	%ebx, %ecx
-	jae	L(128bytesormore_nt_start)
-
-
-	POP(%ebx)
-# define RESTORE_EBX_STATE CFI_PUSH(%ebx)
-	PUSH(%ebx)
-	SETUP_PIC_REG(bx)
-	add	$_GLOBAL_OFFSET_TABLE_, %ebx
-	cmp	__x86_data_cache_size@GOTOFF(%ebx), %ecx
-	POP(%ebx)
-
-	jae	L(128bytes_L2_normal)
-	subl	$128, %ecx
-L(128bytesormore_normal):
-	sub	$128, %ecx
-	movdqa	%xmm0, (%edx)
-	movdqa	%xmm0, 0x10(%edx)
-	movdqa	%xmm0, 0x20(%edx)
-	movdqa	%xmm0, 0x30(%edx)
-	movdqa	%xmm0, 0x40(%edx)
-	movdqa	%xmm0, 0x50(%edx)
-	movdqa	%xmm0, 0x60(%edx)
-	movdqa	%xmm0, 0x70(%edx)
-	lea	128(%edx), %edx
-	jb	L(128bytesless_normal)
-
-
-	sub	$128, %ecx
-	movdqa	%xmm0, (%edx)
-	movdqa	%xmm0, 0x10(%edx)
-	movdqa	%xmm0, 0x20(%edx)
-	movdqa	%xmm0, 0x30(%edx)
-	movdqa	%xmm0, 0x40(%edx)
-	movdqa	%xmm0, 0x50(%edx)
-	movdqa	%xmm0, 0x60(%edx)
-	movdqa	%xmm0, 0x70(%edx)
-	lea	128(%edx), %edx
-	jae	L(128bytesormore_normal)
-
-L(128bytesless_normal):
-	add	$128, %ecx
-	BRANCH_TO_JMPTBL_ENTRY(L(table_16_128bytes))
-
-	ALIGN(4)
-L(128bytes_L2_normal):
-	prefetcht0	0x380(%edx)
-	prefetcht0	0x3c0(%edx)
-	sub	$128, %ecx
-	movdqa	%xmm0, (%edx)
-	movaps	%xmm0, 0x10(%edx)
-	movaps	%xmm0, 0x20(%edx)
-	movaps	%xmm0, 0x30(%edx)
-	movaps	%xmm0, 0x40(%edx)
-	movaps	%xmm0, 0x50(%edx)
-	movaps	%xmm0, 0x60(%edx)
-	movaps	%xmm0, 0x70(%edx)
-	add	$128, %edx
-	cmp	$128, %ecx
-	jae	L(128bytes_L2_normal)
-
-L(128bytesless_L2_normal):
-	BRANCH_TO_JMPTBL_ENTRY(L(table_16_128bytes))
-
-	RESTORE_EBX_STATE
-L(128bytesormore_nt_start):
-	sub	%ebx, %ecx
-	mov	%ebx, %eax
-	and	$0x7f, %eax
-	add	%eax, %ecx
-	movd	%xmm0, %eax
-	ALIGN(4)
-L(128bytesormore_shared_cache_loop):
-	prefetcht0	0x3c0(%edx)
-	prefetcht0	0x380(%edx)
-	sub	$0x80, %ebx
-	movdqa	%xmm0, (%edx)
-	movdqa	%xmm0, 0x10(%edx)
-	movdqa	%xmm0, 0x20(%edx)
-	movdqa	%xmm0, 0x30(%edx)
-	movdqa	%xmm0, 0x40(%edx)
-	movdqa	%xmm0, 0x50(%edx)
-	movdqa	%xmm0, 0x60(%edx)
-	movdqa	%xmm0, 0x70(%edx)
-	add	$0x80, %edx
-	cmp	$0x80, %ebx
-	jae	L(128bytesormore_shared_cache_loop)
-	cmp	$0x80, %ecx
-	jb	L(shared_cache_loop_end)
-	ALIGN(4)
-L(128bytesormore_nt):
-	sub	$0x80, %ecx
-	movntdq	%xmm0, (%edx)
-	movntdq	%xmm0, 0x10(%edx)
-	movntdq	%xmm0, 0x20(%edx)
-	movntdq	%xmm0, 0x30(%edx)
-	movntdq	%xmm0, 0x40(%edx)
-	movntdq	%xmm0, 0x50(%edx)
-	movntdq	%xmm0, 0x60(%edx)
-	movntdq	%xmm0, 0x70(%edx)
-	add	$0x80, %edx
-	cmp	$0x80, %ecx
-	jae	L(128bytesormore_nt)
-	sfence
-L(shared_cache_loop_end):
-	POP(%ebx)
-	BRANCH_TO_JMPTBL_ENTRY(L(table_16_128bytes))
-
-
-	.pushsection .rodata.sse2,"a",@progbits
-	ALIGN(2)
-L(table_16_128bytes):
-	.int	JMPTBL(L(aligned_16_0bytes), L(table_16_128bytes))
-	.int	JMPTBL(L(aligned_16_1bytes), L(table_16_128bytes))
-	.int	JMPTBL(L(aligned_16_2bytes), L(table_16_128bytes))
-	.int	JMPTBL(L(aligned_16_3bytes), L(table_16_128bytes))
-	.int	JMPTBL(L(aligned_16_4bytes), L(table_16_128bytes))
-	.int	JMPTBL(L(aligned_16_5bytes), L(table_16_128bytes))
-	.int	JMPTBL(L(aligned_16_6bytes), L(table_16_128bytes))
-	.int	JMPTBL(L(aligned_16_7bytes), L(table_16_128bytes))
-	.int	JMPTBL(L(aligned_16_8bytes), L(table_16_128bytes))
-	.int	JMPTBL(L(aligned_16_9bytes), L(table_16_128bytes))
-	.int	JMPTBL(L(aligned_16_10bytes), L(table_16_128bytes))
-	.int	JMPTBL(L(aligned_16_11bytes), L(table_16_128bytes))
-	.int	JMPTBL(L(aligned_16_12bytes), L(table_16_128bytes))
-	.int	JMPTBL(L(aligned_16_13bytes), L(table_16_128bytes))
-	.int	JMPTBL(L(aligned_16_14bytes), L(table_16_128bytes))
-	.int	JMPTBL(L(aligned_16_15bytes), L(table_16_128bytes))
-	.int	JMPTBL(L(aligned_16_16bytes), L(table_16_128bytes))
-	.int	JMPTBL(L(aligned_16_17bytes), L(table_16_128bytes))
-	.int	JMPTBL(L(aligned_16_18bytes), L(table_16_128bytes))
-	.int	JMPTBL(L(aligned_16_19bytes), L(table_16_128bytes))
-	.int	JMPTBL(L(aligned_16_20bytes), L(table_16_128bytes))
-	.int	JMPTBL(L(aligned_16_21bytes), L(table_16_128bytes))
-	.int	JMPTBL(L(aligned_16_22bytes), L(table_16_128bytes))
-	.int	JMPTBL(L(aligned_16_23bytes), L(table_16_128bytes))
-	.int	JMPTBL(L(aligned_16_24bytes), L(table_16_128bytes))
-	.int	JMPTBL(L(aligned_16_25bytes), L(table_16_128bytes))
-	.int	JMPTBL(L(aligned_16_26bytes), L(table_16_128bytes))
-	.int	JMPTBL(L(aligned_16_27bytes), L(table_16_128bytes))
-	.int	JMPTBL(L(aligned_16_28bytes), L(table_16_128bytes))
-	.int	JMPTBL(L(aligned_16_29bytes), L(table_16_128bytes))
-	.int	JMPTBL(L(aligned_16_30bytes), L(table_16_128bytes))
-	.int	JMPTBL(L(aligned_16_31bytes), L(table_16_128bytes))
-	.int	JMPTBL(L(aligned_16_32bytes), L(table_16_128bytes))
-	.int	JMPTBL(L(aligned_16_33bytes), L(table_16_128bytes))
-	.int	JMPTBL(L(aligned_16_34bytes), L(table_16_128bytes))
-	.int	JMPTBL(L(aligned_16_35bytes), L(table_16_128bytes))
-	.int	JMPTBL(L(aligned_16_36bytes), L(table_16_128bytes))
-	.int	JMPTBL(L(aligned_16_37bytes), L(table_16_128bytes))
-	.int	JMPTBL(L(aligned_16_38bytes), L(table_16_128bytes))
-	.int	JMPTBL(L(aligned_16_39bytes), L(table_16_128bytes))
-	.int	JMPTBL(L(aligned_16_40bytes), L(table_16_128bytes))
-	.int	JMPTBL(L(aligned_16_41bytes), L(table_16_128bytes))
-	.int	JMPTBL(L(aligned_16_42bytes), L(table_16_128bytes))
-	.int	JMPTBL(L(aligned_16_43bytes), L(table_16_128bytes))
-	.int	JMPTBL(L(aligned_16_44bytes), L(table_16_128bytes))
-	.int	JMPTBL(L(aligned_16_45bytes), L(table_16_128bytes))
-	.int	JMPTBL(L(aligned_16_46bytes), L(table_16_128bytes))
-	.int	JMPTBL(L(aligned_16_47bytes), L(table_16_128bytes))
-	.int	JMPTBL(L(aligned_16_48bytes), L(table_16_128bytes))
-	.int	JMPTBL(L(aligned_16_49bytes), L(table_16_128bytes))
-	.int	JMPTBL(L(aligned_16_50bytes), L(table_16_128bytes))
-	.int	JMPTBL(L(aligned_16_51bytes), L(table_16_128bytes))
-	.int	JMPTBL(L(aligned_16_52bytes), L(table_16_128bytes))
-	.int	JMPTBL(L(aligned_16_53bytes), L(table_16_128bytes))
-	.int	JMPTBL(L(aligned_16_54bytes), L(table_16_128bytes))
-	.int	JMPTBL(L(aligned_16_55bytes), L(table_16_128bytes))
-	.int	JMPTBL(L(aligned_16_56bytes), L(table_16_128bytes))
-	.int	JMPTBL(L(aligned_16_57bytes), L(table_16_128bytes))
-	.int	JMPTBL(L(aligned_16_58bytes), L(table_16_128bytes))
-	.int	JMPTBL(L(aligned_16_59bytes), L(table_16_128bytes))
-	.int	JMPTBL(L(aligned_16_60bytes), L(table_16_128bytes))
-	.int	JMPTBL(L(aligned_16_61bytes), L(table_16_128bytes))
-	.int	JMPTBL(L(aligned_16_62bytes), L(table_16_128bytes))
-	.int	JMPTBL(L(aligned_16_63bytes), L(table_16_128bytes))
-	.int	JMPTBL(L(aligned_16_64bytes), L(table_16_128bytes))
-	.int	JMPTBL(L(aligned_16_65bytes), L(table_16_128bytes))
-	.int	JMPTBL(L(aligned_16_66bytes), L(table_16_128bytes))
-	.int	JMPTBL(L(aligned_16_67bytes), L(table_16_128bytes))
-	.int	JMPTBL(L(aligned_16_68bytes), L(table_16_128bytes))
-	.int	JMPTBL(L(aligned_16_69bytes), L(table_16_128bytes))
-	.int	JMPTBL(L(aligned_16_70bytes), L(table_16_128bytes))
-	.int	JMPTBL(L(aligned_16_71bytes), L(table_16_128bytes))
-	.int	JMPTBL(L(aligned_16_72bytes), L(table_16_128bytes))
-	.int	JMPTBL(L(aligned_16_73bytes), L(table_16_128bytes))
-	.int	JMPTBL(L(aligned_16_74bytes), L(table_16_128bytes))
-	.int	JMPTBL(L(aligned_16_75bytes), L(table_16_128bytes))
-	.int	JMPTBL(L(aligned_16_76bytes), L(table_16_128bytes))
-	.int	JMPTBL(L(aligned_16_77bytes), L(table_16_128bytes))
-	.int	JMPTBL(L(aligned_16_78bytes), L(table_16_128bytes))
-	.int	JMPTBL(L(aligned_16_79bytes), L(table_16_128bytes))
-	.int	JMPTBL(L(aligned_16_80bytes), L(table_16_128bytes))
-	.int	JMPTBL(L(aligned_16_81bytes), L(table_16_128bytes))
-	.int	JMPTBL(L(aligned_16_82bytes), L(table_16_128bytes))
-	.int	JMPTBL(L(aligned_16_83bytes), L(table_16_128bytes))
-	.int	JMPTBL(L(aligned_16_84bytes), L(table_16_128bytes))
-	.int	JMPTBL(L(aligned_16_85bytes), L(table_16_128bytes))
-	.int	JMPTBL(L(aligned_16_86bytes), L(table_16_128bytes))
-	.int	JMPTBL(L(aligned_16_87bytes), L(table_16_128bytes))
-	.int	JMPTBL(L(aligned_16_88bytes), L(table_16_128bytes))
-	.int	JMPTBL(L(aligned_16_89bytes), L(table_16_128bytes))
-	.int	JMPTBL(L(aligned_16_90bytes), L(table_16_128bytes))
-	.int	JMPTBL(L(aligned_16_91bytes), L(table_16_128bytes))
-	.int	JMPTBL(L(aligned_16_92bytes), L(table_16_128bytes))
-	.int	JMPTBL(L(aligned_16_93bytes), L(table_16_128bytes))
-	.int	JMPTBL(L(aligned_16_94bytes), L(table_16_128bytes))
-	.int	JMPTBL(L(aligned_16_95bytes), L(table_16_128bytes))
-	.int	JMPTBL(L(aligned_16_96bytes), L(table_16_128bytes))
-	.int	JMPTBL(L(aligned_16_97bytes), L(table_16_128bytes))
-	.int	JMPTBL(L(aligned_16_98bytes), L(table_16_128bytes))
-	.int	JMPTBL(L(aligned_16_99bytes), L(table_16_128bytes))
-	.int	JMPTBL(L(aligned_16_100bytes), L(table_16_128bytes))
-	.int	JMPTBL(L(aligned_16_101bytes), L(table_16_128bytes))
-	.int	JMPTBL(L(aligned_16_102bytes), L(table_16_128bytes))
-	.int	JMPTBL(L(aligned_16_103bytes), L(table_16_128bytes))
-	.int	JMPTBL(L(aligned_16_104bytes), L(table_16_128bytes))
-	.int	JMPTBL(L(aligned_16_105bytes), L(table_16_128bytes))
-	.int	JMPTBL(L(aligned_16_106bytes), L(table_16_128bytes))
-	.int	JMPTBL(L(aligned_16_107bytes), L(table_16_128bytes))
-	.int	JMPTBL(L(aligned_16_108bytes), L(table_16_128bytes))
-	.int	JMPTBL(L(aligned_16_109bytes), L(table_16_128bytes))
-	.int	JMPTBL(L(aligned_16_110bytes), L(table_16_128bytes))
-	.int	JMPTBL(L(aligned_16_111bytes), L(table_16_128bytes))
-	.int	JMPTBL(L(aligned_16_112bytes), L(table_16_128bytes))
-	.int	JMPTBL(L(aligned_16_113bytes), L(table_16_128bytes))
-	.int	JMPTBL(L(aligned_16_114bytes), L(table_16_128bytes))
-	.int	JMPTBL(L(aligned_16_115bytes), L(table_16_128bytes))
-	.int	JMPTBL(L(aligned_16_116bytes), L(table_16_128bytes))
-	.int	JMPTBL(L(aligned_16_117bytes), L(table_16_128bytes))
-	.int	JMPTBL(L(aligned_16_118bytes), L(table_16_128bytes))
-	.int	JMPTBL(L(aligned_16_119bytes), L(table_16_128bytes))
-	.int	JMPTBL(L(aligned_16_120bytes), L(table_16_128bytes))
-	.int	JMPTBL(L(aligned_16_121bytes), L(table_16_128bytes))
-	.int	JMPTBL(L(aligned_16_122bytes), L(table_16_128bytes))
-	.int	JMPTBL(L(aligned_16_123bytes), L(table_16_128bytes))
-	.int	JMPTBL(L(aligned_16_124bytes), L(table_16_128bytes))
-	.int	JMPTBL(L(aligned_16_125bytes), L(table_16_128bytes))
-	.int	JMPTBL(L(aligned_16_126bytes), L(table_16_128bytes))
-	.int	JMPTBL(L(aligned_16_127bytes), L(table_16_128bytes))
-	.popsection
-
-	ALIGN(4)
-L(aligned_16_112bytes):
-	movdqa	%xmm0, -112(%edx)
-L(aligned_16_96bytes):
-	movdqa	%xmm0, -96(%edx)
-L(aligned_16_80bytes):
-	movdqa	%xmm0, -80(%edx)
-L(aligned_16_64bytes):
-	movdqa	%xmm0, -64(%edx)
-L(aligned_16_48bytes):
-	movdqa	%xmm0, -48(%edx)
-L(aligned_16_32bytes):
-	movdqa	%xmm0, -32(%edx)
-L(aligned_16_16bytes):
-	movdqa	%xmm0, -16(%edx)
-L(aligned_16_0bytes):
-	SETRTNVAL
-	RETURN
-
-	ALIGN(4)
-L(aligned_16_113bytes):
-	movdqa	%xmm0, -113(%edx)
-L(aligned_16_97bytes):
-	movdqa	%xmm0, -97(%edx)
-L(aligned_16_81bytes):
-	movdqa	%xmm0, -81(%edx)
-L(aligned_16_65bytes):
-	movdqa	%xmm0, -65(%edx)
-L(aligned_16_49bytes):
-	movdqa	%xmm0, -49(%edx)
-L(aligned_16_33bytes):
-	movdqa	%xmm0, -33(%edx)
-L(aligned_16_17bytes):
-	movdqa	%xmm0, -17(%edx)
-L(aligned_16_1bytes):
-	movb	%al, -1(%edx)
-	SETRTNVAL
-	RETURN
-
-	ALIGN(4)
-L(aligned_16_114bytes):
-	movdqa	%xmm0, -114(%edx)
-L(aligned_16_98bytes):
-	movdqa	%xmm0, -98(%edx)
-L(aligned_16_82bytes):
-	movdqa	%xmm0, -82(%edx)
-L(aligned_16_66bytes):
-	movdqa	%xmm0, -66(%edx)
-L(aligned_16_50bytes):
-	movdqa	%xmm0, -50(%edx)
-L(aligned_16_34bytes):
-	movdqa	%xmm0, -34(%edx)
-L(aligned_16_18bytes):
-	movdqa	%xmm0, -18(%edx)
-L(aligned_16_2bytes):
-	movw	%ax, -2(%edx)
-	SETRTNVAL
-	RETURN
-
-	ALIGN(4)
-L(aligned_16_115bytes):
-	movdqa	%xmm0, -115(%edx)
-L(aligned_16_99bytes):
-	movdqa	%xmm0, -99(%edx)
-L(aligned_16_83bytes):
-	movdqa	%xmm0, -83(%edx)
-L(aligned_16_67bytes):
-	movdqa	%xmm0, -67(%edx)
-L(aligned_16_51bytes):
-	movdqa	%xmm0, -51(%edx)
-L(aligned_16_35bytes):
-	movdqa	%xmm0, -35(%edx)
-L(aligned_16_19bytes):
-	movdqa	%xmm0, -19(%edx)
-L(aligned_16_3bytes):
-	movw	%ax, -3(%edx)
-	movb	%al, -1(%edx)
-	SETRTNVAL
-	RETURN
-
-	ALIGN(4)
-L(aligned_16_116bytes):
-	movdqa	%xmm0, -116(%edx)
-L(aligned_16_100bytes):
-	movdqa	%xmm0, -100(%edx)
-L(aligned_16_84bytes):
-	movdqa	%xmm0, -84(%edx)
-L(aligned_16_68bytes):
-	movdqa	%xmm0, -68(%edx)
-L(aligned_16_52bytes):
-	movdqa	%xmm0, -52(%edx)
-L(aligned_16_36bytes):
-	movdqa	%xmm0, -36(%edx)
-L(aligned_16_20bytes):
-	movdqa	%xmm0, -20(%edx)
-L(aligned_16_4bytes):
-	movl	%eax, -4(%edx)
-	SETRTNVAL
-	RETURN
-
-	ALIGN(4)
-L(aligned_16_117bytes):
-	movdqa	%xmm0, -117(%edx)
-L(aligned_16_101bytes):
-	movdqa	%xmm0, -101(%edx)
-L(aligned_16_85bytes):
-	movdqa	%xmm0, -85(%edx)
-L(aligned_16_69bytes):
-	movdqa	%xmm0, -69(%edx)
-L(aligned_16_53bytes):
-	movdqa	%xmm0, -53(%edx)
-L(aligned_16_37bytes):
-	movdqa	%xmm0, -37(%edx)
-L(aligned_16_21bytes):
-	movdqa	%xmm0, -21(%edx)
-L(aligned_16_5bytes):
-	movl	%eax, -5(%edx)
-	movb	%al, -1(%edx)
-	SETRTNVAL
-	RETURN
-
-	ALIGN(4)
-L(aligned_16_118bytes):
-	movdqa	%xmm0, -118(%edx)
-L(aligned_16_102bytes):
-	movdqa	%xmm0, -102(%edx)
-L(aligned_16_86bytes):
-	movdqa	%xmm0, -86(%edx)
-L(aligned_16_70bytes):
-	movdqa	%xmm0, -70(%edx)
-L(aligned_16_54bytes):
-	movdqa	%xmm0, -54(%edx)
-L(aligned_16_38bytes):
-	movdqa	%xmm0, -38(%edx)
-L(aligned_16_22bytes):
-	movdqa	%xmm0, -22(%edx)
-L(aligned_16_6bytes):
-	movl	%eax, -6(%edx)
-	movw	%ax, -2(%edx)
-	SETRTNVAL
-	RETURN
-
-	ALIGN(4)
-L(aligned_16_119bytes):
-	movdqa	%xmm0, -119(%edx)
-L(aligned_16_103bytes):
-	movdqa	%xmm0, -103(%edx)
-L(aligned_16_87bytes):
-	movdqa	%xmm0, -87(%edx)
-L(aligned_16_71bytes):
-	movdqa	%xmm0, -71(%edx)
-L(aligned_16_55bytes):
-	movdqa	%xmm0, -55(%edx)
-L(aligned_16_39bytes):
-	movdqa	%xmm0, -39(%edx)
-L(aligned_16_23bytes):
-	movdqa	%xmm0, -23(%edx)
-L(aligned_16_7bytes):
-	movl	%eax, -7(%edx)
-	movw	%ax, -3(%edx)
-	movb	%al, -1(%edx)
-	SETRTNVAL
-	RETURN
-
-	ALIGN(4)
-L(aligned_16_120bytes):
-	movdqa	%xmm0, -120(%edx)
-L(aligned_16_104bytes):
-	movdqa	%xmm0, -104(%edx)
-L(aligned_16_88bytes):
-	movdqa	%xmm0, -88(%edx)
-L(aligned_16_72bytes):
-	movdqa	%xmm0, -72(%edx)
-L(aligned_16_56bytes):
-	movdqa	%xmm0, -56(%edx)
-L(aligned_16_40bytes):
-	movdqa	%xmm0, -40(%edx)
-L(aligned_16_24bytes):
-	movdqa	%xmm0, -24(%edx)
-L(aligned_16_8bytes):
-	movq	%xmm0, -8(%edx)
-	SETRTNVAL
-	RETURN
-
-	ALIGN(4)
-L(aligned_16_121bytes):
-	movdqa	%xmm0, -121(%edx)
-L(aligned_16_105bytes):
-	movdqa	%xmm0, -105(%edx)
-L(aligned_16_89bytes):
-	movdqa	%xmm0, -89(%edx)
-L(aligned_16_73bytes):
-	movdqa	%xmm0, -73(%edx)
-L(aligned_16_57bytes):
-	movdqa	%xmm0, -57(%edx)
-L(aligned_16_41bytes):
-	movdqa	%xmm0, -41(%edx)
-L(aligned_16_25bytes):
-	movdqa	%xmm0, -25(%edx)
-L(aligned_16_9bytes):
-	movq	%xmm0, -9(%edx)
-	movb	%al, -1(%edx)
-	SETRTNVAL
-	RETURN
-
-	ALIGN(4)
-L(aligned_16_122bytes):
-	movdqa	%xmm0, -122(%edx)
-L(aligned_16_106bytes):
-	movdqa	%xmm0, -106(%edx)
-L(aligned_16_90bytes):
-	movdqa	%xmm0, -90(%edx)
-L(aligned_16_74bytes):
-	movdqa	%xmm0, -74(%edx)
-L(aligned_16_58bytes):
-	movdqa	%xmm0, -58(%edx)
-L(aligned_16_42bytes):
-	movdqa	%xmm0, -42(%edx)
-L(aligned_16_26bytes):
-	movdqa	%xmm0, -26(%edx)
-L(aligned_16_10bytes):
-	movq	%xmm0, -10(%edx)
-	movw	%ax, -2(%edx)
-	SETRTNVAL
-	RETURN
-
-	ALIGN(4)
-L(aligned_16_123bytes):
-	movdqa	%xmm0, -123(%edx)
-L(aligned_16_107bytes):
-	movdqa	%xmm0, -107(%edx)
-L(aligned_16_91bytes):
-	movdqa	%xmm0, -91(%edx)
-L(aligned_16_75bytes):
-	movdqa	%xmm0, -75(%edx)
-L(aligned_16_59bytes):
-	movdqa	%xmm0, -59(%edx)
-L(aligned_16_43bytes):
-	movdqa	%xmm0, -43(%edx)
-L(aligned_16_27bytes):
-	movdqa	%xmm0, -27(%edx)
-L(aligned_16_11bytes):
-	movq	%xmm0, -11(%edx)
-	movw	%ax, -3(%edx)
-	movb	%al, -1(%edx)
-	SETRTNVAL
-	RETURN
-
-	ALIGN(4)
-L(aligned_16_124bytes):
-	movdqa	%xmm0, -124(%edx)
-L(aligned_16_108bytes):
-	movdqa	%xmm0, -108(%edx)
-L(aligned_16_92bytes):
-	movdqa	%xmm0, -92(%edx)
-L(aligned_16_76bytes):
-	movdqa	%xmm0, -76(%edx)
-L(aligned_16_60bytes):
-	movdqa	%xmm0, -60(%edx)
-L(aligned_16_44bytes):
-	movdqa	%xmm0, -44(%edx)
-L(aligned_16_28bytes):
-	movdqa	%xmm0, -28(%edx)
-L(aligned_16_12bytes):
-	movq	%xmm0, -12(%edx)
-	movl	%eax, -4(%edx)
-	SETRTNVAL
-	RETURN
-
-	ALIGN(4)
-L(aligned_16_125bytes):
-	movdqa	%xmm0, -125(%edx)
-L(aligned_16_109bytes):
-	movdqa	%xmm0, -109(%edx)
-L(aligned_16_93bytes):
-	movdqa	%xmm0, -93(%edx)
-L(aligned_16_77bytes):
-	movdqa	%xmm0, -77(%edx)
-L(aligned_16_61bytes):
-	movdqa	%xmm0, -61(%edx)
-L(aligned_16_45bytes):
-	movdqa	%xmm0, -45(%edx)
-L(aligned_16_29bytes):
-	movdqa	%xmm0, -29(%edx)
-L(aligned_16_13bytes):
-	movq	%xmm0, -13(%edx)
-	movl	%eax, -5(%edx)
-	movb	%al, -1(%edx)
-	SETRTNVAL
-	RETURN
-
-	ALIGN(4)
-L(aligned_16_126bytes):
-	movdqa	%xmm0, -126(%edx)
-L(aligned_16_110bytes):
-	movdqa	%xmm0, -110(%edx)
-L(aligned_16_94bytes):
-	movdqa	%xmm0, -94(%edx)
-L(aligned_16_78bytes):
-	movdqa	%xmm0, -78(%edx)
-L(aligned_16_62bytes):
-	movdqa	%xmm0, -62(%edx)
-L(aligned_16_46bytes):
-	movdqa	%xmm0, -46(%edx)
-L(aligned_16_30bytes):
-	movdqa	%xmm0, -30(%edx)
-L(aligned_16_14bytes):
-	movq	%xmm0, -14(%edx)
-	movl	%eax, -6(%edx)
-	movw	%ax, -2(%edx)
-	SETRTNVAL
-	RETURN
-
-	ALIGN(4)
-L(aligned_16_127bytes):
-	movdqa	%xmm0, -127(%edx)
-L(aligned_16_111bytes):
-	movdqa	%xmm0, -111(%edx)
-L(aligned_16_95bytes):
-	movdqa	%xmm0, -95(%edx)
-L(aligned_16_79bytes):
-	movdqa	%xmm0, -79(%edx)
-L(aligned_16_63bytes):
-	movdqa	%xmm0, -63(%edx)
-L(aligned_16_47bytes):
-	movdqa	%xmm0, -47(%edx)
-L(aligned_16_31bytes):
-	movdqa	%xmm0, -31(%edx)
-L(aligned_16_15bytes):
-	movq	%xmm0, -15(%edx)
-	movl	%eax, -7(%edx)
-	movw	%ax, -3(%edx)
-	movb	%al, -1(%edx)
-	SETRTNVAL
-	RETURN_END
-
-END(memset_atom)
diff --git a/libc/arch-x86/string/sse2-memset-slm.S b/libc/arch-x86/string/sse2-memset-slm.S
index e4c8fa1..ec2ee52 100644
--- a/libc/arch-x86/string/sse2-memset-slm.S
+++ b/libc/arch-x86/string/sse2-memset-slm.S
@@ -79,7 +79,7 @@
     /* We loaded the jump table and adjusted EDX. Go.  */	\
     jmp		*%ebx
 
-ENTRY(__memset_chk_generic)
+ENTRY(__memset_chk)
   ENTRANCE
 
   movl LEN(%esp), %ecx
@@ -88,11 +88,11 @@
 
   POP(%ebx) // Undo ENTRANCE without returning.
   jmp __memset_chk_fail
-END(__memset_chk_generic)
+END(__memset_chk)
 
 	.section .text.sse2,"ax",@progbits
 	ALIGN(4)
-ENTRY(memset_generic)
+ENTRY(memset)
 	ENTRANCE
 
 	movl	LEN(%esp), %ecx
@@ -755,4 +755,4 @@
 	SETRTNVAL
 	RETURN_END
 
-END(memset_generic)
+END(memset)
diff --git a/libc/arch-x86/string/sse2-strcpy-slm.S b/libc/arch-x86/string/sse2-strcpy-slm.S
index 22ceeab..b5d84b5 100644
--- a/libc/arch-x86/string/sse2-strcpy-slm.S
+++ b/libc/arch-x86/string/sse2-strcpy-slm.S
@@ -79,7 +79,7 @@
 #define POP(REG) popl REG; CFI_POP (REG)
 
 #ifndef STRCPY
-# define STRCPY  strcpy_generic
+# define STRCPY  strcpy
 #endif
 
 #ifdef USE_AS_STPNCPY
diff --git a/libc/arch-x86/string/sse2-strlen-slm.S b/libc/arch-x86/string/sse2-strlen-slm.S
index b805ad6..27cc025 100644
--- a/libc/arch-x86/string/sse2-strlen-slm.S
+++ b/libc/arch-x86/string/sse2-strlen-slm.S
@@ -29,7 +29,7 @@
 */
 
 #ifndef STRLEN
-# define STRLEN strlen_generic
+# define STRLEN strlen
 #endif
 
 #ifndef L
diff --git a/libc/arch-x86/string/sse2-strncpy-slm.S b/libc/arch-x86/string/sse2-strncpy-slm.S
index aff7fb9..591419f 100644
--- a/libc/arch-x86/string/sse2-strncpy-slm.S
+++ b/libc/arch-x86/string/sse2-strncpy-slm.S
@@ -29,5 +29,5 @@
 */

 

 #define USE_AS_STRNCPY

-#define STRCPY strncpy_generic

+#define STRCPY strncpy

 #include "sse2-strcpy-slm.S"

diff --git a/libc/arch-x86/string/ssse3-memcpy-atom.S b/libc/arch-x86/string/ssse3-memcpy-atom.S
deleted file mode 100644
index 83e1985..0000000
--- a/libc/arch-x86/string/ssse3-memcpy-atom.S
+++ /dev/null
@@ -1,3124 +0,0 @@
-/*
-Copyright (c) 2010, Intel Corporation
-All rights reserved.
-
-Redistribution and use in source and binary forms, with or without
-modification, are permitted provided that the following conditions are met:
-
-    * Redistributions of source code must retain the above copyright notice,
-    * this list of conditions and the following disclaimer.
-
-    * Redistributions in binary form must reproduce the above copyright notice,
-    * this list of conditions and the following disclaimer in the documentation
-    * and/or other materials provided with the distribution.
-
-    * Neither the name of Intel Corporation nor the names of its contributors
-    * may be used to endorse or promote products derived from this software
-    * without specific prior written permission.
-
-THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
-ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
-WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
-DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
-ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
-(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
-LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
-ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-*/
-
-#define FOR_ATOM
-
-#ifndef MEMCPY
-# define MEMCPY	memcpy_atom
-#endif
-
-#ifndef L
-# define L(label)	.L##label
-#endif
-
-#ifndef cfi_startproc
-# define cfi_startproc	.cfi_startproc
-#endif
-
-#ifndef cfi_endproc
-# define cfi_endproc	.cfi_endproc
-#endif
-
-#ifndef cfi_rel_offset
-# define cfi_rel_offset(reg, off)	.cfi_rel_offset reg, off
-#endif
-
-#ifndef cfi_restore
-# define cfi_restore(reg)	.cfi_restore reg
-#endif
-
-#ifndef cfi_adjust_cfa_offset
-# define cfi_adjust_cfa_offset(off)	.cfi_adjust_cfa_offset off
-#endif
-
-#ifndef ENTRY
-# define ENTRY(name)		\
-	.type name,  @function;		\
-	.globl name;		\
-	.p2align 4;		\
-name:		\
-	cfi_startproc
-#endif
-
-#ifndef END
-# define END(name)		\
-	cfi_endproc;		\
-	.size name, .-name
-#endif
-
-#define DEST		PARMS
-#define SRC		DEST+4
-#define LEN		SRC+4
-
-#define CFI_PUSH(REG)		\
-  cfi_adjust_cfa_offset (4);		\
-  cfi_rel_offset (REG, 0)
-
-#define CFI_POP(REG)		\
-  cfi_adjust_cfa_offset (-4);		\
-  cfi_restore (REG)
-
-#define PUSH(REG)	pushl REG; CFI_PUSH (REG)
-#define POP(REG)	popl REG; CFI_POP (REG)
-
-#if (defined SHARED || defined __PIC__)
-# define PARMS		8		/* Preserve EBX.  */
-# define ENTRANCE	PUSH (%ebx);
-# define RETURN_END	POP (%ebx); ret
-# define RETURN		RETURN_END; CFI_PUSH (%ebx)
-# define JMPTBL(I, B)	I - B
-
-# define SETUP_PIC_REG(x)	call	__x86.get_pc_thunk.x
-
-/* Load an entry in a jump table into EBX and branch to it.  TABLE is a
-	jump table with relative offsets.  INDEX is a register contains the
-	index into the jump table.   SCALE is the scale of INDEX. */
-
-# define BRANCH_TO_JMPTBL_ENTRY(TABLE, INDEX, SCALE)		\
-    /* We first load PC into EBX.  */		\
-	SETUP_PIC_REG(bx);		\
-    /* Get the address of the jump table.  */		\
-	addl	$(TABLE - .), %ebx;		\
-    /* Get the entry and convert the relative offset to the		\
-	absolute	address.  */		\
-	addl	(%ebx, INDEX, SCALE), %ebx;		\
-    /* We loaded the jump table.  Go.  */		\
-	jmp	*%ebx
-#else
-
-# define PARMS		4
-# define ENTRANCE
-# define RETURN_END	ret
-# define RETURN		RETURN_END
-# define JMPTBL(I, B)	I
-
-/* Branch to an entry in a jump table.  TABLE is a jump table with
-	absolute offsets.  INDEX is a register contains the index into the
-	jump table.  SCALE is the scale of INDEX. */
-
-# define BRANCH_TO_JMPTBL_ENTRY(TABLE, INDEX, SCALE)		\
-	jmp	*TABLE(, INDEX, SCALE)
-#endif
-
-	.section .text.ssse3,"ax",@progbits
-ENTRY (MEMCPY)
-	ENTRANCE
-	movl	LEN(%esp), %ecx
-	movl	SRC(%esp), %eax
-	movl	DEST(%esp), %edx
-
-#ifdef USE_AS_MEMMOVE
-	cmp	%eax, %edx
-	jb	L(copy_forward)
-	je	L(fwd_write_0bytes)
-	cmp	$32, %ecx
-	jae	L(memmove_bwd)
-	jmp	L(bk_write_less32bytes_2)
-
-	.p2align 4
-L(memmove_bwd):
-	add	%ecx, %eax
-	cmp	%eax, %edx
-	movl	SRC(%esp), %eax
-	jb	L(copy_backward)
-
-L(copy_forward):
-#endif
-	cmp	$48, %ecx
-	jae	L(48bytesormore)
-
-L(fwd_write_less32bytes):
-#ifndef USE_AS_MEMMOVE
-	cmp	%dl, %al
-	jb	L(bk_write)
-#endif
-	add	%ecx, %edx
-	add	%ecx, %eax
-	BRANCH_TO_JMPTBL_ENTRY (L(table_48bytes_fwd), %ecx, 4)
-#ifndef USE_AS_MEMMOVE
-	.p2align 4
-L(bk_write):
-	BRANCH_TO_JMPTBL_ENTRY (L(table_48_bytes_bwd), %ecx, 4)
-#endif
-
-	.p2align 4
-L(48bytesormore):
-#ifndef USE_AS_MEMMOVE
-	movlpd	(%eax), %xmm0
-	movlpd	8(%eax), %xmm1
-	movlpd	%xmm0, (%edx)
-	movlpd	%xmm1, 8(%edx)
-#else
-	movdqu	(%eax), %xmm0
-#endif
-	PUSH (%edi)
-	movl	%edx, %edi
-	and	$-16, %edx
-	add	$16, %edx
-	sub	%edx, %edi
-	add	%edi, %ecx
-	sub	%edi, %eax
-
-#ifdef SHARED_CACHE_SIZE_HALF
-	cmp	$SHARED_CACHE_SIZE_HALF, %ecx
-#else
-# if (defined SHARED || defined __PIC__)
-	SETUP_PIC_REG(bx)
-	add	$_GLOBAL_OFFSET_TABLE_, %ebx
-	cmp	__x86_shared_cache_size_half@GOTOFF(%ebx), %ecx
-# else
-	cmp	__x86_shared_cache_size_half, %ecx
-# endif
-#endif
-
-	mov	%eax, %edi
-	jae	L(large_page)
-	and	$0xf, %edi
-	jz	L(shl_0)
-	BRANCH_TO_JMPTBL_ENTRY (L(shl_table), %edi, 4)
-
-	.p2align 4
-L(shl_0):
-#ifdef USE_AS_MEMMOVE
-	movl	DEST+4(%esp), %edi
-	movdqu	%xmm0, (%edi)
-#endif
-	xor	%edi, %edi
-	cmp	$127, %ecx
-	ja	L(shl_0_gobble)
-	lea	-32(%ecx), %ecx
-
-	.p2align 4
-L(shl_0_loop):
-	movdqa	(%eax, %edi), %xmm0
-	movdqa	16(%eax, %edi), %xmm1
-	sub	$32, %ecx
-	movdqa	%xmm0, (%edx, %edi)
-	movdqa	%xmm1, 16(%edx, %edi)
-	lea	32(%edi), %edi
-	jb	L(shl_0_end)
-
-	movdqa	(%eax, %edi), %xmm0
-	movdqa	16(%eax, %edi), %xmm1
-	sub	$32, %ecx
-	movdqa	%xmm0, (%edx, %edi)
-	movdqa	%xmm1, 16(%edx, %edi)
-	lea	32(%edi), %edi
-	jb	L(shl_0_end)
-
-	movdqa	(%eax, %edi), %xmm0
-	movdqa	16(%eax, %edi), %xmm1
-	sub	$32, %ecx
-	movdqa	%xmm0, (%edx, %edi)
-	movdqa	%xmm1, 16(%edx, %edi)
-	lea	32(%edi), %edi
-	jb	L(shl_0_end)
-
-	movdqa	(%eax, %edi), %xmm0
-	movdqa	16(%eax, %edi), %xmm1
-	sub	$32, %ecx
-	movdqa	%xmm0, (%edx, %edi)
-	movdqa	%xmm1, 16(%edx, %edi)
-	lea	32(%edi), %edi
-
-L(shl_0_end):
-	lea	32(%ecx), %ecx
-	add	%ecx, %edi
-	add	%edi, %edx
-	add	%edi, %eax
-	POP (%edi)
-	BRANCH_TO_JMPTBL_ENTRY (L(table_48bytes_fwd_align), %ecx, 4)
-
-	CFI_PUSH (%edi)
-
-	.p2align 4
-L(shl_0_gobble):
-#ifdef DATA_CACHE_SIZE_HALF
-	cmp	$DATA_CACHE_SIZE_HALF, %ecx
-#else
-# if (defined SHARED || defined __PIC__)
-	SETUP_PIC_REG(bx)
-	add	$_GLOBAL_OFFSET_TABLE_, %ebx
-	cmp	__x86_data_cache_size_half@GOTOFF(%ebx), %ecx
-# else
-	cmp	__x86_data_cache_size_half, %ecx
-# endif
-#endif
-	POP	(%edi)
-	lea	-128(%ecx), %ecx
-	jae	L(shl_0_gobble_mem_loop)
-
-	.p2align 4
-L(shl_0_gobble_cache_loop):
-	movdqa	(%eax), %xmm0
-	movdqa	0x10(%eax), %xmm1
-	movdqa	0x20(%eax), %xmm2
-	movdqa	0x30(%eax), %xmm3
-	movdqa	0x40(%eax), %xmm4
-	movdqa	0x50(%eax), %xmm5
-	movdqa	0x60(%eax), %xmm6
-	movdqa	0x70(%eax), %xmm7
-	lea	0x80(%eax), %eax
-	sub	$128, %ecx
-	movdqa	%xmm0, (%edx)
-	movdqa	%xmm1, 0x10(%edx)
-	movdqa	%xmm2, 0x20(%edx)
-	movdqa	%xmm3, 0x30(%edx)
-	movdqa	%xmm4, 0x40(%edx)
-	movdqa	%xmm5, 0x50(%edx)
-	movdqa	%xmm6, 0x60(%edx)
-	movdqa	%xmm7, 0x70(%edx)
-	lea	0x80(%edx), %edx
-
-	jae	L(shl_0_gobble_cache_loop)
-	cmp	$-0x40, %ecx
-	lea	0x80(%ecx), %ecx
-	jl	L(shl_0_cache_less_64bytes)
-
-	movdqa	(%eax), %xmm0
-	sub	$0x40, %ecx
-	movdqa	0x10(%eax), %xmm1
-	movdqa	%xmm0, (%edx)
-	movdqa	%xmm1, 0x10(%edx)
-	movdqa	0x20(%eax), %xmm0
-	movdqa	0x30(%eax), %xmm1
-	add	$0x40, %eax
-	movdqa	%xmm0, 0x20(%edx)
-	movdqa	%xmm1, 0x30(%edx)
-	add	$0x40, %edx
-
-L(shl_0_cache_less_64bytes):
-	cmp	$0x20, %ecx
-	jb	L(shl_0_cache_less_32bytes)
-	movdqa	(%eax), %xmm0
-	sub	$0x20, %ecx
-	movdqa	0x10(%eax), %xmm1
-	add	$0x20, %eax
-	movdqa	%xmm0, (%edx)
-	movdqa	%xmm1, 0x10(%edx)
-	add	$0x20, %edx
-
-L(shl_0_cache_less_32bytes):
-	cmp	$0x10, %ecx
-	jb	L(shl_0_cache_less_16bytes)
-	sub	$0x10, %ecx
-	movdqa	(%eax), %xmm0
-	add	$0x10, %eax
-	movdqa	%xmm0, (%edx)
-	add	$0x10, %edx
-
-L(shl_0_cache_less_16bytes):
-	add	%ecx, %edx
-	add	%ecx, %eax
-	BRANCH_TO_JMPTBL_ENTRY (L(table_48bytes_fwd), %ecx, 4)
-
-	.p2align 4
-L(shl_0_gobble_mem_loop):
-	prefetcht0 0x1c0(%eax)
-	prefetcht0 0x280(%eax)
-	prefetcht0 0x1c0(%edx)
-
-	movdqa	(%eax), %xmm0
-	movdqa	0x10(%eax), %xmm1
-	movdqa	0x20(%eax), %xmm2
-	movdqa	0x30(%eax), %xmm3
-	movdqa	0x40(%eax), %xmm4
-	movdqa	0x50(%eax), %xmm5
-	movdqa	0x60(%eax), %xmm6
-	movdqa	0x70(%eax), %xmm7
-	lea	0x80(%eax), %eax
-	sub	$0x80, %ecx
-	movdqa	%xmm0, (%edx)
-	movdqa	%xmm1, 0x10(%edx)
-	movdqa	%xmm2, 0x20(%edx)
-	movdqa	%xmm3, 0x30(%edx)
-	movdqa	%xmm4, 0x40(%edx)
-	movdqa	%xmm5, 0x50(%edx)
-	movdqa	%xmm6, 0x60(%edx)
-	movdqa	%xmm7, 0x70(%edx)
-	lea	0x80(%edx), %edx
-
-	jae	L(shl_0_gobble_mem_loop)
-	cmp	$-0x40, %ecx
-	lea	0x80(%ecx), %ecx
-	jl	L(shl_0_mem_less_64bytes)
-
-	movdqa	(%eax), %xmm0
-	sub	$0x40, %ecx
-	movdqa	0x10(%eax), %xmm1
-
-	movdqa	%xmm0, (%edx)
-	movdqa	%xmm1, 0x10(%edx)
-
-	movdqa	0x20(%eax), %xmm0
-	movdqa	0x30(%eax), %xmm1
-	add	$0x40, %eax
-
-	movdqa	%xmm0, 0x20(%edx)
-	movdqa	%xmm1, 0x30(%edx)
-	add	$0x40, %edx
-
-L(shl_0_mem_less_64bytes):
-	cmp	$0x20, %ecx
-	jb	L(shl_0_mem_less_32bytes)
-	movdqa	(%eax), %xmm0
-	sub	$0x20, %ecx
-	movdqa	0x10(%eax), %xmm1
-	add	$0x20, %eax
-	movdqa	%xmm0, (%edx)
-	movdqa	%xmm1, 0x10(%edx)
-	add	$0x20, %edx
-
-L(shl_0_mem_less_32bytes):
-	cmp	$0x10, %ecx
-	jb	L(shl_0_mem_less_16bytes)
-	sub	$0x10, %ecx
-	movdqa	(%eax), %xmm0
-	add	$0x10, %eax
-	movdqa	%xmm0, (%edx)
-	add	$0x10, %edx
-
-L(shl_0_mem_less_16bytes):
-	add	%ecx, %edx
-	add	%ecx, %eax
-	BRANCH_TO_JMPTBL_ENTRY (L(table_48bytes_fwd_align), %ecx, 4)
-
-	.p2align 4
-L(shl_1):
-#ifndef USE_AS_MEMMOVE
-	movaps	-1(%eax), %xmm1
-#else
-	movl	DEST+4(%esp), %edi
-	movaps	-1(%eax), %xmm1
-	movdqu	%xmm0, (%edi)
-#endif
-#ifdef DATA_CACHE_SIZE_HALF
-	cmp	$DATA_CACHE_SIZE_HALF, %ecx
-#else
-# if (defined SHARED || defined __PIC__)
-	SETUP_PIC_REG(bx)
-	add	$_GLOBAL_OFFSET_TABLE_, %ebx
-	cmp	__x86_data_cache_size_half@GOTOFF(%ebx), %ecx
-# else
-	cmp	__x86_data_cache_size_half, %ecx
-# endif
-#endif
-	jb L(sh_1_no_prefetch)
-
-	lea	-64(%ecx), %ecx
-
-	.p2align 4
-L(Shl1LoopStart):
-	prefetcht0 0x1c0(%eax)
-	prefetcht0 0x1c0(%edx)
-	movaps	15(%eax), %xmm2
-	movaps	31(%eax), %xmm3
-	movaps	47(%eax), %xmm4
-	movaps	63(%eax), %xmm5
-	movaps	%xmm5, %xmm7
-	palignr	$1, %xmm4, %xmm5
-	palignr	$1, %xmm3, %xmm4
-	movaps	%xmm5, 48(%edx)
-	palignr	$1, %xmm2, %xmm3
-	lea	64(%eax), %eax
-	palignr	$1, %xmm1, %xmm2
-	movaps	%xmm4, 32(%edx)
-	movaps	%xmm3, 16(%edx)
-	movaps	%xmm7, %xmm1
-	movaps	%xmm2, (%edx)
-	lea	64(%edx), %edx
-	sub	$64, %ecx
-	ja	L(Shl1LoopStart)
-
-L(Shl1LoopLeave):
-	add	$32, %ecx
-	jle	L(shl_end_0)
-
-	movaps	15(%eax), %xmm2
-	movaps	31(%eax), %xmm3
-	palignr	$1, %xmm2, %xmm3
-	palignr	$1, %xmm1, %xmm2
-	movaps	%xmm2, (%edx)
-	movaps	%xmm3, 16(%edx)
-	lea	32(%edx, %ecx), %edx
-	lea	32(%eax, %ecx), %eax
-	POP (%edi)
-	BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
-
-	CFI_PUSH (%edi)
-
-	.p2align 4
-L(sh_1_no_prefetch):
-	lea	-32(%ecx), %ecx
-	lea	-1(%eax), %eax
-	xor	%edi, %edi
-
-	.p2align 4
-L(sh_1_no_prefetch_loop):
-	movdqa	16(%eax, %edi), %xmm2
-	sub	$32, %ecx
-	movdqa	32(%eax, %edi), %xmm3
-	movdqa	%xmm3, %xmm4
-	palignr	$1, %xmm2, %xmm3
-	palignr	$1, %xmm1, %xmm2
-	lea	32(%edi), %edi
-	movdqa	%xmm2, -32(%edx, %edi)
-	movdqa	%xmm3, -16(%edx, %edi)
-	jb	L(sh_1_end_no_prefetch_loop)
-
-	movdqa	16(%eax, %edi), %xmm2
-	sub	$32, %ecx
-	movdqa	32(%eax, %edi), %xmm3
-	movdqa	%xmm3, %xmm1
-	palignr	$1, %xmm2, %xmm3
-	palignr	$1, %xmm4, %xmm2
-	lea	32(%edi), %edi
-	movdqa	%xmm2, -32(%edx, %edi)
-	movdqa	%xmm3, -16(%edx, %edi)
-	jae	L(sh_1_no_prefetch_loop)
-
-L(sh_1_end_no_prefetch_loop):
-	lea	32(%ecx), %ecx
-	add	%ecx, %edi
-	add	%edi, %edx
-	lea	1(%edi, %eax), %eax
-	POP	(%edi)
-	BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
-
-	CFI_PUSH (%edi)
-
-	.p2align 4
-L(shl_2):
-#ifndef USE_AS_MEMMOVE
-	movaps	-2(%eax), %xmm1
-#else
-	movl	DEST+4(%esp), %edi
-	movaps	-2(%eax), %xmm1
-	movdqu	%xmm0, (%edi)
-#endif
-#ifdef DATA_CACHE_SIZE_HALF
-	cmp	$DATA_CACHE_SIZE_HALF, %ecx
-#else
-# if (defined SHARED || defined __PIC__)
-	SETUP_PIC_REG(bx)
-	add	$_GLOBAL_OFFSET_TABLE_, %ebx
-	cmp	__x86_data_cache_size_half@GOTOFF(%ebx), %ecx
-# else
-	cmp	__x86_data_cache_size_half, %ecx
-# endif
-#endif
-	jb L(sh_2_no_prefetch)
-
-	lea	-64(%ecx), %ecx
-
-	.p2align 4
-L(Shl2LoopStart):
-	prefetcht0 0x1c0(%eax)
-	prefetcht0 0x1c0(%edx)
-	movaps	14(%eax), %xmm2
-	movaps	30(%eax), %xmm3
-	movaps	46(%eax), %xmm4
-	movaps	62(%eax), %xmm5
-	movaps	%xmm5, %xmm7
-	palignr	$2, %xmm4, %xmm5
-	palignr	$2, %xmm3, %xmm4
-	movaps	%xmm5, 48(%edx)
-	palignr	$2, %xmm2, %xmm3
-	lea	64(%eax), %eax
-	palignr	$2, %xmm1, %xmm2
-	movaps	%xmm4, 32(%edx)
-	movaps	%xmm3, 16(%edx)
-	movaps	%xmm7, %xmm1
-	movaps	%xmm2, (%edx)
-	lea	64(%edx), %edx
-	sub	$64, %ecx
-	ja	L(Shl2LoopStart)
-
-L(Shl2LoopLeave):
-	add	$32, %ecx
-	jle	L(shl_end_0)
-
-	movaps	14(%eax), %xmm2
-	movaps	30(%eax), %xmm3
-	palignr	$2, %xmm2, %xmm3
-	palignr	$2, %xmm1, %xmm2
-	movaps	%xmm2, (%edx)
-	movaps	%xmm3, 16(%edx)
-	lea	32(%edx, %ecx), %edx
-	lea	32(%eax, %ecx), %eax
-	POP (%edi)
-	BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
-
-	CFI_PUSH (%edi)
-
-	.p2align 4
-L(sh_2_no_prefetch):
-	lea	-32(%ecx), %ecx
-	lea	-2(%eax), %eax
-	xor	%edi, %edi
-
-	.p2align 4
-L(sh_2_no_prefetch_loop):
-	movdqa	16(%eax, %edi), %xmm2
-	sub	$32, %ecx
-	movdqa	32(%eax, %edi), %xmm3
-	movdqa	%xmm3, %xmm4
-	palignr	$2, %xmm2, %xmm3
-	palignr	$2, %xmm1, %xmm2
-	lea	32(%edi), %edi
-	movdqa	%xmm2, -32(%edx, %edi)
-	movdqa	%xmm3, -16(%edx, %edi)
-	jb	L(sh_2_end_no_prefetch_loop)
-
-	movdqa	16(%eax, %edi), %xmm2
-	sub	$32, %ecx
-	movdqa	32(%eax, %edi), %xmm3
-	movdqa	%xmm3, %xmm1
-	palignr	$2, %xmm2, %xmm3
-	palignr	$2, %xmm4, %xmm2
-	lea	32(%edi), %edi
-	movdqa	%xmm2, -32(%edx, %edi)
-	movdqa	%xmm3, -16(%edx, %edi)
-	jae	L(sh_2_no_prefetch_loop)
-
-L(sh_2_end_no_prefetch_loop):
-	lea	32(%ecx), %ecx
-	add	%ecx, %edi
-	add	%edi, %edx
-	lea	2(%edi, %eax), %eax
-	POP	(%edi)
-	BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
-
-	CFI_PUSH (%edi)
-
-	.p2align 4
-L(shl_3):
-#ifndef USE_AS_MEMMOVE
-	movaps	-3(%eax), %xmm1
-#else
-	movl	DEST+4(%esp), %edi
-	movaps	-3(%eax), %xmm1
-	movdqu	%xmm0, (%edi)
-#endif
-#ifdef DATA_CACHE_SIZE_HALF
-	cmp	$DATA_CACHE_SIZE_HALF, %ecx
-#else
-# if (defined SHARED || defined __PIC__)
-	SETUP_PIC_REG(bx)
-	add	$_GLOBAL_OFFSET_TABLE_, %ebx
-	cmp	__x86_data_cache_size_half@GOTOFF(%ebx), %ecx
-# else
-	cmp	__x86_data_cache_size_half, %ecx
-# endif
-#endif
-	jb L(sh_3_no_prefetch)
-
-	lea	-64(%ecx), %ecx
-
-	.p2align 4
-L(Shl3LoopStart):
-	prefetcht0 0x1c0(%eax)
-	prefetcht0 0x1c0(%edx)
-	movaps	13(%eax), %xmm2
-	movaps	29(%eax), %xmm3
-	movaps	45(%eax), %xmm4
-	movaps	61(%eax), %xmm5
-	movaps	%xmm5, %xmm7
-	palignr	$3, %xmm4, %xmm5
-	palignr	$3, %xmm3, %xmm4
-	movaps	%xmm5, 48(%edx)
-	palignr	$3, %xmm2, %xmm3
-	lea	64(%eax), %eax
-	palignr	$3, %xmm1, %xmm2
-	movaps	%xmm4, 32(%edx)
-	movaps	%xmm3, 16(%edx)
-	movaps	%xmm7, %xmm1
-	movaps	%xmm2, (%edx)
-	lea	64(%edx), %edx
-	sub	$64, %ecx
-	ja	L(Shl3LoopStart)
-
-L(Shl3LoopLeave):
-	add	$32, %ecx
-	jle	L(shl_end_0)
-
-	movaps	13(%eax), %xmm2
-	movaps	29(%eax), %xmm3
-	palignr	$3, %xmm2, %xmm3
-	palignr	$3, %xmm1, %xmm2
-	movaps	%xmm2, (%edx)
-	movaps	%xmm3, 16(%edx)
-	lea	32(%edx, %ecx), %edx
-	lea	32(%eax, %ecx), %eax
-	POP (%edi)
-	BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
-
-	CFI_PUSH (%edi)
-
-	.p2align 4
-L(sh_3_no_prefetch):
-	lea	-32(%ecx), %ecx
-	lea	-3(%eax), %eax
-	xor	%edi, %edi
-
-	.p2align 4
-L(sh_3_no_prefetch_loop):
-	movdqa	16(%eax, %edi), %xmm2
-	sub	$32, %ecx
-	movdqa	32(%eax, %edi), %xmm3
-	movdqa	%xmm3, %xmm4
-	palignr	$3, %xmm2, %xmm3
-	palignr	$3, %xmm1, %xmm2
-	lea	32(%edi), %edi
-	movdqa	%xmm2, -32(%edx, %edi)
-	movdqa	%xmm3, -16(%edx, %edi)
-
-	jb	L(sh_3_end_no_prefetch_loop)
-
-	movdqa	16(%eax, %edi), %xmm2
-	sub	$32, %ecx
-	movdqa	32(%eax, %edi), %xmm3
-	movdqa	%xmm3, %xmm1
-	palignr	$3, %xmm2, %xmm3
-	palignr	$3, %xmm4, %xmm2
-	lea	32(%edi), %edi
-	movdqa	%xmm2, -32(%edx, %edi)
-	movdqa	%xmm3, -16(%edx, %edi)
-
-	jae	L(sh_3_no_prefetch_loop)
-
-L(sh_3_end_no_prefetch_loop):
-	lea	32(%ecx), %ecx
-	add	%ecx, %edi
-	add	%edi, %edx
-	lea	3(%edi, %eax), %eax
-	POP	(%edi)
-	BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
-
-	CFI_PUSH (%edi)
-
-	.p2align 4
-L(shl_4):
-#ifndef USE_AS_MEMMOVE
-	movaps	-4(%eax), %xmm1
-#else
-	movl	DEST+4(%esp), %edi
-	movaps	-4(%eax), %xmm1
-	movdqu	%xmm0, (%edi)
-#endif
-#ifdef DATA_CACHE_SIZE_HALF
-	cmp	$DATA_CACHE_SIZE_HALF, %ecx
-#else
-# if (defined SHARED || defined __PIC__)
-	SETUP_PIC_REG(bx)
-	add	$_GLOBAL_OFFSET_TABLE_, %ebx
-	cmp	__x86_data_cache_size_half@GOTOFF(%ebx), %ecx
-# else
-	cmp	__x86_data_cache_size_half, %ecx
-# endif
-#endif
-	jb L(sh_4_no_prefetch)
-
-	lea	-64(%ecx), %ecx
-
-	.p2align 4
-L(Shl4LoopStart):
-	prefetcht0 0x1c0(%eax)
-	prefetcht0 0x1c0(%edx)
-	movaps	12(%eax), %xmm2
-	movaps	28(%eax), %xmm3
-	movaps	44(%eax), %xmm4
-	movaps	60(%eax), %xmm5
-	movaps	%xmm5, %xmm7
-	palignr	$4, %xmm4, %xmm5
-	palignr	$4, %xmm3, %xmm4
-	movaps	%xmm5, 48(%edx)
-	palignr	$4, %xmm2, %xmm3
-	lea	64(%eax), %eax
-	palignr	$4, %xmm1, %xmm2
-	movaps	%xmm4, 32(%edx)
-	movaps	%xmm3, 16(%edx)
-	movaps	%xmm7, %xmm1
-	movaps	%xmm2, (%edx)
-	lea	64(%edx), %edx
-	sub	$64, %ecx
-	ja	L(Shl4LoopStart)
-
-L(Shl4LoopLeave):
-	add	$32, %ecx
-	jle	L(shl_end_0)
-
-	movaps	12(%eax), %xmm2
-	movaps	28(%eax), %xmm3
-	palignr	$4, %xmm2, %xmm3
-	palignr	$4, %xmm1, %xmm2
-	movaps	%xmm2, (%edx)
-	movaps	%xmm3, 16(%edx)
-	lea	32(%edx, %ecx), %edx
-	lea	32(%eax, %ecx), %eax
-	POP (%edi)
-	BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
-
-	CFI_PUSH (%edi)
-
-	.p2align 4
-L(sh_4_no_prefetch):
-	lea	-32(%ecx), %ecx
-	lea	-4(%eax), %eax
-	xor	%edi, %edi
-
-	.p2align 4
-L(sh_4_no_prefetch_loop):
-	movdqa	16(%eax, %edi), %xmm2
-	sub	$32, %ecx
-	movdqa	32(%eax, %edi), %xmm3
-	movdqa	%xmm3, %xmm4
-	palignr	$4, %xmm2, %xmm3
-	palignr	$4, %xmm1, %xmm2
-	lea	32(%edi), %edi
-	movdqa	%xmm2, -32(%edx, %edi)
-	movdqa	%xmm3, -16(%edx, %edi)
-
-	jb	L(sh_4_end_no_prefetch_loop)
-
-	movdqa	16(%eax, %edi), %xmm2
-	sub	$32, %ecx
-	movdqa	32(%eax, %edi), %xmm3
-	movdqa	%xmm3, %xmm1
-	palignr	$4, %xmm2, %xmm3
-	palignr	$4, %xmm4, %xmm2
-	lea	32(%edi), %edi
-	movdqa	%xmm2, -32(%edx, %edi)
-	movdqa	%xmm3, -16(%edx, %edi)
-
-	jae	L(sh_4_no_prefetch_loop)
-
-L(sh_4_end_no_prefetch_loop):
-	lea	32(%ecx), %ecx
-	add	%ecx, %edi
-	add	%edi, %edx
-	lea	4(%edi, %eax), %eax
-	POP	(%edi)
-	BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
-
-	CFI_PUSH (%edi)
-
-	.p2align 4
-L(shl_5):
-#ifndef USE_AS_MEMMOVE
-	movaps	-5(%eax), %xmm1
-#else
-	movl	DEST+4(%esp), %edi
-	movaps	-5(%eax), %xmm1
-	movdqu	%xmm0, (%edi)
-#endif
-#ifdef DATA_CACHE_SIZE_HALF
-	cmp	$DATA_CACHE_SIZE_HALF, %ecx
-#else
-# if (defined SHARED || defined __PIC__)
-	SETUP_PIC_REG(bx)
-	add	$_GLOBAL_OFFSET_TABLE_, %ebx
-	cmp	__x86_data_cache_size_half@GOTOFF(%ebx), %ecx
-# else
-	cmp	__x86_data_cache_size_half, %ecx
-# endif
-#endif
-	jb L(sh_5_no_prefetch)
-
-	lea	-64(%ecx), %ecx
-
-	.p2align 4
-L(Shl5LoopStart):
-	prefetcht0 0x1c0(%eax)
-	prefetcht0 0x1c0(%edx)
-	movaps	11(%eax), %xmm2
-	movaps	27(%eax), %xmm3
-	movaps	43(%eax), %xmm4
-	movaps	59(%eax), %xmm5
-	movaps	%xmm5, %xmm7
-	palignr	$5, %xmm4, %xmm5
-	palignr	$5, %xmm3, %xmm4
-	movaps	%xmm5, 48(%edx)
-	palignr	$5, %xmm2, %xmm3
-	lea	64(%eax), %eax
-	palignr	$5, %xmm1, %xmm2
-	movaps	%xmm4, 32(%edx)
-	movaps	%xmm3, 16(%edx)
-	movaps	%xmm7, %xmm1
-	movaps	%xmm2, (%edx)
-	lea	64(%edx), %edx
-	sub	$64, %ecx
-	ja	L(Shl5LoopStart)
-
-L(Shl5LoopLeave):
-	add	$32, %ecx
-	jle	L(shl_end_0)
-
-	movaps	11(%eax), %xmm2
-	movaps	27(%eax), %xmm3
-	palignr	$5, %xmm2, %xmm3
-	palignr	$5, %xmm1, %xmm2
-	movaps	%xmm2, (%edx)
-	movaps	%xmm3, 16(%edx)
-	lea	32(%edx, %ecx), %edx
-	lea	32(%eax, %ecx), %eax
-	POP (%edi)
-	BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
-
-	CFI_PUSH (%edi)
-
-	.p2align 4
-L(sh_5_no_prefetch):
-	lea	-32(%ecx), %ecx
-	lea	-5(%eax), %eax
-	xor	%edi, %edi
-
-	.p2align 4
-L(sh_5_no_prefetch_loop):
-	movdqa	16(%eax, %edi), %xmm2
-	sub	$32, %ecx
-	movdqa	32(%eax, %edi), %xmm3
-	movdqa	%xmm3, %xmm4
-	palignr	$5, %xmm2, %xmm3
-	palignr	$5, %xmm1, %xmm2
-	lea	32(%edi), %edi
-	movdqa	%xmm2, -32(%edx, %edi)
-	movdqa	%xmm3, -16(%edx, %edi)
-
-	jb	L(sh_5_end_no_prefetch_loop)
-
-	movdqa	16(%eax, %edi), %xmm2
-	sub	$32, %ecx
-	movdqa	32(%eax, %edi), %xmm3
-	movdqa	%xmm3, %xmm1
-	palignr	$5, %xmm2, %xmm3
-	palignr	$5, %xmm4, %xmm2
-	lea	32(%edi), %edi
-	movdqa	%xmm2, -32(%edx, %edi)
-	movdqa	%xmm3, -16(%edx, %edi)
-
-	jae	L(sh_5_no_prefetch_loop)
-
-L(sh_5_end_no_prefetch_loop):
-	lea	32(%ecx), %ecx
-	add	%ecx, %edi
-	add	%edi, %edx
-	lea	5(%edi, %eax), %eax
-	POP	(%edi)
-	BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
-
-	CFI_PUSH (%edi)
-
-	.p2align 4
-L(shl_6):
-#ifndef USE_AS_MEMMOVE
-	movaps	-6(%eax), %xmm1
-#else
-	movl	DEST+4(%esp), %edi
-	movaps	-6(%eax), %xmm1
-	movdqu	%xmm0, (%edi)
-#endif
-#ifdef DATA_CACHE_SIZE_HALF
-	cmp	$DATA_CACHE_SIZE_HALF, %ecx
-#else
-# if (defined SHARED || defined __PIC__)
-	SETUP_PIC_REG(bx)
-	add	$_GLOBAL_OFFSET_TABLE_, %ebx
-	cmp	__x86_data_cache_size_half@GOTOFF(%ebx), %ecx
-# else
-	cmp	__x86_data_cache_size_half, %ecx
-# endif
-#endif
-	jb L(sh_6_no_prefetch)
-
-	lea	-64(%ecx), %ecx
-
-	.p2align 4
-L(Shl6LoopStart):
-	prefetcht0 0x1c0(%eax)
-	prefetcht0 0x1c0(%edx)
-	movaps	10(%eax), %xmm2
-	movaps	26(%eax), %xmm3
-	movaps	42(%eax), %xmm4
-	movaps	58(%eax), %xmm5
-	movaps	%xmm5, %xmm7
-	palignr	$6, %xmm4, %xmm5
-	palignr	$6, %xmm3, %xmm4
-	movaps	%xmm5, 48(%edx)
-	palignr	$6, %xmm2, %xmm3
-	lea	64(%eax), %eax
-	palignr	$6, %xmm1, %xmm2
-	movaps	%xmm4, 32(%edx)
-	movaps	%xmm3, 16(%edx)
-	movaps	%xmm7, %xmm1
-	movaps	%xmm2, (%edx)
-	lea	64(%edx), %edx
-	sub	$64, %ecx
-	ja	L(Shl6LoopStart)
-
-L(Shl6LoopLeave):
-	add	$32, %ecx
-	jle	L(shl_end_0)
-
-	movaps	10(%eax), %xmm2
-	movaps	26(%eax), %xmm3
-	palignr	$6, %xmm2, %xmm3
-	palignr	$6, %xmm1, %xmm2
-	movaps	%xmm2, (%edx)
-	movaps	%xmm3, 16(%edx)
-	lea	32(%edx, %ecx), %edx
-	lea	32(%eax, %ecx), %eax
-	POP (%edi)
-	BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
-
-	CFI_PUSH (%edi)
-
-	.p2align 4
-L(sh_6_no_prefetch):
-	lea	-32(%ecx), %ecx
-	lea	-6(%eax), %eax
-	xor	%edi, %edi
-
-	.p2align 4
-L(sh_6_no_prefetch_loop):
-	movdqa	16(%eax, %edi), %xmm2
-	sub	$32, %ecx
-	movdqa	32(%eax, %edi), %xmm3
-	movdqa	%xmm3, %xmm4
-	palignr	$6, %xmm2, %xmm3
-	palignr	$6, %xmm1, %xmm2
-	lea	32(%edi), %edi
-	movdqa	%xmm2, -32(%edx, %edi)
-	movdqa	%xmm3, -16(%edx, %edi)
-
-	jb	L(sh_6_end_no_prefetch_loop)
-
-	movdqa	16(%eax, %edi), %xmm2
-	sub	$32, %ecx
-	movdqa	32(%eax, %edi), %xmm3
-	movdqa	%xmm3, %xmm1
-	palignr	$6, %xmm2, %xmm3
-	palignr	$6, %xmm4, %xmm2
-	lea	32(%edi), %edi
-	movdqa	%xmm2, -32(%edx, %edi)
-	movdqa	%xmm3, -16(%edx, %edi)
-
-	jae	L(sh_6_no_prefetch_loop)
-
-L(sh_6_end_no_prefetch_loop):
-	lea	32(%ecx), %ecx
-	add	%ecx, %edi
-	add	%edi, %edx
-	lea	6(%edi, %eax), %eax
-	POP	(%edi)
-	BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
-
-	CFI_PUSH (%edi)
-
-	.p2align 4
-L(shl_7):
-#ifndef USE_AS_MEMMOVE
-	movaps	-7(%eax), %xmm1
-#else
-	movl	DEST+4(%esp), %edi
-	movaps	-7(%eax), %xmm1
-	movdqu	%xmm0, (%edi)
-#endif
-#ifdef DATA_CACHE_SIZE_HALF
-	cmp	$DATA_CACHE_SIZE_HALF, %ecx
-#else
-# if (defined SHARED || defined __PIC__)
-	SETUP_PIC_REG(bx)
-	add	$_GLOBAL_OFFSET_TABLE_, %ebx
-	cmp	__x86_data_cache_size_half@GOTOFF(%ebx), %ecx
-# else
-	cmp	__x86_data_cache_size_half, %ecx
-# endif
-#endif
-	jb L(sh_7_no_prefetch)
-
-	lea	-64(%ecx), %ecx
-
-	.p2align 4
-L(Shl7LoopStart):
-	prefetcht0 0x1c0(%eax)
-	prefetcht0 0x1c0(%edx)
-	movaps	9(%eax), %xmm2
-	movaps	25(%eax), %xmm3
-	movaps	41(%eax), %xmm4
-	movaps	57(%eax), %xmm5
-	movaps	%xmm5, %xmm7
-	palignr	$7, %xmm4, %xmm5
-	palignr	$7, %xmm3, %xmm4
-	movaps	%xmm5, 48(%edx)
-	palignr	$7, %xmm2, %xmm3
-	lea	64(%eax), %eax
-	palignr	$7, %xmm1, %xmm2
-	movaps	%xmm4, 32(%edx)
-	movaps	%xmm3, 16(%edx)
-	movaps	%xmm7, %xmm1
-	movaps	%xmm2, (%edx)
-	lea	64(%edx), %edx
-	sub	$64, %ecx
-	ja	L(Shl7LoopStart)
-
-L(Shl7LoopLeave):
-	add	$32, %ecx
-	jle	L(shl_end_0)
-
-	movaps	9(%eax), %xmm2
-	movaps	25(%eax), %xmm3
-	palignr	$7, %xmm2, %xmm3
-	palignr	$7, %xmm1, %xmm2
-	movaps	%xmm2, (%edx)
-	movaps	%xmm3, 16(%edx)
-	lea	32(%edx, %ecx), %edx
-	lea	32(%eax, %ecx), %eax
-	POP (%edi)
-	BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
-
-	CFI_PUSH (%edi)
-
-	.p2align 4
-L(sh_7_no_prefetch):
-	lea	-32(%ecx), %ecx
-	lea	-7(%eax), %eax
-	xor	%edi, %edi
-
-	.p2align 4
-L(sh_7_no_prefetch_loop):
-	movdqa	16(%eax, %edi), %xmm2
-	sub	$32, %ecx
-	movdqa	32(%eax, %edi), %xmm3
-	movdqa	%xmm3, %xmm4
-	palignr	$7, %xmm2, %xmm3
-	palignr	$7, %xmm1, %xmm2
-	lea	32(%edi), %edi
-	movdqa	%xmm2, -32(%edx, %edi)
-	movdqa	%xmm3, -16(%edx, %edi)
-	jb	L(sh_7_end_no_prefetch_loop)
-
-	movdqa	16(%eax, %edi), %xmm2
-	sub	$32, %ecx
-	movdqa	32(%eax, %edi), %xmm3
-	movdqa	%xmm3, %xmm1
-	palignr	$7, %xmm2, %xmm3
-	palignr	$7, %xmm4, %xmm2
-	lea	32(%edi), %edi
-	movdqa	%xmm2, -32(%edx, %edi)
-	movdqa	%xmm3, -16(%edx, %edi)
-	jae	L(sh_7_no_prefetch_loop)
-
-L(sh_7_end_no_prefetch_loop):
-	lea	32(%ecx), %ecx
-	add	%ecx, %edi
-	add	%edi, %edx
-	lea	7(%edi, %eax), %eax
-	POP	(%edi)
-	BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
-
-	CFI_PUSH (%edi)
-
-	.p2align 4
-L(shl_8):
-#ifndef USE_AS_MEMMOVE
-	movaps	-8(%eax), %xmm1
-#else
-	movl	DEST+4(%esp), %edi
-	movaps	-8(%eax), %xmm1
-	movdqu	%xmm0, (%edi)
-#endif
-#ifdef DATA_CACHE_SIZE_HALF
-	cmp	$DATA_CACHE_SIZE_HALF, %ecx
-#else
-# if (defined SHARED || defined __PIC__)
-	SETUP_PIC_REG(bx)
-	add	$_GLOBAL_OFFSET_TABLE_, %ebx
-	cmp	__x86_data_cache_size_half@GOTOFF(%ebx), %ecx
-# else
-	cmp	__x86_data_cache_size_half, %ecx
-# endif
-#endif
-	jb L(sh_8_no_prefetch)
-
-	lea	-64(%ecx), %ecx
-
-	.p2align 4
-L(Shl8LoopStart):
-	prefetcht0 0x1c0(%eax)
-	prefetcht0 0x1c0(%edx)
-	movaps	8(%eax), %xmm2
-	movaps	24(%eax), %xmm3
-	movaps	40(%eax), %xmm4
-	movaps	56(%eax), %xmm5
-	movaps	%xmm5, %xmm7
-	palignr	$8, %xmm4, %xmm5
-	palignr	$8, %xmm3, %xmm4
-	movaps	%xmm5, 48(%edx)
-	palignr	$8, %xmm2, %xmm3
-	lea	64(%eax), %eax
-	palignr	$8, %xmm1, %xmm2
-	movaps	%xmm4, 32(%edx)
-	movaps	%xmm3, 16(%edx)
-	movaps	%xmm7, %xmm1
-	movaps	%xmm2, (%edx)
-	lea	64(%edx), %edx
-	sub	$64, %ecx
-	ja	L(Shl8LoopStart)
-
-L(LoopLeave8):
-	add	$32, %ecx
-	jle	L(shl_end_0)
-
-	movaps	8(%eax), %xmm2
-	movaps	24(%eax), %xmm3
-	palignr	$8, %xmm2, %xmm3
-	palignr	$8, %xmm1, %xmm2
-	movaps	%xmm2, (%edx)
-	movaps	%xmm3, 16(%edx)
-	lea	32(%edx, %ecx), %edx
-	lea	32(%eax, %ecx), %eax
-	POP (%edi)
-	BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
-
-	CFI_PUSH (%edi)
-
-	.p2align 4
-L(sh_8_no_prefetch):
-	lea	-32(%ecx), %ecx
-	lea	-8(%eax), %eax
-	xor	%edi, %edi
-
-	.p2align 4
-L(sh_8_no_prefetch_loop):
-	movdqa	16(%eax, %edi), %xmm2
-	sub	$32, %ecx
-	movdqa	32(%eax, %edi), %xmm3
-	movdqa	%xmm3, %xmm4
-	palignr	$8, %xmm2, %xmm3
-	palignr	$8, %xmm1, %xmm2
-	lea	32(%edi), %edi
-	movdqa	%xmm2, -32(%edx, %edi)
-	movdqa	%xmm3, -16(%edx, %edi)
-	jb	L(sh_8_end_no_prefetch_loop)
-
-	movdqa	16(%eax, %edi), %xmm2
-	sub	$32, %ecx
-	movdqa	32(%eax, %edi), %xmm3
-	movdqa	%xmm3, %xmm1
-	palignr	$8, %xmm2, %xmm3
-	palignr	$8, %xmm4, %xmm2
-	lea	32(%edi), %edi
-	movdqa	%xmm2, -32(%edx, %edi)
-	movdqa	%xmm3, -16(%edx, %edi)
-	jae	L(sh_8_no_prefetch_loop)
-
-L(sh_8_end_no_prefetch_loop):
-	lea	32(%ecx), %ecx
-	add	%ecx, %edi
-	add	%edi, %edx
-	lea	8(%edi, %eax), %eax
-	POP	(%edi)
-	BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
-
-	CFI_PUSH (%edi)
-
-	.p2align 4
-L(shl_9):
-#ifndef USE_AS_MEMMOVE
-	movaps	-9(%eax), %xmm1
-#else
-	movl	DEST+4(%esp), %edi
-	movaps	-9(%eax), %xmm1
-	movdqu	%xmm0, (%edi)
-#endif
-#ifdef DATA_CACHE_SIZE_HALF
-	cmp	$DATA_CACHE_SIZE_HALF, %ecx
-#else
-# if (defined SHARED || defined __PIC__)
-	SETUP_PIC_REG(bx)
-	add	$_GLOBAL_OFFSET_TABLE_, %ebx
-	cmp	__x86_data_cache_size_half@GOTOFF(%ebx), %ecx
-# else
-	cmp	__x86_data_cache_size_half, %ecx
-# endif
-#endif
-	jb L(sh_9_no_prefetch)
-
-	lea	-64(%ecx), %ecx
-
-	.p2align 4
-L(Shl9LoopStart):
-	prefetcht0 0x1c0(%eax)
-	prefetcht0 0x1c0(%edx)
-	movaps	7(%eax), %xmm2
-	movaps	23(%eax), %xmm3
-	movaps	39(%eax), %xmm4
-	movaps	55(%eax), %xmm5
-	movaps	%xmm5, %xmm7
-	palignr	$9, %xmm4, %xmm5
-	palignr	$9, %xmm3, %xmm4
-	movaps	%xmm5, 48(%edx)
-	palignr	$9, %xmm2, %xmm3
-	lea	64(%eax), %eax
-	palignr	$9, %xmm1, %xmm2
-	movaps	%xmm4, 32(%edx)
-	movaps	%xmm3, 16(%edx)
-	movaps	%xmm7, %xmm1
-	movaps	%xmm2, (%edx)
-	lea	64(%edx), %edx
-	sub	$64, %ecx
-	ja	L(Shl9LoopStart)
-
-L(Shl9LoopLeave):
-	add	$32, %ecx
-	jle	L(shl_end_0)
-
-	movaps	7(%eax), %xmm2
-	movaps	23(%eax), %xmm3
-	palignr	$9, %xmm2, %xmm3
-	palignr	$9, %xmm1, %xmm2
-
-	movaps	%xmm2, (%edx)
-	movaps	%xmm3, 16(%edx)
-	lea	32(%edx, %ecx), %edx
-	lea	32(%eax, %ecx), %eax
-	POP (%edi)
-	BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
-
-	CFI_PUSH (%edi)
-
-	.p2align 4
-L(sh_9_no_prefetch):
-	lea	-32(%ecx), %ecx
-	lea	-9(%eax), %eax
-	xor	%edi, %edi
-
-	.p2align 4
-L(sh_9_no_prefetch_loop):
-	movdqa	16(%eax, %edi), %xmm2
-	sub	$32, %ecx
-	movdqa	32(%eax, %edi), %xmm3
-	movdqa	%xmm3, %xmm4
-	palignr	$9, %xmm2, %xmm3
-	palignr	$9, %xmm1, %xmm2
-	lea	32(%edi), %edi
-	movdqa	%xmm2, -32(%edx, %edi)
-	movdqa	%xmm3, -16(%edx, %edi)
-	jb	L(sh_9_end_no_prefetch_loop)
-
-	movdqa	16(%eax, %edi), %xmm2
-	sub	$32, %ecx
-	movdqa	32(%eax, %edi), %xmm3
-	movdqa	%xmm3, %xmm1
-	palignr	$9, %xmm2, %xmm3
-	palignr	$9, %xmm4, %xmm2
-	lea	32(%edi), %edi
-	movdqa	%xmm2, -32(%edx, %edi)
-	movdqa	%xmm3, -16(%edx, %edi)
-	jae	L(sh_9_no_prefetch_loop)
-
-L(sh_9_end_no_prefetch_loop):
-	lea	32(%ecx), %ecx
-	add	%ecx, %edi
-	add	%edi, %edx
-	lea	9(%edi, %eax), %eax
-	POP	(%edi)
-	BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
-
-	CFI_PUSH (%edi)
-
-	.p2align 4
-L(shl_10):
-#ifndef USE_AS_MEMMOVE
-	movaps	-10(%eax), %xmm1
-#else
-	movl	DEST+4(%esp), %edi
-	movaps	-10(%eax), %xmm1
-	movdqu	%xmm0, (%edi)
-#endif
-#ifdef DATA_CACHE_SIZE_HALF
-	cmp	$DATA_CACHE_SIZE_HALF, %ecx
-#else
-# if (defined SHARED || defined __PIC__)
-	SETUP_PIC_REG(bx)
-	add	$_GLOBAL_OFFSET_TABLE_, %ebx
-	cmp	__x86_data_cache_size_half@GOTOFF(%ebx), %ecx
-# else
-	cmp	__x86_data_cache_size_half, %ecx
-# endif
-#endif
-	jb L(sh_10_no_prefetch)
-
-	lea	-64(%ecx), %ecx
-
-	.p2align 4
-L(Shl10LoopStart):
-	prefetcht0 0x1c0(%eax)
-	prefetcht0 0x1c0(%edx)
-	movaps	6(%eax), %xmm2
-	movaps	22(%eax), %xmm3
-	movaps	38(%eax), %xmm4
-	movaps	54(%eax), %xmm5
-	movaps	%xmm5, %xmm7
-	palignr	$10, %xmm4, %xmm5
-	palignr	$10, %xmm3, %xmm4
-	movaps	%xmm5, 48(%edx)
-	palignr	$10, %xmm2, %xmm3
-	lea	64(%eax), %eax
-	palignr	$10, %xmm1, %xmm2
-	movaps	%xmm4, 32(%edx)
-	movaps	%xmm3, 16(%edx)
-	movaps	%xmm7, %xmm1
-	movaps	%xmm2, (%edx)
-	lea	64(%edx), %edx
-	sub	$64, %ecx
-	ja	L(Shl10LoopStart)
-
-L(Shl10LoopLeave):
-	add	$32, %ecx
-	jle	L(shl_end_0)
-
-	movaps	6(%eax), %xmm2
-	movaps	22(%eax), %xmm3
-	palignr	$10, %xmm2, %xmm3
-	palignr	$10, %xmm1, %xmm2
-
-	movaps	%xmm2, (%edx)
-	movaps	%xmm3, 16(%edx)
-	lea	32(%edx, %ecx), %edx
-	lea	32(%eax, %ecx), %eax
-	POP (%edi)
-	BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
-
-	CFI_PUSH (%edi)
-
-	.p2align 4
-L(sh_10_no_prefetch):
-	lea	-32(%ecx), %ecx
-	lea	-10(%eax), %eax
-	xor	%edi, %edi
-
-	.p2align 4
-L(sh_10_no_prefetch_loop):
-	movdqa	16(%eax, %edi), %xmm2
-	sub	$32, %ecx
-	movdqa	32(%eax, %edi), %xmm3
-	movdqa	%xmm3, %xmm4
-	palignr	$10, %xmm2, %xmm3
-	palignr	$10, %xmm1, %xmm2
-	lea	32(%edi), %edi
-	movdqa	%xmm2, -32(%edx, %edi)
-	movdqa	%xmm3, -16(%edx, %edi)
-	jb	L(sh_10_end_no_prefetch_loop)
-
-	movdqa	16(%eax, %edi), %xmm2
-	sub	$32, %ecx
-	movdqa	32(%eax, %edi), %xmm3
-	movdqa	%xmm3, %xmm1
-	palignr	$10, %xmm2, %xmm3
-	palignr	$10, %xmm4, %xmm2
-	lea	32(%edi), %edi
-	movdqa	%xmm2, -32(%edx, %edi)
-	movdqa	%xmm3, -16(%edx, %edi)
-	jae	L(sh_10_no_prefetch_loop)
-
-L(sh_10_end_no_prefetch_loop):
-	lea	32(%ecx), %ecx
-	add	%ecx, %edi
-	add	%edi, %edx
-	lea	10(%edi, %eax), %eax
-	POP	(%edi)
-	BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
-
-	CFI_PUSH (%edi)
-
-	.p2align 4
-L(shl_11):
-#ifndef USE_AS_MEMMOVE
-	movaps	-11(%eax), %xmm1
-#else
-	movl	DEST+4(%esp), %edi
-	movaps	-11(%eax), %xmm1
-	movdqu	%xmm0, (%edi)
-#endif
-#ifdef DATA_CACHE_SIZE_HALF
-	cmp	$DATA_CACHE_SIZE_HALF, %ecx
-#else
-# if (defined SHARED || defined __PIC__)
-	SETUP_PIC_REG(bx)
-	add	$_GLOBAL_OFFSET_TABLE_, %ebx
-	cmp	__x86_data_cache_size_half@GOTOFF(%ebx), %ecx
-# else
-	cmp	__x86_data_cache_size_half, %ecx
-# endif
-#endif
-	jb L(sh_11_no_prefetch)
-
-	lea	-64(%ecx), %ecx
-
-	.p2align 4
-L(Shl11LoopStart):
-	prefetcht0 0x1c0(%eax)
-	prefetcht0 0x1c0(%edx)
-	movaps	5(%eax), %xmm2
-	movaps	21(%eax), %xmm3
-	movaps	37(%eax), %xmm4
-	movaps	53(%eax), %xmm5
-	movaps	%xmm5, %xmm7
-	palignr	$11, %xmm4, %xmm5
-	palignr	$11, %xmm3, %xmm4
-	movaps	%xmm5, 48(%edx)
-	palignr	$11, %xmm2, %xmm3
-	lea	64(%eax), %eax
-	palignr	$11, %xmm1, %xmm2
-	movaps	%xmm4, 32(%edx)
-	movaps	%xmm3, 16(%edx)
-	movaps	%xmm7, %xmm1
-	movaps	%xmm2, (%edx)
-	lea	64(%edx), %edx
-	sub	$64, %ecx
-	ja	L(Shl11LoopStart)
-
-L(Shl11LoopLeave):
-	add	$32, %ecx
-	jle	L(shl_end_0)
-
-	movaps	5(%eax), %xmm2
-	movaps	21(%eax), %xmm3
-	palignr	$11, %xmm2, %xmm3
-	palignr	$11, %xmm1, %xmm2
-
-	movaps	%xmm2, (%edx)
-	movaps	%xmm3, 16(%edx)
-	lea	32(%edx, %ecx), %edx
-	lea	32(%eax, %ecx), %eax
-	POP (%edi)
-	BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
-
-	CFI_PUSH (%edi)
-
-	.p2align 4
-L(sh_11_no_prefetch):
-	lea	-32(%ecx), %ecx
-	lea	-11(%eax), %eax
-	xor	%edi, %edi
-
-	.p2align 4
-L(sh_11_no_prefetch_loop):
-	movdqa	16(%eax, %edi), %xmm2
-	sub	$32, %ecx
-	movdqa	32(%eax, %edi), %xmm3
-	movdqa	%xmm3, %xmm4
-	palignr	$11, %xmm2, %xmm3
-	palignr	$11, %xmm1, %xmm2
-	lea	32(%edi), %edi
-	movdqa	%xmm2, -32(%edx, %edi)
-	movdqa	%xmm3, -16(%edx, %edi)
-	jb	L(sh_11_end_no_prefetch_loop)
-
-	movdqa	16(%eax, %edi), %xmm2
-	sub	$32, %ecx
-	movdqa	32(%eax, %edi), %xmm3
-	movdqa	%xmm3, %xmm1
-	palignr	$11, %xmm2, %xmm3
-	palignr	$11, %xmm4, %xmm2
-	lea	32(%edi), %edi
-	movdqa	%xmm2, -32(%edx, %edi)
-	movdqa	%xmm3, -16(%edx, %edi)
-	jae	L(sh_11_no_prefetch_loop)
-
-L(sh_11_end_no_prefetch_loop):
-	lea	32(%ecx), %ecx
-	add	%ecx, %edi
-	add	%edi, %edx
-	lea	11(%edi, %eax), %eax
-	POP	(%edi)
-	BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
-
-	CFI_PUSH (%edi)
-
-	.p2align 4
-L(shl_12):
-#ifndef USE_AS_MEMMOVE
-	movaps	-12(%eax), %xmm1
-#else
-	movl	DEST+4(%esp), %edi
-	movaps	-12(%eax), %xmm1
-	movdqu	%xmm0, (%edi)
-#endif
-#ifdef DATA_CACHE_SIZE_HALF
-	cmp	$DATA_CACHE_SIZE_HALF, %ecx
-#else
-# if (defined SHARED || defined __PIC__)
-	SETUP_PIC_REG(bx)
-	add	$_GLOBAL_OFFSET_TABLE_, %ebx
-	cmp	__x86_data_cache_size_half@GOTOFF(%ebx), %ecx
-# else
-	cmp	__x86_data_cache_size_half, %ecx
-# endif
-#endif
-	jb L(sh_12_no_prefetch)
-
-	lea	-64(%ecx), %ecx
-
-	.p2align 4
-L(Shl12LoopStart):
-	prefetcht0 0x1c0(%eax)
-	prefetcht0 0x1c0(%edx)
-	movaps	4(%eax), %xmm2
-	movaps	20(%eax), %xmm3
-	movaps	36(%eax), %xmm4
-	movaps	52(%eax), %xmm5
-	movaps	%xmm5, %xmm7
-	palignr	$12, %xmm4, %xmm5
-	palignr	$12, %xmm3, %xmm4
-	movaps	%xmm5, 48(%edx)
-	palignr	$12, %xmm2, %xmm3
-	lea	64(%eax), %eax
-	palignr	$12, %xmm1, %xmm2
-	movaps	%xmm4, 32(%edx)
-	movaps	%xmm3, 16(%edx)
-	movaps	%xmm7, %xmm1
-	movaps	%xmm2, (%edx)
-	lea	64(%edx), %edx
-	sub	$64, %ecx
-	ja	L(Shl12LoopStart)
-
-L(Shl12LoopLeave):
-	add	$32, %ecx
-	jle	L(shl_end_0)
-
-	movaps	4(%eax), %xmm2
-	movaps	20(%eax), %xmm3
-	palignr	$12, %xmm2, %xmm3
-	palignr	$12, %xmm1, %xmm2
-
-	movaps	%xmm2, (%edx)
-	movaps	%xmm3, 16(%edx)
-	lea	32(%edx, %ecx), %edx
-	lea	32(%eax, %ecx), %eax
-	POP (%edi)
-	BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
-
-	CFI_PUSH (%edi)
-
-	.p2align 4
-L(sh_12_no_prefetch):
-	lea	-32(%ecx), %ecx
-	lea	-12(%eax), %eax
-	xor	%edi, %edi
-
-	.p2align 4
-L(sh_12_no_prefetch_loop):
-	movdqa	16(%eax, %edi), %xmm2
-	sub	$32, %ecx
-	movdqa	32(%eax, %edi), %xmm3
-	movdqa	%xmm3, %xmm4
-	palignr	$12, %xmm2, %xmm3
-	palignr	$12, %xmm1, %xmm2
-	lea	32(%edi), %edi
-	movdqa	%xmm2, -32(%edx, %edi)
-	movdqa	%xmm3, -16(%edx, %edi)
-	jb	L(sh_12_end_no_prefetch_loop)
-
-	movdqa	16(%eax, %edi), %xmm2
-	sub	$32, %ecx
-	movdqa	32(%eax, %edi), %xmm3
-	movdqa	%xmm3, %xmm1
-	palignr	$12, %xmm2, %xmm3
-	palignr	$12, %xmm4, %xmm2
-	lea	32(%edi), %edi
-	movdqa	%xmm2, -32(%edx, %edi)
-	movdqa	%xmm3, -16(%edx, %edi)
-	jae	L(sh_12_no_prefetch_loop)
-
-L(sh_12_end_no_prefetch_loop):
-	lea	32(%ecx), %ecx
-	add	%ecx, %edi
-	add	%edi, %edx
-	lea	12(%edi, %eax), %eax
-	POP	(%edi)
-	BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
-
-	CFI_PUSH (%edi)
-
-	.p2align 4
-L(shl_13):
-#ifndef USE_AS_MEMMOVE
-	movaps	-13(%eax), %xmm1
-#else
-	movl	DEST+4(%esp), %edi
-	movaps	-13(%eax), %xmm1
-	movdqu	%xmm0, (%edi)
-#endif
-#ifdef DATA_CACHE_SIZE_HALF
-	cmp	$DATA_CACHE_SIZE_HALF, %ecx
-#else
-# if (defined SHARED || defined __PIC__)
-	SETUP_PIC_REG(bx)
-	add	$_GLOBAL_OFFSET_TABLE_, %ebx
-	cmp	__x86_data_cache_size_half@GOTOFF(%ebx), %ecx
-# else
-	cmp	__x86_data_cache_size_half, %ecx
-# endif
-#endif
-	jb L(sh_13_no_prefetch)
-
-	lea	-64(%ecx), %ecx
-
-	.p2align 4
-L(Shl13LoopStart):
-	prefetcht0 0x1c0(%eax)
-	prefetcht0 0x1c0(%edx)
-	movaps	3(%eax), %xmm2
-	movaps	19(%eax), %xmm3
-	movaps	35(%eax), %xmm4
-	movaps	51(%eax), %xmm5
-	movaps	%xmm5, %xmm7
-	palignr	$13, %xmm4, %xmm5
-	palignr	$13, %xmm3, %xmm4
-	movaps	%xmm5, 48(%edx)
-	palignr	$13, %xmm2, %xmm3
-	lea	64(%eax), %eax
-	palignr	$13, %xmm1, %xmm2
-	movaps	%xmm4, 32(%edx)
-	movaps	%xmm3, 16(%edx)
-	movaps	%xmm7, %xmm1
-	movaps	%xmm2, (%edx)
-	lea	64(%edx), %edx
-	sub	$64, %ecx
-	ja	L(Shl13LoopStart)
-
-L(Shl13LoopLeave):
-	add	$32, %ecx
-	jle	L(shl_end_0)
-
-	movaps	3(%eax), %xmm2
-	movaps	19(%eax), %xmm3
-	palignr	$13, %xmm2, %xmm3
-	palignr	$13, %xmm1, %xmm2
-
-	movaps	%xmm2, (%edx)
-	movaps	%xmm3, 16(%edx)
-	lea	32(%edx, %ecx), %edx
-	lea	32(%eax, %ecx), %eax
-	POP (%edi)
-	BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
-
-	CFI_PUSH (%edi)
-
-	.p2align 4
-L(sh_13_no_prefetch):
-	lea	-32(%ecx), %ecx
-	lea	-13(%eax), %eax
-	xor	%edi, %edi
-
-	.p2align 4
-L(sh_13_no_prefetch_loop):
-	movdqa	16(%eax, %edi), %xmm2
-	sub	$32, %ecx
-	movdqa	32(%eax, %edi), %xmm3
-	movdqa	%xmm3, %xmm4
-	palignr	$13, %xmm2, %xmm3
-	palignr	$13, %xmm1, %xmm2
-	lea	32(%edi), %edi
-	movdqa	%xmm2, -32(%edx, %edi)
-	movdqa	%xmm3, -16(%edx, %edi)
-	jb	L(sh_13_end_no_prefetch_loop)
-
-	movdqa	16(%eax, %edi), %xmm2
-	sub	$32, %ecx
-	movdqa	32(%eax, %edi), %xmm3
-	movdqa	%xmm3, %xmm1
-	palignr	$13, %xmm2, %xmm3
-	palignr	$13, %xmm4, %xmm2
-	lea	32(%edi), %edi
-	movdqa	%xmm2, -32(%edx, %edi)
-	movdqa	%xmm3, -16(%edx, %edi)
-	jae	L(sh_13_no_prefetch_loop)
-
-L(sh_13_end_no_prefetch_loop):
-	lea	32(%ecx), %ecx
-	add	%ecx, %edi
-	add	%edi, %edx
-	lea	13(%edi, %eax), %eax
-	POP	(%edi)
-	BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
-
-	CFI_PUSH (%edi)
-
-	.p2align 4
-L(shl_14):
-#ifndef USE_AS_MEMMOVE
-	movaps	-14(%eax), %xmm1
-#else
-	movl	DEST+4(%esp), %edi
-	movaps	-14(%eax), %xmm1
-	movdqu	%xmm0, (%edi)
-#endif
-#ifdef DATA_CACHE_SIZE_HALF
-	cmp	$DATA_CACHE_SIZE_HALF, %ecx
-#else
-# if (defined SHARED || defined __PIC__)
-	SETUP_PIC_REG(bx)
-	add	$_GLOBAL_OFFSET_TABLE_, %ebx
-	cmp	__x86_data_cache_size_half@GOTOFF(%ebx), %ecx
-# else
-	cmp	__x86_data_cache_size_half, %ecx
-# endif
-#endif
-	jb L(sh_14_no_prefetch)
-
-	lea	-64(%ecx), %ecx
-
-	.p2align 4
-L(Shl14LoopStart):
-	prefetcht0 0x1c0(%eax)
-	prefetcht0 0x1c0(%edx)
-	movaps	2(%eax), %xmm2
-	movaps	18(%eax), %xmm3
-	movaps	34(%eax), %xmm4
-	movaps	50(%eax), %xmm5
-	movaps	%xmm5, %xmm7
-	palignr	$14, %xmm4, %xmm5
-	palignr	$14, %xmm3, %xmm4
-	movaps	%xmm5, 48(%edx)
-	palignr	$14, %xmm2, %xmm3
-	lea	64(%eax), %eax
-	palignr	$14, %xmm1, %xmm2
-	movaps	%xmm4, 32(%edx)
-	movaps	%xmm3, 16(%edx)
-	movaps	%xmm7, %xmm1
-	movaps	%xmm2, (%edx)
-	lea	64(%edx), %edx
-	sub	$64, %ecx
-	ja	L(Shl14LoopStart)
-
-L(Shl14LoopLeave):
-	add	$32, %ecx
-	jle	L(shl_end_0)
-
-	movaps	2(%eax), %xmm2
-	movaps	18(%eax), %xmm3
-	palignr	$14, %xmm2, %xmm3
-	palignr	$14, %xmm1, %xmm2
-
-	movaps	%xmm2, (%edx)
-	movaps	%xmm3, 16(%edx)
-	lea	32(%edx, %ecx), %edx
-	lea	32(%eax, %ecx), %eax
-	POP (%edi)
-	BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
-
-	CFI_PUSH (%edi)
-
-	.p2align 4
-L(sh_14_no_prefetch):
-	lea	-32(%ecx), %ecx
-	lea	-14(%eax), %eax
-	xor	%edi, %edi
-
-	.p2align 4
-L(sh_14_no_prefetch_loop):
-	movdqa	16(%eax, %edi), %xmm2
-	sub	$32, %ecx
-	movdqa	32(%eax, %edi), %xmm3
-	movdqa	%xmm3, %xmm4
-	palignr	$14, %xmm2, %xmm3
-	palignr	$14, %xmm1, %xmm2
-	lea	32(%edi), %edi
-	movdqa	%xmm2, -32(%edx, %edi)
-	movdqa	%xmm3, -16(%edx, %edi)
-	jb	L(sh_14_end_no_prefetch_loop)
-
-	movdqa	16(%eax, %edi), %xmm2
-	sub	$32, %ecx
-	movdqa	32(%eax, %edi), %xmm3
-	movdqa	%xmm3, %xmm1
-	palignr	$14, %xmm2, %xmm3
-	palignr	$14, %xmm4, %xmm2
-	lea	32(%edi), %edi
-	movdqa	%xmm2, -32(%edx, %edi)
-	movdqa	%xmm3, -16(%edx, %edi)
-	jae	L(sh_14_no_prefetch_loop)
-
-L(sh_14_end_no_prefetch_loop):
-	lea	32(%ecx), %ecx
-	add	%ecx, %edi
-	add	%edi, %edx
-	lea	14(%edi, %eax), %eax
-	POP	(%edi)
-	BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
-
-	CFI_PUSH (%edi)
-
-	.p2align 4
-L(shl_15):
-#ifndef USE_AS_MEMMOVE
-	movaps	-15(%eax), %xmm1
-#else
-	movl	DEST+4(%esp), %edi
-	movaps	-15(%eax), %xmm1
-	movdqu	%xmm0, (%edi)
-#endif
-#ifdef DATA_CACHE_SIZE_HALF
-	cmp	$DATA_CACHE_SIZE_HALF, %ecx
-#else
-# if (defined SHARED || defined __PIC__)
-	SETUP_PIC_REG(bx)
-	add	$_GLOBAL_OFFSET_TABLE_, %ebx
-	cmp	__x86_data_cache_size_half@GOTOFF(%ebx), %ecx
-# else
-	cmp	__x86_data_cache_size_half, %ecx
-# endif
-#endif
-	jb L(sh_15_no_prefetch)
-
-	lea	-64(%ecx), %ecx
-
-	.p2align 4
-L(Shl15LoopStart):
-	prefetcht0 0x1c0(%eax)
-	prefetcht0 0x1c0(%edx)
-	movaps	1(%eax), %xmm2
-	movaps	17(%eax), %xmm3
-	movaps	33(%eax), %xmm4
-	movaps	49(%eax), %xmm5
-	movaps	%xmm5, %xmm7
-	palignr	$15, %xmm4, %xmm5
-	palignr	$15, %xmm3, %xmm4
-	movaps	%xmm5, 48(%edx)
-	palignr	$15, %xmm2, %xmm3
-	lea	64(%eax), %eax
-	palignr	$15, %xmm1, %xmm2
-	movaps	%xmm4, 32(%edx)
-	movaps	%xmm3, 16(%edx)
-	movaps	%xmm7, %xmm1
-	movaps	%xmm2, (%edx)
-	lea	64(%edx), %edx
-	sub	$64, %ecx
-	ja	L(Shl15LoopStart)
-
-L(Shl15LoopLeave):
-	add	$32, %ecx
-	jle	L(shl_end_0)
-
-	movaps	1(%eax), %xmm2
-	movaps	17(%eax), %xmm3
-	palignr	$15, %xmm2, %xmm3
-	palignr	$15, %xmm1, %xmm2
-
-	movaps	%xmm2, (%edx)
-	movaps	%xmm3, 16(%edx)
-	lea	32(%edx, %ecx), %edx
-	lea	32(%eax, %ecx), %eax
-	POP (%edi)
-	BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
-
-	CFI_PUSH (%edi)
-
-	.p2align 4
-L(sh_15_no_prefetch):
-	lea	-32(%ecx), %ecx
-	lea	-15(%eax), %eax
-	xor	%edi, %edi
-
-	.p2align 4
-L(sh_15_no_prefetch_loop):
-	movdqa	16(%eax, %edi), %xmm2
-	sub	$32, %ecx
-	movdqa	32(%eax, %edi), %xmm3
-	movdqa	%xmm3, %xmm4
-	palignr	$15, %xmm2, %xmm3
-	palignr	$15, %xmm1, %xmm2
-	lea	32(%edi), %edi
-	movdqa	%xmm2, -32(%edx, %edi)
-	movdqa	%xmm3, -16(%edx, %edi)
-	jb	L(sh_15_end_no_prefetch_loop)
-
-	movdqa	16(%eax, %edi), %xmm2
-	sub	$32, %ecx
-	movdqa	32(%eax, %edi), %xmm3
-	movdqa	%xmm3, %xmm1
-	palignr	$15, %xmm2, %xmm3
-	palignr	$15, %xmm4, %xmm2
-	lea	32(%edi), %edi
-	movdqa	%xmm2, -32(%edx, %edi)
-	movdqa	%xmm3, -16(%edx, %edi)
-	jae	L(sh_15_no_prefetch_loop)
-
-L(sh_15_end_no_prefetch_loop):
-	lea	32(%ecx), %ecx
-	add	%ecx, %edi
-	add	%edi, %edx
-	lea	15(%edi, %eax), %eax
-	POP	(%edi)
-	BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
-
-	CFI_PUSH (%edi)
-
-	.p2align 4
-L(shl_end_0):
-	lea	32(%ecx), %ecx
-	lea	(%edx, %ecx), %edx
-	lea	(%eax, %ecx), %eax
-	POP	(%edi)
-	BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
-
-	.p2align 4
-L(fwd_write_44bytes):
-	movq	-44(%eax), %xmm0
-	movq	%xmm0, -44(%edx)
-L(fwd_write_36bytes):
-	movq	-36(%eax), %xmm0
-	movq	%xmm0, -36(%edx)
-L(fwd_write_28bytes):
-	movq	-28(%eax), %xmm0
-	movq	%xmm0, -28(%edx)
-L(fwd_write_20bytes):
-	movq	-20(%eax), %xmm0
-	movq	%xmm0, -20(%edx)
-L(fwd_write_12bytes):
-	movq	-12(%eax), %xmm0
-	movq	%xmm0, -12(%edx)
-L(fwd_write_4bytes):
-	movl	-4(%eax), %ecx
-	movl	%ecx, -4(%edx)
-#ifdef USE_AS_MEMPCPY
-	movl	%edx, %eax
-#else
-	movl	DEST(%esp), %eax
-#endif
-	RETURN
-
-	.p2align 4
-L(fwd_write_40bytes):
-	movq	-40(%eax), %xmm0
-	movq	%xmm0, -40(%edx)
-L(fwd_write_32bytes):
-	movq	-32(%eax), %xmm0
-	movq	%xmm0, -32(%edx)
-L(fwd_write_24bytes):
-	movq	-24(%eax), %xmm0
-	movq	%xmm0, -24(%edx)
-L(fwd_write_16bytes):
-	movq	-16(%eax), %xmm0
-	movq	%xmm0, -16(%edx)
-L(fwd_write_8bytes):
-	movq	-8(%eax), %xmm0
-	movq	%xmm0, -8(%edx)
-L(fwd_write_0bytes):
-#ifdef USE_AS_MEMPCPY
-	movl	%edx, %eax
-#else
-	movl	DEST(%esp), %eax
-#endif
-	RETURN
-
-	.p2align 4
-L(fwd_write_5bytes):
-	movl	-5(%eax), %ecx
-	movl	-4(%eax), %eax
-	movl	%ecx, -5(%edx)
-	movl	%eax, -4(%edx)
-#ifdef USE_AS_MEMPCPY
-	movl	%edx, %eax
-#else
-	movl	DEST(%esp), %eax
-#endif
-	RETURN
-
-	.p2align 4
-L(fwd_write_45bytes):
-	movq	-45(%eax), %xmm0
-	movq	%xmm0, -45(%edx)
-L(fwd_write_37bytes):
-	movq	-37(%eax), %xmm0
-	movq	%xmm0, -37(%edx)
-L(fwd_write_29bytes):
-	movq	-29(%eax), %xmm0
-	movq	%xmm0, -29(%edx)
-L(fwd_write_21bytes):
-	movq	-21(%eax), %xmm0
-	movq	%xmm0, -21(%edx)
-L(fwd_write_13bytes):
-	movq	-13(%eax), %xmm0
-	movq	%xmm0, -13(%edx)
-	movl	-5(%eax), %ecx
-	movl	%ecx, -5(%edx)
-	movzbl	-1(%eax), %ecx
-	movb	%cl, -1(%edx)
-#ifdef USE_AS_MEMPCPY
-	movl	%edx, %eax
-#else
-	movl	DEST(%esp), %eax
-#endif
-	RETURN
-
-	.p2align 4
-L(fwd_write_41bytes):
-	movq	-41(%eax), %xmm0
-	movq	%xmm0, -41(%edx)
-L(fwd_write_33bytes):
-	movq	-33(%eax), %xmm0
-	movq	%xmm0, -33(%edx)
-L(fwd_write_25bytes):
-	movq	-25(%eax), %xmm0
-	movq	%xmm0, -25(%edx)
-L(fwd_write_17bytes):
-	movq	-17(%eax), %xmm0
-	movq	%xmm0, -17(%edx)
-L(fwd_write_9bytes):
-	movq	-9(%eax), %xmm0
-	movq	%xmm0, -9(%edx)
-L(fwd_write_1bytes):
-	movzbl	-1(%eax), %ecx
-	movb	%cl, -1(%edx)
-#ifdef USE_AS_MEMPCPY
-	movl	%edx, %eax
-#else
-	movl	DEST(%esp), %eax
-#endif
-	RETURN
-
-	.p2align 4
-L(fwd_write_46bytes):
-	movq	-46(%eax), %xmm0
-	movq	%xmm0, -46(%edx)
-L(fwd_write_38bytes):
-	movq	-38(%eax), %xmm0
-	movq	%xmm0, -38(%edx)
-L(fwd_write_30bytes):
-	movq	-30(%eax), %xmm0
-	movq	%xmm0, -30(%edx)
-L(fwd_write_22bytes):
-	movq	-22(%eax), %xmm0
-	movq	%xmm0, -22(%edx)
-L(fwd_write_14bytes):
-	movq	-14(%eax), %xmm0
-	movq	%xmm0, -14(%edx)
-L(fwd_write_6bytes):
-	movl	-6(%eax), %ecx
-	movl	%ecx, -6(%edx)
-	movzwl	-2(%eax), %ecx
-	movw	%cx, -2(%edx)
-#ifdef USE_AS_MEMPCPY
-	movl	%edx, %eax
-#else
-	movl	DEST(%esp), %eax
-#endif
-	RETURN
-
-	.p2align 4
-L(fwd_write_42bytes):
-	movq	-42(%eax), %xmm0
-	movq	%xmm0, -42(%edx)
-L(fwd_write_34bytes):
-	movq	-34(%eax), %xmm0
-	movq	%xmm0, -34(%edx)
-L(fwd_write_26bytes):
-	movq	-26(%eax), %xmm0
-	movq	%xmm0, -26(%edx)
-L(fwd_write_18bytes):
-	movq	-18(%eax), %xmm0
-	movq	%xmm0, -18(%edx)
-L(fwd_write_10bytes):
-	movq	-10(%eax), %xmm0
-	movq	%xmm0, -10(%edx)
-L(fwd_write_2bytes):
-	movzwl	-2(%eax), %ecx
-	movw	%cx, -2(%edx)
-#ifdef USE_AS_MEMPCPY
-	movl	%edx, %eax
-#else
-	movl	DEST(%esp), %eax
-#endif
-	RETURN
-
-	.p2align 4
-L(fwd_write_47bytes):
-	movq	-47(%eax), %xmm0
-	movq	%xmm0, -47(%edx)
-L(fwd_write_39bytes):
-	movq	-39(%eax), %xmm0
-	movq	%xmm0, -39(%edx)
-L(fwd_write_31bytes):
-	movq	-31(%eax), %xmm0
-	movq	%xmm0, -31(%edx)
-L(fwd_write_23bytes):
-	movq	-23(%eax), %xmm0
-	movq	%xmm0, -23(%edx)
-L(fwd_write_15bytes):
-	movq	-15(%eax), %xmm0
-	movq	%xmm0, -15(%edx)
-L(fwd_write_7bytes):
-	movl	-7(%eax), %ecx
-	movl	%ecx, -7(%edx)
-	movzwl	-3(%eax), %ecx
-	movzbl	-1(%eax), %eax
-	movw	%cx, -3(%edx)
-	movb	%al, -1(%edx)
-#ifdef USE_AS_MEMPCPY
-	movl	%edx, %eax
-#else
-	movl	DEST(%esp), %eax
-#endif
-	RETURN
-
-	.p2align 4
-L(fwd_write_43bytes):
-	movq	-43(%eax), %xmm0
-	movq	%xmm0, -43(%edx)
-L(fwd_write_35bytes):
-	movq	-35(%eax), %xmm0
-	movq	%xmm0, -35(%edx)
-L(fwd_write_27bytes):
-	movq	-27(%eax), %xmm0
-	movq	%xmm0, -27(%edx)
-L(fwd_write_19bytes):
-	movq	-19(%eax), %xmm0
-	movq	%xmm0, -19(%edx)
-L(fwd_write_11bytes):
-	movq	-11(%eax), %xmm0
-	movq	%xmm0, -11(%edx)
-L(fwd_write_3bytes):
-	movzwl	-3(%eax), %ecx
-	movzbl	-1(%eax), %eax
-	movw	%cx, -3(%edx)
-	movb	%al, -1(%edx)
-#ifdef USE_AS_MEMPCPY
-	movl	%edx, %eax
-#else
-	movl	DEST(%esp), %eax
-#endif
-	RETURN
-
-	.p2align 4
-L(fwd_write_40bytes_align):
-	movdqa	-40(%eax), %xmm0
-	movdqa	%xmm0, -40(%edx)
-L(fwd_write_24bytes_align):
-	movdqa	-24(%eax), %xmm0
-	movdqa	%xmm0, -24(%edx)
-L(fwd_write_8bytes_align):
-	movq	-8(%eax), %xmm0
-	movq	%xmm0, -8(%edx)
-L(fwd_write_0bytes_align):
-#ifdef USE_AS_MEMPCPY
-	movl	%edx, %eax
-#else
-	movl	DEST(%esp), %eax
-#endif
-	RETURN
-
-	.p2align 4
-L(fwd_write_32bytes_align):
-	movdqa	-32(%eax), %xmm0
-	movdqa	%xmm0, -32(%edx)
-L(fwd_write_16bytes_align):
-	movdqa	-16(%eax), %xmm0
-	movdqa	%xmm0, -16(%edx)
-#ifdef USE_AS_MEMPCPY
-	movl	%edx, %eax
-#else
-	movl	DEST(%esp), %eax
-#endif
-	RETURN
-
-	.p2align 4
-L(fwd_write_5bytes_align):
-	movl	-5(%eax), %ecx
-	movl	-4(%eax), %eax
-	movl	%ecx, -5(%edx)
-	movl	%eax, -4(%edx)
-#ifdef USE_AS_MEMPCPY
-	movl	%edx, %eax
-#else
-	movl	DEST(%esp), %eax
-#endif
-	RETURN
-
-	.p2align 4
-L(fwd_write_45bytes_align):
-	movdqa	-45(%eax), %xmm0
-	movdqa	%xmm0, -45(%edx)
-L(fwd_write_29bytes_align):
-	movdqa	-29(%eax), %xmm0
-	movdqa	%xmm0, -29(%edx)
-L(fwd_write_13bytes_align):
-	movq	-13(%eax), %xmm0
-	movq	%xmm0, -13(%edx)
-	movl	-5(%eax), %ecx
-	movl	%ecx, -5(%edx)
-	movzbl	-1(%eax), %ecx
-	movb	%cl, -1(%edx)
-#ifdef USE_AS_MEMPCPY
-	movl	%edx, %eax
-#else
-	movl	DEST(%esp), %eax
-#endif
-	RETURN
-
-	.p2align 4
-L(fwd_write_37bytes_align):
-	movdqa	-37(%eax), %xmm0
-	movdqa	%xmm0, -37(%edx)
-L(fwd_write_21bytes_align):
-	movdqa	-21(%eax), %xmm0
-	movdqa	%xmm0, -21(%edx)
-	movl	-5(%eax), %ecx
-	movl	%ecx, -5(%edx)
-	movzbl	-1(%eax), %ecx
-	movb	%cl, -1(%edx)
-#ifdef USE_AS_MEMPCPY
-	movl	%edx, %eax
-#else
-	movl	DEST(%esp), %eax
-#endif
-	RETURN
-
-	.p2align 4
-L(fwd_write_41bytes_align):
-	movdqa	-41(%eax), %xmm0
-	movdqa	%xmm0, -41(%edx)
-L(fwd_write_25bytes_align):
-	movdqa	-25(%eax), %xmm0
-	movdqa	%xmm0, -25(%edx)
-L(fwd_write_9bytes_align):
-	movq	-9(%eax), %xmm0
-	movq	%xmm0, -9(%edx)
-L(fwd_write_1bytes_align):
-	movzbl	-1(%eax), %ecx
-	movb	%cl, -1(%edx)
-#ifdef USE_AS_MEMPCPY
-	movl	%edx, %eax
-#else
-	movl	DEST(%esp), %eax
-#endif
-	RETURN
-
-	.p2align 4
-L(fwd_write_33bytes_align):
-	movdqa	-33(%eax), %xmm0
-	movdqa	%xmm0, -33(%edx)
-L(fwd_write_17bytes_align):
-	movdqa	-17(%eax), %xmm0
-	movdqa	%xmm0, -17(%edx)
-	movzbl	-1(%eax), %ecx
-	movb	%cl, -1(%edx)
-#ifdef USE_AS_MEMPCPY
-	movl	%edx, %eax
-#else
-	movl	DEST(%esp), %eax
-#endif
-	RETURN
-
-	.p2align 4
-L(fwd_write_46bytes_align):
-	movdqa	-46(%eax), %xmm0
-	movdqa	%xmm0, -46(%edx)
-L(fwd_write_30bytes_align):
-	movdqa	-30(%eax), %xmm0
-	movdqa	%xmm0, -30(%edx)
-L(fwd_write_14bytes_align):
-	movq	-14(%eax), %xmm0
-	movq	%xmm0, -14(%edx)
-L(fwd_write_6bytes_align):
-	movl	-6(%eax), %ecx
-	movl	%ecx, -6(%edx)
-	movzwl	-2(%eax), %ecx
-	movw	%cx, -2(%edx)
-#ifdef USE_AS_MEMPCPY
-	movl	%edx, %eax
-#else
-	movl	DEST(%esp), %eax
-#endif
-	RETURN
-
-	.p2align 4
-L(fwd_write_38bytes_align):
-	movdqa	-38(%eax), %xmm0
-	movdqa	%xmm0, -38(%edx)
-L(fwd_write_22bytes_align):
-	movdqa	-22(%eax), %xmm0
-	movdqa	%xmm0, -22(%edx)
-	movl	-6(%eax), %ecx
-	movl	%ecx, -6(%edx)
-	movzwl	-2(%eax), %ecx
-	movw	%cx, -2(%edx)
-#ifdef USE_AS_MEMPCPY
-	movl	%edx, %eax
-#else
-	movl	DEST(%esp), %eax
-#endif
-	RETURN
-
-	.p2align 4
-L(fwd_write_42bytes_align):
-	movdqa	-42(%eax), %xmm0
-	movdqa	%xmm0, -42(%edx)
-L(fwd_write_26bytes_align):
-	movdqa	-26(%eax), %xmm0
-	movdqa	%xmm0, -26(%edx)
-L(fwd_write_10bytes_align):
-	movq	-10(%eax), %xmm0
-	movq	%xmm0, -10(%edx)
-L(fwd_write_2bytes_align):
-	movzwl	-2(%eax), %ecx
-	movw	%cx, -2(%edx)
-#ifdef USE_AS_MEMPCPY
-	movl	%edx, %eax
-#else
-	movl	DEST(%esp), %eax
-#endif
-	RETURN
-
-	.p2align 4
-L(fwd_write_34bytes_align):
-	movdqa	-34(%eax), %xmm0
-	movdqa	%xmm0, -34(%edx)
-L(fwd_write_18bytes_align):
-	movdqa	-18(%eax), %xmm0
-	movdqa	%xmm0, -18(%edx)
-	movzwl	-2(%eax), %ecx
-	movw	%cx, -2(%edx)
-#ifdef USE_AS_MEMPCPY
-	movl	%edx, %eax
-#else
-	movl	DEST(%esp), %eax
-#endif
-	RETURN
-
-	.p2align 4
-L(fwd_write_47bytes_align):
-	movdqa	-47(%eax), %xmm0
-	movdqa	%xmm0, -47(%edx)
-L(fwd_write_31bytes_align):
-	movdqa	-31(%eax), %xmm0
-	movdqa	%xmm0, -31(%edx)
-L(fwd_write_15bytes_align):
-	movq	-15(%eax), %xmm0
-	movq	%xmm0, -15(%edx)
-L(fwd_write_7bytes_align):
-	movl	-7(%eax), %ecx
-	movl	%ecx, -7(%edx)
-	movzwl	-3(%eax), %ecx
-	movzbl	-1(%eax), %eax
-	movw	%cx, -3(%edx)
-	movb	%al, -1(%edx)
-#ifdef USE_AS_MEMPCPY
-	movl	%edx, %eax
-#else
-	movl	DEST(%esp), %eax
-#endif
-	RETURN
-
-	.p2align 4
-L(fwd_write_39bytes_align):
-	movdqa	-39(%eax), %xmm0
-	movdqa	%xmm0, -39(%edx)
-L(fwd_write_23bytes_align):
-	movdqa	-23(%eax), %xmm0
-	movdqa	%xmm0, -23(%edx)
-	movl	-7(%eax), %ecx
-	movl	%ecx, -7(%edx)
-	movzwl	-3(%eax), %ecx
-	movzbl	-1(%eax), %eax
-	movw	%cx, -3(%edx)
-	movb	%al, -1(%edx)
-#ifdef USE_AS_MEMPCPY
-	movl	%edx, %eax
-#else
-	movl	DEST(%esp), %eax
-#endif
-	RETURN
-
-	.p2align 4
-L(fwd_write_43bytes_align):
-	movdqa	-43(%eax), %xmm0
-	movdqa	%xmm0, -43(%edx)
-L(fwd_write_27bytes_align):
-	movdqa	-27(%eax), %xmm0
-	movdqa	%xmm0, -27(%edx)
-L(fwd_write_11bytes_align):
-	movq	-11(%eax), %xmm0
-	movq	%xmm0, -11(%edx)
-L(fwd_write_3bytes_align):
-	movzwl	-3(%eax), %ecx
-	movzbl	-1(%eax), %eax
-	movw	%cx, -3(%edx)
-	movb	%al, -1(%edx)
-#ifdef USE_AS_MEMPCPY
-	movl	%edx, %eax
-#else
-	movl	DEST(%esp), %eax
-#endif
-	RETURN
-
-	.p2align 4
-L(fwd_write_35bytes_align):
-	movdqa	-35(%eax), %xmm0
-	movdqa	%xmm0, -35(%edx)
-L(fwd_write_19bytes_align):
-	movdqa	-19(%eax), %xmm0
-	movdqa	%xmm0, -19(%edx)
-	movzwl	-3(%eax), %ecx
-	movzbl	-1(%eax), %eax
-	movw	%cx, -3(%edx)
-	movb	%al, -1(%edx)
-#ifdef USE_AS_MEMPCPY
-	movl	%edx, %eax
-#else
-	movl	DEST(%esp), %eax
-#endif
-	RETURN
-
-	.p2align 4
-L(fwd_write_44bytes_align):
-	movdqa	-44(%eax), %xmm0
-	movdqa	%xmm0, -44(%edx)
-L(fwd_write_28bytes_align):
-	movdqa	-28(%eax), %xmm0
-	movdqa	%xmm0, -28(%edx)
-L(fwd_write_12bytes_align):
-	movq	-12(%eax), %xmm0
-	movq	%xmm0, -12(%edx)
-L(fwd_write_4bytes_align):
-	movl	-4(%eax), %ecx
-	movl	%ecx, -4(%edx)
-#ifdef USE_AS_MEMPCPY
-	movl	%edx, %eax
-#else
-	movl	DEST(%esp), %eax
-#endif
-	RETURN
-
-	.p2align 4
-L(fwd_write_36bytes_align):
-	movdqa	-36(%eax), %xmm0
-	movdqa	%xmm0, -36(%edx)
-L(fwd_write_20bytes_align):
-	movdqa	-20(%eax), %xmm0
-	movdqa	%xmm0, -20(%edx)
-	movl	-4(%eax), %ecx
-	movl	%ecx, -4(%edx)
-#ifdef USE_AS_MEMPCPY
-	movl	%edx, %eax
-#else
-	movl	DEST(%esp), %eax
-#endif
-	RETURN_END
-
-	CFI_PUSH (%edi)
-
-	.p2align 4
-L(large_page):
-	movdqu	(%eax), %xmm1
-#ifdef USE_AS_MEMMOVE
-	movl	DEST+4(%esp), %edi
-	movdqu	%xmm0, (%edi)
-#endif
-	lea	16(%eax), %eax
-	movntdq	%xmm1, (%edx)
-	lea	16(%edx), %edx
-	lea	-0x90(%ecx), %ecx
-	POP (%edi)
-
-	.p2align 4
-L(large_page_loop):
-	movdqu	(%eax), %xmm0
-	movdqu	0x10(%eax), %xmm1
-	movdqu	0x20(%eax), %xmm2
-	movdqu	0x30(%eax), %xmm3
-	movdqu	0x40(%eax), %xmm4
-	movdqu	0x50(%eax), %xmm5
-	movdqu	0x60(%eax), %xmm6
-	movdqu	0x70(%eax), %xmm7
-	lea	0x80(%eax), %eax
-
-	sub	$0x80, %ecx
-	movntdq	%xmm0, (%edx)
-	movntdq	%xmm1, 0x10(%edx)
-	movntdq	%xmm2, 0x20(%edx)
-	movntdq	%xmm3, 0x30(%edx)
-	movntdq	%xmm4, 0x40(%edx)
-	movntdq	%xmm5, 0x50(%edx)
-	movntdq	%xmm6, 0x60(%edx)
-	movntdq	%xmm7, 0x70(%edx)
-	lea	0x80(%edx), %edx
-	jae	L(large_page_loop)
-	cmp	$-0x40, %ecx
-	lea	0x80(%ecx), %ecx
-	jl	L(large_page_less_64bytes)
-
-	movdqu	(%eax), %xmm0
-	movdqu	0x10(%eax), %xmm1
-	movdqu	0x20(%eax), %xmm2
-	movdqu	0x30(%eax), %xmm3
-	lea	0x40(%eax), %eax
-
-	movntdq	%xmm0, (%edx)
-	movntdq	%xmm1, 0x10(%edx)
-	movntdq	%xmm2, 0x20(%edx)
-	movntdq	%xmm3, 0x30(%edx)
-	lea	0x40(%edx), %edx
-	sub	$0x40, %ecx
-L(large_page_less_64bytes):
-	cmp	$32, %ecx
-	jb	L(large_page_less_32bytes)
-	movdqu	(%eax), %xmm0
-	movdqu	0x10(%eax), %xmm1
-	lea	0x20(%eax), %eax
-	movntdq	%xmm0, (%edx)
-	movntdq	%xmm1, 0x10(%edx)
-	lea	0x20(%edx), %edx
-	sub	$0x20, %ecx
-L(large_page_less_32bytes):
-	add	%ecx, %edx
-	add	%ecx, %eax
-	sfence
-	BRANCH_TO_JMPTBL_ENTRY (L(table_48bytes_fwd), %ecx, 4)
-
-	.p2align 4
-L(bk_write_44bytes):
-	movq	36(%eax), %xmm0
-	movq	%xmm0, 36(%edx)
-L(bk_write_36bytes):
-	movq	28(%eax), %xmm0
-	movq	%xmm0, 28(%edx)
-L(bk_write_28bytes):
-	movq	20(%eax), %xmm0
-	movq	%xmm0, 20(%edx)
-L(bk_write_20bytes):
-	movq	12(%eax), %xmm0
-	movq	%xmm0, 12(%edx)
-L(bk_write_12bytes):
-	movq	4(%eax), %xmm0
-	movq	%xmm0, 4(%edx)
-L(bk_write_4bytes):
-	movl	(%eax), %ecx
-	movl	%ecx, (%edx)
-L(bk_write_0bytes):
-	movl	DEST(%esp), %eax
-#ifdef USE_AS_MEMPCPY
-	movl	LEN(%esp), %ecx
-	add	%ecx, %eax
-#endif
-	RETURN
-
-	.p2align 4
-L(bk_write_40bytes):
-	movq	32(%eax), %xmm0
-	movq	%xmm0, 32(%edx)
-L(bk_write_32bytes):
-	movq	24(%eax), %xmm0
-	movq	%xmm0, 24(%edx)
-L(bk_write_24bytes):
-	movq	16(%eax), %xmm0
-	movq	%xmm0, 16(%edx)
-L(bk_write_16bytes):
-	movq	8(%eax), %xmm0
-	movq	%xmm0, 8(%edx)
-L(bk_write_8bytes):
-	movq	(%eax), %xmm0
-	movq	%xmm0, (%edx)
-	movl	DEST(%esp), %eax
-#ifdef USE_AS_MEMPCPY
-	movl	LEN(%esp), %ecx
-	add	%ecx, %eax
-#endif
-	RETURN
-
-	.p2align 4
-L(bk_write_45bytes):
-	movq	37(%eax), %xmm0
-	movq	%xmm0, 37(%edx)
-L(bk_write_37bytes):
-	movq	29(%eax), %xmm0
-	movq	%xmm0, 29(%edx)
-L(bk_write_29bytes):
-	movq	21(%eax), %xmm0
-	movq	%xmm0, 21(%edx)
-L(bk_write_21bytes):
-	movq	13(%eax), %xmm0
-	movq	%xmm0, 13(%edx)
-L(bk_write_13bytes):
-	movq	5(%eax), %xmm0
-	movq	%xmm0, 5(%edx)
-L(bk_write_5bytes):
-	movl	1(%eax), %ecx
-	movl	%ecx, 1(%edx)
-L(bk_write_1bytes):
-	movzbl	(%eax), %ecx
-	movb	%cl, (%edx)
-	movl	DEST(%esp), %eax
-#ifdef USE_AS_MEMPCPY
-	movl	LEN(%esp), %ecx
-	add	%ecx, %eax
-#endif
-	RETURN
-
-	.p2align 4
-L(bk_write_41bytes):
-	movq	33(%eax), %xmm0
-	movq	%xmm0, 33(%edx)
-L(bk_write_33bytes):
-	movq	25(%eax), %xmm0
-	movq	%xmm0, 25(%edx)
-L(bk_write_25bytes):
-	movq	17(%eax), %xmm0
-	movq	%xmm0, 17(%edx)
-L(bk_write_17bytes):
-	movq	9(%eax), %xmm0
-	movq	%xmm0, 9(%edx)
-L(bk_write_9bytes):
-	movq	1(%eax), %xmm0
-	movq	%xmm0, 1(%edx)
-	movzbl	(%eax), %ecx
-	movb	%cl, (%edx)
-	movl	DEST(%esp), %eax
-#ifdef USE_AS_MEMPCPY
-	movl	LEN(%esp), %ecx
-	add	%ecx, %eax
-#endif
-	RETURN
-
-	.p2align 4
-L(bk_write_46bytes):
-	movq	38(%eax), %xmm0
-	movq	%xmm0, 38(%edx)
-L(bk_write_38bytes):
-	movq	30(%eax), %xmm0
-	movq	%xmm0, 30(%edx)
-L(bk_write_30bytes):
-	movq	22(%eax), %xmm0
-	movq	%xmm0, 22(%edx)
-L(bk_write_22bytes):
-	movq	14(%eax), %xmm0
-	movq	%xmm0, 14(%edx)
-L(bk_write_14bytes):
-	movq	6(%eax), %xmm0
-	movq	%xmm0, 6(%edx)
-L(bk_write_6bytes):
-	movl	2(%eax), %ecx
-	movl	%ecx, 2(%edx)
-	movzwl	(%eax), %ecx
-	movw	%cx, (%edx)
-	movl	DEST(%esp), %eax
-#ifdef USE_AS_MEMPCPY
-	movl	LEN(%esp), %ecx
-	add	%ecx, %eax
-#endif
-	RETURN
-
-	.p2align 4
-L(bk_write_42bytes):
-	movq	34(%eax), %xmm0
-	movq	%xmm0, 34(%edx)
-L(bk_write_34bytes):
-	movq	26(%eax), %xmm0
-	movq	%xmm0, 26(%edx)
-L(bk_write_26bytes):
-	movq	18(%eax), %xmm0
-	movq	%xmm0, 18(%edx)
-L(bk_write_18bytes):
-	movq	10(%eax), %xmm0
-	movq	%xmm0, 10(%edx)
-L(bk_write_10bytes):
-	movq	2(%eax), %xmm0
-	movq	%xmm0, 2(%edx)
-L(bk_write_2bytes):
-	movzwl	(%eax), %ecx
-	movw	%cx, (%edx)
-	movl	DEST(%esp), %eax
-#ifdef USE_AS_MEMPCPY
-	movl	LEN(%esp), %ecx
-	add	%ecx, %eax
-#endif
-	RETURN
-
-	.p2align 4
-L(bk_write_47bytes):
-	movq	39(%eax), %xmm0
-	movq	%xmm0, 39(%edx)
-L(bk_write_39bytes):
-	movq	31(%eax), %xmm0
-	movq	%xmm0, 31(%edx)
-L(bk_write_31bytes):
-	movq	23(%eax), %xmm0
-	movq	%xmm0, 23(%edx)
-L(bk_write_23bytes):
-	movq	15(%eax), %xmm0
-	movq	%xmm0, 15(%edx)
-L(bk_write_15bytes):
-	movq	7(%eax), %xmm0
-	movq	%xmm0, 7(%edx)
-L(bk_write_7bytes):
-	movl	3(%eax), %ecx
-	movl	%ecx, 3(%edx)
-	movzwl	1(%eax), %ecx
-	movw	%cx, 1(%edx)
-	movzbl	(%eax), %eax
-	movb	%al, (%edx)
-	movl	DEST(%esp), %eax
-#ifdef USE_AS_MEMPCPY
-	movl	LEN(%esp), %ecx
-	add	%ecx, %eax
-#endif
-	RETURN
-
-	.p2align 4
-L(bk_write_43bytes):
-	movq	35(%eax), %xmm0
-	movq	%xmm0, 35(%edx)
-L(bk_write_35bytes):
-	movq	27(%eax), %xmm0
-	movq	%xmm0, 27(%edx)
-L(bk_write_27bytes):
-	movq	19(%eax), %xmm0
-	movq	%xmm0, 19(%edx)
-L(bk_write_19bytes):
-	movq	11(%eax), %xmm0
-	movq	%xmm0, 11(%edx)
-L(bk_write_11bytes):
-	movq	3(%eax), %xmm0
-	movq	%xmm0, 3(%edx)
-L(bk_write_3bytes):
-	movzwl	1(%eax), %ecx
-	movw	%cx, 1(%edx)
-	movzbl	(%eax), %eax
-	movb	%al, (%edx)
-	movl	DEST(%esp), %eax
-#ifdef USE_AS_MEMPCPY
-	movl	LEN(%esp), %ecx
-	add	%ecx, %eax
-#endif
-	RETURN_END
-
-
-	.pushsection .rodata.ssse3,"a",@progbits
-	.p2align 2
-L(table_48bytes_fwd):
-	.int	JMPTBL (L(fwd_write_0bytes), L(table_48bytes_fwd))
-	.int	JMPTBL (L(fwd_write_1bytes), L(table_48bytes_fwd))
-	.int	JMPTBL (L(fwd_write_2bytes), L(table_48bytes_fwd))
-	.int	JMPTBL (L(fwd_write_3bytes), L(table_48bytes_fwd))
-	.int	JMPTBL (L(fwd_write_4bytes), L(table_48bytes_fwd))
-	.int	JMPTBL (L(fwd_write_5bytes), L(table_48bytes_fwd))
-	.int	JMPTBL (L(fwd_write_6bytes), L(table_48bytes_fwd))
-	.int	JMPTBL (L(fwd_write_7bytes), L(table_48bytes_fwd))
-	.int	JMPTBL (L(fwd_write_8bytes), L(table_48bytes_fwd))
-	.int	JMPTBL (L(fwd_write_9bytes), L(table_48bytes_fwd))
-	.int	JMPTBL (L(fwd_write_10bytes), L(table_48bytes_fwd))
-	.int	JMPTBL (L(fwd_write_11bytes), L(table_48bytes_fwd))
-	.int	JMPTBL (L(fwd_write_12bytes), L(table_48bytes_fwd))
-	.int	JMPTBL (L(fwd_write_13bytes), L(table_48bytes_fwd))
-	.int	JMPTBL (L(fwd_write_14bytes), L(table_48bytes_fwd))
-	.int	JMPTBL (L(fwd_write_15bytes), L(table_48bytes_fwd))
-	.int	JMPTBL (L(fwd_write_16bytes), L(table_48bytes_fwd))
-	.int	JMPTBL (L(fwd_write_17bytes), L(table_48bytes_fwd))
-	.int	JMPTBL (L(fwd_write_18bytes), L(table_48bytes_fwd))
-	.int	JMPTBL (L(fwd_write_19bytes), L(table_48bytes_fwd))
-	.int	JMPTBL (L(fwd_write_20bytes), L(table_48bytes_fwd))
-	.int	JMPTBL (L(fwd_write_21bytes), L(table_48bytes_fwd))
-	.int	JMPTBL (L(fwd_write_22bytes), L(table_48bytes_fwd))
-	.int	JMPTBL (L(fwd_write_23bytes), L(table_48bytes_fwd))
-	.int	JMPTBL (L(fwd_write_24bytes), L(table_48bytes_fwd))
-	.int	JMPTBL (L(fwd_write_25bytes), L(table_48bytes_fwd))
-	.int	JMPTBL (L(fwd_write_26bytes), L(table_48bytes_fwd))
-	.int	JMPTBL (L(fwd_write_27bytes), L(table_48bytes_fwd))
-	.int	JMPTBL (L(fwd_write_28bytes), L(table_48bytes_fwd))
-	.int	JMPTBL (L(fwd_write_29bytes), L(table_48bytes_fwd))
-	.int	JMPTBL (L(fwd_write_30bytes), L(table_48bytes_fwd))
-	.int	JMPTBL (L(fwd_write_31bytes), L(table_48bytes_fwd))
-	.int	JMPTBL (L(fwd_write_32bytes), L(table_48bytes_fwd))
-	.int	JMPTBL (L(fwd_write_33bytes), L(table_48bytes_fwd))
-	.int	JMPTBL (L(fwd_write_34bytes), L(table_48bytes_fwd))
-	.int	JMPTBL (L(fwd_write_35bytes), L(table_48bytes_fwd))
-	.int	JMPTBL (L(fwd_write_36bytes), L(table_48bytes_fwd))
-	.int	JMPTBL (L(fwd_write_37bytes), L(table_48bytes_fwd))
-	.int	JMPTBL (L(fwd_write_38bytes), L(table_48bytes_fwd))
-	.int	JMPTBL (L(fwd_write_39bytes), L(table_48bytes_fwd))
-	.int	JMPTBL (L(fwd_write_40bytes), L(table_48bytes_fwd))
-	.int	JMPTBL (L(fwd_write_41bytes), L(table_48bytes_fwd))
-	.int	JMPTBL (L(fwd_write_42bytes), L(table_48bytes_fwd))
-	.int	JMPTBL (L(fwd_write_43bytes), L(table_48bytes_fwd))
-	.int	JMPTBL (L(fwd_write_44bytes), L(table_48bytes_fwd))
-	.int	JMPTBL (L(fwd_write_45bytes), L(table_48bytes_fwd))
-	.int	JMPTBL (L(fwd_write_46bytes), L(table_48bytes_fwd))
-	.int	JMPTBL (L(fwd_write_47bytes), L(table_48bytes_fwd))
-
-	.p2align 2
-L(table_48bytes_fwd_align):
-	.int	JMPTBL (L(fwd_write_0bytes_align), L(table_48bytes_fwd_align))
-	.int	JMPTBL (L(fwd_write_1bytes_align), L(table_48bytes_fwd_align))
-	.int	JMPTBL (L(fwd_write_2bytes_align), L(table_48bytes_fwd_align))
-	.int	JMPTBL (L(fwd_write_3bytes_align), L(table_48bytes_fwd_align))
-	.int	JMPTBL (L(fwd_write_4bytes_align), L(table_48bytes_fwd_align))
-	.int	JMPTBL (L(fwd_write_5bytes_align), L(table_48bytes_fwd_align))
-	.int	JMPTBL (L(fwd_write_6bytes_align), L(table_48bytes_fwd_align))
-	.int	JMPTBL (L(fwd_write_7bytes_align), L(table_48bytes_fwd_align))
-	.int	JMPTBL (L(fwd_write_8bytes_align), L(table_48bytes_fwd_align))
-	.int	JMPTBL (L(fwd_write_9bytes_align), L(table_48bytes_fwd_align))
-	.int	JMPTBL (L(fwd_write_10bytes_align), L(table_48bytes_fwd_align))
-	.int	JMPTBL (L(fwd_write_11bytes_align), L(table_48bytes_fwd_align))
-	.int	JMPTBL (L(fwd_write_12bytes_align), L(table_48bytes_fwd_align))
-	.int	JMPTBL (L(fwd_write_13bytes_align), L(table_48bytes_fwd_align))
-	.int	JMPTBL (L(fwd_write_14bytes_align), L(table_48bytes_fwd_align))
-	.int	JMPTBL (L(fwd_write_15bytes_align), L(table_48bytes_fwd_align))
-	.int	JMPTBL (L(fwd_write_16bytes_align), L(table_48bytes_fwd_align))
-	.int	JMPTBL (L(fwd_write_17bytes_align), L(table_48bytes_fwd_align))
-	.int	JMPTBL (L(fwd_write_18bytes_align), L(table_48bytes_fwd_align))
-	.int	JMPTBL (L(fwd_write_19bytes_align), L(table_48bytes_fwd_align))
-	.int	JMPTBL (L(fwd_write_20bytes_align), L(table_48bytes_fwd_align))
-	.int	JMPTBL (L(fwd_write_21bytes_align), L(table_48bytes_fwd_align))
-	.int	JMPTBL (L(fwd_write_22bytes_align), L(table_48bytes_fwd_align))
-	.int	JMPTBL (L(fwd_write_23bytes_align), L(table_48bytes_fwd_align))
-	.int	JMPTBL (L(fwd_write_24bytes_align), L(table_48bytes_fwd_align))
-	.int	JMPTBL (L(fwd_write_25bytes_align), L(table_48bytes_fwd_align))
-	.int	JMPTBL (L(fwd_write_26bytes_align), L(table_48bytes_fwd_align))
-	.int	JMPTBL (L(fwd_write_27bytes_align), L(table_48bytes_fwd_align))
-	.int	JMPTBL (L(fwd_write_28bytes_align), L(table_48bytes_fwd_align))
-	.int	JMPTBL (L(fwd_write_29bytes_align), L(table_48bytes_fwd_align))
-	.int	JMPTBL (L(fwd_write_30bytes_align), L(table_48bytes_fwd_align))
-	.int	JMPTBL (L(fwd_write_31bytes_align), L(table_48bytes_fwd_align))
-	.int	JMPTBL (L(fwd_write_32bytes_align), L(table_48bytes_fwd_align))
-	.int	JMPTBL (L(fwd_write_33bytes_align), L(table_48bytes_fwd_align))
-	.int	JMPTBL (L(fwd_write_34bytes_align), L(table_48bytes_fwd_align))
-	.int	JMPTBL (L(fwd_write_35bytes_align), L(table_48bytes_fwd_align))
-	.int	JMPTBL (L(fwd_write_36bytes_align), L(table_48bytes_fwd_align))
-	.int	JMPTBL (L(fwd_write_37bytes_align), L(table_48bytes_fwd_align))
-	.int	JMPTBL (L(fwd_write_38bytes_align), L(table_48bytes_fwd_align))
-	.int	JMPTBL (L(fwd_write_39bytes_align), L(table_48bytes_fwd_align))
-	.int	JMPTBL (L(fwd_write_40bytes_align), L(table_48bytes_fwd_align))
-	.int	JMPTBL (L(fwd_write_41bytes_align), L(table_48bytes_fwd_align))
-	.int	JMPTBL (L(fwd_write_42bytes_align), L(table_48bytes_fwd_align))
-	.int	JMPTBL (L(fwd_write_43bytes_align), L(table_48bytes_fwd_align))
-	.int	JMPTBL (L(fwd_write_44bytes_align), L(table_48bytes_fwd_align))
-	.int	JMPTBL (L(fwd_write_45bytes_align), L(table_48bytes_fwd_align))
-	.int	JMPTBL (L(fwd_write_46bytes_align), L(table_48bytes_fwd_align))
-	.int	JMPTBL (L(fwd_write_47bytes_align), L(table_48bytes_fwd_align))
-
-	.p2align 2
-L(shl_table):
-	.int	JMPTBL (L(shl_0), L(shl_table))
-	.int	JMPTBL (L(shl_1), L(shl_table))
-	.int	JMPTBL (L(shl_2), L(shl_table))
-	.int	JMPTBL (L(shl_3), L(shl_table))
-	.int	JMPTBL (L(shl_4), L(shl_table))
-	.int	JMPTBL (L(shl_5), L(shl_table))
-	.int	JMPTBL (L(shl_6), L(shl_table))
-	.int	JMPTBL (L(shl_7), L(shl_table))
-	.int	JMPTBL (L(shl_8), L(shl_table))
-	.int	JMPTBL (L(shl_9), L(shl_table))
-	.int	JMPTBL (L(shl_10), L(shl_table))
-	.int	JMPTBL (L(shl_11), L(shl_table))
-	.int	JMPTBL (L(shl_12), L(shl_table))
-	.int	JMPTBL (L(shl_13), L(shl_table))
-	.int	JMPTBL (L(shl_14), L(shl_table))
-	.int	JMPTBL (L(shl_15), L(shl_table))
-
-	.p2align 2
-L(table_48_bytes_bwd):
-	.int	JMPTBL (L(bk_write_0bytes), L(table_48_bytes_bwd))
-	.int	JMPTBL (L(bk_write_1bytes), L(table_48_bytes_bwd))
-	.int	JMPTBL (L(bk_write_2bytes), L(table_48_bytes_bwd))
-	.int	JMPTBL (L(bk_write_3bytes), L(table_48_bytes_bwd))
-	.int	JMPTBL (L(bk_write_4bytes), L(table_48_bytes_bwd))
-	.int	JMPTBL (L(bk_write_5bytes), L(table_48_bytes_bwd))
-	.int	JMPTBL (L(bk_write_6bytes), L(table_48_bytes_bwd))
-	.int	JMPTBL (L(bk_write_7bytes), L(table_48_bytes_bwd))
-	.int	JMPTBL (L(bk_write_8bytes), L(table_48_bytes_bwd))
-	.int	JMPTBL (L(bk_write_9bytes), L(table_48_bytes_bwd))
-	.int	JMPTBL (L(bk_write_10bytes), L(table_48_bytes_bwd))
-	.int	JMPTBL (L(bk_write_11bytes), L(table_48_bytes_bwd))
-	.int	JMPTBL (L(bk_write_12bytes), L(table_48_bytes_bwd))
-	.int	JMPTBL (L(bk_write_13bytes), L(table_48_bytes_bwd))
-	.int	JMPTBL (L(bk_write_14bytes), L(table_48_bytes_bwd))
-	.int	JMPTBL (L(bk_write_15bytes), L(table_48_bytes_bwd))
-	.int	JMPTBL (L(bk_write_16bytes), L(table_48_bytes_bwd))
-	.int	JMPTBL (L(bk_write_17bytes), L(table_48_bytes_bwd))
-	.int	JMPTBL (L(bk_write_18bytes), L(table_48_bytes_bwd))
-	.int	JMPTBL (L(bk_write_19bytes), L(table_48_bytes_bwd))
-	.int	JMPTBL (L(bk_write_20bytes), L(table_48_bytes_bwd))
-	.int	JMPTBL (L(bk_write_21bytes), L(table_48_bytes_bwd))
-	.int	JMPTBL (L(bk_write_22bytes), L(table_48_bytes_bwd))
-	.int	JMPTBL (L(bk_write_23bytes), L(table_48_bytes_bwd))
-	.int	JMPTBL (L(bk_write_24bytes), L(table_48_bytes_bwd))
-	.int	JMPTBL (L(bk_write_25bytes), L(table_48_bytes_bwd))
-	.int	JMPTBL (L(bk_write_26bytes), L(table_48_bytes_bwd))
-	.int	JMPTBL (L(bk_write_27bytes), L(table_48_bytes_bwd))
-	.int	JMPTBL (L(bk_write_28bytes), L(table_48_bytes_bwd))
-	.int	JMPTBL (L(bk_write_29bytes), L(table_48_bytes_bwd))
-	.int	JMPTBL (L(bk_write_30bytes), L(table_48_bytes_bwd))
-	.int	JMPTBL (L(bk_write_31bytes), L(table_48_bytes_bwd))
-	.int	JMPTBL (L(bk_write_32bytes), L(table_48_bytes_bwd))
-	.int	JMPTBL (L(bk_write_33bytes), L(table_48_bytes_bwd))
-	.int	JMPTBL (L(bk_write_34bytes), L(table_48_bytes_bwd))
-	.int	JMPTBL (L(bk_write_35bytes), L(table_48_bytes_bwd))
-	.int	JMPTBL (L(bk_write_36bytes), L(table_48_bytes_bwd))
-	.int	JMPTBL (L(bk_write_37bytes), L(table_48_bytes_bwd))
-	.int	JMPTBL (L(bk_write_38bytes), L(table_48_bytes_bwd))
-	.int	JMPTBL (L(bk_write_39bytes), L(table_48_bytes_bwd))
-	.int	JMPTBL (L(bk_write_40bytes), L(table_48_bytes_bwd))
-	.int	JMPTBL (L(bk_write_41bytes), L(table_48_bytes_bwd))
-	.int	JMPTBL (L(bk_write_42bytes), L(table_48_bytes_bwd))
-	.int	JMPTBL (L(bk_write_43bytes), L(table_48_bytes_bwd))
-	.int	JMPTBL (L(bk_write_44bytes), L(table_48_bytes_bwd))
-	.int	JMPTBL (L(bk_write_45bytes), L(table_48_bytes_bwd))
-	.int	JMPTBL (L(bk_write_46bytes), L(table_48_bytes_bwd))
-	.int	JMPTBL (L(bk_write_47bytes), L(table_48_bytes_bwd))
-
-	.popsection
-
-#ifdef USE_AS_MEMMOVE
-	.p2align 4
-L(copy_backward):
-	PUSH (%edi)
-	movl	%eax, %edi
-	lea	(%ecx,%edx,1),%edx
-	lea	(%ecx,%edi,1),%edi
-	testl	$0x3, %edx
-	jnz	L(bk_align)
-
-L(bk_aligned_4):
-	cmp	$64, %ecx
-	jae	L(bk_write_more64bytes)
-
-L(bk_write_64bytesless):
-	cmp	$32, %ecx
-	jb	L(bk_write_less32bytes)
-
-L(bk_write_more32bytes):
-	/* Copy 32 bytes at a time.  */
-	sub	$32, %ecx
-	movq	-8(%edi), %xmm0
-	movq	%xmm0, -8(%edx)
-	movq	-16(%edi), %xmm0
-	movq	%xmm0, -16(%edx)
-	movq	-24(%edi), %xmm0
-	movq	%xmm0, -24(%edx)
-	movq	-32(%edi), %xmm0
-	movq	%xmm0, -32(%edx)
-	sub	$32, %edx
-	sub	$32, %edi
-
-L(bk_write_less32bytes):
-	movl	%edi, %eax
-	sub	%ecx, %edx
-	sub	%ecx, %eax
-	POP (%edi)
-L(bk_write_less32bytes_2):
-	BRANCH_TO_JMPTBL_ENTRY (L(table_48_bytes_bwd), %ecx, 4)
-
-	CFI_PUSH (%edi)
-
-	.p2align 4
-L(bk_align):
-	cmp	$8, %ecx
-	jbe	L(bk_write_less32bytes)
-	testl	$1, %edx
-	/* We get here only if (EDX & 3 ) != 0 so if (EDX & 1) ==0,
-	then	(EDX & 2) must be != 0.  */
-	jz	L(bk_got2)
-	sub	$1, %edi
-	sub	$1, %ecx
-	sub	$1, %edx
-	movzbl	(%edi), %eax
-	movb	%al, (%edx)
-
-	testl	$2, %edx
-	jz	L(bk_aligned_4)
-
-L(bk_got2):
-	sub	$2, %edi
-	sub	$2, %ecx
-	sub	$2, %edx
-	movzwl	(%edi), %eax
-	movw	%ax, (%edx)
-	jmp	L(bk_aligned_4)
-
-	.p2align 4
-L(bk_write_more64bytes):
-	/* Check alignment of last byte.  */
-	testl	$15, %edx
-	jz	L(bk_ssse3_cpy_pre)
-
-/* EDX is aligned 4 bytes, but not 16 bytes.  */
-L(bk_ssse3_align):
-	sub	$4, %edi
-	sub	$4, %ecx
-	sub	$4, %edx
-	movl	(%edi), %eax
-	movl	%eax, (%edx)
-
-	testl	$15, %edx
-	jz	L(bk_ssse3_cpy_pre)
-
-	sub	$4, %edi
-	sub	$4, %ecx
-	sub	$4, %edx
-	movl	(%edi), %eax
-	movl	%eax, (%edx)
-
-	testl	$15, %edx
-	jz	L(bk_ssse3_cpy_pre)
-
-	sub	$4, %edi
-	sub	$4, %ecx
-	sub	$4, %edx
-	movl	(%edi), %eax
-	movl	%eax, (%edx)
-
-L(bk_ssse3_cpy_pre):
-	cmp	$64, %ecx
-	jb	L(bk_write_more32bytes)
-
-	.p2align 4
-L(bk_ssse3_cpy):
-	sub	$64, %edi
-	sub	$64, %ecx
-	sub	$64, %edx
-	movdqu	0x30(%edi), %xmm3
-	movdqa	%xmm3, 0x30(%edx)
-	movdqu	0x20(%edi), %xmm2
-	movdqa	%xmm2, 0x20(%edx)
-	movdqu	0x10(%edi), %xmm1
-	movdqa	%xmm1, 0x10(%edx)
-	movdqu	(%edi), %xmm0
-	movdqa	%xmm0, (%edx)
-	cmp	$64, %ecx
-	jae	L(bk_ssse3_cpy)
-	jmp	L(bk_write_64bytesless)
-
-#endif
-
-END (MEMCPY)
diff --git a/libc/arch-x86/string/ssse3-memmove-atom.S b/libc/arch-x86/string/ssse3-memmove-atom.S
deleted file mode 100644
index 3572eac..0000000
--- a/libc/arch-x86/string/ssse3-memmove-atom.S
+++ /dev/null
@@ -1,34 +0,0 @@
-/*
-Copyright (c) 2010, Intel Corporation
-All rights reserved.
-
-Redistribution and use in source and binary forms, with or without
-modification, are permitted provided that the following conditions are met:
-
-    * Redistributions of source code must retain the above copyright notice,
-    * this list of conditions and the following disclaimer.
-
-    * Redistributions in binary form must reproduce the above copyright notice,
-    * this list of conditions and the following disclaimer in the documentation
-    * and/or other materials provided with the distribution.
-
-    * Neither the name of Intel Corporation nor the names of its contributors
-    * may be used to endorse or promote products derived from this software
-    * without specific prior written permission.
-
-THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
-ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
-WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
-DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
-ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
-(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
-LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
-ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-*/
-
-
-#define MEMCPY memmove_atom
-#define USE_AS_MEMMOVE
-#include "ssse3-memcpy-atom.S"
diff --git a/libc/arch-x86/string/ssse3-strncpy-atom.S b/libc/arch-x86/string/ssse3-strncpy-atom.S
deleted file mode 100644
index 0c27ffe..0000000
--- a/libc/arch-x86/string/ssse3-strncpy-atom.S
+++ /dev/null
@@ -1,33 +0,0 @@
-/*
-Copyright (c) 2011, Intel Corporation
-All rights reserved.
-
-Redistribution and use in source and binary forms, with or without
-modification, are permitted provided that the following conditions are met:
-
-    * Redistributions of source code must retain the above copyright notice,
-    * this list of conditions and the following disclaimer.
-
-    * Redistributions in binary form must reproduce the above copyright notice,
-    * this list of conditions and the following disclaimer in the documentation
-    * and/or other materials provided with the distribution.
-
-    * Neither the name of Intel Corporation nor the names of its contributors
-    * may be used to endorse or promote products derived from this software
-    * without specific prior written permission.
-
-THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
-ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
-WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
-DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
-ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
-(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
-LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
-ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-*/
-
-#define USE_AS_STRNCPY
-#define STRCPY strncpy_atom
-#include "ssse3-strcpy-atom.S"