libc: remove most dynamically-selected x86 atom functions
The 'generic' versions of these all have newer impls from Intel, and
seemingly use the same feature baseline (sse2 or ssse3). There's
no obvious reason to carry these old impls around.
While I'm here, also remove the ssse3-memcpy. While that might be
a bit faster, `s/memmove/memcpy` seems slightly risky for 32-bit x86.
Bug: 358360849
Test: mma
Change-Id: I62cbf638bf4d9fdb799af93168b0cc0ea841526f
diff --git a/libc/Android.bp b/libc/Android.bp
index a4f2c69..532ed4d 100644
--- a/libc/Android.bp
+++ b/libc/Android.bp
@@ -1186,7 +1186,6 @@
"arch-x86/string/sse2-memchr-atom.S",
"arch-x86/string/sse2-memmove-slm.S",
"arch-x86/string/sse2-memrchr-atom.S",
- "arch-x86/string/sse2-memset-atom.S",
"arch-x86/string/sse2-memset-slm.S",
"arch-x86/string/sse2-stpcpy-slm.S",
"arch-x86/string/sse2-stpncpy-slm.S",
@@ -1200,18 +1199,14 @@
"arch-x86/string/sse2-wcsrchr-atom.S",
"arch-x86/string/sse2-wcslen-atom.S",
"arch-x86/string/sse2-wcscmp-atom.S",
- "arch-x86/string/sse2-strlen-atom.S",
"arch-x86/string/ssse3-memcmp-atom.S",
- "arch-x86/string/ssse3-memmove-atom.S",
"arch-x86/string/ssse3-strcat-atom.S",
"arch-x86/string/ssse3-strcmp-atom.S",
- "arch-x86/string/ssse3-strcpy-atom.S",
"arch-x86/string/ssse3-strlcat-atom.S",
"arch-x86/string/ssse3-strlcpy-atom.S",
"arch-x86/string/ssse3-strncat-atom.S",
"arch-x86/string/ssse3-strncmp-atom.S",
- "arch-x86/string/ssse3-strncpy-atom.S",
"arch-x86/string/ssse3-wcscat-atom.S",
"arch-x86/string/ssse3-wcscpy-atom.S",
"arch-x86/string/ssse3-wmemcmp-atom.S",
diff --git a/libc/arch-x86/dynamic_function_dispatch.cpp b/libc/arch-x86/dynamic_function_dispatch.cpp
index e6cc5fb..db64237 100644
--- a/libc/arch-x86/dynamic_function_dispatch.cpp
+++ b/libc/arch-x86/dynamic_function_dispatch.cpp
@@ -39,51 +39,6 @@
}
MEMCMP_SHIM()
-DEFINE_IFUNC_FOR(memset) {
- __builtin_cpu_init();
- if (__builtin_cpu_is("atom")) RETURN_FUNC(memset_func_t, memset_atom);
- RETURN_FUNC(memset_func_t, memset_generic);
-}
-MEMSET_SHIM()
-
-DEFINE_IFUNC_FOR(__memset_chk) {
- __builtin_cpu_init();
- if (__builtin_cpu_is("atom")) RETURN_FUNC(__memset_chk_func_t, __memset_chk_atom);
- RETURN_FUNC(__memset_chk_func_t, __memset_chk_generic);
-}
-__MEMSET_CHK_SHIM()
-
-DEFINE_IFUNC_FOR(memmove) {
- __builtin_cpu_init();
- if (__builtin_cpu_is("atom")) RETURN_FUNC(memmove_func_t, memmove_atom);
- RETURN_FUNC(memmove_func_t, memmove_generic);
-}
-MEMMOVE_SHIM()
-
-DEFINE_IFUNC_FOR(memcpy) { return memmove_resolver(); }
-MEMCPY_SHIM()
-
-DEFINE_IFUNC_FOR(strcpy) {
- __builtin_cpu_init();
- if (__builtin_cpu_is("atom")) RETURN_FUNC(strcpy_func_t, strcpy_atom);
- RETURN_FUNC(strcpy_func_t, strcpy_generic);
-}
-STRCPY_SHIM()
-
-DEFINE_IFUNC_FOR(strncpy) {
- __builtin_cpu_init();
- if (__builtin_cpu_is("atom")) RETURN_FUNC(strncpy_func_t, strncpy_atom);
- RETURN_FUNC(strncpy_func_t, strncpy_generic);
-}
-STRNCPY_SHIM()
-
-DEFINE_IFUNC_FOR(strlen) {
- __builtin_cpu_init();
- if (__builtin_cpu_is("atom")) RETURN_FUNC(strlen_func_t, strlen_atom);
- RETURN_FUNC(strlen_func_t, strlen_generic);
-}
-STRLEN_SHIM()
-
typedef int wmemcmp_func_t(const wchar_t*, const wchar_t*, size_t);
DEFINE_IFUNC_FOR(wmemcmp) {
__builtin_cpu_init();
diff --git a/libc/arch-x86/string/sse2-memmove-slm.S b/libc/arch-x86/string/sse2-memmove-slm.S
index 7f42374..2ed4e7b 100644
--- a/libc/arch-x86/string/sse2-memmove-slm.S
+++ b/libc/arch-x86/string/sse2-memmove-slm.S
@@ -31,7 +31,7 @@
#define FOR_SILVERMONT
#ifndef MEMMOVE
-# define MEMMOVE memmove_generic
+# define MEMMOVE memmove
#endif
#ifndef L
@@ -551,3 +551,9 @@
jmp L(mm_recalc_len)
END (MEMMOVE)
+
+// N.B., `private/bionic_asm.h` provides ALIAS_SYMBOL, but that file provides
+// conflicting definitions for some macros in this file. Since ALIAS_SYMBOL is
+// small, inline it here.
+.globl memcpy;
+.equ memcpy, MEMMOVE
diff --git a/libc/arch-x86/string/sse2-memset-atom.S b/libc/arch-x86/string/sse2-memset-atom.S
deleted file mode 100644
index e43ead0..0000000
--- a/libc/arch-x86/string/sse2-memset-atom.S
+++ /dev/null
@@ -1,841 +0,0 @@
-/*
-Copyright (c) 2010, Intel Corporation
-All rights reserved.
-
-Redistribution and use in source and binary forms, with or without
-modification, are permitted provided that the following conditions are met:
-
- * Redistributions of source code must retain the above copyright notice,
- * this list of conditions and the following disclaimer.
-
- * Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
-
- * Neither the name of Intel Corporation nor the names of its contributors
- * may be used to endorse or promote products derived from this software
- * without specific prior written permission.
-
-THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
-ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
-WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
-DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
-ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
-(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
-LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
-ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-*/
-
-#include <private/bionic_asm.h>
-
-#define FOR_ATOM
-
-#ifndef L
-# define L(label) .L##label
-#endif
-
-#ifndef ALIGN
-# define ALIGN(n) .p2align n
-#endif
-
-#define CFI_PUSH(REG) \
- .cfi_adjust_cfa_offset 4; \
- .cfi_rel_offset REG, 0
-
-#define CFI_POP(REG) \
- .cfi_adjust_cfa_offset -4; \
- .cfi_restore REG
-
-#define PUSH(REG) pushl REG; CFI_PUSH(REG)
-#define POP(REG) popl REG; CFI_POP(REG)
-
-#define PARMS 8 /* Preserve EBX. */
-#define DST PARMS
-#define CHR (DST+4)
-#define LEN (CHR+4)
-#define CHK_DST_LEN (LEN+4)
-#define SETRTNVAL movl DST(%esp), %eax
-
-#define ENTRANCE PUSH(%ebx);
-#define RETURN_END POP(%ebx); ret
-#define RETURN RETURN_END; CFI_PUSH(%ebx)
-#define JMPTBL(I, B) I - B
-
-#define SETUP_PIC_REG(x) call __x86.get_pc_thunk.x
-
-/* Load an entry in a jump table into EBX and branch to it. TABLE is a
- jump table with relative offsets. */
-# define BRANCH_TO_JMPTBL_ENTRY(TABLE) \
- /* We first load PC into EBX. */ \
- call __x86.get_pc_thunk.bx; \
- /* Get the address of the jump table. */ \
- add $(TABLE - .), %ebx; \
- /* Get the entry and convert the relative offset to the \
- absolute address. */ \
- add (%ebx,%ecx,4), %ebx; \
- add %ecx, %edx; \
- /* We loaded the jump table and adjusted EDX. Go. */ \
- jmp *%ebx
-
-ENTRY(__memset_chk_atom)
- ENTRANCE
-
- movl LEN(%esp), %ecx
- cmpl CHK_DST_LEN(%esp), %ecx
- jna L(memset_length_loaded)
-
- POP(%ebx) // Undo ENTRANCE without returning.
- jmp __memset_chk_fail
-END(__memset_chk_atom)
-
- .section .text.sse2,"ax",@progbits
- ALIGN(4)
-ENTRY(memset_atom)
- ENTRANCE
-
- movl LEN(%esp), %ecx
-L(memset_length_loaded):
- movzbl CHR(%esp), %eax
- movb %al, %ah
- /* Fill the whole EAX with pattern. */
- movl %eax, %edx
- shl $16, %eax
- or %edx, %eax
- movl DST(%esp), %edx
- cmp $32, %ecx
- jae L(32bytesormore)
-
-L(write_less32bytes):
- BRANCH_TO_JMPTBL_ENTRY(L(table_less_32bytes))
-
-
- .pushsection .rodata.sse2,"a",@progbits
- ALIGN(2)
-L(table_less_32bytes):
- .int JMPTBL(L(write_0bytes), L(table_less_32bytes))
- .int JMPTBL(L(write_1bytes), L(table_less_32bytes))
- .int JMPTBL(L(write_2bytes), L(table_less_32bytes))
- .int JMPTBL(L(write_3bytes), L(table_less_32bytes))
- .int JMPTBL(L(write_4bytes), L(table_less_32bytes))
- .int JMPTBL(L(write_5bytes), L(table_less_32bytes))
- .int JMPTBL(L(write_6bytes), L(table_less_32bytes))
- .int JMPTBL(L(write_7bytes), L(table_less_32bytes))
- .int JMPTBL(L(write_8bytes), L(table_less_32bytes))
- .int JMPTBL(L(write_9bytes), L(table_less_32bytes))
- .int JMPTBL(L(write_10bytes), L(table_less_32bytes))
- .int JMPTBL(L(write_11bytes), L(table_less_32bytes))
- .int JMPTBL(L(write_12bytes), L(table_less_32bytes))
- .int JMPTBL(L(write_13bytes), L(table_less_32bytes))
- .int JMPTBL(L(write_14bytes), L(table_less_32bytes))
- .int JMPTBL(L(write_15bytes), L(table_less_32bytes))
- .int JMPTBL(L(write_16bytes), L(table_less_32bytes))
- .int JMPTBL(L(write_17bytes), L(table_less_32bytes))
- .int JMPTBL(L(write_18bytes), L(table_less_32bytes))
- .int JMPTBL(L(write_19bytes), L(table_less_32bytes))
- .int JMPTBL(L(write_20bytes), L(table_less_32bytes))
- .int JMPTBL(L(write_21bytes), L(table_less_32bytes))
- .int JMPTBL(L(write_22bytes), L(table_less_32bytes))
- .int JMPTBL(L(write_23bytes), L(table_less_32bytes))
- .int JMPTBL(L(write_24bytes), L(table_less_32bytes))
- .int JMPTBL(L(write_25bytes), L(table_less_32bytes))
- .int JMPTBL(L(write_26bytes), L(table_less_32bytes))
- .int JMPTBL(L(write_27bytes), L(table_less_32bytes))
- .int JMPTBL(L(write_28bytes), L(table_less_32bytes))
- .int JMPTBL(L(write_29bytes), L(table_less_32bytes))
- .int JMPTBL(L(write_30bytes), L(table_less_32bytes))
- .int JMPTBL(L(write_31bytes), L(table_less_32bytes))
- .popsection
-
- ALIGN(4)
-L(write_28bytes):
- movl %eax, -28(%edx)
-L(write_24bytes):
- movl %eax, -24(%edx)
-L(write_20bytes):
- movl %eax, -20(%edx)
-L(write_16bytes):
- movl %eax, -16(%edx)
-L(write_12bytes):
- movl %eax, -12(%edx)
-L(write_8bytes):
- movl %eax, -8(%edx)
-L(write_4bytes):
- movl %eax, -4(%edx)
-L(write_0bytes):
- SETRTNVAL
- RETURN
-
- ALIGN(4)
-L(write_29bytes):
- movl %eax, -29(%edx)
-L(write_25bytes):
- movl %eax, -25(%edx)
-L(write_21bytes):
- movl %eax, -21(%edx)
-L(write_17bytes):
- movl %eax, -17(%edx)
-L(write_13bytes):
- movl %eax, -13(%edx)
-L(write_9bytes):
- movl %eax, -9(%edx)
-L(write_5bytes):
- movl %eax, -5(%edx)
-L(write_1bytes):
- movb %al, -1(%edx)
- SETRTNVAL
- RETURN
-
- ALIGN(4)
-L(write_30bytes):
- movl %eax, -30(%edx)
-L(write_26bytes):
- movl %eax, -26(%edx)
-L(write_22bytes):
- movl %eax, -22(%edx)
-L(write_18bytes):
- movl %eax, -18(%edx)
-L(write_14bytes):
- movl %eax, -14(%edx)
-L(write_10bytes):
- movl %eax, -10(%edx)
-L(write_6bytes):
- movl %eax, -6(%edx)
-L(write_2bytes):
- movw %ax, -2(%edx)
- SETRTNVAL
- RETURN
-
- ALIGN(4)
-L(write_31bytes):
- movl %eax, -31(%edx)
-L(write_27bytes):
- movl %eax, -27(%edx)
-L(write_23bytes):
- movl %eax, -23(%edx)
-L(write_19bytes):
- movl %eax, -19(%edx)
-L(write_15bytes):
- movl %eax, -15(%edx)
-L(write_11bytes):
- movl %eax, -11(%edx)
-L(write_7bytes):
- movl %eax, -7(%edx)
-L(write_3bytes):
- movw %ax, -3(%edx)
- movb %al, -1(%edx)
- SETRTNVAL
- RETURN
-
- ALIGN(4)
-/* ECX > 32 and EDX is 4 byte aligned. */
-L(32bytesormore):
- /* Fill xmm0 with the pattern. */
- movd %eax, %xmm0
- pshufd $0, %xmm0, %xmm0
- testl $0xf, %edx
- jz L(aligned_16)
-/* ECX > 32 and EDX is not 16 byte aligned. */
-L(not_aligned_16):
- movdqu %xmm0, (%edx)
- movl %edx, %eax
- and $-16, %edx
- add $16, %edx
- sub %edx, %eax
- add %eax, %ecx
- movd %xmm0, %eax
-
- ALIGN(4)
-L(aligned_16):
- cmp $128, %ecx
- jae L(128bytesormore)
-
-L(aligned_16_less128bytes):
- BRANCH_TO_JMPTBL_ENTRY(L(table_16_128bytes))
-
- ALIGN(4)
-L(128bytesormore):
- PUSH(%ebx)
- SETUP_PIC_REG(bx)
- add $_GLOBAL_OFFSET_TABLE_, %ebx
- mov __x86_shared_cache_size@GOTOFF(%ebx), %ebx
- cmp %ebx, %ecx
- jae L(128bytesormore_nt_start)
-
-
- POP(%ebx)
-# define RESTORE_EBX_STATE CFI_PUSH(%ebx)
- PUSH(%ebx)
- SETUP_PIC_REG(bx)
- add $_GLOBAL_OFFSET_TABLE_, %ebx
- cmp __x86_data_cache_size@GOTOFF(%ebx), %ecx
- POP(%ebx)
-
- jae L(128bytes_L2_normal)
- subl $128, %ecx
-L(128bytesormore_normal):
- sub $128, %ecx
- movdqa %xmm0, (%edx)
- movdqa %xmm0, 0x10(%edx)
- movdqa %xmm0, 0x20(%edx)
- movdqa %xmm0, 0x30(%edx)
- movdqa %xmm0, 0x40(%edx)
- movdqa %xmm0, 0x50(%edx)
- movdqa %xmm0, 0x60(%edx)
- movdqa %xmm0, 0x70(%edx)
- lea 128(%edx), %edx
- jb L(128bytesless_normal)
-
-
- sub $128, %ecx
- movdqa %xmm0, (%edx)
- movdqa %xmm0, 0x10(%edx)
- movdqa %xmm0, 0x20(%edx)
- movdqa %xmm0, 0x30(%edx)
- movdqa %xmm0, 0x40(%edx)
- movdqa %xmm0, 0x50(%edx)
- movdqa %xmm0, 0x60(%edx)
- movdqa %xmm0, 0x70(%edx)
- lea 128(%edx), %edx
- jae L(128bytesormore_normal)
-
-L(128bytesless_normal):
- add $128, %ecx
- BRANCH_TO_JMPTBL_ENTRY(L(table_16_128bytes))
-
- ALIGN(4)
-L(128bytes_L2_normal):
- prefetcht0 0x380(%edx)
- prefetcht0 0x3c0(%edx)
- sub $128, %ecx
- movdqa %xmm0, (%edx)
- movaps %xmm0, 0x10(%edx)
- movaps %xmm0, 0x20(%edx)
- movaps %xmm0, 0x30(%edx)
- movaps %xmm0, 0x40(%edx)
- movaps %xmm0, 0x50(%edx)
- movaps %xmm0, 0x60(%edx)
- movaps %xmm0, 0x70(%edx)
- add $128, %edx
- cmp $128, %ecx
- jae L(128bytes_L2_normal)
-
-L(128bytesless_L2_normal):
- BRANCH_TO_JMPTBL_ENTRY(L(table_16_128bytes))
-
- RESTORE_EBX_STATE
-L(128bytesormore_nt_start):
- sub %ebx, %ecx
- mov %ebx, %eax
- and $0x7f, %eax
- add %eax, %ecx
- movd %xmm0, %eax
- ALIGN(4)
-L(128bytesormore_shared_cache_loop):
- prefetcht0 0x3c0(%edx)
- prefetcht0 0x380(%edx)
- sub $0x80, %ebx
- movdqa %xmm0, (%edx)
- movdqa %xmm0, 0x10(%edx)
- movdqa %xmm0, 0x20(%edx)
- movdqa %xmm0, 0x30(%edx)
- movdqa %xmm0, 0x40(%edx)
- movdqa %xmm0, 0x50(%edx)
- movdqa %xmm0, 0x60(%edx)
- movdqa %xmm0, 0x70(%edx)
- add $0x80, %edx
- cmp $0x80, %ebx
- jae L(128bytesormore_shared_cache_loop)
- cmp $0x80, %ecx
- jb L(shared_cache_loop_end)
- ALIGN(4)
-L(128bytesormore_nt):
- sub $0x80, %ecx
- movntdq %xmm0, (%edx)
- movntdq %xmm0, 0x10(%edx)
- movntdq %xmm0, 0x20(%edx)
- movntdq %xmm0, 0x30(%edx)
- movntdq %xmm0, 0x40(%edx)
- movntdq %xmm0, 0x50(%edx)
- movntdq %xmm0, 0x60(%edx)
- movntdq %xmm0, 0x70(%edx)
- add $0x80, %edx
- cmp $0x80, %ecx
- jae L(128bytesormore_nt)
- sfence
-L(shared_cache_loop_end):
- POP(%ebx)
- BRANCH_TO_JMPTBL_ENTRY(L(table_16_128bytes))
-
-
- .pushsection .rodata.sse2,"a",@progbits
- ALIGN(2)
-L(table_16_128bytes):
- .int JMPTBL(L(aligned_16_0bytes), L(table_16_128bytes))
- .int JMPTBL(L(aligned_16_1bytes), L(table_16_128bytes))
- .int JMPTBL(L(aligned_16_2bytes), L(table_16_128bytes))
- .int JMPTBL(L(aligned_16_3bytes), L(table_16_128bytes))
- .int JMPTBL(L(aligned_16_4bytes), L(table_16_128bytes))
- .int JMPTBL(L(aligned_16_5bytes), L(table_16_128bytes))
- .int JMPTBL(L(aligned_16_6bytes), L(table_16_128bytes))
- .int JMPTBL(L(aligned_16_7bytes), L(table_16_128bytes))
- .int JMPTBL(L(aligned_16_8bytes), L(table_16_128bytes))
- .int JMPTBL(L(aligned_16_9bytes), L(table_16_128bytes))
- .int JMPTBL(L(aligned_16_10bytes), L(table_16_128bytes))
- .int JMPTBL(L(aligned_16_11bytes), L(table_16_128bytes))
- .int JMPTBL(L(aligned_16_12bytes), L(table_16_128bytes))
- .int JMPTBL(L(aligned_16_13bytes), L(table_16_128bytes))
- .int JMPTBL(L(aligned_16_14bytes), L(table_16_128bytes))
- .int JMPTBL(L(aligned_16_15bytes), L(table_16_128bytes))
- .int JMPTBL(L(aligned_16_16bytes), L(table_16_128bytes))
- .int JMPTBL(L(aligned_16_17bytes), L(table_16_128bytes))
- .int JMPTBL(L(aligned_16_18bytes), L(table_16_128bytes))
- .int JMPTBL(L(aligned_16_19bytes), L(table_16_128bytes))
- .int JMPTBL(L(aligned_16_20bytes), L(table_16_128bytes))
- .int JMPTBL(L(aligned_16_21bytes), L(table_16_128bytes))
- .int JMPTBL(L(aligned_16_22bytes), L(table_16_128bytes))
- .int JMPTBL(L(aligned_16_23bytes), L(table_16_128bytes))
- .int JMPTBL(L(aligned_16_24bytes), L(table_16_128bytes))
- .int JMPTBL(L(aligned_16_25bytes), L(table_16_128bytes))
- .int JMPTBL(L(aligned_16_26bytes), L(table_16_128bytes))
- .int JMPTBL(L(aligned_16_27bytes), L(table_16_128bytes))
- .int JMPTBL(L(aligned_16_28bytes), L(table_16_128bytes))
- .int JMPTBL(L(aligned_16_29bytes), L(table_16_128bytes))
- .int JMPTBL(L(aligned_16_30bytes), L(table_16_128bytes))
- .int JMPTBL(L(aligned_16_31bytes), L(table_16_128bytes))
- .int JMPTBL(L(aligned_16_32bytes), L(table_16_128bytes))
- .int JMPTBL(L(aligned_16_33bytes), L(table_16_128bytes))
- .int JMPTBL(L(aligned_16_34bytes), L(table_16_128bytes))
- .int JMPTBL(L(aligned_16_35bytes), L(table_16_128bytes))
- .int JMPTBL(L(aligned_16_36bytes), L(table_16_128bytes))
- .int JMPTBL(L(aligned_16_37bytes), L(table_16_128bytes))
- .int JMPTBL(L(aligned_16_38bytes), L(table_16_128bytes))
- .int JMPTBL(L(aligned_16_39bytes), L(table_16_128bytes))
- .int JMPTBL(L(aligned_16_40bytes), L(table_16_128bytes))
- .int JMPTBL(L(aligned_16_41bytes), L(table_16_128bytes))
- .int JMPTBL(L(aligned_16_42bytes), L(table_16_128bytes))
- .int JMPTBL(L(aligned_16_43bytes), L(table_16_128bytes))
- .int JMPTBL(L(aligned_16_44bytes), L(table_16_128bytes))
- .int JMPTBL(L(aligned_16_45bytes), L(table_16_128bytes))
- .int JMPTBL(L(aligned_16_46bytes), L(table_16_128bytes))
- .int JMPTBL(L(aligned_16_47bytes), L(table_16_128bytes))
- .int JMPTBL(L(aligned_16_48bytes), L(table_16_128bytes))
- .int JMPTBL(L(aligned_16_49bytes), L(table_16_128bytes))
- .int JMPTBL(L(aligned_16_50bytes), L(table_16_128bytes))
- .int JMPTBL(L(aligned_16_51bytes), L(table_16_128bytes))
- .int JMPTBL(L(aligned_16_52bytes), L(table_16_128bytes))
- .int JMPTBL(L(aligned_16_53bytes), L(table_16_128bytes))
- .int JMPTBL(L(aligned_16_54bytes), L(table_16_128bytes))
- .int JMPTBL(L(aligned_16_55bytes), L(table_16_128bytes))
- .int JMPTBL(L(aligned_16_56bytes), L(table_16_128bytes))
- .int JMPTBL(L(aligned_16_57bytes), L(table_16_128bytes))
- .int JMPTBL(L(aligned_16_58bytes), L(table_16_128bytes))
- .int JMPTBL(L(aligned_16_59bytes), L(table_16_128bytes))
- .int JMPTBL(L(aligned_16_60bytes), L(table_16_128bytes))
- .int JMPTBL(L(aligned_16_61bytes), L(table_16_128bytes))
- .int JMPTBL(L(aligned_16_62bytes), L(table_16_128bytes))
- .int JMPTBL(L(aligned_16_63bytes), L(table_16_128bytes))
- .int JMPTBL(L(aligned_16_64bytes), L(table_16_128bytes))
- .int JMPTBL(L(aligned_16_65bytes), L(table_16_128bytes))
- .int JMPTBL(L(aligned_16_66bytes), L(table_16_128bytes))
- .int JMPTBL(L(aligned_16_67bytes), L(table_16_128bytes))
- .int JMPTBL(L(aligned_16_68bytes), L(table_16_128bytes))
- .int JMPTBL(L(aligned_16_69bytes), L(table_16_128bytes))
- .int JMPTBL(L(aligned_16_70bytes), L(table_16_128bytes))
- .int JMPTBL(L(aligned_16_71bytes), L(table_16_128bytes))
- .int JMPTBL(L(aligned_16_72bytes), L(table_16_128bytes))
- .int JMPTBL(L(aligned_16_73bytes), L(table_16_128bytes))
- .int JMPTBL(L(aligned_16_74bytes), L(table_16_128bytes))
- .int JMPTBL(L(aligned_16_75bytes), L(table_16_128bytes))
- .int JMPTBL(L(aligned_16_76bytes), L(table_16_128bytes))
- .int JMPTBL(L(aligned_16_77bytes), L(table_16_128bytes))
- .int JMPTBL(L(aligned_16_78bytes), L(table_16_128bytes))
- .int JMPTBL(L(aligned_16_79bytes), L(table_16_128bytes))
- .int JMPTBL(L(aligned_16_80bytes), L(table_16_128bytes))
- .int JMPTBL(L(aligned_16_81bytes), L(table_16_128bytes))
- .int JMPTBL(L(aligned_16_82bytes), L(table_16_128bytes))
- .int JMPTBL(L(aligned_16_83bytes), L(table_16_128bytes))
- .int JMPTBL(L(aligned_16_84bytes), L(table_16_128bytes))
- .int JMPTBL(L(aligned_16_85bytes), L(table_16_128bytes))
- .int JMPTBL(L(aligned_16_86bytes), L(table_16_128bytes))
- .int JMPTBL(L(aligned_16_87bytes), L(table_16_128bytes))
- .int JMPTBL(L(aligned_16_88bytes), L(table_16_128bytes))
- .int JMPTBL(L(aligned_16_89bytes), L(table_16_128bytes))
- .int JMPTBL(L(aligned_16_90bytes), L(table_16_128bytes))
- .int JMPTBL(L(aligned_16_91bytes), L(table_16_128bytes))
- .int JMPTBL(L(aligned_16_92bytes), L(table_16_128bytes))
- .int JMPTBL(L(aligned_16_93bytes), L(table_16_128bytes))
- .int JMPTBL(L(aligned_16_94bytes), L(table_16_128bytes))
- .int JMPTBL(L(aligned_16_95bytes), L(table_16_128bytes))
- .int JMPTBL(L(aligned_16_96bytes), L(table_16_128bytes))
- .int JMPTBL(L(aligned_16_97bytes), L(table_16_128bytes))
- .int JMPTBL(L(aligned_16_98bytes), L(table_16_128bytes))
- .int JMPTBL(L(aligned_16_99bytes), L(table_16_128bytes))
- .int JMPTBL(L(aligned_16_100bytes), L(table_16_128bytes))
- .int JMPTBL(L(aligned_16_101bytes), L(table_16_128bytes))
- .int JMPTBL(L(aligned_16_102bytes), L(table_16_128bytes))
- .int JMPTBL(L(aligned_16_103bytes), L(table_16_128bytes))
- .int JMPTBL(L(aligned_16_104bytes), L(table_16_128bytes))
- .int JMPTBL(L(aligned_16_105bytes), L(table_16_128bytes))
- .int JMPTBL(L(aligned_16_106bytes), L(table_16_128bytes))
- .int JMPTBL(L(aligned_16_107bytes), L(table_16_128bytes))
- .int JMPTBL(L(aligned_16_108bytes), L(table_16_128bytes))
- .int JMPTBL(L(aligned_16_109bytes), L(table_16_128bytes))
- .int JMPTBL(L(aligned_16_110bytes), L(table_16_128bytes))
- .int JMPTBL(L(aligned_16_111bytes), L(table_16_128bytes))
- .int JMPTBL(L(aligned_16_112bytes), L(table_16_128bytes))
- .int JMPTBL(L(aligned_16_113bytes), L(table_16_128bytes))
- .int JMPTBL(L(aligned_16_114bytes), L(table_16_128bytes))
- .int JMPTBL(L(aligned_16_115bytes), L(table_16_128bytes))
- .int JMPTBL(L(aligned_16_116bytes), L(table_16_128bytes))
- .int JMPTBL(L(aligned_16_117bytes), L(table_16_128bytes))
- .int JMPTBL(L(aligned_16_118bytes), L(table_16_128bytes))
- .int JMPTBL(L(aligned_16_119bytes), L(table_16_128bytes))
- .int JMPTBL(L(aligned_16_120bytes), L(table_16_128bytes))
- .int JMPTBL(L(aligned_16_121bytes), L(table_16_128bytes))
- .int JMPTBL(L(aligned_16_122bytes), L(table_16_128bytes))
- .int JMPTBL(L(aligned_16_123bytes), L(table_16_128bytes))
- .int JMPTBL(L(aligned_16_124bytes), L(table_16_128bytes))
- .int JMPTBL(L(aligned_16_125bytes), L(table_16_128bytes))
- .int JMPTBL(L(aligned_16_126bytes), L(table_16_128bytes))
- .int JMPTBL(L(aligned_16_127bytes), L(table_16_128bytes))
- .popsection
-
- ALIGN(4)
-L(aligned_16_112bytes):
- movdqa %xmm0, -112(%edx)
-L(aligned_16_96bytes):
- movdqa %xmm0, -96(%edx)
-L(aligned_16_80bytes):
- movdqa %xmm0, -80(%edx)
-L(aligned_16_64bytes):
- movdqa %xmm0, -64(%edx)
-L(aligned_16_48bytes):
- movdqa %xmm0, -48(%edx)
-L(aligned_16_32bytes):
- movdqa %xmm0, -32(%edx)
-L(aligned_16_16bytes):
- movdqa %xmm0, -16(%edx)
-L(aligned_16_0bytes):
- SETRTNVAL
- RETURN
-
- ALIGN(4)
-L(aligned_16_113bytes):
- movdqa %xmm0, -113(%edx)
-L(aligned_16_97bytes):
- movdqa %xmm0, -97(%edx)
-L(aligned_16_81bytes):
- movdqa %xmm0, -81(%edx)
-L(aligned_16_65bytes):
- movdqa %xmm0, -65(%edx)
-L(aligned_16_49bytes):
- movdqa %xmm0, -49(%edx)
-L(aligned_16_33bytes):
- movdqa %xmm0, -33(%edx)
-L(aligned_16_17bytes):
- movdqa %xmm0, -17(%edx)
-L(aligned_16_1bytes):
- movb %al, -1(%edx)
- SETRTNVAL
- RETURN
-
- ALIGN(4)
-L(aligned_16_114bytes):
- movdqa %xmm0, -114(%edx)
-L(aligned_16_98bytes):
- movdqa %xmm0, -98(%edx)
-L(aligned_16_82bytes):
- movdqa %xmm0, -82(%edx)
-L(aligned_16_66bytes):
- movdqa %xmm0, -66(%edx)
-L(aligned_16_50bytes):
- movdqa %xmm0, -50(%edx)
-L(aligned_16_34bytes):
- movdqa %xmm0, -34(%edx)
-L(aligned_16_18bytes):
- movdqa %xmm0, -18(%edx)
-L(aligned_16_2bytes):
- movw %ax, -2(%edx)
- SETRTNVAL
- RETURN
-
- ALIGN(4)
-L(aligned_16_115bytes):
- movdqa %xmm0, -115(%edx)
-L(aligned_16_99bytes):
- movdqa %xmm0, -99(%edx)
-L(aligned_16_83bytes):
- movdqa %xmm0, -83(%edx)
-L(aligned_16_67bytes):
- movdqa %xmm0, -67(%edx)
-L(aligned_16_51bytes):
- movdqa %xmm0, -51(%edx)
-L(aligned_16_35bytes):
- movdqa %xmm0, -35(%edx)
-L(aligned_16_19bytes):
- movdqa %xmm0, -19(%edx)
-L(aligned_16_3bytes):
- movw %ax, -3(%edx)
- movb %al, -1(%edx)
- SETRTNVAL
- RETURN
-
- ALIGN(4)
-L(aligned_16_116bytes):
- movdqa %xmm0, -116(%edx)
-L(aligned_16_100bytes):
- movdqa %xmm0, -100(%edx)
-L(aligned_16_84bytes):
- movdqa %xmm0, -84(%edx)
-L(aligned_16_68bytes):
- movdqa %xmm0, -68(%edx)
-L(aligned_16_52bytes):
- movdqa %xmm0, -52(%edx)
-L(aligned_16_36bytes):
- movdqa %xmm0, -36(%edx)
-L(aligned_16_20bytes):
- movdqa %xmm0, -20(%edx)
-L(aligned_16_4bytes):
- movl %eax, -4(%edx)
- SETRTNVAL
- RETURN
-
- ALIGN(4)
-L(aligned_16_117bytes):
- movdqa %xmm0, -117(%edx)
-L(aligned_16_101bytes):
- movdqa %xmm0, -101(%edx)
-L(aligned_16_85bytes):
- movdqa %xmm0, -85(%edx)
-L(aligned_16_69bytes):
- movdqa %xmm0, -69(%edx)
-L(aligned_16_53bytes):
- movdqa %xmm0, -53(%edx)
-L(aligned_16_37bytes):
- movdqa %xmm0, -37(%edx)
-L(aligned_16_21bytes):
- movdqa %xmm0, -21(%edx)
-L(aligned_16_5bytes):
- movl %eax, -5(%edx)
- movb %al, -1(%edx)
- SETRTNVAL
- RETURN
-
- ALIGN(4)
-L(aligned_16_118bytes):
- movdqa %xmm0, -118(%edx)
-L(aligned_16_102bytes):
- movdqa %xmm0, -102(%edx)
-L(aligned_16_86bytes):
- movdqa %xmm0, -86(%edx)
-L(aligned_16_70bytes):
- movdqa %xmm0, -70(%edx)
-L(aligned_16_54bytes):
- movdqa %xmm0, -54(%edx)
-L(aligned_16_38bytes):
- movdqa %xmm0, -38(%edx)
-L(aligned_16_22bytes):
- movdqa %xmm0, -22(%edx)
-L(aligned_16_6bytes):
- movl %eax, -6(%edx)
- movw %ax, -2(%edx)
- SETRTNVAL
- RETURN
-
- ALIGN(4)
-L(aligned_16_119bytes):
- movdqa %xmm0, -119(%edx)
-L(aligned_16_103bytes):
- movdqa %xmm0, -103(%edx)
-L(aligned_16_87bytes):
- movdqa %xmm0, -87(%edx)
-L(aligned_16_71bytes):
- movdqa %xmm0, -71(%edx)
-L(aligned_16_55bytes):
- movdqa %xmm0, -55(%edx)
-L(aligned_16_39bytes):
- movdqa %xmm0, -39(%edx)
-L(aligned_16_23bytes):
- movdqa %xmm0, -23(%edx)
-L(aligned_16_7bytes):
- movl %eax, -7(%edx)
- movw %ax, -3(%edx)
- movb %al, -1(%edx)
- SETRTNVAL
- RETURN
-
- ALIGN(4)
-L(aligned_16_120bytes):
- movdqa %xmm0, -120(%edx)
-L(aligned_16_104bytes):
- movdqa %xmm0, -104(%edx)
-L(aligned_16_88bytes):
- movdqa %xmm0, -88(%edx)
-L(aligned_16_72bytes):
- movdqa %xmm0, -72(%edx)
-L(aligned_16_56bytes):
- movdqa %xmm0, -56(%edx)
-L(aligned_16_40bytes):
- movdqa %xmm0, -40(%edx)
-L(aligned_16_24bytes):
- movdqa %xmm0, -24(%edx)
-L(aligned_16_8bytes):
- movq %xmm0, -8(%edx)
- SETRTNVAL
- RETURN
-
- ALIGN(4)
-L(aligned_16_121bytes):
- movdqa %xmm0, -121(%edx)
-L(aligned_16_105bytes):
- movdqa %xmm0, -105(%edx)
-L(aligned_16_89bytes):
- movdqa %xmm0, -89(%edx)
-L(aligned_16_73bytes):
- movdqa %xmm0, -73(%edx)
-L(aligned_16_57bytes):
- movdqa %xmm0, -57(%edx)
-L(aligned_16_41bytes):
- movdqa %xmm0, -41(%edx)
-L(aligned_16_25bytes):
- movdqa %xmm0, -25(%edx)
-L(aligned_16_9bytes):
- movq %xmm0, -9(%edx)
- movb %al, -1(%edx)
- SETRTNVAL
- RETURN
-
- ALIGN(4)
-L(aligned_16_122bytes):
- movdqa %xmm0, -122(%edx)
-L(aligned_16_106bytes):
- movdqa %xmm0, -106(%edx)
-L(aligned_16_90bytes):
- movdqa %xmm0, -90(%edx)
-L(aligned_16_74bytes):
- movdqa %xmm0, -74(%edx)
-L(aligned_16_58bytes):
- movdqa %xmm0, -58(%edx)
-L(aligned_16_42bytes):
- movdqa %xmm0, -42(%edx)
-L(aligned_16_26bytes):
- movdqa %xmm0, -26(%edx)
-L(aligned_16_10bytes):
- movq %xmm0, -10(%edx)
- movw %ax, -2(%edx)
- SETRTNVAL
- RETURN
-
- ALIGN(4)
-L(aligned_16_123bytes):
- movdqa %xmm0, -123(%edx)
-L(aligned_16_107bytes):
- movdqa %xmm0, -107(%edx)
-L(aligned_16_91bytes):
- movdqa %xmm0, -91(%edx)
-L(aligned_16_75bytes):
- movdqa %xmm0, -75(%edx)
-L(aligned_16_59bytes):
- movdqa %xmm0, -59(%edx)
-L(aligned_16_43bytes):
- movdqa %xmm0, -43(%edx)
-L(aligned_16_27bytes):
- movdqa %xmm0, -27(%edx)
-L(aligned_16_11bytes):
- movq %xmm0, -11(%edx)
- movw %ax, -3(%edx)
- movb %al, -1(%edx)
- SETRTNVAL
- RETURN
-
- ALIGN(4)
-L(aligned_16_124bytes):
- movdqa %xmm0, -124(%edx)
-L(aligned_16_108bytes):
- movdqa %xmm0, -108(%edx)
-L(aligned_16_92bytes):
- movdqa %xmm0, -92(%edx)
-L(aligned_16_76bytes):
- movdqa %xmm0, -76(%edx)
-L(aligned_16_60bytes):
- movdqa %xmm0, -60(%edx)
-L(aligned_16_44bytes):
- movdqa %xmm0, -44(%edx)
-L(aligned_16_28bytes):
- movdqa %xmm0, -28(%edx)
-L(aligned_16_12bytes):
- movq %xmm0, -12(%edx)
- movl %eax, -4(%edx)
- SETRTNVAL
- RETURN
-
- ALIGN(4)
-L(aligned_16_125bytes):
- movdqa %xmm0, -125(%edx)
-L(aligned_16_109bytes):
- movdqa %xmm0, -109(%edx)
-L(aligned_16_93bytes):
- movdqa %xmm0, -93(%edx)
-L(aligned_16_77bytes):
- movdqa %xmm0, -77(%edx)
-L(aligned_16_61bytes):
- movdqa %xmm0, -61(%edx)
-L(aligned_16_45bytes):
- movdqa %xmm0, -45(%edx)
-L(aligned_16_29bytes):
- movdqa %xmm0, -29(%edx)
-L(aligned_16_13bytes):
- movq %xmm0, -13(%edx)
- movl %eax, -5(%edx)
- movb %al, -1(%edx)
- SETRTNVAL
- RETURN
-
- ALIGN(4)
-L(aligned_16_126bytes):
- movdqa %xmm0, -126(%edx)
-L(aligned_16_110bytes):
- movdqa %xmm0, -110(%edx)
-L(aligned_16_94bytes):
- movdqa %xmm0, -94(%edx)
-L(aligned_16_78bytes):
- movdqa %xmm0, -78(%edx)
-L(aligned_16_62bytes):
- movdqa %xmm0, -62(%edx)
-L(aligned_16_46bytes):
- movdqa %xmm0, -46(%edx)
-L(aligned_16_30bytes):
- movdqa %xmm0, -30(%edx)
-L(aligned_16_14bytes):
- movq %xmm0, -14(%edx)
- movl %eax, -6(%edx)
- movw %ax, -2(%edx)
- SETRTNVAL
- RETURN
-
- ALIGN(4)
-L(aligned_16_127bytes):
- movdqa %xmm0, -127(%edx)
-L(aligned_16_111bytes):
- movdqa %xmm0, -111(%edx)
-L(aligned_16_95bytes):
- movdqa %xmm0, -95(%edx)
-L(aligned_16_79bytes):
- movdqa %xmm0, -79(%edx)
-L(aligned_16_63bytes):
- movdqa %xmm0, -63(%edx)
-L(aligned_16_47bytes):
- movdqa %xmm0, -47(%edx)
-L(aligned_16_31bytes):
- movdqa %xmm0, -31(%edx)
-L(aligned_16_15bytes):
- movq %xmm0, -15(%edx)
- movl %eax, -7(%edx)
- movw %ax, -3(%edx)
- movb %al, -1(%edx)
- SETRTNVAL
- RETURN_END
-
-END(memset_atom)
diff --git a/libc/arch-x86/string/sse2-memset-slm.S b/libc/arch-x86/string/sse2-memset-slm.S
index e4c8fa1..ec2ee52 100644
--- a/libc/arch-x86/string/sse2-memset-slm.S
+++ b/libc/arch-x86/string/sse2-memset-slm.S
@@ -79,7 +79,7 @@
/* We loaded the jump table and adjusted EDX. Go. */ \
jmp *%ebx
-ENTRY(__memset_chk_generic)
+ENTRY(__memset_chk)
ENTRANCE
movl LEN(%esp), %ecx
@@ -88,11 +88,11 @@
POP(%ebx) // Undo ENTRANCE without returning.
jmp __memset_chk_fail
-END(__memset_chk_generic)
+END(__memset_chk)
.section .text.sse2,"ax",@progbits
ALIGN(4)
-ENTRY(memset_generic)
+ENTRY(memset)
ENTRANCE
movl LEN(%esp), %ecx
@@ -755,4 +755,4 @@
SETRTNVAL
RETURN_END
-END(memset_generic)
+END(memset)
diff --git a/libc/arch-x86/string/sse2-strcpy-slm.S b/libc/arch-x86/string/sse2-strcpy-slm.S
index 22ceeab..b5d84b5 100644
--- a/libc/arch-x86/string/sse2-strcpy-slm.S
+++ b/libc/arch-x86/string/sse2-strcpy-slm.S
@@ -79,7 +79,7 @@
#define POP(REG) popl REG; CFI_POP (REG)
#ifndef STRCPY
-# define STRCPY strcpy_generic
+# define STRCPY strcpy
#endif
#ifdef USE_AS_STPNCPY
diff --git a/libc/arch-x86/string/sse2-strlen-slm.S b/libc/arch-x86/string/sse2-strlen-slm.S
index b805ad6..27cc025 100644
--- a/libc/arch-x86/string/sse2-strlen-slm.S
+++ b/libc/arch-x86/string/sse2-strlen-slm.S
@@ -29,7 +29,7 @@
*/
#ifndef STRLEN
-# define STRLEN strlen_generic
+# define STRLEN strlen
#endif
#ifndef L
diff --git a/libc/arch-x86/string/sse2-strncpy-slm.S b/libc/arch-x86/string/sse2-strncpy-slm.S
index aff7fb9..591419f 100644
--- a/libc/arch-x86/string/sse2-strncpy-slm.S
+++ b/libc/arch-x86/string/sse2-strncpy-slm.S
@@ -29,5 +29,5 @@
*/
#define USE_AS_STRNCPY
-#define STRCPY strncpy_generic
+#define STRCPY strncpy
#include "sse2-strcpy-slm.S"
diff --git a/libc/arch-x86/string/ssse3-memcpy-atom.S b/libc/arch-x86/string/ssse3-memcpy-atom.S
deleted file mode 100644
index 83e1985..0000000
--- a/libc/arch-x86/string/ssse3-memcpy-atom.S
+++ /dev/null
@@ -1,3124 +0,0 @@
-/*
-Copyright (c) 2010, Intel Corporation
-All rights reserved.
-
-Redistribution and use in source and binary forms, with or without
-modification, are permitted provided that the following conditions are met:
-
- * Redistributions of source code must retain the above copyright notice,
- * this list of conditions and the following disclaimer.
-
- * Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
-
- * Neither the name of Intel Corporation nor the names of its contributors
- * may be used to endorse or promote products derived from this software
- * without specific prior written permission.
-
-THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
-ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
-WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
-DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
-ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
-(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
-LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
-ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-*/
-
-#define FOR_ATOM
-
-#ifndef MEMCPY
-# define MEMCPY memcpy_atom
-#endif
-
-#ifndef L
-# define L(label) .L##label
-#endif
-
-#ifndef cfi_startproc
-# define cfi_startproc .cfi_startproc
-#endif
-
-#ifndef cfi_endproc
-# define cfi_endproc .cfi_endproc
-#endif
-
-#ifndef cfi_rel_offset
-# define cfi_rel_offset(reg, off) .cfi_rel_offset reg, off
-#endif
-
-#ifndef cfi_restore
-# define cfi_restore(reg) .cfi_restore reg
-#endif
-
-#ifndef cfi_adjust_cfa_offset
-# define cfi_adjust_cfa_offset(off) .cfi_adjust_cfa_offset off
-#endif
-
-#ifndef ENTRY
-# define ENTRY(name) \
- .type name, @function; \
- .globl name; \
- .p2align 4; \
-name: \
- cfi_startproc
-#endif
-
-#ifndef END
-# define END(name) \
- cfi_endproc; \
- .size name, .-name
-#endif
-
-#define DEST PARMS
-#define SRC DEST+4
-#define LEN SRC+4
-
-#define CFI_PUSH(REG) \
- cfi_adjust_cfa_offset (4); \
- cfi_rel_offset (REG, 0)
-
-#define CFI_POP(REG) \
- cfi_adjust_cfa_offset (-4); \
- cfi_restore (REG)
-
-#define PUSH(REG) pushl REG; CFI_PUSH (REG)
-#define POP(REG) popl REG; CFI_POP (REG)
-
-#if (defined SHARED || defined __PIC__)
-# define PARMS 8 /* Preserve EBX. */
-# define ENTRANCE PUSH (%ebx);
-# define RETURN_END POP (%ebx); ret
-# define RETURN RETURN_END; CFI_PUSH (%ebx)
-# define JMPTBL(I, B) I - B
-
-# define SETUP_PIC_REG(x) call __x86.get_pc_thunk.x
-
-/* Load an entry in a jump table into EBX and branch to it. TABLE is a
- jump table with relative offsets. INDEX is a register contains the
- index into the jump table. SCALE is the scale of INDEX. */
-
-# define BRANCH_TO_JMPTBL_ENTRY(TABLE, INDEX, SCALE) \
- /* We first load PC into EBX. */ \
- SETUP_PIC_REG(bx); \
- /* Get the address of the jump table. */ \
- addl $(TABLE - .), %ebx; \
- /* Get the entry and convert the relative offset to the \
- absolute address. */ \
- addl (%ebx, INDEX, SCALE), %ebx; \
- /* We loaded the jump table. Go. */ \
- jmp *%ebx
-#else
-
-# define PARMS 4
-# define ENTRANCE
-# define RETURN_END ret
-# define RETURN RETURN_END
-# define JMPTBL(I, B) I
-
-/* Branch to an entry in a jump table. TABLE is a jump table with
- absolute offsets. INDEX is a register contains the index into the
- jump table. SCALE is the scale of INDEX. */
-
-# define BRANCH_TO_JMPTBL_ENTRY(TABLE, INDEX, SCALE) \
- jmp *TABLE(, INDEX, SCALE)
-#endif
-
- .section .text.ssse3,"ax",@progbits
-ENTRY (MEMCPY)
- ENTRANCE
- movl LEN(%esp), %ecx
- movl SRC(%esp), %eax
- movl DEST(%esp), %edx
-
-#ifdef USE_AS_MEMMOVE
- cmp %eax, %edx
- jb L(copy_forward)
- je L(fwd_write_0bytes)
- cmp $32, %ecx
- jae L(memmove_bwd)
- jmp L(bk_write_less32bytes_2)
-
- .p2align 4
-L(memmove_bwd):
- add %ecx, %eax
- cmp %eax, %edx
- movl SRC(%esp), %eax
- jb L(copy_backward)
-
-L(copy_forward):
-#endif
- cmp $48, %ecx
- jae L(48bytesormore)
-
-L(fwd_write_less32bytes):
-#ifndef USE_AS_MEMMOVE
- cmp %dl, %al
- jb L(bk_write)
-#endif
- add %ecx, %edx
- add %ecx, %eax
- BRANCH_TO_JMPTBL_ENTRY (L(table_48bytes_fwd), %ecx, 4)
-#ifndef USE_AS_MEMMOVE
- .p2align 4
-L(bk_write):
- BRANCH_TO_JMPTBL_ENTRY (L(table_48_bytes_bwd), %ecx, 4)
-#endif
-
- .p2align 4
-L(48bytesormore):
-#ifndef USE_AS_MEMMOVE
- movlpd (%eax), %xmm0
- movlpd 8(%eax), %xmm1
- movlpd %xmm0, (%edx)
- movlpd %xmm1, 8(%edx)
-#else
- movdqu (%eax), %xmm0
-#endif
- PUSH (%edi)
- movl %edx, %edi
- and $-16, %edx
- add $16, %edx
- sub %edx, %edi
- add %edi, %ecx
- sub %edi, %eax
-
-#ifdef SHARED_CACHE_SIZE_HALF
- cmp $SHARED_CACHE_SIZE_HALF, %ecx
-#else
-# if (defined SHARED || defined __PIC__)
- SETUP_PIC_REG(bx)
- add $_GLOBAL_OFFSET_TABLE_, %ebx
- cmp __x86_shared_cache_size_half@GOTOFF(%ebx), %ecx
-# else
- cmp __x86_shared_cache_size_half, %ecx
-# endif
-#endif
-
- mov %eax, %edi
- jae L(large_page)
- and $0xf, %edi
- jz L(shl_0)
- BRANCH_TO_JMPTBL_ENTRY (L(shl_table), %edi, 4)
-
- .p2align 4
-L(shl_0):
-#ifdef USE_AS_MEMMOVE
- movl DEST+4(%esp), %edi
- movdqu %xmm0, (%edi)
-#endif
- xor %edi, %edi
- cmp $127, %ecx
- ja L(shl_0_gobble)
- lea -32(%ecx), %ecx
-
- .p2align 4
-L(shl_0_loop):
- movdqa (%eax, %edi), %xmm0
- movdqa 16(%eax, %edi), %xmm1
- sub $32, %ecx
- movdqa %xmm0, (%edx, %edi)
- movdqa %xmm1, 16(%edx, %edi)
- lea 32(%edi), %edi
- jb L(shl_0_end)
-
- movdqa (%eax, %edi), %xmm0
- movdqa 16(%eax, %edi), %xmm1
- sub $32, %ecx
- movdqa %xmm0, (%edx, %edi)
- movdqa %xmm1, 16(%edx, %edi)
- lea 32(%edi), %edi
- jb L(shl_0_end)
-
- movdqa (%eax, %edi), %xmm0
- movdqa 16(%eax, %edi), %xmm1
- sub $32, %ecx
- movdqa %xmm0, (%edx, %edi)
- movdqa %xmm1, 16(%edx, %edi)
- lea 32(%edi), %edi
- jb L(shl_0_end)
-
- movdqa (%eax, %edi), %xmm0
- movdqa 16(%eax, %edi), %xmm1
- sub $32, %ecx
- movdqa %xmm0, (%edx, %edi)
- movdqa %xmm1, 16(%edx, %edi)
- lea 32(%edi), %edi
-
-L(shl_0_end):
- lea 32(%ecx), %ecx
- add %ecx, %edi
- add %edi, %edx
- add %edi, %eax
- POP (%edi)
- BRANCH_TO_JMPTBL_ENTRY (L(table_48bytes_fwd_align), %ecx, 4)
-
- CFI_PUSH (%edi)
-
- .p2align 4
-L(shl_0_gobble):
-#ifdef DATA_CACHE_SIZE_HALF
- cmp $DATA_CACHE_SIZE_HALF, %ecx
-#else
-# if (defined SHARED || defined __PIC__)
- SETUP_PIC_REG(bx)
- add $_GLOBAL_OFFSET_TABLE_, %ebx
- cmp __x86_data_cache_size_half@GOTOFF(%ebx), %ecx
-# else
- cmp __x86_data_cache_size_half, %ecx
-# endif
-#endif
- POP (%edi)
- lea -128(%ecx), %ecx
- jae L(shl_0_gobble_mem_loop)
-
- .p2align 4
-L(shl_0_gobble_cache_loop):
- movdqa (%eax), %xmm0
- movdqa 0x10(%eax), %xmm1
- movdqa 0x20(%eax), %xmm2
- movdqa 0x30(%eax), %xmm3
- movdqa 0x40(%eax), %xmm4
- movdqa 0x50(%eax), %xmm5
- movdqa 0x60(%eax), %xmm6
- movdqa 0x70(%eax), %xmm7
- lea 0x80(%eax), %eax
- sub $128, %ecx
- movdqa %xmm0, (%edx)
- movdqa %xmm1, 0x10(%edx)
- movdqa %xmm2, 0x20(%edx)
- movdqa %xmm3, 0x30(%edx)
- movdqa %xmm4, 0x40(%edx)
- movdqa %xmm5, 0x50(%edx)
- movdqa %xmm6, 0x60(%edx)
- movdqa %xmm7, 0x70(%edx)
- lea 0x80(%edx), %edx
-
- jae L(shl_0_gobble_cache_loop)
- cmp $-0x40, %ecx
- lea 0x80(%ecx), %ecx
- jl L(shl_0_cache_less_64bytes)
-
- movdqa (%eax), %xmm0
- sub $0x40, %ecx
- movdqa 0x10(%eax), %xmm1
- movdqa %xmm0, (%edx)
- movdqa %xmm1, 0x10(%edx)
- movdqa 0x20(%eax), %xmm0
- movdqa 0x30(%eax), %xmm1
- add $0x40, %eax
- movdqa %xmm0, 0x20(%edx)
- movdqa %xmm1, 0x30(%edx)
- add $0x40, %edx
-
-L(shl_0_cache_less_64bytes):
- cmp $0x20, %ecx
- jb L(shl_0_cache_less_32bytes)
- movdqa (%eax), %xmm0
- sub $0x20, %ecx
- movdqa 0x10(%eax), %xmm1
- add $0x20, %eax
- movdqa %xmm0, (%edx)
- movdqa %xmm1, 0x10(%edx)
- add $0x20, %edx
-
-L(shl_0_cache_less_32bytes):
- cmp $0x10, %ecx
- jb L(shl_0_cache_less_16bytes)
- sub $0x10, %ecx
- movdqa (%eax), %xmm0
- add $0x10, %eax
- movdqa %xmm0, (%edx)
- add $0x10, %edx
-
-L(shl_0_cache_less_16bytes):
- add %ecx, %edx
- add %ecx, %eax
- BRANCH_TO_JMPTBL_ENTRY (L(table_48bytes_fwd), %ecx, 4)
-
- .p2align 4
-L(shl_0_gobble_mem_loop):
- prefetcht0 0x1c0(%eax)
- prefetcht0 0x280(%eax)
- prefetcht0 0x1c0(%edx)
-
- movdqa (%eax), %xmm0
- movdqa 0x10(%eax), %xmm1
- movdqa 0x20(%eax), %xmm2
- movdqa 0x30(%eax), %xmm3
- movdqa 0x40(%eax), %xmm4
- movdqa 0x50(%eax), %xmm5
- movdqa 0x60(%eax), %xmm6
- movdqa 0x70(%eax), %xmm7
- lea 0x80(%eax), %eax
- sub $0x80, %ecx
- movdqa %xmm0, (%edx)
- movdqa %xmm1, 0x10(%edx)
- movdqa %xmm2, 0x20(%edx)
- movdqa %xmm3, 0x30(%edx)
- movdqa %xmm4, 0x40(%edx)
- movdqa %xmm5, 0x50(%edx)
- movdqa %xmm6, 0x60(%edx)
- movdqa %xmm7, 0x70(%edx)
- lea 0x80(%edx), %edx
-
- jae L(shl_0_gobble_mem_loop)
- cmp $-0x40, %ecx
- lea 0x80(%ecx), %ecx
- jl L(shl_0_mem_less_64bytes)
-
- movdqa (%eax), %xmm0
- sub $0x40, %ecx
- movdqa 0x10(%eax), %xmm1
-
- movdqa %xmm0, (%edx)
- movdqa %xmm1, 0x10(%edx)
-
- movdqa 0x20(%eax), %xmm0
- movdqa 0x30(%eax), %xmm1
- add $0x40, %eax
-
- movdqa %xmm0, 0x20(%edx)
- movdqa %xmm1, 0x30(%edx)
- add $0x40, %edx
-
-L(shl_0_mem_less_64bytes):
- cmp $0x20, %ecx
- jb L(shl_0_mem_less_32bytes)
- movdqa (%eax), %xmm0
- sub $0x20, %ecx
- movdqa 0x10(%eax), %xmm1
- add $0x20, %eax
- movdqa %xmm0, (%edx)
- movdqa %xmm1, 0x10(%edx)
- add $0x20, %edx
-
-L(shl_0_mem_less_32bytes):
- cmp $0x10, %ecx
- jb L(shl_0_mem_less_16bytes)
- sub $0x10, %ecx
- movdqa (%eax), %xmm0
- add $0x10, %eax
- movdqa %xmm0, (%edx)
- add $0x10, %edx
-
-L(shl_0_mem_less_16bytes):
- add %ecx, %edx
- add %ecx, %eax
- BRANCH_TO_JMPTBL_ENTRY (L(table_48bytes_fwd_align), %ecx, 4)
-
- .p2align 4
-L(shl_1):
-#ifndef USE_AS_MEMMOVE
- movaps -1(%eax), %xmm1
-#else
- movl DEST+4(%esp), %edi
- movaps -1(%eax), %xmm1
- movdqu %xmm0, (%edi)
-#endif
-#ifdef DATA_CACHE_SIZE_HALF
- cmp $DATA_CACHE_SIZE_HALF, %ecx
-#else
-# if (defined SHARED || defined __PIC__)
- SETUP_PIC_REG(bx)
- add $_GLOBAL_OFFSET_TABLE_, %ebx
- cmp __x86_data_cache_size_half@GOTOFF(%ebx), %ecx
-# else
- cmp __x86_data_cache_size_half, %ecx
-# endif
-#endif
- jb L(sh_1_no_prefetch)
-
- lea -64(%ecx), %ecx
-
- .p2align 4
-L(Shl1LoopStart):
- prefetcht0 0x1c0(%eax)
- prefetcht0 0x1c0(%edx)
- movaps 15(%eax), %xmm2
- movaps 31(%eax), %xmm3
- movaps 47(%eax), %xmm4
- movaps 63(%eax), %xmm5
- movaps %xmm5, %xmm7
- palignr $1, %xmm4, %xmm5
- palignr $1, %xmm3, %xmm4
- movaps %xmm5, 48(%edx)
- palignr $1, %xmm2, %xmm3
- lea 64(%eax), %eax
- palignr $1, %xmm1, %xmm2
- movaps %xmm4, 32(%edx)
- movaps %xmm3, 16(%edx)
- movaps %xmm7, %xmm1
- movaps %xmm2, (%edx)
- lea 64(%edx), %edx
- sub $64, %ecx
- ja L(Shl1LoopStart)
-
-L(Shl1LoopLeave):
- add $32, %ecx
- jle L(shl_end_0)
-
- movaps 15(%eax), %xmm2
- movaps 31(%eax), %xmm3
- palignr $1, %xmm2, %xmm3
- palignr $1, %xmm1, %xmm2
- movaps %xmm2, (%edx)
- movaps %xmm3, 16(%edx)
- lea 32(%edx, %ecx), %edx
- lea 32(%eax, %ecx), %eax
- POP (%edi)
- BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
-
- CFI_PUSH (%edi)
-
- .p2align 4
-L(sh_1_no_prefetch):
- lea -32(%ecx), %ecx
- lea -1(%eax), %eax
- xor %edi, %edi
-
- .p2align 4
-L(sh_1_no_prefetch_loop):
- movdqa 16(%eax, %edi), %xmm2
- sub $32, %ecx
- movdqa 32(%eax, %edi), %xmm3
- movdqa %xmm3, %xmm4
- palignr $1, %xmm2, %xmm3
- palignr $1, %xmm1, %xmm2
- lea 32(%edi), %edi
- movdqa %xmm2, -32(%edx, %edi)
- movdqa %xmm3, -16(%edx, %edi)
- jb L(sh_1_end_no_prefetch_loop)
-
- movdqa 16(%eax, %edi), %xmm2
- sub $32, %ecx
- movdqa 32(%eax, %edi), %xmm3
- movdqa %xmm3, %xmm1
- palignr $1, %xmm2, %xmm3
- palignr $1, %xmm4, %xmm2
- lea 32(%edi), %edi
- movdqa %xmm2, -32(%edx, %edi)
- movdqa %xmm3, -16(%edx, %edi)
- jae L(sh_1_no_prefetch_loop)
-
-L(sh_1_end_no_prefetch_loop):
- lea 32(%ecx), %ecx
- add %ecx, %edi
- add %edi, %edx
- lea 1(%edi, %eax), %eax
- POP (%edi)
- BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
-
- CFI_PUSH (%edi)
-
- .p2align 4
-L(shl_2):
-#ifndef USE_AS_MEMMOVE
- movaps -2(%eax), %xmm1
-#else
- movl DEST+4(%esp), %edi
- movaps -2(%eax), %xmm1
- movdqu %xmm0, (%edi)
-#endif
-#ifdef DATA_CACHE_SIZE_HALF
- cmp $DATA_CACHE_SIZE_HALF, %ecx
-#else
-# if (defined SHARED || defined __PIC__)
- SETUP_PIC_REG(bx)
- add $_GLOBAL_OFFSET_TABLE_, %ebx
- cmp __x86_data_cache_size_half@GOTOFF(%ebx), %ecx
-# else
- cmp __x86_data_cache_size_half, %ecx
-# endif
-#endif
- jb L(sh_2_no_prefetch)
-
- lea -64(%ecx), %ecx
-
- .p2align 4
-L(Shl2LoopStart):
- prefetcht0 0x1c0(%eax)
- prefetcht0 0x1c0(%edx)
- movaps 14(%eax), %xmm2
- movaps 30(%eax), %xmm3
- movaps 46(%eax), %xmm4
- movaps 62(%eax), %xmm5
- movaps %xmm5, %xmm7
- palignr $2, %xmm4, %xmm5
- palignr $2, %xmm3, %xmm4
- movaps %xmm5, 48(%edx)
- palignr $2, %xmm2, %xmm3
- lea 64(%eax), %eax
- palignr $2, %xmm1, %xmm2
- movaps %xmm4, 32(%edx)
- movaps %xmm3, 16(%edx)
- movaps %xmm7, %xmm1
- movaps %xmm2, (%edx)
- lea 64(%edx), %edx
- sub $64, %ecx
- ja L(Shl2LoopStart)
-
-L(Shl2LoopLeave):
- add $32, %ecx
- jle L(shl_end_0)
-
- movaps 14(%eax), %xmm2
- movaps 30(%eax), %xmm3
- palignr $2, %xmm2, %xmm3
- palignr $2, %xmm1, %xmm2
- movaps %xmm2, (%edx)
- movaps %xmm3, 16(%edx)
- lea 32(%edx, %ecx), %edx
- lea 32(%eax, %ecx), %eax
- POP (%edi)
- BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
-
- CFI_PUSH (%edi)
-
- .p2align 4
-L(sh_2_no_prefetch):
- lea -32(%ecx), %ecx
- lea -2(%eax), %eax
- xor %edi, %edi
-
- .p2align 4
-L(sh_2_no_prefetch_loop):
- movdqa 16(%eax, %edi), %xmm2
- sub $32, %ecx
- movdqa 32(%eax, %edi), %xmm3
- movdqa %xmm3, %xmm4
- palignr $2, %xmm2, %xmm3
- palignr $2, %xmm1, %xmm2
- lea 32(%edi), %edi
- movdqa %xmm2, -32(%edx, %edi)
- movdqa %xmm3, -16(%edx, %edi)
- jb L(sh_2_end_no_prefetch_loop)
-
- movdqa 16(%eax, %edi), %xmm2
- sub $32, %ecx
- movdqa 32(%eax, %edi), %xmm3
- movdqa %xmm3, %xmm1
- palignr $2, %xmm2, %xmm3
- palignr $2, %xmm4, %xmm2
- lea 32(%edi), %edi
- movdqa %xmm2, -32(%edx, %edi)
- movdqa %xmm3, -16(%edx, %edi)
- jae L(sh_2_no_prefetch_loop)
-
-L(sh_2_end_no_prefetch_loop):
- lea 32(%ecx), %ecx
- add %ecx, %edi
- add %edi, %edx
- lea 2(%edi, %eax), %eax
- POP (%edi)
- BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
-
- CFI_PUSH (%edi)
-
- .p2align 4
-L(shl_3):
-#ifndef USE_AS_MEMMOVE
- movaps -3(%eax), %xmm1
-#else
- movl DEST+4(%esp), %edi
- movaps -3(%eax), %xmm1
- movdqu %xmm0, (%edi)
-#endif
-#ifdef DATA_CACHE_SIZE_HALF
- cmp $DATA_CACHE_SIZE_HALF, %ecx
-#else
-# if (defined SHARED || defined __PIC__)
- SETUP_PIC_REG(bx)
- add $_GLOBAL_OFFSET_TABLE_, %ebx
- cmp __x86_data_cache_size_half@GOTOFF(%ebx), %ecx
-# else
- cmp __x86_data_cache_size_half, %ecx
-# endif
-#endif
- jb L(sh_3_no_prefetch)
-
- lea -64(%ecx), %ecx
-
- .p2align 4
-L(Shl3LoopStart):
- prefetcht0 0x1c0(%eax)
- prefetcht0 0x1c0(%edx)
- movaps 13(%eax), %xmm2
- movaps 29(%eax), %xmm3
- movaps 45(%eax), %xmm4
- movaps 61(%eax), %xmm5
- movaps %xmm5, %xmm7
- palignr $3, %xmm4, %xmm5
- palignr $3, %xmm3, %xmm4
- movaps %xmm5, 48(%edx)
- palignr $3, %xmm2, %xmm3
- lea 64(%eax), %eax
- palignr $3, %xmm1, %xmm2
- movaps %xmm4, 32(%edx)
- movaps %xmm3, 16(%edx)
- movaps %xmm7, %xmm1
- movaps %xmm2, (%edx)
- lea 64(%edx), %edx
- sub $64, %ecx
- ja L(Shl3LoopStart)
-
-L(Shl3LoopLeave):
- add $32, %ecx
- jle L(shl_end_0)
-
- movaps 13(%eax), %xmm2
- movaps 29(%eax), %xmm3
- palignr $3, %xmm2, %xmm3
- palignr $3, %xmm1, %xmm2
- movaps %xmm2, (%edx)
- movaps %xmm3, 16(%edx)
- lea 32(%edx, %ecx), %edx
- lea 32(%eax, %ecx), %eax
- POP (%edi)
- BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
-
- CFI_PUSH (%edi)
-
- .p2align 4
-L(sh_3_no_prefetch):
- lea -32(%ecx), %ecx
- lea -3(%eax), %eax
- xor %edi, %edi
-
- .p2align 4
-L(sh_3_no_prefetch_loop):
- movdqa 16(%eax, %edi), %xmm2
- sub $32, %ecx
- movdqa 32(%eax, %edi), %xmm3
- movdqa %xmm3, %xmm4
- palignr $3, %xmm2, %xmm3
- palignr $3, %xmm1, %xmm2
- lea 32(%edi), %edi
- movdqa %xmm2, -32(%edx, %edi)
- movdqa %xmm3, -16(%edx, %edi)
-
- jb L(sh_3_end_no_prefetch_loop)
-
- movdqa 16(%eax, %edi), %xmm2
- sub $32, %ecx
- movdqa 32(%eax, %edi), %xmm3
- movdqa %xmm3, %xmm1
- palignr $3, %xmm2, %xmm3
- palignr $3, %xmm4, %xmm2
- lea 32(%edi), %edi
- movdqa %xmm2, -32(%edx, %edi)
- movdqa %xmm3, -16(%edx, %edi)
-
- jae L(sh_3_no_prefetch_loop)
-
-L(sh_3_end_no_prefetch_loop):
- lea 32(%ecx), %ecx
- add %ecx, %edi
- add %edi, %edx
- lea 3(%edi, %eax), %eax
- POP (%edi)
- BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
-
- CFI_PUSH (%edi)
-
- .p2align 4
-L(shl_4):
-#ifndef USE_AS_MEMMOVE
- movaps -4(%eax), %xmm1
-#else
- movl DEST+4(%esp), %edi
- movaps -4(%eax), %xmm1
- movdqu %xmm0, (%edi)
-#endif
-#ifdef DATA_CACHE_SIZE_HALF
- cmp $DATA_CACHE_SIZE_HALF, %ecx
-#else
-# if (defined SHARED || defined __PIC__)
- SETUP_PIC_REG(bx)
- add $_GLOBAL_OFFSET_TABLE_, %ebx
- cmp __x86_data_cache_size_half@GOTOFF(%ebx), %ecx
-# else
- cmp __x86_data_cache_size_half, %ecx
-# endif
-#endif
- jb L(sh_4_no_prefetch)
-
- lea -64(%ecx), %ecx
-
- .p2align 4
-L(Shl4LoopStart):
- prefetcht0 0x1c0(%eax)
- prefetcht0 0x1c0(%edx)
- movaps 12(%eax), %xmm2
- movaps 28(%eax), %xmm3
- movaps 44(%eax), %xmm4
- movaps 60(%eax), %xmm5
- movaps %xmm5, %xmm7
- palignr $4, %xmm4, %xmm5
- palignr $4, %xmm3, %xmm4
- movaps %xmm5, 48(%edx)
- palignr $4, %xmm2, %xmm3
- lea 64(%eax), %eax
- palignr $4, %xmm1, %xmm2
- movaps %xmm4, 32(%edx)
- movaps %xmm3, 16(%edx)
- movaps %xmm7, %xmm1
- movaps %xmm2, (%edx)
- lea 64(%edx), %edx
- sub $64, %ecx
- ja L(Shl4LoopStart)
-
-L(Shl4LoopLeave):
- add $32, %ecx
- jle L(shl_end_0)
-
- movaps 12(%eax), %xmm2
- movaps 28(%eax), %xmm3
- palignr $4, %xmm2, %xmm3
- palignr $4, %xmm1, %xmm2
- movaps %xmm2, (%edx)
- movaps %xmm3, 16(%edx)
- lea 32(%edx, %ecx), %edx
- lea 32(%eax, %ecx), %eax
- POP (%edi)
- BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
-
- CFI_PUSH (%edi)
-
- .p2align 4
-L(sh_4_no_prefetch):
- lea -32(%ecx), %ecx
- lea -4(%eax), %eax
- xor %edi, %edi
-
- .p2align 4
-L(sh_4_no_prefetch_loop):
- movdqa 16(%eax, %edi), %xmm2
- sub $32, %ecx
- movdqa 32(%eax, %edi), %xmm3
- movdqa %xmm3, %xmm4
- palignr $4, %xmm2, %xmm3
- palignr $4, %xmm1, %xmm2
- lea 32(%edi), %edi
- movdqa %xmm2, -32(%edx, %edi)
- movdqa %xmm3, -16(%edx, %edi)
-
- jb L(sh_4_end_no_prefetch_loop)
-
- movdqa 16(%eax, %edi), %xmm2
- sub $32, %ecx
- movdqa 32(%eax, %edi), %xmm3
- movdqa %xmm3, %xmm1
- palignr $4, %xmm2, %xmm3
- palignr $4, %xmm4, %xmm2
- lea 32(%edi), %edi
- movdqa %xmm2, -32(%edx, %edi)
- movdqa %xmm3, -16(%edx, %edi)
-
- jae L(sh_4_no_prefetch_loop)
-
-L(sh_4_end_no_prefetch_loop):
- lea 32(%ecx), %ecx
- add %ecx, %edi
- add %edi, %edx
- lea 4(%edi, %eax), %eax
- POP (%edi)
- BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
-
- CFI_PUSH (%edi)
-
- .p2align 4
-L(shl_5):
-#ifndef USE_AS_MEMMOVE
- movaps -5(%eax), %xmm1
-#else
- movl DEST+4(%esp), %edi
- movaps -5(%eax), %xmm1
- movdqu %xmm0, (%edi)
-#endif
-#ifdef DATA_CACHE_SIZE_HALF
- cmp $DATA_CACHE_SIZE_HALF, %ecx
-#else
-# if (defined SHARED || defined __PIC__)
- SETUP_PIC_REG(bx)
- add $_GLOBAL_OFFSET_TABLE_, %ebx
- cmp __x86_data_cache_size_half@GOTOFF(%ebx), %ecx
-# else
- cmp __x86_data_cache_size_half, %ecx
-# endif
-#endif
- jb L(sh_5_no_prefetch)
-
- lea -64(%ecx), %ecx
-
- .p2align 4
-L(Shl5LoopStart):
- prefetcht0 0x1c0(%eax)
- prefetcht0 0x1c0(%edx)
- movaps 11(%eax), %xmm2
- movaps 27(%eax), %xmm3
- movaps 43(%eax), %xmm4
- movaps 59(%eax), %xmm5
- movaps %xmm5, %xmm7
- palignr $5, %xmm4, %xmm5
- palignr $5, %xmm3, %xmm4
- movaps %xmm5, 48(%edx)
- palignr $5, %xmm2, %xmm3
- lea 64(%eax), %eax
- palignr $5, %xmm1, %xmm2
- movaps %xmm4, 32(%edx)
- movaps %xmm3, 16(%edx)
- movaps %xmm7, %xmm1
- movaps %xmm2, (%edx)
- lea 64(%edx), %edx
- sub $64, %ecx
- ja L(Shl5LoopStart)
-
-L(Shl5LoopLeave):
- add $32, %ecx
- jle L(shl_end_0)
-
- movaps 11(%eax), %xmm2
- movaps 27(%eax), %xmm3
- palignr $5, %xmm2, %xmm3
- palignr $5, %xmm1, %xmm2
- movaps %xmm2, (%edx)
- movaps %xmm3, 16(%edx)
- lea 32(%edx, %ecx), %edx
- lea 32(%eax, %ecx), %eax
- POP (%edi)
- BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
-
- CFI_PUSH (%edi)
-
- .p2align 4
-L(sh_5_no_prefetch):
- lea -32(%ecx), %ecx
- lea -5(%eax), %eax
- xor %edi, %edi
-
- .p2align 4
-L(sh_5_no_prefetch_loop):
- movdqa 16(%eax, %edi), %xmm2
- sub $32, %ecx
- movdqa 32(%eax, %edi), %xmm3
- movdqa %xmm3, %xmm4
- palignr $5, %xmm2, %xmm3
- palignr $5, %xmm1, %xmm2
- lea 32(%edi), %edi
- movdqa %xmm2, -32(%edx, %edi)
- movdqa %xmm3, -16(%edx, %edi)
-
- jb L(sh_5_end_no_prefetch_loop)
-
- movdqa 16(%eax, %edi), %xmm2
- sub $32, %ecx
- movdqa 32(%eax, %edi), %xmm3
- movdqa %xmm3, %xmm1
- palignr $5, %xmm2, %xmm3
- palignr $5, %xmm4, %xmm2
- lea 32(%edi), %edi
- movdqa %xmm2, -32(%edx, %edi)
- movdqa %xmm3, -16(%edx, %edi)
-
- jae L(sh_5_no_prefetch_loop)
-
-L(sh_5_end_no_prefetch_loop):
- lea 32(%ecx), %ecx
- add %ecx, %edi
- add %edi, %edx
- lea 5(%edi, %eax), %eax
- POP (%edi)
- BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
-
- CFI_PUSH (%edi)
-
- .p2align 4
-L(shl_6):
-#ifndef USE_AS_MEMMOVE
- movaps -6(%eax), %xmm1
-#else
- movl DEST+4(%esp), %edi
- movaps -6(%eax), %xmm1
- movdqu %xmm0, (%edi)
-#endif
-#ifdef DATA_CACHE_SIZE_HALF
- cmp $DATA_CACHE_SIZE_HALF, %ecx
-#else
-# if (defined SHARED || defined __PIC__)
- SETUP_PIC_REG(bx)
- add $_GLOBAL_OFFSET_TABLE_, %ebx
- cmp __x86_data_cache_size_half@GOTOFF(%ebx), %ecx
-# else
- cmp __x86_data_cache_size_half, %ecx
-# endif
-#endif
- jb L(sh_6_no_prefetch)
-
- lea -64(%ecx), %ecx
-
- .p2align 4
-L(Shl6LoopStart):
- prefetcht0 0x1c0(%eax)
- prefetcht0 0x1c0(%edx)
- movaps 10(%eax), %xmm2
- movaps 26(%eax), %xmm3
- movaps 42(%eax), %xmm4
- movaps 58(%eax), %xmm5
- movaps %xmm5, %xmm7
- palignr $6, %xmm4, %xmm5
- palignr $6, %xmm3, %xmm4
- movaps %xmm5, 48(%edx)
- palignr $6, %xmm2, %xmm3
- lea 64(%eax), %eax
- palignr $6, %xmm1, %xmm2
- movaps %xmm4, 32(%edx)
- movaps %xmm3, 16(%edx)
- movaps %xmm7, %xmm1
- movaps %xmm2, (%edx)
- lea 64(%edx), %edx
- sub $64, %ecx
- ja L(Shl6LoopStart)
-
-L(Shl6LoopLeave):
- add $32, %ecx
- jle L(shl_end_0)
-
- movaps 10(%eax), %xmm2
- movaps 26(%eax), %xmm3
- palignr $6, %xmm2, %xmm3
- palignr $6, %xmm1, %xmm2
- movaps %xmm2, (%edx)
- movaps %xmm3, 16(%edx)
- lea 32(%edx, %ecx), %edx
- lea 32(%eax, %ecx), %eax
- POP (%edi)
- BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
-
- CFI_PUSH (%edi)
-
- .p2align 4
-L(sh_6_no_prefetch):
- lea -32(%ecx), %ecx
- lea -6(%eax), %eax
- xor %edi, %edi
-
- .p2align 4
-L(sh_6_no_prefetch_loop):
- movdqa 16(%eax, %edi), %xmm2
- sub $32, %ecx
- movdqa 32(%eax, %edi), %xmm3
- movdqa %xmm3, %xmm4
- palignr $6, %xmm2, %xmm3
- palignr $6, %xmm1, %xmm2
- lea 32(%edi), %edi
- movdqa %xmm2, -32(%edx, %edi)
- movdqa %xmm3, -16(%edx, %edi)
-
- jb L(sh_6_end_no_prefetch_loop)
-
- movdqa 16(%eax, %edi), %xmm2
- sub $32, %ecx
- movdqa 32(%eax, %edi), %xmm3
- movdqa %xmm3, %xmm1
- palignr $6, %xmm2, %xmm3
- palignr $6, %xmm4, %xmm2
- lea 32(%edi), %edi
- movdqa %xmm2, -32(%edx, %edi)
- movdqa %xmm3, -16(%edx, %edi)
-
- jae L(sh_6_no_prefetch_loop)
-
-L(sh_6_end_no_prefetch_loop):
- lea 32(%ecx), %ecx
- add %ecx, %edi
- add %edi, %edx
- lea 6(%edi, %eax), %eax
- POP (%edi)
- BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
-
- CFI_PUSH (%edi)
-
- .p2align 4
-L(shl_7):
-#ifndef USE_AS_MEMMOVE
- movaps -7(%eax), %xmm1
-#else
- movl DEST+4(%esp), %edi
- movaps -7(%eax), %xmm1
- movdqu %xmm0, (%edi)
-#endif
-#ifdef DATA_CACHE_SIZE_HALF
- cmp $DATA_CACHE_SIZE_HALF, %ecx
-#else
-# if (defined SHARED || defined __PIC__)
- SETUP_PIC_REG(bx)
- add $_GLOBAL_OFFSET_TABLE_, %ebx
- cmp __x86_data_cache_size_half@GOTOFF(%ebx), %ecx
-# else
- cmp __x86_data_cache_size_half, %ecx
-# endif
-#endif
- jb L(sh_7_no_prefetch)
-
- lea -64(%ecx), %ecx
-
- .p2align 4
-L(Shl7LoopStart):
- prefetcht0 0x1c0(%eax)
- prefetcht0 0x1c0(%edx)
- movaps 9(%eax), %xmm2
- movaps 25(%eax), %xmm3
- movaps 41(%eax), %xmm4
- movaps 57(%eax), %xmm5
- movaps %xmm5, %xmm7
- palignr $7, %xmm4, %xmm5
- palignr $7, %xmm3, %xmm4
- movaps %xmm5, 48(%edx)
- palignr $7, %xmm2, %xmm3
- lea 64(%eax), %eax
- palignr $7, %xmm1, %xmm2
- movaps %xmm4, 32(%edx)
- movaps %xmm3, 16(%edx)
- movaps %xmm7, %xmm1
- movaps %xmm2, (%edx)
- lea 64(%edx), %edx
- sub $64, %ecx
- ja L(Shl7LoopStart)
-
-L(Shl7LoopLeave):
- add $32, %ecx
- jle L(shl_end_0)
-
- movaps 9(%eax), %xmm2
- movaps 25(%eax), %xmm3
- palignr $7, %xmm2, %xmm3
- palignr $7, %xmm1, %xmm2
- movaps %xmm2, (%edx)
- movaps %xmm3, 16(%edx)
- lea 32(%edx, %ecx), %edx
- lea 32(%eax, %ecx), %eax
- POP (%edi)
- BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
-
- CFI_PUSH (%edi)
-
- .p2align 4
-L(sh_7_no_prefetch):
- lea -32(%ecx), %ecx
- lea -7(%eax), %eax
- xor %edi, %edi
-
- .p2align 4
-L(sh_7_no_prefetch_loop):
- movdqa 16(%eax, %edi), %xmm2
- sub $32, %ecx
- movdqa 32(%eax, %edi), %xmm3
- movdqa %xmm3, %xmm4
- palignr $7, %xmm2, %xmm3
- palignr $7, %xmm1, %xmm2
- lea 32(%edi), %edi
- movdqa %xmm2, -32(%edx, %edi)
- movdqa %xmm3, -16(%edx, %edi)
- jb L(sh_7_end_no_prefetch_loop)
-
- movdqa 16(%eax, %edi), %xmm2
- sub $32, %ecx
- movdqa 32(%eax, %edi), %xmm3
- movdqa %xmm3, %xmm1
- palignr $7, %xmm2, %xmm3
- palignr $7, %xmm4, %xmm2
- lea 32(%edi), %edi
- movdqa %xmm2, -32(%edx, %edi)
- movdqa %xmm3, -16(%edx, %edi)
- jae L(sh_7_no_prefetch_loop)
-
-L(sh_7_end_no_prefetch_loop):
- lea 32(%ecx), %ecx
- add %ecx, %edi
- add %edi, %edx
- lea 7(%edi, %eax), %eax
- POP (%edi)
- BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
-
- CFI_PUSH (%edi)
-
- .p2align 4
-L(shl_8):
-#ifndef USE_AS_MEMMOVE
- movaps -8(%eax), %xmm1
-#else
- movl DEST+4(%esp), %edi
- movaps -8(%eax), %xmm1
- movdqu %xmm0, (%edi)
-#endif
-#ifdef DATA_CACHE_SIZE_HALF
- cmp $DATA_CACHE_SIZE_HALF, %ecx
-#else
-# if (defined SHARED || defined __PIC__)
- SETUP_PIC_REG(bx)
- add $_GLOBAL_OFFSET_TABLE_, %ebx
- cmp __x86_data_cache_size_half@GOTOFF(%ebx), %ecx
-# else
- cmp __x86_data_cache_size_half, %ecx
-# endif
-#endif
- jb L(sh_8_no_prefetch)
-
- lea -64(%ecx), %ecx
-
- .p2align 4
-L(Shl8LoopStart):
- prefetcht0 0x1c0(%eax)
- prefetcht0 0x1c0(%edx)
- movaps 8(%eax), %xmm2
- movaps 24(%eax), %xmm3
- movaps 40(%eax), %xmm4
- movaps 56(%eax), %xmm5
- movaps %xmm5, %xmm7
- palignr $8, %xmm4, %xmm5
- palignr $8, %xmm3, %xmm4
- movaps %xmm5, 48(%edx)
- palignr $8, %xmm2, %xmm3
- lea 64(%eax), %eax
- palignr $8, %xmm1, %xmm2
- movaps %xmm4, 32(%edx)
- movaps %xmm3, 16(%edx)
- movaps %xmm7, %xmm1
- movaps %xmm2, (%edx)
- lea 64(%edx), %edx
- sub $64, %ecx
- ja L(Shl8LoopStart)
-
-L(LoopLeave8):
- add $32, %ecx
- jle L(shl_end_0)
-
- movaps 8(%eax), %xmm2
- movaps 24(%eax), %xmm3
- palignr $8, %xmm2, %xmm3
- palignr $8, %xmm1, %xmm2
- movaps %xmm2, (%edx)
- movaps %xmm3, 16(%edx)
- lea 32(%edx, %ecx), %edx
- lea 32(%eax, %ecx), %eax
- POP (%edi)
- BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
-
- CFI_PUSH (%edi)
-
- .p2align 4
-L(sh_8_no_prefetch):
- lea -32(%ecx), %ecx
- lea -8(%eax), %eax
- xor %edi, %edi
-
- .p2align 4
-L(sh_8_no_prefetch_loop):
- movdqa 16(%eax, %edi), %xmm2
- sub $32, %ecx
- movdqa 32(%eax, %edi), %xmm3
- movdqa %xmm3, %xmm4
- palignr $8, %xmm2, %xmm3
- palignr $8, %xmm1, %xmm2
- lea 32(%edi), %edi
- movdqa %xmm2, -32(%edx, %edi)
- movdqa %xmm3, -16(%edx, %edi)
- jb L(sh_8_end_no_prefetch_loop)
-
- movdqa 16(%eax, %edi), %xmm2
- sub $32, %ecx
- movdqa 32(%eax, %edi), %xmm3
- movdqa %xmm3, %xmm1
- palignr $8, %xmm2, %xmm3
- palignr $8, %xmm4, %xmm2
- lea 32(%edi), %edi
- movdqa %xmm2, -32(%edx, %edi)
- movdqa %xmm3, -16(%edx, %edi)
- jae L(sh_8_no_prefetch_loop)
-
-L(sh_8_end_no_prefetch_loop):
- lea 32(%ecx), %ecx
- add %ecx, %edi
- add %edi, %edx
- lea 8(%edi, %eax), %eax
- POP (%edi)
- BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
-
- CFI_PUSH (%edi)
-
- .p2align 4
-L(shl_9):
-#ifndef USE_AS_MEMMOVE
- movaps -9(%eax), %xmm1
-#else
- movl DEST+4(%esp), %edi
- movaps -9(%eax), %xmm1
- movdqu %xmm0, (%edi)
-#endif
-#ifdef DATA_CACHE_SIZE_HALF
- cmp $DATA_CACHE_SIZE_HALF, %ecx
-#else
-# if (defined SHARED || defined __PIC__)
- SETUP_PIC_REG(bx)
- add $_GLOBAL_OFFSET_TABLE_, %ebx
- cmp __x86_data_cache_size_half@GOTOFF(%ebx), %ecx
-# else
- cmp __x86_data_cache_size_half, %ecx
-# endif
-#endif
- jb L(sh_9_no_prefetch)
-
- lea -64(%ecx), %ecx
-
- .p2align 4
-L(Shl9LoopStart):
- prefetcht0 0x1c0(%eax)
- prefetcht0 0x1c0(%edx)
- movaps 7(%eax), %xmm2
- movaps 23(%eax), %xmm3
- movaps 39(%eax), %xmm4
- movaps 55(%eax), %xmm5
- movaps %xmm5, %xmm7
- palignr $9, %xmm4, %xmm5
- palignr $9, %xmm3, %xmm4
- movaps %xmm5, 48(%edx)
- palignr $9, %xmm2, %xmm3
- lea 64(%eax), %eax
- palignr $9, %xmm1, %xmm2
- movaps %xmm4, 32(%edx)
- movaps %xmm3, 16(%edx)
- movaps %xmm7, %xmm1
- movaps %xmm2, (%edx)
- lea 64(%edx), %edx
- sub $64, %ecx
- ja L(Shl9LoopStart)
-
-L(Shl9LoopLeave):
- add $32, %ecx
- jle L(shl_end_0)
-
- movaps 7(%eax), %xmm2
- movaps 23(%eax), %xmm3
- palignr $9, %xmm2, %xmm3
- palignr $9, %xmm1, %xmm2
-
- movaps %xmm2, (%edx)
- movaps %xmm3, 16(%edx)
- lea 32(%edx, %ecx), %edx
- lea 32(%eax, %ecx), %eax
- POP (%edi)
- BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
-
- CFI_PUSH (%edi)
-
- .p2align 4
-L(sh_9_no_prefetch):
- lea -32(%ecx), %ecx
- lea -9(%eax), %eax
- xor %edi, %edi
-
- .p2align 4
-L(sh_9_no_prefetch_loop):
- movdqa 16(%eax, %edi), %xmm2
- sub $32, %ecx
- movdqa 32(%eax, %edi), %xmm3
- movdqa %xmm3, %xmm4
- palignr $9, %xmm2, %xmm3
- palignr $9, %xmm1, %xmm2
- lea 32(%edi), %edi
- movdqa %xmm2, -32(%edx, %edi)
- movdqa %xmm3, -16(%edx, %edi)
- jb L(sh_9_end_no_prefetch_loop)
-
- movdqa 16(%eax, %edi), %xmm2
- sub $32, %ecx
- movdqa 32(%eax, %edi), %xmm3
- movdqa %xmm3, %xmm1
- palignr $9, %xmm2, %xmm3
- palignr $9, %xmm4, %xmm2
- lea 32(%edi), %edi
- movdqa %xmm2, -32(%edx, %edi)
- movdqa %xmm3, -16(%edx, %edi)
- jae L(sh_9_no_prefetch_loop)
-
-L(sh_9_end_no_prefetch_loop):
- lea 32(%ecx), %ecx
- add %ecx, %edi
- add %edi, %edx
- lea 9(%edi, %eax), %eax
- POP (%edi)
- BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
-
- CFI_PUSH (%edi)
-
- .p2align 4
-L(shl_10):
-#ifndef USE_AS_MEMMOVE
- movaps -10(%eax), %xmm1
-#else
- movl DEST+4(%esp), %edi
- movaps -10(%eax), %xmm1
- movdqu %xmm0, (%edi)
-#endif
-#ifdef DATA_CACHE_SIZE_HALF
- cmp $DATA_CACHE_SIZE_HALF, %ecx
-#else
-# if (defined SHARED || defined __PIC__)
- SETUP_PIC_REG(bx)
- add $_GLOBAL_OFFSET_TABLE_, %ebx
- cmp __x86_data_cache_size_half@GOTOFF(%ebx), %ecx
-# else
- cmp __x86_data_cache_size_half, %ecx
-# endif
-#endif
- jb L(sh_10_no_prefetch)
-
- lea -64(%ecx), %ecx
-
- .p2align 4
-L(Shl10LoopStart):
- prefetcht0 0x1c0(%eax)
- prefetcht0 0x1c0(%edx)
- movaps 6(%eax), %xmm2
- movaps 22(%eax), %xmm3
- movaps 38(%eax), %xmm4
- movaps 54(%eax), %xmm5
- movaps %xmm5, %xmm7
- palignr $10, %xmm4, %xmm5
- palignr $10, %xmm3, %xmm4
- movaps %xmm5, 48(%edx)
- palignr $10, %xmm2, %xmm3
- lea 64(%eax), %eax
- palignr $10, %xmm1, %xmm2
- movaps %xmm4, 32(%edx)
- movaps %xmm3, 16(%edx)
- movaps %xmm7, %xmm1
- movaps %xmm2, (%edx)
- lea 64(%edx), %edx
- sub $64, %ecx
- ja L(Shl10LoopStart)
-
-L(Shl10LoopLeave):
- add $32, %ecx
- jle L(shl_end_0)
-
- movaps 6(%eax), %xmm2
- movaps 22(%eax), %xmm3
- palignr $10, %xmm2, %xmm3
- palignr $10, %xmm1, %xmm2
-
- movaps %xmm2, (%edx)
- movaps %xmm3, 16(%edx)
- lea 32(%edx, %ecx), %edx
- lea 32(%eax, %ecx), %eax
- POP (%edi)
- BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
-
- CFI_PUSH (%edi)
-
- .p2align 4
-L(sh_10_no_prefetch):
- lea -32(%ecx), %ecx
- lea -10(%eax), %eax
- xor %edi, %edi
-
- .p2align 4
-L(sh_10_no_prefetch_loop):
- movdqa 16(%eax, %edi), %xmm2
- sub $32, %ecx
- movdqa 32(%eax, %edi), %xmm3
- movdqa %xmm3, %xmm4
- palignr $10, %xmm2, %xmm3
- palignr $10, %xmm1, %xmm2
- lea 32(%edi), %edi
- movdqa %xmm2, -32(%edx, %edi)
- movdqa %xmm3, -16(%edx, %edi)
- jb L(sh_10_end_no_prefetch_loop)
-
- movdqa 16(%eax, %edi), %xmm2
- sub $32, %ecx
- movdqa 32(%eax, %edi), %xmm3
- movdqa %xmm3, %xmm1
- palignr $10, %xmm2, %xmm3
- palignr $10, %xmm4, %xmm2
- lea 32(%edi), %edi
- movdqa %xmm2, -32(%edx, %edi)
- movdqa %xmm3, -16(%edx, %edi)
- jae L(sh_10_no_prefetch_loop)
-
-L(sh_10_end_no_prefetch_loop):
- lea 32(%ecx), %ecx
- add %ecx, %edi
- add %edi, %edx
- lea 10(%edi, %eax), %eax
- POP (%edi)
- BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
-
- CFI_PUSH (%edi)
-
- .p2align 4
-L(shl_11):
-#ifndef USE_AS_MEMMOVE
- movaps -11(%eax), %xmm1
-#else
- movl DEST+4(%esp), %edi
- movaps -11(%eax), %xmm1
- movdqu %xmm0, (%edi)
-#endif
-#ifdef DATA_CACHE_SIZE_HALF
- cmp $DATA_CACHE_SIZE_HALF, %ecx
-#else
-# if (defined SHARED || defined __PIC__)
- SETUP_PIC_REG(bx)
- add $_GLOBAL_OFFSET_TABLE_, %ebx
- cmp __x86_data_cache_size_half@GOTOFF(%ebx), %ecx
-# else
- cmp __x86_data_cache_size_half, %ecx
-# endif
-#endif
- jb L(sh_11_no_prefetch)
-
- lea -64(%ecx), %ecx
-
- .p2align 4
-L(Shl11LoopStart):
- prefetcht0 0x1c0(%eax)
- prefetcht0 0x1c0(%edx)
- movaps 5(%eax), %xmm2
- movaps 21(%eax), %xmm3
- movaps 37(%eax), %xmm4
- movaps 53(%eax), %xmm5
- movaps %xmm5, %xmm7
- palignr $11, %xmm4, %xmm5
- palignr $11, %xmm3, %xmm4
- movaps %xmm5, 48(%edx)
- palignr $11, %xmm2, %xmm3
- lea 64(%eax), %eax
- palignr $11, %xmm1, %xmm2
- movaps %xmm4, 32(%edx)
- movaps %xmm3, 16(%edx)
- movaps %xmm7, %xmm1
- movaps %xmm2, (%edx)
- lea 64(%edx), %edx
- sub $64, %ecx
- ja L(Shl11LoopStart)
-
-L(Shl11LoopLeave):
- add $32, %ecx
- jle L(shl_end_0)
-
- movaps 5(%eax), %xmm2
- movaps 21(%eax), %xmm3
- palignr $11, %xmm2, %xmm3
- palignr $11, %xmm1, %xmm2
-
- movaps %xmm2, (%edx)
- movaps %xmm3, 16(%edx)
- lea 32(%edx, %ecx), %edx
- lea 32(%eax, %ecx), %eax
- POP (%edi)
- BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
-
- CFI_PUSH (%edi)
-
- .p2align 4
-L(sh_11_no_prefetch):
- lea -32(%ecx), %ecx
- lea -11(%eax), %eax
- xor %edi, %edi
-
- .p2align 4
-L(sh_11_no_prefetch_loop):
- movdqa 16(%eax, %edi), %xmm2
- sub $32, %ecx
- movdqa 32(%eax, %edi), %xmm3
- movdqa %xmm3, %xmm4
- palignr $11, %xmm2, %xmm3
- palignr $11, %xmm1, %xmm2
- lea 32(%edi), %edi
- movdqa %xmm2, -32(%edx, %edi)
- movdqa %xmm3, -16(%edx, %edi)
- jb L(sh_11_end_no_prefetch_loop)
-
- movdqa 16(%eax, %edi), %xmm2
- sub $32, %ecx
- movdqa 32(%eax, %edi), %xmm3
- movdqa %xmm3, %xmm1
- palignr $11, %xmm2, %xmm3
- palignr $11, %xmm4, %xmm2
- lea 32(%edi), %edi
- movdqa %xmm2, -32(%edx, %edi)
- movdqa %xmm3, -16(%edx, %edi)
- jae L(sh_11_no_prefetch_loop)
-
-L(sh_11_end_no_prefetch_loop):
- lea 32(%ecx), %ecx
- add %ecx, %edi
- add %edi, %edx
- lea 11(%edi, %eax), %eax
- POP (%edi)
- BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
-
- CFI_PUSH (%edi)
-
- .p2align 4
-L(shl_12):
-#ifndef USE_AS_MEMMOVE
- movaps -12(%eax), %xmm1
-#else
- movl DEST+4(%esp), %edi
- movaps -12(%eax), %xmm1
- movdqu %xmm0, (%edi)
-#endif
-#ifdef DATA_CACHE_SIZE_HALF
- cmp $DATA_CACHE_SIZE_HALF, %ecx
-#else
-# if (defined SHARED || defined __PIC__)
- SETUP_PIC_REG(bx)
- add $_GLOBAL_OFFSET_TABLE_, %ebx
- cmp __x86_data_cache_size_half@GOTOFF(%ebx), %ecx
-# else
- cmp __x86_data_cache_size_half, %ecx
-# endif
-#endif
- jb L(sh_12_no_prefetch)
-
- lea -64(%ecx), %ecx
-
- .p2align 4
-L(Shl12LoopStart):
- prefetcht0 0x1c0(%eax)
- prefetcht0 0x1c0(%edx)
- movaps 4(%eax), %xmm2
- movaps 20(%eax), %xmm3
- movaps 36(%eax), %xmm4
- movaps 52(%eax), %xmm5
- movaps %xmm5, %xmm7
- palignr $12, %xmm4, %xmm5
- palignr $12, %xmm3, %xmm4
- movaps %xmm5, 48(%edx)
- palignr $12, %xmm2, %xmm3
- lea 64(%eax), %eax
- palignr $12, %xmm1, %xmm2
- movaps %xmm4, 32(%edx)
- movaps %xmm3, 16(%edx)
- movaps %xmm7, %xmm1
- movaps %xmm2, (%edx)
- lea 64(%edx), %edx
- sub $64, %ecx
- ja L(Shl12LoopStart)
-
-L(Shl12LoopLeave):
- add $32, %ecx
- jle L(shl_end_0)
-
- movaps 4(%eax), %xmm2
- movaps 20(%eax), %xmm3
- palignr $12, %xmm2, %xmm3
- palignr $12, %xmm1, %xmm2
-
- movaps %xmm2, (%edx)
- movaps %xmm3, 16(%edx)
- lea 32(%edx, %ecx), %edx
- lea 32(%eax, %ecx), %eax
- POP (%edi)
- BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
-
- CFI_PUSH (%edi)
-
- .p2align 4
-L(sh_12_no_prefetch):
- lea -32(%ecx), %ecx
- lea -12(%eax), %eax
- xor %edi, %edi
-
- .p2align 4
-L(sh_12_no_prefetch_loop):
- movdqa 16(%eax, %edi), %xmm2
- sub $32, %ecx
- movdqa 32(%eax, %edi), %xmm3
- movdqa %xmm3, %xmm4
- palignr $12, %xmm2, %xmm3
- palignr $12, %xmm1, %xmm2
- lea 32(%edi), %edi
- movdqa %xmm2, -32(%edx, %edi)
- movdqa %xmm3, -16(%edx, %edi)
- jb L(sh_12_end_no_prefetch_loop)
-
- movdqa 16(%eax, %edi), %xmm2
- sub $32, %ecx
- movdqa 32(%eax, %edi), %xmm3
- movdqa %xmm3, %xmm1
- palignr $12, %xmm2, %xmm3
- palignr $12, %xmm4, %xmm2
- lea 32(%edi), %edi
- movdqa %xmm2, -32(%edx, %edi)
- movdqa %xmm3, -16(%edx, %edi)
- jae L(sh_12_no_prefetch_loop)
-
-L(sh_12_end_no_prefetch_loop):
- lea 32(%ecx), %ecx
- add %ecx, %edi
- add %edi, %edx
- lea 12(%edi, %eax), %eax
- POP (%edi)
- BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
-
- CFI_PUSH (%edi)
-
- .p2align 4
-L(shl_13):
-#ifndef USE_AS_MEMMOVE
- movaps -13(%eax), %xmm1
-#else
- movl DEST+4(%esp), %edi
- movaps -13(%eax), %xmm1
- movdqu %xmm0, (%edi)
-#endif
-#ifdef DATA_CACHE_SIZE_HALF
- cmp $DATA_CACHE_SIZE_HALF, %ecx
-#else
-# if (defined SHARED || defined __PIC__)
- SETUP_PIC_REG(bx)
- add $_GLOBAL_OFFSET_TABLE_, %ebx
- cmp __x86_data_cache_size_half@GOTOFF(%ebx), %ecx
-# else
- cmp __x86_data_cache_size_half, %ecx
-# endif
-#endif
- jb L(sh_13_no_prefetch)
-
- lea -64(%ecx), %ecx
-
- .p2align 4
-L(Shl13LoopStart):
- prefetcht0 0x1c0(%eax)
- prefetcht0 0x1c0(%edx)
- movaps 3(%eax), %xmm2
- movaps 19(%eax), %xmm3
- movaps 35(%eax), %xmm4
- movaps 51(%eax), %xmm5
- movaps %xmm5, %xmm7
- palignr $13, %xmm4, %xmm5
- palignr $13, %xmm3, %xmm4
- movaps %xmm5, 48(%edx)
- palignr $13, %xmm2, %xmm3
- lea 64(%eax), %eax
- palignr $13, %xmm1, %xmm2
- movaps %xmm4, 32(%edx)
- movaps %xmm3, 16(%edx)
- movaps %xmm7, %xmm1
- movaps %xmm2, (%edx)
- lea 64(%edx), %edx
- sub $64, %ecx
- ja L(Shl13LoopStart)
-
-L(Shl13LoopLeave):
- add $32, %ecx
- jle L(shl_end_0)
-
- movaps 3(%eax), %xmm2
- movaps 19(%eax), %xmm3
- palignr $13, %xmm2, %xmm3
- palignr $13, %xmm1, %xmm2
-
- movaps %xmm2, (%edx)
- movaps %xmm3, 16(%edx)
- lea 32(%edx, %ecx), %edx
- lea 32(%eax, %ecx), %eax
- POP (%edi)
- BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
-
- CFI_PUSH (%edi)
-
- .p2align 4
-L(sh_13_no_prefetch):
- lea -32(%ecx), %ecx
- lea -13(%eax), %eax
- xor %edi, %edi
-
- .p2align 4
-L(sh_13_no_prefetch_loop):
- movdqa 16(%eax, %edi), %xmm2
- sub $32, %ecx
- movdqa 32(%eax, %edi), %xmm3
- movdqa %xmm3, %xmm4
- palignr $13, %xmm2, %xmm3
- palignr $13, %xmm1, %xmm2
- lea 32(%edi), %edi
- movdqa %xmm2, -32(%edx, %edi)
- movdqa %xmm3, -16(%edx, %edi)
- jb L(sh_13_end_no_prefetch_loop)
-
- movdqa 16(%eax, %edi), %xmm2
- sub $32, %ecx
- movdqa 32(%eax, %edi), %xmm3
- movdqa %xmm3, %xmm1
- palignr $13, %xmm2, %xmm3
- palignr $13, %xmm4, %xmm2
- lea 32(%edi), %edi
- movdqa %xmm2, -32(%edx, %edi)
- movdqa %xmm3, -16(%edx, %edi)
- jae L(sh_13_no_prefetch_loop)
-
-L(sh_13_end_no_prefetch_loop):
- lea 32(%ecx), %ecx
- add %ecx, %edi
- add %edi, %edx
- lea 13(%edi, %eax), %eax
- POP (%edi)
- BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
-
- CFI_PUSH (%edi)
-
- .p2align 4
-L(shl_14):
-#ifndef USE_AS_MEMMOVE
- movaps -14(%eax), %xmm1
-#else
- movl DEST+4(%esp), %edi
- movaps -14(%eax), %xmm1
- movdqu %xmm0, (%edi)
-#endif
-#ifdef DATA_CACHE_SIZE_HALF
- cmp $DATA_CACHE_SIZE_HALF, %ecx
-#else
-# if (defined SHARED || defined __PIC__)
- SETUP_PIC_REG(bx)
- add $_GLOBAL_OFFSET_TABLE_, %ebx
- cmp __x86_data_cache_size_half@GOTOFF(%ebx), %ecx
-# else
- cmp __x86_data_cache_size_half, %ecx
-# endif
-#endif
- jb L(sh_14_no_prefetch)
-
- lea -64(%ecx), %ecx
-
- .p2align 4
-L(Shl14LoopStart):
- prefetcht0 0x1c0(%eax)
- prefetcht0 0x1c0(%edx)
- movaps 2(%eax), %xmm2
- movaps 18(%eax), %xmm3
- movaps 34(%eax), %xmm4
- movaps 50(%eax), %xmm5
- movaps %xmm5, %xmm7
- palignr $14, %xmm4, %xmm5
- palignr $14, %xmm3, %xmm4
- movaps %xmm5, 48(%edx)
- palignr $14, %xmm2, %xmm3
- lea 64(%eax), %eax
- palignr $14, %xmm1, %xmm2
- movaps %xmm4, 32(%edx)
- movaps %xmm3, 16(%edx)
- movaps %xmm7, %xmm1
- movaps %xmm2, (%edx)
- lea 64(%edx), %edx
- sub $64, %ecx
- ja L(Shl14LoopStart)
-
-L(Shl14LoopLeave):
- add $32, %ecx
- jle L(shl_end_0)
-
- movaps 2(%eax), %xmm2
- movaps 18(%eax), %xmm3
- palignr $14, %xmm2, %xmm3
- palignr $14, %xmm1, %xmm2
-
- movaps %xmm2, (%edx)
- movaps %xmm3, 16(%edx)
- lea 32(%edx, %ecx), %edx
- lea 32(%eax, %ecx), %eax
- POP (%edi)
- BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
-
- CFI_PUSH (%edi)
-
- .p2align 4
-L(sh_14_no_prefetch):
- lea -32(%ecx), %ecx
- lea -14(%eax), %eax
- xor %edi, %edi
-
- .p2align 4
-L(sh_14_no_prefetch_loop):
- movdqa 16(%eax, %edi), %xmm2
- sub $32, %ecx
- movdqa 32(%eax, %edi), %xmm3
- movdqa %xmm3, %xmm4
- palignr $14, %xmm2, %xmm3
- palignr $14, %xmm1, %xmm2
- lea 32(%edi), %edi
- movdqa %xmm2, -32(%edx, %edi)
- movdqa %xmm3, -16(%edx, %edi)
- jb L(sh_14_end_no_prefetch_loop)
-
- movdqa 16(%eax, %edi), %xmm2
- sub $32, %ecx
- movdqa 32(%eax, %edi), %xmm3
- movdqa %xmm3, %xmm1
- palignr $14, %xmm2, %xmm3
- palignr $14, %xmm4, %xmm2
- lea 32(%edi), %edi
- movdqa %xmm2, -32(%edx, %edi)
- movdqa %xmm3, -16(%edx, %edi)
- jae L(sh_14_no_prefetch_loop)
-
-L(sh_14_end_no_prefetch_loop):
- lea 32(%ecx), %ecx
- add %ecx, %edi
- add %edi, %edx
- lea 14(%edi, %eax), %eax
- POP (%edi)
- BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
-
- CFI_PUSH (%edi)
-
- .p2align 4
-L(shl_15):
-#ifndef USE_AS_MEMMOVE
- movaps -15(%eax), %xmm1
-#else
- movl DEST+4(%esp), %edi
- movaps -15(%eax), %xmm1
- movdqu %xmm0, (%edi)
-#endif
-#ifdef DATA_CACHE_SIZE_HALF
- cmp $DATA_CACHE_SIZE_HALF, %ecx
-#else
-# if (defined SHARED || defined __PIC__)
- SETUP_PIC_REG(bx)
- add $_GLOBAL_OFFSET_TABLE_, %ebx
- cmp __x86_data_cache_size_half@GOTOFF(%ebx), %ecx
-# else
- cmp __x86_data_cache_size_half, %ecx
-# endif
-#endif
- jb L(sh_15_no_prefetch)
-
- lea -64(%ecx), %ecx
-
- .p2align 4
-L(Shl15LoopStart):
- prefetcht0 0x1c0(%eax)
- prefetcht0 0x1c0(%edx)
- movaps 1(%eax), %xmm2
- movaps 17(%eax), %xmm3
- movaps 33(%eax), %xmm4
- movaps 49(%eax), %xmm5
- movaps %xmm5, %xmm7
- palignr $15, %xmm4, %xmm5
- palignr $15, %xmm3, %xmm4
- movaps %xmm5, 48(%edx)
- palignr $15, %xmm2, %xmm3
- lea 64(%eax), %eax
- palignr $15, %xmm1, %xmm2
- movaps %xmm4, 32(%edx)
- movaps %xmm3, 16(%edx)
- movaps %xmm7, %xmm1
- movaps %xmm2, (%edx)
- lea 64(%edx), %edx
- sub $64, %ecx
- ja L(Shl15LoopStart)
-
-L(Shl15LoopLeave):
- add $32, %ecx
- jle L(shl_end_0)
-
- movaps 1(%eax), %xmm2
- movaps 17(%eax), %xmm3
- palignr $15, %xmm2, %xmm3
- palignr $15, %xmm1, %xmm2
-
- movaps %xmm2, (%edx)
- movaps %xmm3, 16(%edx)
- lea 32(%edx, %ecx), %edx
- lea 32(%eax, %ecx), %eax
- POP (%edi)
- BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
-
- CFI_PUSH (%edi)
-
- .p2align 4
-L(sh_15_no_prefetch):
- lea -32(%ecx), %ecx
- lea -15(%eax), %eax
- xor %edi, %edi
-
- .p2align 4
-L(sh_15_no_prefetch_loop):
- movdqa 16(%eax, %edi), %xmm2
- sub $32, %ecx
- movdqa 32(%eax, %edi), %xmm3
- movdqa %xmm3, %xmm4
- palignr $15, %xmm2, %xmm3
- palignr $15, %xmm1, %xmm2
- lea 32(%edi), %edi
- movdqa %xmm2, -32(%edx, %edi)
- movdqa %xmm3, -16(%edx, %edi)
- jb L(sh_15_end_no_prefetch_loop)
-
- movdqa 16(%eax, %edi), %xmm2
- sub $32, %ecx
- movdqa 32(%eax, %edi), %xmm3
- movdqa %xmm3, %xmm1
- palignr $15, %xmm2, %xmm3
- palignr $15, %xmm4, %xmm2
- lea 32(%edi), %edi
- movdqa %xmm2, -32(%edx, %edi)
- movdqa %xmm3, -16(%edx, %edi)
- jae L(sh_15_no_prefetch_loop)
-
-L(sh_15_end_no_prefetch_loop):
- lea 32(%ecx), %ecx
- add %ecx, %edi
- add %edi, %edx
- lea 15(%edi, %eax), %eax
- POP (%edi)
- BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
-
- CFI_PUSH (%edi)
-
- .p2align 4
-L(shl_end_0):
- lea 32(%ecx), %ecx
- lea (%edx, %ecx), %edx
- lea (%eax, %ecx), %eax
- POP (%edi)
- BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
-
- .p2align 4
-L(fwd_write_44bytes):
- movq -44(%eax), %xmm0
- movq %xmm0, -44(%edx)
-L(fwd_write_36bytes):
- movq -36(%eax), %xmm0
- movq %xmm0, -36(%edx)
-L(fwd_write_28bytes):
- movq -28(%eax), %xmm0
- movq %xmm0, -28(%edx)
-L(fwd_write_20bytes):
- movq -20(%eax), %xmm0
- movq %xmm0, -20(%edx)
-L(fwd_write_12bytes):
- movq -12(%eax), %xmm0
- movq %xmm0, -12(%edx)
-L(fwd_write_4bytes):
- movl -4(%eax), %ecx
- movl %ecx, -4(%edx)
-#ifdef USE_AS_MEMPCPY
- movl %edx, %eax
-#else
- movl DEST(%esp), %eax
-#endif
- RETURN
-
- .p2align 4
-L(fwd_write_40bytes):
- movq -40(%eax), %xmm0
- movq %xmm0, -40(%edx)
-L(fwd_write_32bytes):
- movq -32(%eax), %xmm0
- movq %xmm0, -32(%edx)
-L(fwd_write_24bytes):
- movq -24(%eax), %xmm0
- movq %xmm0, -24(%edx)
-L(fwd_write_16bytes):
- movq -16(%eax), %xmm0
- movq %xmm0, -16(%edx)
-L(fwd_write_8bytes):
- movq -8(%eax), %xmm0
- movq %xmm0, -8(%edx)
-L(fwd_write_0bytes):
-#ifdef USE_AS_MEMPCPY
- movl %edx, %eax
-#else
- movl DEST(%esp), %eax
-#endif
- RETURN
-
- .p2align 4
-L(fwd_write_5bytes):
- movl -5(%eax), %ecx
- movl -4(%eax), %eax
- movl %ecx, -5(%edx)
- movl %eax, -4(%edx)
-#ifdef USE_AS_MEMPCPY
- movl %edx, %eax
-#else
- movl DEST(%esp), %eax
-#endif
- RETURN
-
- .p2align 4
-L(fwd_write_45bytes):
- movq -45(%eax), %xmm0
- movq %xmm0, -45(%edx)
-L(fwd_write_37bytes):
- movq -37(%eax), %xmm0
- movq %xmm0, -37(%edx)
-L(fwd_write_29bytes):
- movq -29(%eax), %xmm0
- movq %xmm0, -29(%edx)
-L(fwd_write_21bytes):
- movq -21(%eax), %xmm0
- movq %xmm0, -21(%edx)
-L(fwd_write_13bytes):
- movq -13(%eax), %xmm0
- movq %xmm0, -13(%edx)
- movl -5(%eax), %ecx
- movl %ecx, -5(%edx)
- movzbl -1(%eax), %ecx
- movb %cl, -1(%edx)
-#ifdef USE_AS_MEMPCPY
- movl %edx, %eax
-#else
- movl DEST(%esp), %eax
-#endif
- RETURN
-
- .p2align 4
-L(fwd_write_41bytes):
- movq -41(%eax), %xmm0
- movq %xmm0, -41(%edx)
-L(fwd_write_33bytes):
- movq -33(%eax), %xmm0
- movq %xmm0, -33(%edx)
-L(fwd_write_25bytes):
- movq -25(%eax), %xmm0
- movq %xmm0, -25(%edx)
-L(fwd_write_17bytes):
- movq -17(%eax), %xmm0
- movq %xmm0, -17(%edx)
-L(fwd_write_9bytes):
- movq -9(%eax), %xmm0
- movq %xmm0, -9(%edx)
-L(fwd_write_1bytes):
- movzbl -1(%eax), %ecx
- movb %cl, -1(%edx)
-#ifdef USE_AS_MEMPCPY
- movl %edx, %eax
-#else
- movl DEST(%esp), %eax
-#endif
- RETURN
-
- .p2align 4
-L(fwd_write_46bytes):
- movq -46(%eax), %xmm0
- movq %xmm0, -46(%edx)
-L(fwd_write_38bytes):
- movq -38(%eax), %xmm0
- movq %xmm0, -38(%edx)
-L(fwd_write_30bytes):
- movq -30(%eax), %xmm0
- movq %xmm0, -30(%edx)
-L(fwd_write_22bytes):
- movq -22(%eax), %xmm0
- movq %xmm0, -22(%edx)
-L(fwd_write_14bytes):
- movq -14(%eax), %xmm0
- movq %xmm0, -14(%edx)
-L(fwd_write_6bytes):
- movl -6(%eax), %ecx
- movl %ecx, -6(%edx)
- movzwl -2(%eax), %ecx
- movw %cx, -2(%edx)
-#ifdef USE_AS_MEMPCPY
- movl %edx, %eax
-#else
- movl DEST(%esp), %eax
-#endif
- RETURN
-
- .p2align 4
-L(fwd_write_42bytes):
- movq -42(%eax), %xmm0
- movq %xmm0, -42(%edx)
-L(fwd_write_34bytes):
- movq -34(%eax), %xmm0
- movq %xmm0, -34(%edx)
-L(fwd_write_26bytes):
- movq -26(%eax), %xmm0
- movq %xmm0, -26(%edx)
-L(fwd_write_18bytes):
- movq -18(%eax), %xmm0
- movq %xmm0, -18(%edx)
-L(fwd_write_10bytes):
- movq -10(%eax), %xmm0
- movq %xmm0, -10(%edx)
-L(fwd_write_2bytes):
- movzwl -2(%eax), %ecx
- movw %cx, -2(%edx)
-#ifdef USE_AS_MEMPCPY
- movl %edx, %eax
-#else
- movl DEST(%esp), %eax
-#endif
- RETURN
-
- .p2align 4
-L(fwd_write_47bytes):
- movq -47(%eax), %xmm0
- movq %xmm0, -47(%edx)
-L(fwd_write_39bytes):
- movq -39(%eax), %xmm0
- movq %xmm0, -39(%edx)
-L(fwd_write_31bytes):
- movq -31(%eax), %xmm0
- movq %xmm0, -31(%edx)
-L(fwd_write_23bytes):
- movq -23(%eax), %xmm0
- movq %xmm0, -23(%edx)
-L(fwd_write_15bytes):
- movq -15(%eax), %xmm0
- movq %xmm0, -15(%edx)
-L(fwd_write_7bytes):
- movl -7(%eax), %ecx
- movl %ecx, -7(%edx)
- movzwl -3(%eax), %ecx
- movzbl -1(%eax), %eax
- movw %cx, -3(%edx)
- movb %al, -1(%edx)
-#ifdef USE_AS_MEMPCPY
- movl %edx, %eax
-#else
- movl DEST(%esp), %eax
-#endif
- RETURN
-
- .p2align 4
-L(fwd_write_43bytes):
- movq -43(%eax), %xmm0
- movq %xmm0, -43(%edx)
-L(fwd_write_35bytes):
- movq -35(%eax), %xmm0
- movq %xmm0, -35(%edx)
-L(fwd_write_27bytes):
- movq -27(%eax), %xmm0
- movq %xmm0, -27(%edx)
-L(fwd_write_19bytes):
- movq -19(%eax), %xmm0
- movq %xmm0, -19(%edx)
-L(fwd_write_11bytes):
- movq -11(%eax), %xmm0
- movq %xmm0, -11(%edx)
-L(fwd_write_3bytes):
- movzwl -3(%eax), %ecx
- movzbl -1(%eax), %eax
- movw %cx, -3(%edx)
- movb %al, -1(%edx)
-#ifdef USE_AS_MEMPCPY
- movl %edx, %eax
-#else
- movl DEST(%esp), %eax
-#endif
- RETURN
-
- .p2align 4
-L(fwd_write_40bytes_align):
- movdqa -40(%eax), %xmm0
- movdqa %xmm0, -40(%edx)
-L(fwd_write_24bytes_align):
- movdqa -24(%eax), %xmm0
- movdqa %xmm0, -24(%edx)
-L(fwd_write_8bytes_align):
- movq -8(%eax), %xmm0
- movq %xmm0, -8(%edx)
-L(fwd_write_0bytes_align):
-#ifdef USE_AS_MEMPCPY
- movl %edx, %eax
-#else
- movl DEST(%esp), %eax
-#endif
- RETURN
-
- .p2align 4
-L(fwd_write_32bytes_align):
- movdqa -32(%eax), %xmm0
- movdqa %xmm0, -32(%edx)
-L(fwd_write_16bytes_align):
- movdqa -16(%eax), %xmm0
- movdqa %xmm0, -16(%edx)
-#ifdef USE_AS_MEMPCPY
- movl %edx, %eax
-#else
- movl DEST(%esp), %eax
-#endif
- RETURN
-
- .p2align 4
-L(fwd_write_5bytes_align):
- movl -5(%eax), %ecx
- movl -4(%eax), %eax
- movl %ecx, -5(%edx)
- movl %eax, -4(%edx)
-#ifdef USE_AS_MEMPCPY
- movl %edx, %eax
-#else
- movl DEST(%esp), %eax
-#endif
- RETURN
-
- .p2align 4
-L(fwd_write_45bytes_align):
- movdqa -45(%eax), %xmm0
- movdqa %xmm0, -45(%edx)
-L(fwd_write_29bytes_align):
- movdqa -29(%eax), %xmm0
- movdqa %xmm0, -29(%edx)
-L(fwd_write_13bytes_align):
- movq -13(%eax), %xmm0
- movq %xmm0, -13(%edx)
- movl -5(%eax), %ecx
- movl %ecx, -5(%edx)
- movzbl -1(%eax), %ecx
- movb %cl, -1(%edx)
-#ifdef USE_AS_MEMPCPY
- movl %edx, %eax
-#else
- movl DEST(%esp), %eax
-#endif
- RETURN
-
- .p2align 4
-L(fwd_write_37bytes_align):
- movdqa -37(%eax), %xmm0
- movdqa %xmm0, -37(%edx)
-L(fwd_write_21bytes_align):
- movdqa -21(%eax), %xmm0
- movdqa %xmm0, -21(%edx)
- movl -5(%eax), %ecx
- movl %ecx, -5(%edx)
- movzbl -1(%eax), %ecx
- movb %cl, -1(%edx)
-#ifdef USE_AS_MEMPCPY
- movl %edx, %eax
-#else
- movl DEST(%esp), %eax
-#endif
- RETURN
-
- .p2align 4
-L(fwd_write_41bytes_align):
- movdqa -41(%eax), %xmm0
- movdqa %xmm0, -41(%edx)
-L(fwd_write_25bytes_align):
- movdqa -25(%eax), %xmm0
- movdqa %xmm0, -25(%edx)
-L(fwd_write_9bytes_align):
- movq -9(%eax), %xmm0
- movq %xmm0, -9(%edx)
-L(fwd_write_1bytes_align):
- movzbl -1(%eax), %ecx
- movb %cl, -1(%edx)
-#ifdef USE_AS_MEMPCPY
- movl %edx, %eax
-#else
- movl DEST(%esp), %eax
-#endif
- RETURN
-
- .p2align 4
-L(fwd_write_33bytes_align):
- movdqa -33(%eax), %xmm0
- movdqa %xmm0, -33(%edx)
-L(fwd_write_17bytes_align):
- movdqa -17(%eax), %xmm0
- movdqa %xmm0, -17(%edx)
- movzbl -1(%eax), %ecx
- movb %cl, -1(%edx)
-#ifdef USE_AS_MEMPCPY
- movl %edx, %eax
-#else
- movl DEST(%esp), %eax
-#endif
- RETURN
-
- .p2align 4
-L(fwd_write_46bytes_align):
- movdqa -46(%eax), %xmm0
- movdqa %xmm0, -46(%edx)
-L(fwd_write_30bytes_align):
- movdqa -30(%eax), %xmm0
- movdqa %xmm0, -30(%edx)
-L(fwd_write_14bytes_align):
- movq -14(%eax), %xmm0
- movq %xmm0, -14(%edx)
-L(fwd_write_6bytes_align):
- movl -6(%eax), %ecx
- movl %ecx, -6(%edx)
- movzwl -2(%eax), %ecx
- movw %cx, -2(%edx)
-#ifdef USE_AS_MEMPCPY
- movl %edx, %eax
-#else
- movl DEST(%esp), %eax
-#endif
- RETURN
-
- .p2align 4
-L(fwd_write_38bytes_align):
- movdqa -38(%eax), %xmm0
- movdqa %xmm0, -38(%edx)
-L(fwd_write_22bytes_align):
- movdqa -22(%eax), %xmm0
- movdqa %xmm0, -22(%edx)
- movl -6(%eax), %ecx
- movl %ecx, -6(%edx)
- movzwl -2(%eax), %ecx
- movw %cx, -2(%edx)
-#ifdef USE_AS_MEMPCPY
- movl %edx, %eax
-#else
- movl DEST(%esp), %eax
-#endif
- RETURN
-
- .p2align 4
-L(fwd_write_42bytes_align):
- movdqa -42(%eax), %xmm0
- movdqa %xmm0, -42(%edx)
-L(fwd_write_26bytes_align):
- movdqa -26(%eax), %xmm0
- movdqa %xmm0, -26(%edx)
-L(fwd_write_10bytes_align):
- movq -10(%eax), %xmm0
- movq %xmm0, -10(%edx)
-L(fwd_write_2bytes_align):
- movzwl -2(%eax), %ecx
- movw %cx, -2(%edx)
-#ifdef USE_AS_MEMPCPY
- movl %edx, %eax
-#else
- movl DEST(%esp), %eax
-#endif
- RETURN
-
- .p2align 4
-L(fwd_write_34bytes_align):
- movdqa -34(%eax), %xmm0
- movdqa %xmm0, -34(%edx)
-L(fwd_write_18bytes_align):
- movdqa -18(%eax), %xmm0
- movdqa %xmm0, -18(%edx)
- movzwl -2(%eax), %ecx
- movw %cx, -2(%edx)
-#ifdef USE_AS_MEMPCPY
- movl %edx, %eax
-#else
- movl DEST(%esp), %eax
-#endif
- RETURN
-
- .p2align 4
-L(fwd_write_47bytes_align):
- movdqa -47(%eax), %xmm0
- movdqa %xmm0, -47(%edx)
-L(fwd_write_31bytes_align):
- movdqa -31(%eax), %xmm0
- movdqa %xmm0, -31(%edx)
-L(fwd_write_15bytes_align):
- movq -15(%eax), %xmm0
- movq %xmm0, -15(%edx)
-L(fwd_write_7bytes_align):
- movl -7(%eax), %ecx
- movl %ecx, -7(%edx)
- movzwl -3(%eax), %ecx
- movzbl -1(%eax), %eax
- movw %cx, -3(%edx)
- movb %al, -1(%edx)
-#ifdef USE_AS_MEMPCPY
- movl %edx, %eax
-#else
- movl DEST(%esp), %eax
-#endif
- RETURN
-
- .p2align 4
-L(fwd_write_39bytes_align):
- movdqa -39(%eax), %xmm0
- movdqa %xmm0, -39(%edx)
-L(fwd_write_23bytes_align):
- movdqa -23(%eax), %xmm0
- movdqa %xmm0, -23(%edx)
- movl -7(%eax), %ecx
- movl %ecx, -7(%edx)
- movzwl -3(%eax), %ecx
- movzbl -1(%eax), %eax
- movw %cx, -3(%edx)
- movb %al, -1(%edx)
-#ifdef USE_AS_MEMPCPY
- movl %edx, %eax
-#else
- movl DEST(%esp), %eax
-#endif
- RETURN
-
- .p2align 4
-L(fwd_write_43bytes_align):
- movdqa -43(%eax), %xmm0
- movdqa %xmm0, -43(%edx)
-L(fwd_write_27bytes_align):
- movdqa -27(%eax), %xmm0
- movdqa %xmm0, -27(%edx)
-L(fwd_write_11bytes_align):
- movq -11(%eax), %xmm0
- movq %xmm0, -11(%edx)
-L(fwd_write_3bytes_align):
- movzwl -3(%eax), %ecx
- movzbl -1(%eax), %eax
- movw %cx, -3(%edx)
- movb %al, -1(%edx)
-#ifdef USE_AS_MEMPCPY
- movl %edx, %eax
-#else
- movl DEST(%esp), %eax
-#endif
- RETURN
-
- .p2align 4
-L(fwd_write_35bytes_align):
- movdqa -35(%eax), %xmm0
- movdqa %xmm0, -35(%edx)
-L(fwd_write_19bytes_align):
- movdqa -19(%eax), %xmm0
- movdqa %xmm0, -19(%edx)
- movzwl -3(%eax), %ecx
- movzbl -1(%eax), %eax
- movw %cx, -3(%edx)
- movb %al, -1(%edx)
-#ifdef USE_AS_MEMPCPY
- movl %edx, %eax
-#else
- movl DEST(%esp), %eax
-#endif
- RETURN
-
- .p2align 4
-L(fwd_write_44bytes_align):
- movdqa -44(%eax), %xmm0
- movdqa %xmm0, -44(%edx)
-L(fwd_write_28bytes_align):
- movdqa -28(%eax), %xmm0
- movdqa %xmm0, -28(%edx)
-L(fwd_write_12bytes_align):
- movq -12(%eax), %xmm0
- movq %xmm0, -12(%edx)
-L(fwd_write_4bytes_align):
- movl -4(%eax), %ecx
- movl %ecx, -4(%edx)
-#ifdef USE_AS_MEMPCPY
- movl %edx, %eax
-#else
- movl DEST(%esp), %eax
-#endif
- RETURN
-
- .p2align 4
-L(fwd_write_36bytes_align):
- movdqa -36(%eax), %xmm0
- movdqa %xmm0, -36(%edx)
-L(fwd_write_20bytes_align):
- movdqa -20(%eax), %xmm0
- movdqa %xmm0, -20(%edx)
- movl -4(%eax), %ecx
- movl %ecx, -4(%edx)
-#ifdef USE_AS_MEMPCPY
- movl %edx, %eax
-#else
- movl DEST(%esp), %eax
-#endif
- RETURN_END
-
- CFI_PUSH (%edi)
-
- .p2align 4
-L(large_page):
- movdqu (%eax), %xmm1
-#ifdef USE_AS_MEMMOVE
- movl DEST+4(%esp), %edi
- movdqu %xmm0, (%edi)
-#endif
- lea 16(%eax), %eax
- movntdq %xmm1, (%edx)
- lea 16(%edx), %edx
- lea -0x90(%ecx), %ecx
- POP (%edi)
-
- .p2align 4
-L(large_page_loop):
- movdqu (%eax), %xmm0
- movdqu 0x10(%eax), %xmm1
- movdqu 0x20(%eax), %xmm2
- movdqu 0x30(%eax), %xmm3
- movdqu 0x40(%eax), %xmm4
- movdqu 0x50(%eax), %xmm5
- movdqu 0x60(%eax), %xmm6
- movdqu 0x70(%eax), %xmm7
- lea 0x80(%eax), %eax
-
- sub $0x80, %ecx
- movntdq %xmm0, (%edx)
- movntdq %xmm1, 0x10(%edx)
- movntdq %xmm2, 0x20(%edx)
- movntdq %xmm3, 0x30(%edx)
- movntdq %xmm4, 0x40(%edx)
- movntdq %xmm5, 0x50(%edx)
- movntdq %xmm6, 0x60(%edx)
- movntdq %xmm7, 0x70(%edx)
- lea 0x80(%edx), %edx
- jae L(large_page_loop)
- cmp $-0x40, %ecx
- lea 0x80(%ecx), %ecx
- jl L(large_page_less_64bytes)
-
- movdqu (%eax), %xmm0
- movdqu 0x10(%eax), %xmm1
- movdqu 0x20(%eax), %xmm2
- movdqu 0x30(%eax), %xmm3
- lea 0x40(%eax), %eax
-
- movntdq %xmm0, (%edx)
- movntdq %xmm1, 0x10(%edx)
- movntdq %xmm2, 0x20(%edx)
- movntdq %xmm3, 0x30(%edx)
- lea 0x40(%edx), %edx
- sub $0x40, %ecx
-L(large_page_less_64bytes):
- cmp $32, %ecx
- jb L(large_page_less_32bytes)
- movdqu (%eax), %xmm0
- movdqu 0x10(%eax), %xmm1
- lea 0x20(%eax), %eax
- movntdq %xmm0, (%edx)
- movntdq %xmm1, 0x10(%edx)
- lea 0x20(%edx), %edx
- sub $0x20, %ecx
-L(large_page_less_32bytes):
- add %ecx, %edx
- add %ecx, %eax
- sfence
- BRANCH_TO_JMPTBL_ENTRY (L(table_48bytes_fwd), %ecx, 4)
-
- .p2align 4
-L(bk_write_44bytes):
- movq 36(%eax), %xmm0
- movq %xmm0, 36(%edx)
-L(bk_write_36bytes):
- movq 28(%eax), %xmm0
- movq %xmm0, 28(%edx)
-L(bk_write_28bytes):
- movq 20(%eax), %xmm0
- movq %xmm0, 20(%edx)
-L(bk_write_20bytes):
- movq 12(%eax), %xmm0
- movq %xmm0, 12(%edx)
-L(bk_write_12bytes):
- movq 4(%eax), %xmm0
- movq %xmm0, 4(%edx)
-L(bk_write_4bytes):
- movl (%eax), %ecx
- movl %ecx, (%edx)
-L(bk_write_0bytes):
- movl DEST(%esp), %eax
-#ifdef USE_AS_MEMPCPY
- movl LEN(%esp), %ecx
- add %ecx, %eax
-#endif
- RETURN
-
- .p2align 4
-L(bk_write_40bytes):
- movq 32(%eax), %xmm0
- movq %xmm0, 32(%edx)
-L(bk_write_32bytes):
- movq 24(%eax), %xmm0
- movq %xmm0, 24(%edx)
-L(bk_write_24bytes):
- movq 16(%eax), %xmm0
- movq %xmm0, 16(%edx)
-L(bk_write_16bytes):
- movq 8(%eax), %xmm0
- movq %xmm0, 8(%edx)
-L(bk_write_8bytes):
- movq (%eax), %xmm0
- movq %xmm0, (%edx)
- movl DEST(%esp), %eax
-#ifdef USE_AS_MEMPCPY
- movl LEN(%esp), %ecx
- add %ecx, %eax
-#endif
- RETURN
-
- .p2align 4
-L(bk_write_45bytes):
- movq 37(%eax), %xmm0
- movq %xmm0, 37(%edx)
-L(bk_write_37bytes):
- movq 29(%eax), %xmm0
- movq %xmm0, 29(%edx)
-L(bk_write_29bytes):
- movq 21(%eax), %xmm0
- movq %xmm0, 21(%edx)
-L(bk_write_21bytes):
- movq 13(%eax), %xmm0
- movq %xmm0, 13(%edx)
-L(bk_write_13bytes):
- movq 5(%eax), %xmm0
- movq %xmm0, 5(%edx)
-L(bk_write_5bytes):
- movl 1(%eax), %ecx
- movl %ecx, 1(%edx)
-L(bk_write_1bytes):
- movzbl (%eax), %ecx
- movb %cl, (%edx)
- movl DEST(%esp), %eax
-#ifdef USE_AS_MEMPCPY
- movl LEN(%esp), %ecx
- add %ecx, %eax
-#endif
- RETURN
-
- .p2align 4
-L(bk_write_41bytes):
- movq 33(%eax), %xmm0
- movq %xmm0, 33(%edx)
-L(bk_write_33bytes):
- movq 25(%eax), %xmm0
- movq %xmm0, 25(%edx)
-L(bk_write_25bytes):
- movq 17(%eax), %xmm0
- movq %xmm0, 17(%edx)
-L(bk_write_17bytes):
- movq 9(%eax), %xmm0
- movq %xmm0, 9(%edx)
-L(bk_write_9bytes):
- movq 1(%eax), %xmm0
- movq %xmm0, 1(%edx)
- movzbl (%eax), %ecx
- movb %cl, (%edx)
- movl DEST(%esp), %eax
-#ifdef USE_AS_MEMPCPY
- movl LEN(%esp), %ecx
- add %ecx, %eax
-#endif
- RETURN
-
- .p2align 4
-L(bk_write_46bytes):
- movq 38(%eax), %xmm0
- movq %xmm0, 38(%edx)
-L(bk_write_38bytes):
- movq 30(%eax), %xmm0
- movq %xmm0, 30(%edx)
-L(bk_write_30bytes):
- movq 22(%eax), %xmm0
- movq %xmm0, 22(%edx)
-L(bk_write_22bytes):
- movq 14(%eax), %xmm0
- movq %xmm0, 14(%edx)
-L(bk_write_14bytes):
- movq 6(%eax), %xmm0
- movq %xmm0, 6(%edx)
-L(bk_write_6bytes):
- movl 2(%eax), %ecx
- movl %ecx, 2(%edx)
- movzwl (%eax), %ecx
- movw %cx, (%edx)
- movl DEST(%esp), %eax
-#ifdef USE_AS_MEMPCPY
- movl LEN(%esp), %ecx
- add %ecx, %eax
-#endif
- RETURN
-
- .p2align 4
-L(bk_write_42bytes):
- movq 34(%eax), %xmm0
- movq %xmm0, 34(%edx)
-L(bk_write_34bytes):
- movq 26(%eax), %xmm0
- movq %xmm0, 26(%edx)
-L(bk_write_26bytes):
- movq 18(%eax), %xmm0
- movq %xmm0, 18(%edx)
-L(bk_write_18bytes):
- movq 10(%eax), %xmm0
- movq %xmm0, 10(%edx)
-L(bk_write_10bytes):
- movq 2(%eax), %xmm0
- movq %xmm0, 2(%edx)
-L(bk_write_2bytes):
- movzwl (%eax), %ecx
- movw %cx, (%edx)
- movl DEST(%esp), %eax
-#ifdef USE_AS_MEMPCPY
- movl LEN(%esp), %ecx
- add %ecx, %eax
-#endif
- RETURN
-
- .p2align 4
-L(bk_write_47bytes):
- movq 39(%eax), %xmm0
- movq %xmm0, 39(%edx)
-L(bk_write_39bytes):
- movq 31(%eax), %xmm0
- movq %xmm0, 31(%edx)
-L(bk_write_31bytes):
- movq 23(%eax), %xmm0
- movq %xmm0, 23(%edx)
-L(bk_write_23bytes):
- movq 15(%eax), %xmm0
- movq %xmm0, 15(%edx)
-L(bk_write_15bytes):
- movq 7(%eax), %xmm0
- movq %xmm0, 7(%edx)
-L(bk_write_7bytes):
- movl 3(%eax), %ecx
- movl %ecx, 3(%edx)
- movzwl 1(%eax), %ecx
- movw %cx, 1(%edx)
- movzbl (%eax), %eax
- movb %al, (%edx)
- movl DEST(%esp), %eax
-#ifdef USE_AS_MEMPCPY
- movl LEN(%esp), %ecx
- add %ecx, %eax
-#endif
- RETURN
-
- .p2align 4
-L(bk_write_43bytes):
- movq 35(%eax), %xmm0
- movq %xmm0, 35(%edx)
-L(bk_write_35bytes):
- movq 27(%eax), %xmm0
- movq %xmm0, 27(%edx)
-L(bk_write_27bytes):
- movq 19(%eax), %xmm0
- movq %xmm0, 19(%edx)
-L(bk_write_19bytes):
- movq 11(%eax), %xmm0
- movq %xmm0, 11(%edx)
-L(bk_write_11bytes):
- movq 3(%eax), %xmm0
- movq %xmm0, 3(%edx)
-L(bk_write_3bytes):
- movzwl 1(%eax), %ecx
- movw %cx, 1(%edx)
- movzbl (%eax), %eax
- movb %al, (%edx)
- movl DEST(%esp), %eax
-#ifdef USE_AS_MEMPCPY
- movl LEN(%esp), %ecx
- add %ecx, %eax
-#endif
- RETURN_END
-
-
- .pushsection .rodata.ssse3,"a",@progbits
- .p2align 2
-L(table_48bytes_fwd):
- .int JMPTBL (L(fwd_write_0bytes), L(table_48bytes_fwd))
- .int JMPTBL (L(fwd_write_1bytes), L(table_48bytes_fwd))
- .int JMPTBL (L(fwd_write_2bytes), L(table_48bytes_fwd))
- .int JMPTBL (L(fwd_write_3bytes), L(table_48bytes_fwd))
- .int JMPTBL (L(fwd_write_4bytes), L(table_48bytes_fwd))
- .int JMPTBL (L(fwd_write_5bytes), L(table_48bytes_fwd))
- .int JMPTBL (L(fwd_write_6bytes), L(table_48bytes_fwd))
- .int JMPTBL (L(fwd_write_7bytes), L(table_48bytes_fwd))
- .int JMPTBL (L(fwd_write_8bytes), L(table_48bytes_fwd))
- .int JMPTBL (L(fwd_write_9bytes), L(table_48bytes_fwd))
- .int JMPTBL (L(fwd_write_10bytes), L(table_48bytes_fwd))
- .int JMPTBL (L(fwd_write_11bytes), L(table_48bytes_fwd))
- .int JMPTBL (L(fwd_write_12bytes), L(table_48bytes_fwd))
- .int JMPTBL (L(fwd_write_13bytes), L(table_48bytes_fwd))
- .int JMPTBL (L(fwd_write_14bytes), L(table_48bytes_fwd))
- .int JMPTBL (L(fwd_write_15bytes), L(table_48bytes_fwd))
- .int JMPTBL (L(fwd_write_16bytes), L(table_48bytes_fwd))
- .int JMPTBL (L(fwd_write_17bytes), L(table_48bytes_fwd))
- .int JMPTBL (L(fwd_write_18bytes), L(table_48bytes_fwd))
- .int JMPTBL (L(fwd_write_19bytes), L(table_48bytes_fwd))
- .int JMPTBL (L(fwd_write_20bytes), L(table_48bytes_fwd))
- .int JMPTBL (L(fwd_write_21bytes), L(table_48bytes_fwd))
- .int JMPTBL (L(fwd_write_22bytes), L(table_48bytes_fwd))
- .int JMPTBL (L(fwd_write_23bytes), L(table_48bytes_fwd))
- .int JMPTBL (L(fwd_write_24bytes), L(table_48bytes_fwd))
- .int JMPTBL (L(fwd_write_25bytes), L(table_48bytes_fwd))
- .int JMPTBL (L(fwd_write_26bytes), L(table_48bytes_fwd))
- .int JMPTBL (L(fwd_write_27bytes), L(table_48bytes_fwd))
- .int JMPTBL (L(fwd_write_28bytes), L(table_48bytes_fwd))
- .int JMPTBL (L(fwd_write_29bytes), L(table_48bytes_fwd))
- .int JMPTBL (L(fwd_write_30bytes), L(table_48bytes_fwd))
- .int JMPTBL (L(fwd_write_31bytes), L(table_48bytes_fwd))
- .int JMPTBL (L(fwd_write_32bytes), L(table_48bytes_fwd))
- .int JMPTBL (L(fwd_write_33bytes), L(table_48bytes_fwd))
- .int JMPTBL (L(fwd_write_34bytes), L(table_48bytes_fwd))
- .int JMPTBL (L(fwd_write_35bytes), L(table_48bytes_fwd))
- .int JMPTBL (L(fwd_write_36bytes), L(table_48bytes_fwd))
- .int JMPTBL (L(fwd_write_37bytes), L(table_48bytes_fwd))
- .int JMPTBL (L(fwd_write_38bytes), L(table_48bytes_fwd))
- .int JMPTBL (L(fwd_write_39bytes), L(table_48bytes_fwd))
- .int JMPTBL (L(fwd_write_40bytes), L(table_48bytes_fwd))
- .int JMPTBL (L(fwd_write_41bytes), L(table_48bytes_fwd))
- .int JMPTBL (L(fwd_write_42bytes), L(table_48bytes_fwd))
- .int JMPTBL (L(fwd_write_43bytes), L(table_48bytes_fwd))
- .int JMPTBL (L(fwd_write_44bytes), L(table_48bytes_fwd))
- .int JMPTBL (L(fwd_write_45bytes), L(table_48bytes_fwd))
- .int JMPTBL (L(fwd_write_46bytes), L(table_48bytes_fwd))
- .int JMPTBL (L(fwd_write_47bytes), L(table_48bytes_fwd))
-
- .p2align 2
-L(table_48bytes_fwd_align):
- .int JMPTBL (L(fwd_write_0bytes_align), L(table_48bytes_fwd_align))
- .int JMPTBL (L(fwd_write_1bytes_align), L(table_48bytes_fwd_align))
- .int JMPTBL (L(fwd_write_2bytes_align), L(table_48bytes_fwd_align))
- .int JMPTBL (L(fwd_write_3bytes_align), L(table_48bytes_fwd_align))
- .int JMPTBL (L(fwd_write_4bytes_align), L(table_48bytes_fwd_align))
- .int JMPTBL (L(fwd_write_5bytes_align), L(table_48bytes_fwd_align))
- .int JMPTBL (L(fwd_write_6bytes_align), L(table_48bytes_fwd_align))
- .int JMPTBL (L(fwd_write_7bytes_align), L(table_48bytes_fwd_align))
- .int JMPTBL (L(fwd_write_8bytes_align), L(table_48bytes_fwd_align))
- .int JMPTBL (L(fwd_write_9bytes_align), L(table_48bytes_fwd_align))
- .int JMPTBL (L(fwd_write_10bytes_align), L(table_48bytes_fwd_align))
- .int JMPTBL (L(fwd_write_11bytes_align), L(table_48bytes_fwd_align))
- .int JMPTBL (L(fwd_write_12bytes_align), L(table_48bytes_fwd_align))
- .int JMPTBL (L(fwd_write_13bytes_align), L(table_48bytes_fwd_align))
- .int JMPTBL (L(fwd_write_14bytes_align), L(table_48bytes_fwd_align))
- .int JMPTBL (L(fwd_write_15bytes_align), L(table_48bytes_fwd_align))
- .int JMPTBL (L(fwd_write_16bytes_align), L(table_48bytes_fwd_align))
- .int JMPTBL (L(fwd_write_17bytes_align), L(table_48bytes_fwd_align))
- .int JMPTBL (L(fwd_write_18bytes_align), L(table_48bytes_fwd_align))
- .int JMPTBL (L(fwd_write_19bytes_align), L(table_48bytes_fwd_align))
- .int JMPTBL (L(fwd_write_20bytes_align), L(table_48bytes_fwd_align))
- .int JMPTBL (L(fwd_write_21bytes_align), L(table_48bytes_fwd_align))
- .int JMPTBL (L(fwd_write_22bytes_align), L(table_48bytes_fwd_align))
- .int JMPTBL (L(fwd_write_23bytes_align), L(table_48bytes_fwd_align))
- .int JMPTBL (L(fwd_write_24bytes_align), L(table_48bytes_fwd_align))
- .int JMPTBL (L(fwd_write_25bytes_align), L(table_48bytes_fwd_align))
- .int JMPTBL (L(fwd_write_26bytes_align), L(table_48bytes_fwd_align))
- .int JMPTBL (L(fwd_write_27bytes_align), L(table_48bytes_fwd_align))
- .int JMPTBL (L(fwd_write_28bytes_align), L(table_48bytes_fwd_align))
- .int JMPTBL (L(fwd_write_29bytes_align), L(table_48bytes_fwd_align))
- .int JMPTBL (L(fwd_write_30bytes_align), L(table_48bytes_fwd_align))
- .int JMPTBL (L(fwd_write_31bytes_align), L(table_48bytes_fwd_align))
- .int JMPTBL (L(fwd_write_32bytes_align), L(table_48bytes_fwd_align))
- .int JMPTBL (L(fwd_write_33bytes_align), L(table_48bytes_fwd_align))
- .int JMPTBL (L(fwd_write_34bytes_align), L(table_48bytes_fwd_align))
- .int JMPTBL (L(fwd_write_35bytes_align), L(table_48bytes_fwd_align))
- .int JMPTBL (L(fwd_write_36bytes_align), L(table_48bytes_fwd_align))
- .int JMPTBL (L(fwd_write_37bytes_align), L(table_48bytes_fwd_align))
- .int JMPTBL (L(fwd_write_38bytes_align), L(table_48bytes_fwd_align))
- .int JMPTBL (L(fwd_write_39bytes_align), L(table_48bytes_fwd_align))
- .int JMPTBL (L(fwd_write_40bytes_align), L(table_48bytes_fwd_align))
- .int JMPTBL (L(fwd_write_41bytes_align), L(table_48bytes_fwd_align))
- .int JMPTBL (L(fwd_write_42bytes_align), L(table_48bytes_fwd_align))
- .int JMPTBL (L(fwd_write_43bytes_align), L(table_48bytes_fwd_align))
- .int JMPTBL (L(fwd_write_44bytes_align), L(table_48bytes_fwd_align))
- .int JMPTBL (L(fwd_write_45bytes_align), L(table_48bytes_fwd_align))
- .int JMPTBL (L(fwd_write_46bytes_align), L(table_48bytes_fwd_align))
- .int JMPTBL (L(fwd_write_47bytes_align), L(table_48bytes_fwd_align))
-
- .p2align 2
-L(shl_table):
- .int JMPTBL (L(shl_0), L(shl_table))
- .int JMPTBL (L(shl_1), L(shl_table))
- .int JMPTBL (L(shl_2), L(shl_table))
- .int JMPTBL (L(shl_3), L(shl_table))
- .int JMPTBL (L(shl_4), L(shl_table))
- .int JMPTBL (L(shl_5), L(shl_table))
- .int JMPTBL (L(shl_6), L(shl_table))
- .int JMPTBL (L(shl_7), L(shl_table))
- .int JMPTBL (L(shl_8), L(shl_table))
- .int JMPTBL (L(shl_9), L(shl_table))
- .int JMPTBL (L(shl_10), L(shl_table))
- .int JMPTBL (L(shl_11), L(shl_table))
- .int JMPTBL (L(shl_12), L(shl_table))
- .int JMPTBL (L(shl_13), L(shl_table))
- .int JMPTBL (L(shl_14), L(shl_table))
- .int JMPTBL (L(shl_15), L(shl_table))
-
- .p2align 2
-L(table_48_bytes_bwd):
- .int JMPTBL (L(bk_write_0bytes), L(table_48_bytes_bwd))
- .int JMPTBL (L(bk_write_1bytes), L(table_48_bytes_bwd))
- .int JMPTBL (L(bk_write_2bytes), L(table_48_bytes_bwd))
- .int JMPTBL (L(bk_write_3bytes), L(table_48_bytes_bwd))
- .int JMPTBL (L(bk_write_4bytes), L(table_48_bytes_bwd))
- .int JMPTBL (L(bk_write_5bytes), L(table_48_bytes_bwd))
- .int JMPTBL (L(bk_write_6bytes), L(table_48_bytes_bwd))
- .int JMPTBL (L(bk_write_7bytes), L(table_48_bytes_bwd))
- .int JMPTBL (L(bk_write_8bytes), L(table_48_bytes_bwd))
- .int JMPTBL (L(bk_write_9bytes), L(table_48_bytes_bwd))
- .int JMPTBL (L(bk_write_10bytes), L(table_48_bytes_bwd))
- .int JMPTBL (L(bk_write_11bytes), L(table_48_bytes_bwd))
- .int JMPTBL (L(bk_write_12bytes), L(table_48_bytes_bwd))
- .int JMPTBL (L(bk_write_13bytes), L(table_48_bytes_bwd))
- .int JMPTBL (L(bk_write_14bytes), L(table_48_bytes_bwd))
- .int JMPTBL (L(bk_write_15bytes), L(table_48_bytes_bwd))
- .int JMPTBL (L(bk_write_16bytes), L(table_48_bytes_bwd))
- .int JMPTBL (L(bk_write_17bytes), L(table_48_bytes_bwd))
- .int JMPTBL (L(bk_write_18bytes), L(table_48_bytes_bwd))
- .int JMPTBL (L(bk_write_19bytes), L(table_48_bytes_bwd))
- .int JMPTBL (L(bk_write_20bytes), L(table_48_bytes_bwd))
- .int JMPTBL (L(bk_write_21bytes), L(table_48_bytes_bwd))
- .int JMPTBL (L(bk_write_22bytes), L(table_48_bytes_bwd))
- .int JMPTBL (L(bk_write_23bytes), L(table_48_bytes_bwd))
- .int JMPTBL (L(bk_write_24bytes), L(table_48_bytes_bwd))
- .int JMPTBL (L(bk_write_25bytes), L(table_48_bytes_bwd))
- .int JMPTBL (L(bk_write_26bytes), L(table_48_bytes_bwd))
- .int JMPTBL (L(bk_write_27bytes), L(table_48_bytes_bwd))
- .int JMPTBL (L(bk_write_28bytes), L(table_48_bytes_bwd))
- .int JMPTBL (L(bk_write_29bytes), L(table_48_bytes_bwd))
- .int JMPTBL (L(bk_write_30bytes), L(table_48_bytes_bwd))
- .int JMPTBL (L(bk_write_31bytes), L(table_48_bytes_bwd))
- .int JMPTBL (L(bk_write_32bytes), L(table_48_bytes_bwd))
- .int JMPTBL (L(bk_write_33bytes), L(table_48_bytes_bwd))
- .int JMPTBL (L(bk_write_34bytes), L(table_48_bytes_bwd))
- .int JMPTBL (L(bk_write_35bytes), L(table_48_bytes_bwd))
- .int JMPTBL (L(bk_write_36bytes), L(table_48_bytes_bwd))
- .int JMPTBL (L(bk_write_37bytes), L(table_48_bytes_bwd))
- .int JMPTBL (L(bk_write_38bytes), L(table_48_bytes_bwd))
- .int JMPTBL (L(bk_write_39bytes), L(table_48_bytes_bwd))
- .int JMPTBL (L(bk_write_40bytes), L(table_48_bytes_bwd))
- .int JMPTBL (L(bk_write_41bytes), L(table_48_bytes_bwd))
- .int JMPTBL (L(bk_write_42bytes), L(table_48_bytes_bwd))
- .int JMPTBL (L(bk_write_43bytes), L(table_48_bytes_bwd))
- .int JMPTBL (L(bk_write_44bytes), L(table_48_bytes_bwd))
- .int JMPTBL (L(bk_write_45bytes), L(table_48_bytes_bwd))
- .int JMPTBL (L(bk_write_46bytes), L(table_48_bytes_bwd))
- .int JMPTBL (L(bk_write_47bytes), L(table_48_bytes_bwd))
-
- .popsection
-
-#ifdef USE_AS_MEMMOVE
- .p2align 4
-L(copy_backward):
- PUSH (%edi)
- movl %eax, %edi
- lea (%ecx,%edx,1),%edx
- lea (%ecx,%edi,1),%edi
- testl $0x3, %edx
- jnz L(bk_align)
-
-L(bk_aligned_4):
- cmp $64, %ecx
- jae L(bk_write_more64bytes)
-
-L(bk_write_64bytesless):
- cmp $32, %ecx
- jb L(bk_write_less32bytes)
-
-L(bk_write_more32bytes):
- /* Copy 32 bytes at a time. */
- sub $32, %ecx
- movq -8(%edi), %xmm0
- movq %xmm0, -8(%edx)
- movq -16(%edi), %xmm0
- movq %xmm0, -16(%edx)
- movq -24(%edi), %xmm0
- movq %xmm0, -24(%edx)
- movq -32(%edi), %xmm0
- movq %xmm0, -32(%edx)
- sub $32, %edx
- sub $32, %edi
-
-L(bk_write_less32bytes):
- movl %edi, %eax
- sub %ecx, %edx
- sub %ecx, %eax
- POP (%edi)
-L(bk_write_less32bytes_2):
- BRANCH_TO_JMPTBL_ENTRY (L(table_48_bytes_bwd), %ecx, 4)
-
- CFI_PUSH (%edi)
-
- .p2align 4
-L(bk_align):
- cmp $8, %ecx
- jbe L(bk_write_less32bytes)
- testl $1, %edx
- /* We get here only if (EDX & 3 ) != 0 so if (EDX & 1) ==0,
- then (EDX & 2) must be != 0. */
- jz L(bk_got2)
- sub $1, %edi
- sub $1, %ecx
- sub $1, %edx
- movzbl (%edi), %eax
- movb %al, (%edx)
-
- testl $2, %edx
- jz L(bk_aligned_4)
-
-L(bk_got2):
- sub $2, %edi
- sub $2, %ecx
- sub $2, %edx
- movzwl (%edi), %eax
- movw %ax, (%edx)
- jmp L(bk_aligned_4)
-
- .p2align 4
-L(bk_write_more64bytes):
- /* Check alignment of last byte. */
- testl $15, %edx
- jz L(bk_ssse3_cpy_pre)
-
-/* EDX is aligned 4 bytes, but not 16 bytes. */
-L(bk_ssse3_align):
- sub $4, %edi
- sub $4, %ecx
- sub $4, %edx
- movl (%edi), %eax
- movl %eax, (%edx)
-
- testl $15, %edx
- jz L(bk_ssse3_cpy_pre)
-
- sub $4, %edi
- sub $4, %ecx
- sub $4, %edx
- movl (%edi), %eax
- movl %eax, (%edx)
-
- testl $15, %edx
- jz L(bk_ssse3_cpy_pre)
-
- sub $4, %edi
- sub $4, %ecx
- sub $4, %edx
- movl (%edi), %eax
- movl %eax, (%edx)
-
-L(bk_ssse3_cpy_pre):
- cmp $64, %ecx
- jb L(bk_write_more32bytes)
-
- .p2align 4
-L(bk_ssse3_cpy):
- sub $64, %edi
- sub $64, %ecx
- sub $64, %edx
- movdqu 0x30(%edi), %xmm3
- movdqa %xmm3, 0x30(%edx)
- movdqu 0x20(%edi), %xmm2
- movdqa %xmm2, 0x20(%edx)
- movdqu 0x10(%edi), %xmm1
- movdqa %xmm1, 0x10(%edx)
- movdqu (%edi), %xmm0
- movdqa %xmm0, (%edx)
- cmp $64, %ecx
- jae L(bk_ssse3_cpy)
- jmp L(bk_write_64bytesless)
-
-#endif
-
-END (MEMCPY)
diff --git a/libc/arch-x86/string/ssse3-memmove-atom.S b/libc/arch-x86/string/ssse3-memmove-atom.S
deleted file mode 100644
index 3572eac..0000000
--- a/libc/arch-x86/string/ssse3-memmove-atom.S
+++ /dev/null
@@ -1,34 +0,0 @@
-/*
-Copyright (c) 2010, Intel Corporation
-All rights reserved.
-
-Redistribution and use in source and binary forms, with or without
-modification, are permitted provided that the following conditions are met:
-
- * Redistributions of source code must retain the above copyright notice,
- * this list of conditions and the following disclaimer.
-
- * Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
-
- * Neither the name of Intel Corporation nor the names of its contributors
- * may be used to endorse or promote products derived from this software
- * without specific prior written permission.
-
-THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
-ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
-WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
-DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
-ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
-(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
-LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
-ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-*/
-
-
-#define MEMCPY memmove_atom
-#define USE_AS_MEMMOVE
-#include "ssse3-memcpy-atom.S"
diff --git a/libc/arch-x86/string/ssse3-strncpy-atom.S b/libc/arch-x86/string/ssse3-strncpy-atom.S
deleted file mode 100644
index 0c27ffe..0000000
--- a/libc/arch-x86/string/ssse3-strncpy-atom.S
+++ /dev/null
@@ -1,33 +0,0 @@
-/*
-Copyright (c) 2011, Intel Corporation
-All rights reserved.
-
-Redistribution and use in source and binary forms, with or without
-modification, are permitted provided that the following conditions are met:
-
- * Redistributions of source code must retain the above copyright notice,
- * this list of conditions and the following disclaimer.
-
- * Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
-
- * Neither the name of Intel Corporation nor the names of its contributors
- * may be used to endorse or promote products derived from this software
- * without specific prior written permission.
-
-THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
-ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
-WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
-DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
-ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
-(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
-LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
-ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-*/
-
-#define USE_AS_STRNCPY
-#define STRCPY strncpy_atom
-#include "ssse3-strcpy-atom.S"