| /* |
| Copyright (c) 2010, Intel Corporation |
| All rights reserved. |
| |
| Redistribution and use in source and binary forms, with or without |
| modification, are permitted provided that the following conditions are met: |
| |
| * Redistributions of source code must retain the above copyright notice, |
| * this list of conditions and the following disclaimer. |
| |
| * Redistributions in binary form must reproduce the above copyright notice, |
| * this list of conditions and the following disclaimer in the documentation |
| * and/or other materials provided with the distribution. |
| |
| * Neither the name of Intel Corporation nor the names of its contributors |
| * may be used to endorse or promote products derived from this software |
| * without specific prior written permission. |
| |
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND |
| ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED |
| WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE |
| DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR |
| ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES |
| (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; |
| LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON |
| ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT |
| (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS |
| SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. |
| */ |
| |
| #define FOR_ATOM |
| #include "cache.h" |
| |
| #ifndef MEMCPY |
| # define MEMCPY memcpy_atom |
| #endif |
| |
| #ifndef L |
| # define L(label) .L##label |
| #endif |
| |
| #ifndef cfi_startproc |
| # define cfi_startproc .cfi_startproc |
| #endif |
| |
| #ifndef cfi_endproc |
| # define cfi_endproc .cfi_endproc |
| #endif |
| |
| #ifndef cfi_rel_offset |
| # define cfi_rel_offset(reg, off) .cfi_rel_offset reg, off |
| #endif |
| |
| #ifndef cfi_restore |
| # define cfi_restore(reg) .cfi_restore reg |
| #endif |
| |
| #ifndef cfi_adjust_cfa_offset |
| # define cfi_adjust_cfa_offset(off) .cfi_adjust_cfa_offset off |
| #endif |
| |
| #ifndef ENTRY |
| # define ENTRY(name) \ |
| .type name, @function; \ |
| .globl name; \ |
| .p2align 4; \ |
| name: \ |
| cfi_startproc |
| #endif |
| |
| #ifndef END |
| # define END(name) \ |
| cfi_endproc; \ |
| .size name, .-name |
| #endif |
| |
| #define DEST PARMS |
| #define SRC DEST+4 |
| #define LEN SRC+4 |
| |
| #define CFI_PUSH(REG) \ |
| cfi_adjust_cfa_offset (4); \ |
| cfi_rel_offset (REG, 0) |
| |
| #define CFI_POP(REG) \ |
| cfi_adjust_cfa_offset (-4); \ |
| cfi_restore (REG) |
| |
| #define PUSH(REG) pushl REG; CFI_PUSH (REG) |
| #define POP(REG) popl REG; CFI_POP (REG) |
| |
| #if (defined SHARED || defined __PIC__) |
| # define PARMS 8 /* Preserve EBX. */ |
| # define ENTRANCE PUSH (%ebx); |
| # define RETURN_END POP (%ebx); ret |
| # define RETURN RETURN_END; CFI_PUSH (%ebx) |
| # define JMPTBL(I, B) I - B |
| |
| # define SETUP_PIC_REG(x) call __x86.get_pc_thunk.x |
| |
| /* Load an entry in a jump table into EBX and branch to it. TABLE is a |
| jump table with relative offsets. INDEX is a register contains the |
| index into the jump table. SCALE is the scale of INDEX. */ |
| |
| # define BRANCH_TO_JMPTBL_ENTRY(TABLE, INDEX, SCALE) \ |
| /* We first load PC into EBX. */ \ |
| SETUP_PIC_REG(bx); \ |
| /* Get the address of the jump table. */ \ |
| addl $(TABLE - .), %ebx; \ |
| /* Get the entry and convert the relative offset to the \ |
| absolute address. */ \ |
| addl (%ebx, INDEX, SCALE), %ebx; \ |
| /* We loaded the jump table. Go. */ \ |
| jmp *%ebx |
| #else |
| |
| # define PARMS 4 |
| # define ENTRANCE |
| # define RETURN_END ret |
| # define RETURN RETURN_END |
| # define JMPTBL(I, B) I |
| |
| /* Branch to an entry in a jump table. TABLE is a jump table with |
| absolute offsets. INDEX is a register contains the index into the |
| jump table. SCALE is the scale of INDEX. */ |
| |
| # define BRANCH_TO_JMPTBL_ENTRY(TABLE, INDEX, SCALE) \ |
| jmp *TABLE(, INDEX, SCALE) |
| #endif |
| |
| .section .text.ssse3,"ax",@progbits |
| ENTRY (MEMCPY) |
| ENTRANCE |
| movl LEN(%esp), %ecx |
| movl SRC(%esp), %eax |
| movl DEST(%esp), %edx |
| |
| #ifdef USE_AS_MEMMOVE |
| cmp %eax, %edx |
| jb L(copy_forward) |
| je L(fwd_write_0bytes) |
| cmp $32, %ecx |
| jae L(memmove_bwd) |
| jmp L(bk_write_less32bytes_2) |
| |
| .p2align 4 |
| L(memmove_bwd): |
| add %ecx, %eax |
| cmp %eax, %edx |
| movl SRC(%esp), %eax |
| jb L(copy_backward) |
| |
| L(copy_forward): |
| #endif |
| cmp $48, %ecx |
| jae L(48bytesormore) |
| |
| L(fwd_write_less32bytes): |
| #ifndef USE_AS_MEMMOVE |
| cmp %dl, %al |
| jb L(bk_write) |
| #endif |
| add %ecx, %edx |
| add %ecx, %eax |
| BRANCH_TO_JMPTBL_ENTRY (L(table_48bytes_fwd), %ecx, 4) |
| #ifndef USE_AS_MEMMOVE |
| .p2align 4 |
| L(bk_write): |
| BRANCH_TO_JMPTBL_ENTRY (L(table_48_bytes_bwd), %ecx, 4) |
| #endif |
| |
| .p2align 4 |
| L(48bytesormore): |
| #ifndef USE_AS_MEMMOVE |
| movlpd (%eax), %xmm0 |
| movlpd 8(%eax), %xmm1 |
| movlpd %xmm0, (%edx) |
| movlpd %xmm1, 8(%edx) |
| #else |
| movdqu (%eax), %xmm0 |
| #endif |
| PUSH (%edi) |
| movl %edx, %edi |
| and $-16, %edx |
| add $16, %edx |
| sub %edx, %edi |
| add %edi, %ecx |
| sub %edi, %eax |
| |
| #ifdef SHARED_CACHE_SIZE_HALF |
| cmp $SHARED_CACHE_SIZE_HALF, %ecx |
| #else |
| # if (defined SHARED || defined __PIC__) |
| SETUP_PIC_REG(bx) |
| add $_GLOBAL_OFFSET_TABLE_, %ebx |
| cmp __x86_shared_cache_size_half@GOTOFF(%ebx), %ecx |
| # else |
| cmp __x86_shared_cache_size_half, %ecx |
| # endif |
| #endif |
| |
| mov %eax, %edi |
| jae L(large_page) |
| and $0xf, %edi |
| jz L(shl_0) |
| BRANCH_TO_JMPTBL_ENTRY (L(shl_table), %edi, 4) |
| |
| .p2align 4 |
| L(shl_0): |
| #ifdef USE_AS_MEMMOVE |
| movl DEST+4(%esp), %edi |
| movdqu %xmm0, (%edi) |
| #endif |
| xor %edi, %edi |
| cmp $127, %ecx |
| ja L(shl_0_gobble) |
| lea -32(%ecx), %ecx |
| |
| .p2align 4 |
| L(shl_0_loop): |
| movdqa (%eax, %edi), %xmm0 |
| movdqa 16(%eax, %edi), %xmm1 |
| sub $32, %ecx |
| movdqa %xmm0, (%edx, %edi) |
| movdqa %xmm1, 16(%edx, %edi) |
| lea 32(%edi), %edi |
| jb L(shl_0_end) |
| |
| movdqa (%eax, %edi), %xmm0 |
| movdqa 16(%eax, %edi), %xmm1 |
| sub $32, %ecx |
| movdqa %xmm0, (%edx, %edi) |
| movdqa %xmm1, 16(%edx, %edi) |
| lea 32(%edi), %edi |
| jb L(shl_0_end) |
| |
| movdqa (%eax, %edi), %xmm0 |
| movdqa 16(%eax, %edi), %xmm1 |
| sub $32, %ecx |
| movdqa %xmm0, (%edx, %edi) |
| movdqa %xmm1, 16(%edx, %edi) |
| lea 32(%edi), %edi |
| jb L(shl_0_end) |
| |
| movdqa (%eax, %edi), %xmm0 |
| movdqa 16(%eax, %edi), %xmm1 |
| sub $32, %ecx |
| movdqa %xmm0, (%edx, %edi) |
| movdqa %xmm1, 16(%edx, %edi) |
| lea 32(%edi), %edi |
| |
| L(shl_0_end): |
| lea 32(%ecx), %ecx |
| add %ecx, %edi |
| add %edi, %edx |
| add %edi, %eax |
| POP (%edi) |
| BRANCH_TO_JMPTBL_ENTRY (L(table_48bytes_fwd_align), %ecx, 4) |
| |
| CFI_PUSH (%edi) |
| |
| .p2align 4 |
| L(shl_0_gobble): |
| #ifdef DATA_CACHE_SIZE_HALF |
| cmp $DATA_CACHE_SIZE_HALF, %ecx |
| #else |
| # if (defined SHARED || defined __PIC__) |
| SETUP_PIC_REG(bx) |
| add $_GLOBAL_OFFSET_TABLE_, %ebx |
| cmp __x86_data_cache_size_half@GOTOFF(%ebx), %ecx |
| # else |
| cmp __x86_data_cache_size_half, %ecx |
| # endif |
| #endif |
| POP (%edi) |
| lea -128(%ecx), %ecx |
| jae L(shl_0_gobble_mem_loop) |
| |
| .p2align 4 |
| L(shl_0_gobble_cache_loop): |
| movdqa (%eax), %xmm0 |
| movdqa 0x10(%eax), %xmm1 |
| movdqa 0x20(%eax), %xmm2 |
| movdqa 0x30(%eax), %xmm3 |
| movdqa 0x40(%eax), %xmm4 |
| movdqa 0x50(%eax), %xmm5 |
| movdqa 0x60(%eax), %xmm6 |
| movdqa 0x70(%eax), %xmm7 |
| lea 0x80(%eax), %eax |
| sub $128, %ecx |
| movdqa %xmm0, (%edx) |
| movdqa %xmm1, 0x10(%edx) |
| movdqa %xmm2, 0x20(%edx) |
| movdqa %xmm3, 0x30(%edx) |
| movdqa %xmm4, 0x40(%edx) |
| movdqa %xmm5, 0x50(%edx) |
| movdqa %xmm6, 0x60(%edx) |
| movdqa %xmm7, 0x70(%edx) |
| lea 0x80(%edx), %edx |
| |
| jae L(shl_0_gobble_cache_loop) |
| cmp $-0x40, %ecx |
| lea 0x80(%ecx), %ecx |
| jl L(shl_0_cache_less_64bytes) |
| |
| movdqa (%eax), %xmm0 |
| sub $0x40, %ecx |
| movdqa 0x10(%eax), %xmm1 |
| movdqa %xmm0, (%edx) |
| movdqa %xmm1, 0x10(%edx) |
| movdqa 0x20(%eax), %xmm0 |
| movdqa 0x30(%eax), %xmm1 |
| add $0x40, %eax |
| movdqa %xmm0, 0x20(%edx) |
| movdqa %xmm1, 0x30(%edx) |
| add $0x40, %edx |
| |
| L(shl_0_cache_less_64bytes): |
| cmp $0x20, %ecx |
| jb L(shl_0_cache_less_32bytes) |
| movdqa (%eax), %xmm0 |
| sub $0x20, %ecx |
| movdqa 0x10(%eax), %xmm1 |
| add $0x20, %eax |
| movdqa %xmm0, (%edx) |
| movdqa %xmm1, 0x10(%edx) |
| add $0x20, %edx |
| |
| L(shl_0_cache_less_32bytes): |
| cmp $0x10, %ecx |
| jb L(shl_0_cache_less_16bytes) |
| sub $0x10, %ecx |
| movdqa (%eax), %xmm0 |
| add $0x10, %eax |
| movdqa %xmm0, (%edx) |
| add $0x10, %edx |
| |
| L(shl_0_cache_less_16bytes): |
| add %ecx, %edx |
| add %ecx, %eax |
| BRANCH_TO_JMPTBL_ENTRY (L(table_48bytes_fwd), %ecx, 4) |
| |
| .p2align 4 |
| L(shl_0_gobble_mem_loop): |
| prefetcht0 0x1c0(%eax) |
| prefetcht0 0x280(%eax) |
| prefetcht0 0x1c0(%edx) |
| |
| movdqa (%eax), %xmm0 |
| movdqa 0x10(%eax), %xmm1 |
| movdqa 0x20(%eax), %xmm2 |
| movdqa 0x30(%eax), %xmm3 |
| movdqa 0x40(%eax), %xmm4 |
| movdqa 0x50(%eax), %xmm5 |
| movdqa 0x60(%eax), %xmm6 |
| movdqa 0x70(%eax), %xmm7 |
| lea 0x80(%eax), %eax |
| sub $0x80, %ecx |
| movdqa %xmm0, (%edx) |
| movdqa %xmm1, 0x10(%edx) |
| movdqa %xmm2, 0x20(%edx) |
| movdqa %xmm3, 0x30(%edx) |
| movdqa %xmm4, 0x40(%edx) |
| movdqa %xmm5, 0x50(%edx) |
| movdqa %xmm6, 0x60(%edx) |
| movdqa %xmm7, 0x70(%edx) |
| lea 0x80(%edx), %edx |
| |
| jae L(shl_0_gobble_mem_loop) |
| cmp $-0x40, %ecx |
| lea 0x80(%ecx), %ecx |
| jl L(shl_0_mem_less_64bytes) |
| |
| movdqa (%eax), %xmm0 |
| sub $0x40, %ecx |
| movdqa 0x10(%eax), %xmm1 |
| |
| movdqa %xmm0, (%edx) |
| movdqa %xmm1, 0x10(%edx) |
| |
| movdqa 0x20(%eax), %xmm0 |
| movdqa 0x30(%eax), %xmm1 |
| add $0x40, %eax |
| |
| movdqa %xmm0, 0x20(%edx) |
| movdqa %xmm1, 0x30(%edx) |
| add $0x40, %edx |
| |
| L(shl_0_mem_less_64bytes): |
| cmp $0x20, %ecx |
| jb L(shl_0_mem_less_32bytes) |
| movdqa (%eax), %xmm0 |
| sub $0x20, %ecx |
| movdqa 0x10(%eax), %xmm1 |
| add $0x20, %eax |
| movdqa %xmm0, (%edx) |
| movdqa %xmm1, 0x10(%edx) |
| add $0x20, %edx |
| |
| L(shl_0_mem_less_32bytes): |
| cmp $0x10, %ecx |
| jb L(shl_0_mem_less_16bytes) |
| sub $0x10, %ecx |
| movdqa (%eax), %xmm0 |
| add $0x10, %eax |
| movdqa %xmm0, (%edx) |
| add $0x10, %edx |
| |
| L(shl_0_mem_less_16bytes): |
| add %ecx, %edx |
| add %ecx, %eax |
| BRANCH_TO_JMPTBL_ENTRY (L(table_48bytes_fwd_align), %ecx, 4) |
| |
| .p2align 4 |
| L(shl_1): |
| #ifndef USE_AS_MEMMOVE |
| movaps -1(%eax), %xmm1 |
| #else |
| movl DEST+4(%esp), %edi |
| movaps -1(%eax), %xmm1 |
| movdqu %xmm0, (%edi) |
| #endif |
| #ifdef DATA_CACHE_SIZE_HALF |
| cmp $DATA_CACHE_SIZE_HALF, %ecx |
| #else |
| # if (defined SHARED || defined __PIC__) |
| SETUP_PIC_REG(bx) |
| add $_GLOBAL_OFFSET_TABLE_, %ebx |
| cmp __x86_data_cache_size_half@GOTOFF(%ebx), %ecx |
| # else |
| cmp __x86_data_cache_size_half, %ecx |
| # endif |
| #endif |
| jb L(sh_1_no_prefetch) |
| |
| lea -64(%ecx), %ecx |
| |
| .p2align 4 |
| L(Shl1LoopStart): |
| prefetcht0 0x1c0(%eax) |
| prefetcht0 0x1c0(%edx) |
| movaps 15(%eax), %xmm2 |
| movaps 31(%eax), %xmm3 |
| movaps 47(%eax), %xmm4 |
| movaps 63(%eax), %xmm5 |
| movaps %xmm5, %xmm7 |
| palignr $1, %xmm4, %xmm5 |
| palignr $1, %xmm3, %xmm4 |
| movaps %xmm5, 48(%edx) |
| palignr $1, %xmm2, %xmm3 |
| lea 64(%eax), %eax |
| palignr $1, %xmm1, %xmm2 |
| movaps %xmm4, 32(%edx) |
| movaps %xmm3, 16(%edx) |
| movaps %xmm7, %xmm1 |
| movaps %xmm2, (%edx) |
| lea 64(%edx), %edx |
| sub $64, %ecx |
| ja L(Shl1LoopStart) |
| |
| L(Shl1LoopLeave): |
| add $32, %ecx |
| jle L(shl_end_0) |
| |
| movaps 15(%eax), %xmm2 |
| movaps 31(%eax), %xmm3 |
| palignr $1, %xmm2, %xmm3 |
| palignr $1, %xmm1, %xmm2 |
| movaps %xmm2, (%edx) |
| movaps %xmm3, 16(%edx) |
| lea 32(%edx, %ecx), %edx |
| lea 32(%eax, %ecx), %eax |
| POP (%edi) |
| BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4) |
| |
| CFI_PUSH (%edi) |
| |
| .p2align 4 |
| L(sh_1_no_prefetch): |
| lea -32(%ecx), %ecx |
| lea -1(%eax), %eax |
| xor %edi, %edi |
| |
| .p2align 4 |
| L(sh_1_no_prefetch_loop): |
| movdqa 16(%eax, %edi), %xmm2 |
| sub $32, %ecx |
| movdqa 32(%eax, %edi), %xmm3 |
| movdqa %xmm3, %xmm4 |
| palignr $1, %xmm2, %xmm3 |
| palignr $1, %xmm1, %xmm2 |
| lea 32(%edi), %edi |
| movdqa %xmm2, -32(%edx, %edi) |
| movdqa %xmm3, -16(%edx, %edi) |
| jb L(sh_1_end_no_prefetch_loop) |
| |
| movdqa 16(%eax, %edi), %xmm2 |
| sub $32, %ecx |
| movdqa 32(%eax, %edi), %xmm3 |
| movdqa %xmm3, %xmm1 |
| palignr $1, %xmm2, %xmm3 |
| palignr $1, %xmm4, %xmm2 |
| lea 32(%edi), %edi |
| movdqa %xmm2, -32(%edx, %edi) |
| movdqa %xmm3, -16(%edx, %edi) |
| jae L(sh_1_no_prefetch_loop) |
| |
| L(sh_1_end_no_prefetch_loop): |
| lea 32(%ecx), %ecx |
| add %ecx, %edi |
| add %edi, %edx |
| lea 1(%edi, %eax), %eax |
| POP (%edi) |
| BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4) |
| |
| CFI_PUSH (%edi) |
| |
| .p2align 4 |
| L(shl_2): |
| #ifndef USE_AS_MEMMOVE |
| movaps -2(%eax), %xmm1 |
| #else |
| movl DEST+4(%esp), %edi |
| movaps -2(%eax), %xmm1 |
| movdqu %xmm0, (%edi) |
| #endif |
| #ifdef DATA_CACHE_SIZE_HALF |
| cmp $DATA_CACHE_SIZE_HALF, %ecx |
| #else |
| # if (defined SHARED || defined __PIC__) |
| SETUP_PIC_REG(bx) |
| add $_GLOBAL_OFFSET_TABLE_, %ebx |
| cmp __x86_data_cache_size_half@GOTOFF(%ebx), %ecx |
| # else |
| cmp __x86_data_cache_size_half, %ecx |
| # endif |
| #endif |
| jb L(sh_2_no_prefetch) |
| |
| lea -64(%ecx), %ecx |
| |
| .p2align 4 |
| L(Shl2LoopStart): |
| prefetcht0 0x1c0(%eax) |
| prefetcht0 0x1c0(%edx) |
| movaps 14(%eax), %xmm2 |
| movaps 30(%eax), %xmm3 |
| movaps 46(%eax), %xmm4 |
| movaps 62(%eax), %xmm5 |
| movaps %xmm5, %xmm7 |
| palignr $2, %xmm4, %xmm5 |
| palignr $2, %xmm3, %xmm4 |
| movaps %xmm5, 48(%edx) |
| palignr $2, %xmm2, %xmm3 |
| lea 64(%eax), %eax |
| palignr $2, %xmm1, %xmm2 |
| movaps %xmm4, 32(%edx) |
| movaps %xmm3, 16(%edx) |
| movaps %xmm7, %xmm1 |
| movaps %xmm2, (%edx) |
| lea 64(%edx), %edx |
| sub $64, %ecx |
| ja L(Shl2LoopStart) |
| |
| L(Shl2LoopLeave): |
| add $32, %ecx |
| jle L(shl_end_0) |
| |
| movaps 14(%eax), %xmm2 |
| movaps 30(%eax), %xmm3 |
| palignr $2, %xmm2, %xmm3 |
| palignr $2, %xmm1, %xmm2 |
| movaps %xmm2, (%edx) |
| movaps %xmm3, 16(%edx) |
| lea 32(%edx, %ecx), %edx |
| lea 32(%eax, %ecx), %eax |
| POP (%edi) |
| BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4) |
| |
| CFI_PUSH (%edi) |
| |
| .p2align 4 |
| L(sh_2_no_prefetch): |
| lea -32(%ecx), %ecx |
| lea -2(%eax), %eax |
| xor %edi, %edi |
| |
| .p2align 4 |
| L(sh_2_no_prefetch_loop): |
| movdqa 16(%eax, %edi), %xmm2 |
| sub $32, %ecx |
| movdqa 32(%eax, %edi), %xmm3 |
| movdqa %xmm3, %xmm4 |
| palignr $2, %xmm2, %xmm3 |
| palignr $2, %xmm1, %xmm2 |
| lea 32(%edi), %edi |
| movdqa %xmm2, -32(%edx, %edi) |
| movdqa %xmm3, -16(%edx, %edi) |
| jb L(sh_2_end_no_prefetch_loop) |
| |
| movdqa 16(%eax, %edi), %xmm2 |
| sub $32, %ecx |
| movdqa 32(%eax, %edi), %xmm3 |
| movdqa %xmm3, %xmm1 |
| palignr $2, %xmm2, %xmm3 |
| palignr $2, %xmm4, %xmm2 |
| lea 32(%edi), %edi |
| movdqa %xmm2, -32(%edx, %edi) |
| movdqa %xmm3, -16(%edx, %edi) |
| jae L(sh_2_no_prefetch_loop) |
| |
| L(sh_2_end_no_prefetch_loop): |
| lea 32(%ecx), %ecx |
| add %ecx, %edi |
| add %edi, %edx |
| lea 2(%edi, %eax), %eax |
| POP (%edi) |
| BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4) |
| |
| CFI_PUSH (%edi) |
| |
| .p2align 4 |
| L(shl_3): |
| #ifndef USE_AS_MEMMOVE |
| movaps -3(%eax), %xmm1 |
| #else |
| movl DEST+4(%esp), %edi |
| movaps -3(%eax), %xmm1 |
| movdqu %xmm0, (%edi) |
| #endif |
| #ifdef DATA_CACHE_SIZE_HALF |
| cmp $DATA_CACHE_SIZE_HALF, %ecx |
| #else |
| # if (defined SHARED || defined __PIC__) |
| SETUP_PIC_REG(bx) |
| add $_GLOBAL_OFFSET_TABLE_, %ebx |
| cmp __x86_data_cache_size_half@GOTOFF(%ebx), %ecx |
| # else |
| cmp __x86_data_cache_size_half, %ecx |
| # endif |
| #endif |
| jb L(sh_3_no_prefetch) |
| |
| lea -64(%ecx), %ecx |
| |
| .p2align 4 |
| L(Shl3LoopStart): |
| prefetcht0 0x1c0(%eax) |
| prefetcht0 0x1c0(%edx) |
| movaps 13(%eax), %xmm2 |
| movaps 29(%eax), %xmm3 |
| movaps 45(%eax), %xmm4 |
| movaps 61(%eax), %xmm5 |
| movaps %xmm5, %xmm7 |
| palignr $3, %xmm4, %xmm5 |
| palignr $3, %xmm3, %xmm4 |
| movaps %xmm5, 48(%edx) |
| palignr $3, %xmm2, %xmm3 |
| lea 64(%eax), %eax |
| palignr $3, %xmm1, %xmm2 |
| movaps %xmm4, 32(%edx) |
| movaps %xmm3, 16(%edx) |
| movaps %xmm7, %xmm1 |
| movaps %xmm2, (%edx) |
| lea 64(%edx), %edx |
| sub $64, %ecx |
| ja L(Shl3LoopStart) |
| |
| L(Shl3LoopLeave): |
| add $32, %ecx |
| jle L(shl_end_0) |
| |
| movaps 13(%eax), %xmm2 |
| movaps 29(%eax), %xmm3 |
| palignr $3, %xmm2, %xmm3 |
| palignr $3, %xmm1, %xmm2 |
| movaps %xmm2, (%edx) |
| movaps %xmm3, 16(%edx) |
| lea 32(%edx, %ecx), %edx |
| lea 32(%eax, %ecx), %eax |
| POP (%edi) |
| BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4) |
| |
| CFI_PUSH (%edi) |
| |
| .p2align 4 |
| L(sh_3_no_prefetch): |
| lea -32(%ecx), %ecx |
| lea -3(%eax), %eax |
| xor %edi, %edi |
| |
| .p2align 4 |
| L(sh_3_no_prefetch_loop): |
| movdqa 16(%eax, %edi), %xmm2 |
| sub $32, %ecx |
| movdqa 32(%eax, %edi), %xmm3 |
| movdqa %xmm3, %xmm4 |
| palignr $3, %xmm2, %xmm3 |
| palignr $3, %xmm1, %xmm2 |
| lea 32(%edi), %edi |
| movdqa %xmm2, -32(%edx, %edi) |
| movdqa %xmm3, -16(%edx, %edi) |
| |
| jb L(sh_3_end_no_prefetch_loop) |
| |
| movdqa 16(%eax, %edi), %xmm2 |
| sub $32, %ecx |
| movdqa 32(%eax, %edi), %xmm3 |
| movdqa %xmm3, %xmm1 |
| palignr $3, %xmm2, %xmm3 |
| palignr $3, %xmm4, %xmm2 |
| lea 32(%edi), %edi |
| movdqa %xmm2, -32(%edx, %edi) |
| movdqa %xmm3, -16(%edx, %edi) |
| |
| jae L(sh_3_no_prefetch_loop) |
| |
| L(sh_3_end_no_prefetch_loop): |
| lea 32(%ecx), %ecx |
| add %ecx, %edi |
| add %edi, %edx |
| lea 3(%edi, %eax), %eax |
| POP (%edi) |
| BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4) |
| |
| CFI_PUSH (%edi) |
| |
| .p2align 4 |
| L(shl_4): |
| #ifndef USE_AS_MEMMOVE |
| movaps -4(%eax), %xmm1 |
| #else |
| movl DEST+4(%esp), %edi |
| movaps -4(%eax), %xmm1 |
| movdqu %xmm0, (%edi) |
| #endif |
| #ifdef DATA_CACHE_SIZE_HALF |
| cmp $DATA_CACHE_SIZE_HALF, %ecx |
| #else |
| # if (defined SHARED || defined __PIC__) |
| SETUP_PIC_REG(bx) |
| add $_GLOBAL_OFFSET_TABLE_, %ebx |
| cmp __x86_data_cache_size_half@GOTOFF(%ebx), %ecx |
| # else |
| cmp __x86_data_cache_size_half, %ecx |
| # endif |
| #endif |
| jb L(sh_4_no_prefetch) |
| |
| lea -64(%ecx), %ecx |
| |
| .p2align 4 |
| L(Shl4LoopStart): |
| prefetcht0 0x1c0(%eax) |
| prefetcht0 0x1c0(%edx) |
| movaps 12(%eax), %xmm2 |
| movaps 28(%eax), %xmm3 |
| movaps 44(%eax), %xmm4 |
| movaps 60(%eax), %xmm5 |
| movaps %xmm5, %xmm7 |
| palignr $4, %xmm4, %xmm5 |
| palignr $4, %xmm3, %xmm4 |
| movaps %xmm5, 48(%edx) |
| palignr $4, %xmm2, %xmm3 |
| lea 64(%eax), %eax |
| palignr $4, %xmm1, %xmm2 |
| movaps %xmm4, 32(%edx) |
| movaps %xmm3, 16(%edx) |
| movaps %xmm7, %xmm1 |
| movaps %xmm2, (%edx) |
| lea 64(%edx), %edx |
| sub $64, %ecx |
| ja L(Shl4LoopStart) |
| |
| L(Shl4LoopLeave): |
| add $32, %ecx |
| jle L(shl_end_0) |
| |
| movaps 12(%eax), %xmm2 |
| movaps 28(%eax), %xmm3 |
| palignr $4, %xmm2, %xmm3 |
| palignr $4, %xmm1, %xmm2 |
| movaps %xmm2, (%edx) |
| movaps %xmm3, 16(%edx) |
| lea 32(%edx, %ecx), %edx |
| lea 32(%eax, %ecx), %eax |
| POP (%edi) |
| BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4) |
| |
| CFI_PUSH (%edi) |
| |
| .p2align 4 |
| L(sh_4_no_prefetch): |
| lea -32(%ecx), %ecx |
| lea -4(%eax), %eax |
| xor %edi, %edi |
| |
| .p2align 4 |
| L(sh_4_no_prefetch_loop): |
| movdqa 16(%eax, %edi), %xmm2 |
| sub $32, %ecx |
| movdqa 32(%eax, %edi), %xmm3 |
| movdqa %xmm3, %xmm4 |
| palignr $4, %xmm2, %xmm3 |
| palignr $4, %xmm1, %xmm2 |
| lea 32(%edi), %edi |
| movdqa %xmm2, -32(%edx, %edi) |
| movdqa %xmm3, -16(%edx, %edi) |
| |
| jb L(sh_4_end_no_prefetch_loop) |
| |
| movdqa 16(%eax, %edi), %xmm2 |
| sub $32, %ecx |
| movdqa 32(%eax, %edi), %xmm3 |
| movdqa %xmm3, %xmm1 |
| palignr $4, %xmm2, %xmm3 |
| palignr $4, %xmm4, %xmm2 |
| lea 32(%edi), %edi |
| movdqa %xmm2, -32(%edx, %edi) |
| movdqa %xmm3, -16(%edx, %edi) |
| |
| jae L(sh_4_no_prefetch_loop) |
| |
| L(sh_4_end_no_prefetch_loop): |
| lea 32(%ecx), %ecx |
| add %ecx, %edi |
| add %edi, %edx |
| lea 4(%edi, %eax), %eax |
| POP (%edi) |
| BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4) |
| |
| CFI_PUSH (%edi) |
| |
| .p2align 4 |
| L(shl_5): |
| #ifndef USE_AS_MEMMOVE |
| movaps -5(%eax), %xmm1 |
| #else |
| movl DEST+4(%esp), %edi |
| movaps -5(%eax), %xmm1 |
| movdqu %xmm0, (%edi) |
| #endif |
| #ifdef DATA_CACHE_SIZE_HALF |
| cmp $DATA_CACHE_SIZE_HALF, %ecx |
| #else |
| # if (defined SHARED || defined __PIC__) |
| SETUP_PIC_REG(bx) |
| add $_GLOBAL_OFFSET_TABLE_, %ebx |
| cmp __x86_data_cache_size_half@GOTOFF(%ebx), %ecx |
| # else |
| cmp __x86_data_cache_size_half, %ecx |
| # endif |
| #endif |
| jb L(sh_5_no_prefetch) |
| |
| lea -64(%ecx), %ecx |
| |
| .p2align 4 |
| L(Shl5LoopStart): |
| prefetcht0 0x1c0(%eax) |
| prefetcht0 0x1c0(%edx) |
| movaps 11(%eax), %xmm2 |
| movaps 27(%eax), %xmm3 |
| movaps 43(%eax), %xmm4 |
| movaps 59(%eax), %xmm5 |
| movaps %xmm5, %xmm7 |
| palignr $5, %xmm4, %xmm5 |
| palignr $5, %xmm3, %xmm4 |
| movaps %xmm5, 48(%edx) |
| palignr $5, %xmm2, %xmm3 |
| lea 64(%eax), %eax |
| palignr $5, %xmm1, %xmm2 |
| movaps %xmm4, 32(%edx) |
| movaps %xmm3, 16(%edx) |
| movaps %xmm7, %xmm1 |
| movaps %xmm2, (%edx) |
| lea 64(%edx), %edx |
| sub $64, %ecx |
| ja L(Shl5LoopStart) |
| |
| L(Shl5LoopLeave): |
| add $32, %ecx |
| jle L(shl_end_0) |
| |
| movaps 11(%eax), %xmm2 |
| movaps 27(%eax), %xmm3 |
| palignr $5, %xmm2, %xmm3 |
| palignr $5, %xmm1, %xmm2 |
| movaps %xmm2, (%edx) |
| movaps %xmm3, 16(%edx) |
| lea 32(%edx, %ecx), %edx |
| lea 32(%eax, %ecx), %eax |
| POP (%edi) |
| BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4) |
| |
| CFI_PUSH (%edi) |
| |
| .p2align 4 |
| L(sh_5_no_prefetch): |
| lea -32(%ecx), %ecx |
| lea -5(%eax), %eax |
| xor %edi, %edi |
| |
| .p2align 4 |
| L(sh_5_no_prefetch_loop): |
| movdqa 16(%eax, %edi), %xmm2 |
| sub $32, %ecx |
| movdqa 32(%eax, %edi), %xmm3 |
| movdqa %xmm3, %xmm4 |
| palignr $5, %xmm2, %xmm3 |
| palignr $5, %xmm1, %xmm2 |
| lea 32(%edi), %edi |
| movdqa %xmm2, -32(%edx, %edi) |
| movdqa %xmm3, -16(%edx, %edi) |
| |
| jb L(sh_5_end_no_prefetch_loop) |
| |
| movdqa 16(%eax, %edi), %xmm2 |
| sub $32, %ecx |
| movdqa 32(%eax, %edi), %xmm3 |
| movdqa %xmm3, %xmm1 |
| palignr $5, %xmm2, %xmm3 |
| palignr $5, %xmm4, %xmm2 |
| lea 32(%edi), %edi |
| movdqa %xmm2, -32(%edx, %edi) |
| movdqa %xmm3, -16(%edx, %edi) |
| |
| jae L(sh_5_no_prefetch_loop) |
| |
| L(sh_5_end_no_prefetch_loop): |
| lea 32(%ecx), %ecx |
| add %ecx, %edi |
| add %edi, %edx |
| lea 5(%edi, %eax), %eax |
| POP (%edi) |
| BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4) |
| |
| CFI_PUSH (%edi) |
| |
| .p2align 4 |
| L(shl_6): |
| #ifndef USE_AS_MEMMOVE |
| movaps -6(%eax), %xmm1 |
| #else |
| movl DEST+4(%esp), %edi |
| movaps -6(%eax), %xmm1 |
| movdqu %xmm0, (%edi) |
| #endif |
| #ifdef DATA_CACHE_SIZE_HALF |
| cmp $DATA_CACHE_SIZE_HALF, %ecx |
| #else |
| # if (defined SHARED || defined __PIC__) |
| SETUP_PIC_REG(bx) |
| add $_GLOBAL_OFFSET_TABLE_, %ebx |
| cmp __x86_data_cache_size_half@GOTOFF(%ebx), %ecx |
| # else |
| cmp __x86_data_cache_size_half, %ecx |
| # endif |
| #endif |
| jb L(sh_6_no_prefetch) |
| |
| lea -64(%ecx), %ecx |
| |
| .p2align 4 |
| L(Shl6LoopStart): |
| prefetcht0 0x1c0(%eax) |
| prefetcht0 0x1c0(%edx) |
| movaps 10(%eax), %xmm2 |
| movaps 26(%eax), %xmm3 |
| movaps 42(%eax), %xmm4 |
| movaps 58(%eax), %xmm5 |
| movaps %xmm5, %xmm7 |
| palignr $6, %xmm4, %xmm5 |
| palignr $6, %xmm3, %xmm4 |
| movaps %xmm5, 48(%edx) |
| palignr $6, %xmm2, %xmm3 |
| lea 64(%eax), %eax |
| palignr $6, %xmm1, %xmm2 |
| movaps %xmm4, 32(%edx) |
| movaps %xmm3, 16(%edx) |
| movaps %xmm7, %xmm1 |
| movaps %xmm2, (%edx) |
| lea 64(%edx), %edx |
| sub $64, %ecx |
| ja L(Shl6LoopStart) |
| |
| L(Shl6LoopLeave): |
| add $32, %ecx |
| jle L(shl_end_0) |
| |
| movaps 10(%eax), %xmm2 |
| movaps 26(%eax), %xmm3 |
| palignr $6, %xmm2, %xmm3 |
| palignr $6, %xmm1, %xmm2 |
| movaps %xmm2, (%edx) |
| movaps %xmm3, 16(%edx) |
| lea 32(%edx, %ecx), %edx |
| lea 32(%eax, %ecx), %eax |
| POP (%edi) |
| BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4) |
| |
| CFI_PUSH (%edi) |
| |
| .p2align 4 |
| L(sh_6_no_prefetch): |
| lea -32(%ecx), %ecx |
| lea -6(%eax), %eax |
| xor %edi, %edi |
| |
| .p2align 4 |
| L(sh_6_no_prefetch_loop): |
| movdqa 16(%eax, %edi), %xmm2 |
| sub $32, %ecx |
| movdqa 32(%eax, %edi), %xmm3 |
| movdqa %xmm3, %xmm4 |
| palignr $6, %xmm2, %xmm3 |
| palignr $6, %xmm1, %xmm2 |
| lea 32(%edi), %edi |
| movdqa %xmm2, -32(%edx, %edi) |
| movdqa %xmm3, -16(%edx, %edi) |
| |
| jb L(sh_6_end_no_prefetch_loop) |
| |
| movdqa 16(%eax, %edi), %xmm2 |
| sub $32, %ecx |
| movdqa 32(%eax, %edi), %xmm3 |
| movdqa %xmm3, %xmm1 |
| palignr $6, %xmm2, %xmm3 |
| palignr $6, %xmm4, %xmm2 |
| lea 32(%edi), %edi |
| movdqa %xmm2, -32(%edx, %edi) |
| movdqa %xmm3, -16(%edx, %edi) |
| |
| jae L(sh_6_no_prefetch_loop) |
| |
| L(sh_6_end_no_prefetch_loop): |
| lea 32(%ecx), %ecx |
| add %ecx, %edi |
| add %edi, %edx |
| lea 6(%edi, %eax), %eax |
| POP (%edi) |
| BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4) |
| |
| CFI_PUSH (%edi) |
| |
| .p2align 4 |
| L(shl_7): |
| #ifndef USE_AS_MEMMOVE |
| movaps -7(%eax), %xmm1 |
| #else |
| movl DEST+4(%esp), %edi |
| movaps -7(%eax), %xmm1 |
| movdqu %xmm0, (%edi) |
| #endif |
| #ifdef DATA_CACHE_SIZE_HALF |
| cmp $DATA_CACHE_SIZE_HALF, %ecx |
| #else |
| # if (defined SHARED || defined __PIC__) |
| SETUP_PIC_REG(bx) |
| add $_GLOBAL_OFFSET_TABLE_, %ebx |
| cmp __x86_data_cache_size_half@GOTOFF(%ebx), %ecx |
| # else |
| cmp __x86_data_cache_size_half, %ecx |
| # endif |
| #endif |
| jb L(sh_7_no_prefetch) |
| |
| lea -64(%ecx), %ecx |
| |
| .p2align 4 |
| L(Shl7LoopStart): |
| prefetcht0 0x1c0(%eax) |
| prefetcht0 0x1c0(%edx) |
| movaps 9(%eax), %xmm2 |
| movaps 25(%eax), %xmm3 |
| movaps 41(%eax), %xmm4 |
| movaps 57(%eax), %xmm5 |
| movaps %xmm5, %xmm7 |
| palignr $7, %xmm4, %xmm5 |
| palignr $7, %xmm3, %xmm4 |
| movaps %xmm5, 48(%edx) |
| palignr $7, %xmm2, %xmm3 |
| lea 64(%eax), %eax |
| palignr $7, %xmm1, %xmm2 |
| movaps %xmm4, 32(%edx) |
| movaps %xmm3, 16(%edx) |
| movaps %xmm7, %xmm1 |
| movaps %xmm2, (%edx) |
| lea 64(%edx), %edx |
| sub $64, %ecx |
| ja L(Shl7LoopStart) |
| |
| L(Shl7LoopLeave): |
| add $32, %ecx |
| jle L(shl_end_0) |
| |
| movaps 9(%eax), %xmm2 |
| movaps 25(%eax), %xmm3 |
| palignr $7, %xmm2, %xmm3 |
| palignr $7, %xmm1, %xmm2 |
| movaps %xmm2, (%edx) |
| movaps %xmm3, 16(%edx) |
| lea 32(%edx, %ecx), %edx |
| lea 32(%eax, %ecx), %eax |
| POP (%edi) |
| BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4) |
| |
| CFI_PUSH (%edi) |
| |
| .p2align 4 |
| L(sh_7_no_prefetch): |
| lea -32(%ecx), %ecx |
| lea -7(%eax), %eax |
| xor %edi, %edi |
| |
| .p2align 4 |
| L(sh_7_no_prefetch_loop): |
| movdqa 16(%eax, %edi), %xmm2 |
| sub $32, %ecx |
| movdqa 32(%eax, %edi), %xmm3 |
| movdqa %xmm3, %xmm4 |
| palignr $7, %xmm2, %xmm3 |
| palignr $7, %xmm1, %xmm2 |
| lea 32(%edi), %edi |
| movdqa %xmm2, -32(%edx, %edi) |
| movdqa %xmm3, -16(%edx, %edi) |
| jb L(sh_7_end_no_prefetch_loop) |
| |
| movdqa 16(%eax, %edi), %xmm2 |
| sub $32, %ecx |
| movdqa 32(%eax, %edi), %xmm3 |
| movdqa %xmm3, %xmm1 |
| palignr $7, %xmm2, %xmm3 |
| palignr $7, %xmm4, %xmm2 |
| lea 32(%edi), %edi |
| movdqa %xmm2, -32(%edx, %edi) |
| movdqa %xmm3, -16(%edx, %edi) |
| jae L(sh_7_no_prefetch_loop) |
| |
| L(sh_7_end_no_prefetch_loop): |
| lea 32(%ecx), %ecx |
| add %ecx, %edi |
| add %edi, %edx |
| lea 7(%edi, %eax), %eax |
| POP (%edi) |
| BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4) |
| |
| CFI_PUSH (%edi) |
| |
| .p2align 4 |
| L(shl_8): |
| #ifndef USE_AS_MEMMOVE |
| movaps -8(%eax), %xmm1 |
| #else |
| movl DEST+4(%esp), %edi |
| movaps -8(%eax), %xmm1 |
| movdqu %xmm0, (%edi) |
| #endif |
| #ifdef DATA_CACHE_SIZE_HALF |
| cmp $DATA_CACHE_SIZE_HALF, %ecx |
| #else |
| # if (defined SHARED || defined __PIC__) |
| SETUP_PIC_REG(bx) |
| add $_GLOBAL_OFFSET_TABLE_, %ebx |
| cmp __x86_data_cache_size_half@GOTOFF(%ebx), %ecx |
| # else |
| cmp __x86_data_cache_size_half, %ecx |
| # endif |
| #endif |
| jb L(sh_8_no_prefetch) |
| |
| lea -64(%ecx), %ecx |
| |
| .p2align 4 |
| L(Shl8LoopStart): |
| prefetcht0 0x1c0(%eax) |
| prefetcht0 0x1c0(%edx) |
| movaps 8(%eax), %xmm2 |
| movaps 24(%eax), %xmm3 |
| movaps 40(%eax), %xmm4 |
| movaps 56(%eax), %xmm5 |
| movaps %xmm5, %xmm7 |
| palignr $8, %xmm4, %xmm5 |
| palignr $8, %xmm3, %xmm4 |
| movaps %xmm5, 48(%edx) |
| palignr $8, %xmm2, %xmm3 |
| lea 64(%eax), %eax |
| palignr $8, %xmm1, %xmm2 |
| movaps %xmm4, 32(%edx) |
| movaps %xmm3, 16(%edx) |
| movaps %xmm7, %xmm1 |
| movaps %xmm2, (%edx) |
| lea 64(%edx), %edx |
| sub $64, %ecx |
| ja L(Shl8LoopStart) |
| |
| L(LoopLeave8): |
| add $32, %ecx |
| jle L(shl_end_0) |
| |
| movaps 8(%eax), %xmm2 |
| movaps 24(%eax), %xmm3 |
| palignr $8, %xmm2, %xmm3 |
| palignr $8, %xmm1, %xmm2 |
| movaps %xmm2, (%edx) |
| movaps %xmm3, 16(%edx) |
| lea 32(%edx, %ecx), %edx |
| lea 32(%eax, %ecx), %eax |
| POP (%edi) |
| BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4) |
| |
| CFI_PUSH (%edi) |
| |
| .p2align 4 |
| L(sh_8_no_prefetch): |
| lea -32(%ecx), %ecx |
| lea -8(%eax), %eax |
| xor %edi, %edi |
| |
| .p2align 4 |
| L(sh_8_no_prefetch_loop): |
| movdqa 16(%eax, %edi), %xmm2 |
| sub $32, %ecx |
| movdqa 32(%eax, %edi), %xmm3 |
| movdqa %xmm3, %xmm4 |
| palignr $8, %xmm2, %xmm3 |
| palignr $8, %xmm1, %xmm2 |
| lea 32(%edi), %edi |
| movdqa %xmm2, -32(%edx, %edi) |
| movdqa %xmm3, -16(%edx, %edi) |
| jb L(sh_8_end_no_prefetch_loop) |
| |
| movdqa 16(%eax, %edi), %xmm2 |
| sub $32, %ecx |
| movdqa 32(%eax, %edi), %xmm3 |
| movdqa %xmm3, %xmm1 |
| palignr $8, %xmm2, %xmm3 |
| palignr $8, %xmm4, %xmm2 |
| lea 32(%edi), %edi |
| movdqa %xmm2, -32(%edx, %edi) |
| movdqa %xmm3, -16(%edx, %edi) |
| jae L(sh_8_no_prefetch_loop) |
| |
| L(sh_8_end_no_prefetch_loop): |
| lea 32(%ecx), %ecx |
| add %ecx, %edi |
| add %edi, %edx |
| lea 8(%edi, %eax), %eax |
| POP (%edi) |
| BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4) |
| |
| CFI_PUSH (%edi) |
| |
| .p2align 4 |
| L(shl_9): |
| #ifndef USE_AS_MEMMOVE |
| movaps -9(%eax), %xmm1 |
| #else |
| movl DEST+4(%esp), %edi |
| movaps -9(%eax), %xmm1 |
| movdqu %xmm0, (%edi) |
| #endif |
| #ifdef DATA_CACHE_SIZE_HALF |
| cmp $DATA_CACHE_SIZE_HALF, %ecx |
| #else |
| # if (defined SHARED || defined __PIC__) |
| SETUP_PIC_REG(bx) |
| add $_GLOBAL_OFFSET_TABLE_, %ebx |
| cmp __x86_data_cache_size_half@GOTOFF(%ebx), %ecx |
| # else |
| cmp __x86_data_cache_size_half, %ecx |
| # endif |
| #endif |
| jb L(sh_9_no_prefetch) |
| |
| lea -64(%ecx), %ecx |
| |
| .p2align 4 |
| L(Shl9LoopStart): |
| prefetcht0 0x1c0(%eax) |
| prefetcht0 0x1c0(%edx) |
| movaps 7(%eax), %xmm2 |
| movaps 23(%eax), %xmm3 |
| movaps 39(%eax), %xmm4 |
| movaps 55(%eax), %xmm5 |
| movaps %xmm5, %xmm7 |
| palignr $9, %xmm4, %xmm5 |
| palignr $9, %xmm3, %xmm4 |
| movaps %xmm5, 48(%edx) |
| palignr $9, %xmm2, %xmm3 |
| lea 64(%eax), %eax |
| palignr $9, %xmm1, %xmm2 |
| movaps %xmm4, 32(%edx) |
| movaps %xmm3, 16(%edx) |
| movaps %xmm7, %xmm1 |
| movaps %xmm2, (%edx) |
| lea 64(%edx), %edx |
| sub $64, %ecx |
| ja L(Shl9LoopStart) |
| |
| L(Shl9LoopLeave): |
| add $32, %ecx |
| jle L(shl_end_0) |
| |
| movaps 7(%eax), %xmm2 |
| movaps 23(%eax), %xmm3 |
| palignr $9, %xmm2, %xmm3 |
| palignr $9, %xmm1, %xmm2 |
| |
| movaps %xmm2, (%edx) |
| movaps %xmm3, 16(%edx) |
| lea 32(%edx, %ecx), %edx |
| lea 32(%eax, %ecx), %eax |
| POP (%edi) |
| BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4) |
| |
| CFI_PUSH (%edi) |
| |
| .p2align 4 |
| L(sh_9_no_prefetch): |
| lea -32(%ecx), %ecx |
| lea -9(%eax), %eax |
| xor %edi, %edi |
| |
| .p2align 4 |
| L(sh_9_no_prefetch_loop): |
| movdqa 16(%eax, %edi), %xmm2 |
| sub $32, %ecx |
| movdqa 32(%eax, %edi), %xmm3 |
| movdqa %xmm3, %xmm4 |
| palignr $9, %xmm2, %xmm3 |
| palignr $9, %xmm1, %xmm2 |
| lea 32(%edi), %edi |
| movdqa %xmm2, -32(%edx, %edi) |
| movdqa %xmm3, -16(%edx, %edi) |
| jb L(sh_9_end_no_prefetch_loop) |
| |
| movdqa 16(%eax, %edi), %xmm2 |
| sub $32, %ecx |
| movdqa 32(%eax, %edi), %xmm3 |
| movdqa %xmm3, %xmm1 |
| palignr $9, %xmm2, %xmm3 |
| palignr $9, %xmm4, %xmm2 |
| lea 32(%edi), %edi |
| movdqa %xmm2, -32(%edx, %edi) |
| movdqa %xmm3, -16(%edx, %edi) |
| jae L(sh_9_no_prefetch_loop) |
| |
| L(sh_9_end_no_prefetch_loop): |
| lea 32(%ecx), %ecx |
| add %ecx, %edi |
| add %edi, %edx |
| lea 9(%edi, %eax), %eax |
| POP (%edi) |
| BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4) |
| |
| CFI_PUSH (%edi) |
| |
| .p2align 4 |
| L(shl_10): |
| #ifndef USE_AS_MEMMOVE |
| movaps -10(%eax), %xmm1 |
| #else |
| movl DEST+4(%esp), %edi |
| movaps -10(%eax), %xmm1 |
| movdqu %xmm0, (%edi) |
| #endif |
| #ifdef DATA_CACHE_SIZE_HALF |
| cmp $DATA_CACHE_SIZE_HALF, %ecx |
| #else |
| # if (defined SHARED || defined __PIC__) |
| SETUP_PIC_REG(bx) |
| add $_GLOBAL_OFFSET_TABLE_, %ebx |
| cmp __x86_data_cache_size_half@GOTOFF(%ebx), %ecx |
| # else |
| cmp __x86_data_cache_size_half, %ecx |
| # endif |
| #endif |
| jb L(sh_10_no_prefetch) |
| |
| lea -64(%ecx), %ecx |
| |
| .p2align 4 |
| L(Shl10LoopStart): |
| prefetcht0 0x1c0(%eax) |
| prefetcht0 0x1c0(%edx) |
| movaps 6(%eax), %xmm2 |
| movaps 22(%eax), %xmm3 |
| movaps 38(%eax), %xmm4 |
| movaps 54(%eax), %xmm5 |
| movaps %xmm5, %xmm7 |
| palignr $10, %xmm4, %xmm5 |
| palignr $10, %xmm3, %xmm4 |
| movaps %xmm5, 48(%edx) |
| palignr $10, %xmm2, %xmm3 |
| lea 64(%eax), %eax |
| palignr $10, %xmm1, %xmm2 |
| movaps %xmm4, 32(%edx) |
| movaps %xmm3, 16(%edx) |
| movaps %xmm7, %xmm1 |
| movaps %xmm2, (%edx) |
| lea 64(%edx), %edx |
| sub $64, %ecx |
| ja L(Shl10LoopStart) |
| |
| L(Shl10LoopLeave): |
| add $32, %ecx |
| jle L(shl_end_0) |
| |
| movaps 6(%eax), %xmm2 |
| movaps 22(%eax), %xmm3 |
| palignr $10, %xmm2, %xmm3 |
| palignr $10, %xmm1, %xmm2 |
| |
| movaps %xmm2, (%edx) |
| movaps %xmm3, 16(%edx) |
| lea 32(%edx, %ecx), %edx |
| lea 32(%eax, %ecx), %eax |
| POP (%edi) |
| BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4) |
| |
| CFI_PUSH (%edi) |
| |
| .p2align 4 |
| L(sh_10_no_prefetch): |
| lea -32(%ecx), %ecx |
| lea -10(%eax), %eax |
| xor %edi, %edi |
| |
| .p2align 4 |
| L(sh_10_no_prefetch_loop): |
| movdqa 16(%eax, %edi), %xmm2 |
| sub $32, %ecx |
| movdqa 32(%eax, %edi), %xmm3 |
| movdqa %xmm3, %xmm4 |
| palignr $10, %xmm2, %xmm3 |
| palignr $10, %xmm1, %xmm2 |
| lea 32(%edi), %edi |
| movdqa %xmm2, -32(%edx, %edi) |
| movdqa %xmm3, -16(%edx, %edi) |
| jb L(sh_10_end_no_prefetch_loop) |
| |
| movdqa 16(%eax, %edi), %xmm2 |
| sub $32, %ecx |
| movdqa 32(%eax, %edi), %xmm3 |
| movdqa %xmm3, %xmm1 |
| palignr $10, %xmm2, %xmm3 |
| palignr $10, %xmm4, %xmm2 |
| lea 32(%edi), %edi |
| movdqa %xmm2, -32(%edx, %edi) |
| movdqa %xmm3, -16(%edx, %edi) |
| jae L(sh_10_no_prefetch_loop) |
| |
| L(sh_10_end_no_prefetch_loop): |
| lea 32(%ecx), %ecx |
| add %ecx, %edi |
| add %edi, %edx |
| lea 10(%edi, %eax), %eax |
| POP (%edi) |
| BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4) |
| |
| CFI_PUSH (%edi) |
| |
| .p2align 4 |
| L(shl_11): |
| #ifndef USE_AS_MEMMOVE |
| movaps -11(%eax), %xmm1 |
| #else |
| movl DEST+4(%esp), %edi |
| movaps -11(%eax), %xmm1 |
| movdqu %xmm0, (%edi) |
| #endif |
| #ifdef DATA_CACHE_SIZE_HALF |
| cmp $DATA_CACHE_SIZE_HALF, %ecx |
| #else |
| # if (defined SHARED || defined __PIC__) |
| SETUP_PIC_REG(bx) |
| add $_GLOBAL_OFFSET_TABLE_, %ebx |
| cmp __x86_data_cache_size_half@GOTOFF(%ebx), %ecx |
| # else |
| cmp __x86_data_cache_size_half, %ecx |
| # endif |
| #endif |
| jb L(sh_11_no_prefetch) |
| |
| lea -64(%ecx), %ecx |
| |
| .p2align 4 |
| L(Shl11LoopStart): |
| prefetcht0 0x1c0(%eax) |
| prefetcht0 0x1c0(%edx) |
| movaps 5(%eax), %xmm2 |
| movaps 21(%eax), %xmm3 |
| movaps 37(%eax), %xmm4 |
| movaps 53(%eax), %xmm5 |
| movaps %xmm5, %xmm7 |
| palignr $11, %xmm4, %xmm5 |
| palignr $11, %xmm3, %xmm4 |
| movaps %xmm5, 48(%edx) |
| palignr $11, %xmm2, %xmm3 |
| lea 64(%eax), %eax |
| palignr $11, %xmm1, %xmm2 |
| movaps %xmm4, 32(%edx) |
| movaps %xmm3, 16(%edx) |
| movaps %xmm7, %xmm1 |
| movaps %xmm2, (%edx) |
| lea 64(%edx), %edx |
| sub $64, %ecx |
| ja L(Shl11LoopStart) |
| |
| L(Shl11LoopLeave): |
| add $32, %ecx |
| jle L(shl_end_0) |
| |
| movaps 5(%eax), %xmm2 |
| movaps 21(%eax), %xmm3 |
| palignr $11, %xmm2, %xmm3 |
| palignr $11, %xmm1, %xmm2 |
| |
| movaps %xmm2, (%edx) |
| movaps %xmm3, 16(%edx) |
| lea 32(%edx, %ecx), %edx |
| lea 32(%eax, %ecx), %eax |
| POP (%edi) |
| BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4) |
| |
| CFI_PUSH (%edi) |
| |
| .p2align 4 |
| L(sh_11_no_prefetch): |
| lea -32(%ecx), %ecx |
| lea -11(%eax), %eax |
| xor %edi, %edi |
| |
| .p2align 4 |
| L(sh_11_no_prefetch_loop): |
| movdqa 16(%eax, %edi), %xmm2 |
| sub $32, %ecx |
| movdqa 32(%eax, %edi), %xmm3 |
| movdqa %xmm3, %xmm4 |
| palignr $11, %xmm2, %xmm3 |
| palignr $11, %xmm1, %xmm2 |
| lea 32(%edi), %edi |
| movdqa %xmm2, -32(%edx, %edi) |
| movdqa %xmm3, -16(%edx, %edi) |
| jb L(sh_11_end_no_prefetch_loop) |
| |
| movdqa 16(%eax, %edi), %xmm2 |
| sub $32, %ecx |
| movdqa 32(%eax, %edi), %xmm3 |
| movdqa %xmm3, %xmm1 |
| palignr $11, %xmm2, %xmm3 |
| palignr $11, %xmm4, %xmm2 |
| lea 32(%edi), %edi |
| movdqa %xmm2, -32(%edx, %edi) |
| movdqa %xmm3, -16(%edx, %edi) |
| jae L(sh_11_no_prefetch_loop) |
| |
| L(sh_11_end_no_prefetch_loop): |
| lea 32(%ecx), %ecx |
| add %ecx, %edi |
| add %edi, %edx |
| lea 11(%edi, %eax), %eax |
| POP (%edi) |
| BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4) |
| |
| CFI_PUSH (%edi) |
| |
| .p2align 4 |
| L(shl_12): |
| #ifndef USE_AS_MEMMOVE |
| movaps -12(%eax), %xmm1 |
| #else |
| movl DEST+4(%esp), %edi |
| movaps -12(%eax), %xmm1 |
| movdqu %xmm0, (%edi) |
| #endif |
| #ifdef DATA_CACHE_SIZE_HALF |
| cmp $DATA_CACHE_SIZE_HALF, %ecx |
| #else |
| # if (defined SHARED || defined __PIC__) |
| SETUP_PIC_REG(bx) |
| add $_GLOBAL_OFFSET_TABLE_, %ebx |
| cmp __x86_data_cache_size_half@GOTOFF(%ebx), %ecx |
| # else |
| cmp __x86_data_cache_size_half, %ecx |
| # endif |
| #endif |
| jb L(sh_12_no_prefetch) |
| |
| lea -64(%ecx), %ecx |
| |
| .p2align 4 |
| L(Shl12LoopStart): |
| prefetcht0 0x1c0(%eax) |
| prefetcht0 0x1c0(%edx) |
| movaps 4(%eax), %xmm2 |
| movaps 20(%eax), %xmm3 |
| movaps 36(%eax), %xmm4 |
| movaps 52(%eax), %xmm5 |
| movaps %xmm5, %xmm7 |
| palignr $12, %xmm4, %xmm5 |
| palignr $12, %xmm3, %xmm4 |
| movaps %xmm5, 48(%edx) |
| palignr $12, %xmm2, %xmm3 |
| lea 64(%eax), %eax |
| palignr $12, %xmm1, %xmm2 |
| movaps %xmm4, 32(%edx) |
| movaps %xmm3, 16(%edx) |
| movaps %xmm7, %xmm1 |
| movaps %xmm2, (%edx) |
| lea 64(%edx), %edx |
| sub $64, %ecx |
| ja L(Shl12LoopStart) |
| |
| L(Shl12LoopLeave): |
| add $32, %ecx |
| jle L(shl_end_0) |
| |
| movaps 4(%eax), %xmm2 |
| movaps 20(%eax), %xmm3 |
| palignr $12, %xmm2, %xmm3 |
| palignr $12, %xmm1, %xmm2 |
| |
| movaps %xmm2, (%edx) |
| movaps %xmm3, 16(%edx) |
| lea 32(%edx, %ecx), %edx |
| lea 32(%eax, %ecx), %eax |
| POP (%edi) |
| BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4) |
| |
| CFI_PUSH (%edi) |
| |
| .p2align 4 |
| L(sh_12_no_prefetch): |
| lea -32(%ecx), %ecx |
| lea -12(%eax), %eax |
| xor %edi, %edi |
| |
| .p2align 4 |
| L(sh_12_no_prefetch_loop): |
| movdqa 16(%eax, %edi), %xmm2 |
| sub $32, %ecx |
| movdqa 32(%eax, %edi), %xmm3 |
| movdqa %xmm3, %xmm4 |
| palignr $12, %xmm2, %xmm3 |
| palignr $12, %xmm1, %xmm2 |
| lea 32(%edi), %edi |
| movdqa %xmm2, -32(%edx, %edi) |
| movdqa %xmm3, -16(%edx, %edi) |
| jb L(sh_12_end_no_prefetch_loop) |
| |
| movdqa 16(%eax, %edi), %xmm2 |
| sub $32, %ecx |
| movdqa 32(%eax, %edi), %xmm3 |
| movdqa %xmm3, %xmm1 |
| palignr $12, %xmm2, %xmm3 |
| palignr $12, %xmm4, %xmm2 |
| lea 32(%edi), %edi |
| movdqa %xmm2, -32(%edx, %edi) |
| movdqa %xmm3, -16(%edx, %edi) |
| jae L(sh_12_no_prefetch_loop) |
| |
| L(sh_12_end_no_prefetch_loop): |
| lea 32(%ecx), %ecx |
| add %ecx, %edi |
| add %edi, %edx |
| lea 12(%edi, %eax), %eax |
| POP (%edi) |
| BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4) |
| |
| CFI_PUSH (%edi) |
| |
| .p2align 4 |
| L(shl_13): |
| #ifndef USE_AS_MEMMOVE |
| movaps -13(%eax), %xmm1 |
| #else |
| movl DEST+4(%esp), %edi |
| movaps -13(%eax), %xmm1 |
| movdqu %xmm0, (%edi) |
| #endif |
| #ifdef DATA_CACHE_SIZE_HALF |
| cmp $DATA_CACHE_SIZE_HALF, %ecx |
| #else |
| # if (defined SHARED || defined __PIC__) |
| SETUP_PIC_REG(bx) |
| add $_GLOBAL_OFFSET_TABLE_, %ebx |
| cmp __x86_data_cache_size_half@GOTOFF(%ebx), %ecx |
| # else |
| cmp __x86_data_cache_size_half, %ecx |
| # endif |
| #endif |
| jb L(sh_13_no_prefetch) |
| |
| lea -64(%ecx), %ecx |
| |
| .p2align 4 |
| L(Shl13LoopStart): |
| prefetcht0 0x1c0(%eax) |
| prefetcht0 0x1c0(%edx) |
| movaps 3(%eax), %xmm2 |
| movaps 19(%eax), %xmm3 |
| movaps 35(%eax), %xmm4 |
| movaps 51(%eax), %xmm5 |
| movaps %xmm5, %xmm7 |
| palignr $13, %xmm4, %xmm5 |
| palignr $13, %xmm3, %xmm4 |
| movaps %xmm5, 48(%edx) |
| palignr $13, %xmm2, %xmm3 |
| lea 64(%eax), %eax |
| palignr $13, %xmm1, %xmm2 |
| movaps %xmm4, 32(%edx) |
| movaps %xmm3, 16(%edx) |
| movaps %xmm7, %xmm1 |
| movaps %xmm2, (%edx) |
| lea 64(%edx), %edx |
| sub $64, %ecx |
| ja L(Shl13LoopStart) |
| |
| L(Shl13LoopLeave): |
| add $32, %ecx |
| jle L(shl_end_0) |
| |
| movaps 3(%eax), %xmm2 |
| movaps 19(%eax), %xmm3 |
| palignr $13, %xmm2, %xmm3 |
| palignr $13, %xmm1, %xmm2 |
| |
| movaps %xmm2, (%edx) |
| movaps %xmm3, 16(%edx) |
| lea 32(%edx, %ecx), %edx |
| lea 32(%eax, %ecx), %eax |
| POP (%edi) |
| BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4) |
| |
| CFI_PUSH (%edi) |
| |
| .p2align 4 |
| L(sh_13_no_prefetch): |
| lea -32(%ecx), %ecx |
| lea -13(%eax), %eax |
| xor %edi, %edi |
| |
| .p2align 4 |
| L(sh_13_no_prefetch_loop): |
| movdqa 16(%eax, %edi), %xmm2 |
| sub $32, %ecx |
| movdqa 32(%eax, %edi), %xmm3 |
| movdqa %xmm3, %xmm4 |
| palignr $13, %xmm2, %xmm3 |
| palignr $13, %xmm1, %xmm2 |
| lea 32(%edi), %edi |
| movdqa %xmm2, -32(%edx, %edi) |
| movdqa %xmm3, -16(%edx, %edi) |
| jb L(sh_13_end_no_prefetch_loop) |
| |
| movdqa 16(%eax, %edi), %xmm2 |
| sub $32, %ecx |
| movdqa 32(%eax, %edi), %xmm3 |
| movdqa %xmm3, %xmm1 |
| palignr $13, %xmm2, %xmm3 |
| palignr $13, %xmm4, %xmm2 |
| lea 32(%edi), %edi |
| movdqa %xmm2, -32(%edx, %edi) |
| movdqa %xmm3, -16(%edx, %edi) |
| jae L(sh_13_no_prefetch_loop) |
| |
| L(sh_13_end_no_prefetch_loop): |
| lea 32(%ecx), %ecx |
| add %ecx, %edi |
| add %edi, %edx |
| lea 13(%edi, %eax), %eax |
| POP (%edi) |
| BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4) |
| |
| CFI_PUSH (%edi) |
| |
| .p2align 4 |
| L(shl_14): |
| #ifndef USE_AS_MEMMOVE |
| movaps -14(%eax), %xmm1 |
| #else |
| movl DEST+4(%esp), %edi |
| movaps -14(%eax), %xmm1 |
| movdqu %xmm0, (%edi) |
| #endif |
| #ifdef DATA_CACHE_SIZE_HALF |
| cmp $DATA_CACHE_SIZE_HALF, %ecx |
| #else |
| # if (defined SHARED || defined __PIC__) |
| SETUP_PIC_REG(bx) |
| add $_GLOBAL_OFFSET_TABLE_, %ebx |
| cmp __x86_data_cache_size_half@GOTOFF(%ebx), %ecx |
| # else |
| cmp __x86_data_cache_size_half, %ecx |
| # endif |
| #endif |
| jb L(sh_14_no_prefetch) |
| |
| lea -64(%ecx), %ecx |
| |
| .p2align 4 |
| L(Shl14LoopStart): |
| prefetcht0 0x1c0(%eax) |
| prefetcht0 0x1c0(%edx) |
| movaps 2(%eax), %xmm2 |
| movaps 18(%eax), %xmm3 |
| movaps 34(%eax), %xmm4 |
| movaps 50(%eax), %xmm5 |
| movaps %xmm5, %xmm7 |
| palignr $14, %xmm4, %xmm5 |
| palignr $14, %xmm3, %xmm4 |
| movaps %xmm5, 48(%edx) |
| palignr $14, %xmm2, %xmm3 |
| lea 64(%eax), %eax |
| palignr $14, %xmm1, %xmm2 |
| movaps %xmm4, 32(%edx) |
| movaps %xmm3, 16(%edx) |
| movaps %xmm7, %xmm1 |
| movaps %xmm2, (%edx) |
| lea 64(%edx), %edx |
| sub $64, %ecx |
| ja L(Shl14LoopStart) |
| |
| L(Shl14LoopLeave): |
| add $32, %ecx |
| jle L(shl_end_0) |
| |
| movaps 2(%eax), %xmm2 |
| movaps 18(%eax), %xmm3 |
| palignr $14, %xmm2, %xmm3 |
| palignr $14, %xmm1, %xmm2 |
| |
| movaps %xmm2, (%edx) |
| movaps %xmm3, 16(%edx) |
| lea 32(%edx, %ecx), %edx |
| lea 32(%eax, %ecx), %eax |
| POP (%edi) |
| BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4) |
| |
| CFI_PUSH (%edi) |
| |
| .p2align 4 |
| L(sh_14_no_prefetch): |
| lea -32(%ecx), %ecx |
| lea -14(%eax), %eax |
| xor %edi, %edi |
| |
| .p2align 4 |
| L(sh_14_no_prefetch_loop): |
| movdqa 16(%eax, %edi), %xmm2 |
| sub $32, %ecx |
| movdqa 32(%eax, %edi), %xmm3 |
| movdqa %xmm3, %xmm4 |
| palignr $14, %xmm2, %xmm3 |
| palignr $14, %xmm1, %xmm2 |
| lea 32(%edi), %edi |
| movdqa %xmm2, -32(%edx, %edi) |
| movdqa %xmm3, -16(%edx, %edi) |
| jb L(sh_14_end_no_prefetch_loop) |
| |
| movdqa 16(%eax, %edi), %xmm2 |
| sub $32, %ecx |
| movdqa 32(%eax, %edi), %xmm3 |
| movdqa %xmm3, %xmm1 |
| palignr $14, %xmm2, %xmm3 |
| palignr $14, %xmm4, %xmm2 |
| lea 32(%edi), %edi |
| movdqa %xmm2, -32(%edx, %edi) |
| movdqa %xmm3, -16(%edx, %edi) |
| jae L(sh_14_no_prefetch_loop) |
| |
| L(sh_14_end_no_prefetch_loop): |
| lea 32(%ecx), %ecx |
| add %ecx, %edi |
| add %edi, %edx |
| lea 14(%edi, %eax), %eax |
| POP (%edi) |
| BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4) |
| |
| CFI_PUSH (%edi) |
| |
| .p2align 4 |
| L(shl_15): |
| #ifndef USE_AS_MEMMOVE |
| movaps -15(%eax), %xmm1 |
| #else |
| movl DEST+4(%esp), %edi |
| movaps -15(%eax), %xmm1 |
| movdqu %xmm0, (%edi) |
| #endif |
| #ifdef DATA_CACHE_SIZE_HALF |
| cmp $DATA_CACHE_SIZE_HALF, %ecx |
| #else |
| # if (defined SHARED || defined __PIC__) |
| SETUP_PIC_REG(bx) |
| add $_GLOBAL_OFFSET_TABLE_, %ebx |
| cmp __x86_data_cache_size_half@GOTOFF(%ebx), %ecx |
| # else |
| cmp __x86_data_cache_size_half, %ecx |
| # endif |
| #endif |
| jb L(sh_15_no_prefetch) |
| |
| lea -64(%ecx), %ecx |
| |
| .p2align 4 |
| L(Shl15LoopStart): |
| prefetcht0 0x1c0(%eax) |
| prefetcht0 0x1c0(%edx) |
| movaps 1(%eax), %xmm2 |
| movaps 17(%eax), %xmm3 |
| movaps 33(%eax), %xmm4 |
| movaps 49(%eax), %xmm5 |
| movaps %xmm5, %xmm7 |
| palignr $15, %xmm4, %xmm5 |
| palignr $15, %xmm3, %xmm4 |
| movaps %xmm5, 48(%edx) |
| palignr $15, %xmm2, %xmm3 |
| lea 64(%eax), %eax |
| palignr $15, %xmm1, %xmm2 |
| movaps %xmm4, 32(%edx) |
| movaps %xmm3, 16(%edx) |
| movaps %xmm7, %xmm1 |
| movaps %xmm2, (%edx) |
| lea 64(%edx), %edx |
| sub $64, %ecx |
| ja L(Shl15LoopStart) |
| |
| L(Shl15LoopLeave): |
| add $32, %ecx |
| jle L(shl_end_0) |
| |
| movaps 1(%eax), %xmm2 |
| movaps 17(%eax), %xmm3 |
| palignr $15, %xmm2, %xmm3 |
| palignr $15, %xmm1, %xmm2 |
| |
| movaps %xmm2, (%edx) |
| movaps %xmm3, 16(%edx) |
| lea 32(%edx, %ecx), %edx |
| lea 32(%eax, %ecx), %eax |
| POP (%edi) |
| BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4) |
| |
| CFI_PUSH (%edi) |
| |
| .p2align 4 |
| L(sh_15_no_prefetch): |
| lea -32(%ecx), %ecx |
| lea -15(%eax), %eax |
| xor %edi, %edi |
| |
| .p2align 4 |
| L(sh_15_no_prefetch_loop): |
| movdqa 16(%eax, %edi), %xmm2 |
| sub $32, %ecx |
| movdqa 32(%eax, %edi), %xmm3 |
| movdqa %xmm3, %xmm4 |
| palignr $15, %xmm2, %xmm3 |
| palignr $15, %xmm1, %xmm2 |
| lea 32(%edi), %edi |
| movdqa %xmm2, -32(%edx, %edi) |
| movdqa %xmm3, -16(%edx, %edi) |
| jb L(sh_15_end_no_prefetch_loop) |
| |
| movdqa 16(%eax, %edi), %xmm2 |
| sub $32, %ecx |
| movdqa 32(%eax, %edi), %xmm3 |
| movdqa %xmm3, %xmm1 |
| palignr $15, %xmm2, %xmm3 |
| palignr $15, %xmm4, %xmm2 |
| lea 32(%edi), %edi |
| movdqa %xmm2, -32(%edx, %edi) |
| movdqa %xmm3, -16(%edx, %edi) |
| jae L(sh_15_no_prefetch_loop) |
| |
| L(sh_15_end_no_prefetch_loop): |
| lea 32(%ecx), %ecx |
| add %ecx, %edi |
| add %edi, %edx |
| lea 15(%edi, %eax), %eax |
| POP (%edi) |
| BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4) |
| |
| CFI_PUSH (%edi) |
| |
| .p2align 4 |
| L(shl_end_0): |
| lea 32(%ecx), %ecx |
| lea (%edx, %ecx), %edx |
| lea (%eax, %ecx), %eax |
| POP (%edi) |
| BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4) |
| |
| .p2align 4 |
| L(fwd_write_44bytes): |
| movq -44(%eax), %xmm0 |
| movq %xmm0, -44(%edx) |
| L(fwd_write_36bytes): |
| movq -36(%eax), %xmm0 |
| movq %xmm0, -36(%edx) |
| L(fwd_write_28bytes): |
| movq -28(%eax), %xmm0 |
| movq %xmm0, -28(%edx) |
| L(fwd_write_20bytes): |
| movq -20(%eax), %xmm0 |
| movq %xmm0, -20(%edx) |
| L(fwd_write_12bytes): |
| movq -12(%eax), %xmm0 |
| movq %xmm0, -12(%edx) |
| L(fwd_write_4bytes): |
| movl -4(%eax), %ecx |
| movl %ecx, -4(%edx) |
| #ifdef USE_AS_MEMPCPY |
| movl %edx, %eax |
| #else |
| movl DEST(%esp), %eax |
| #endif |
| RETURN |
| |
| .p2align 4 |
| L(fwd_write_40bytes): |
| movq -40(%eax), %xmm0 |
| movq %xmm0, -40(%edx) |
| L(fwd_write_32bytes): |
| movq -32(%eax), %xmm0 |
| movq %xmm0, -32(%edx) |
| L(fwd_write_24bytes): |
| movq -24(%eax), %xmm0 |
| movq %xmm0, -24(%edx) |
| L(fwd_write_16bytes): |
| movq -16(%eax), %xmm0 |
| movq %xmm0, -16(%edx) |
| L(fwd_write_8bytes): |
| movq -8(%eax), %xmm0 |
| movq %xmm0, -8(%edx) |
| L(fwd_write_0bytes): |
| #ifdef USE_AS_MEMPCPY |
| movl %edx, %eax |
| #else |
| movl DEST(%esp), %eax |
| #endif |
| RETURN |
| |
| .p2align 4 |
| L(fwd_write_5bytes): |
| movl -5(%eax), %ecx |
| movl -4(%eax), %eax |
| movl %ecx, -5(%edx) |
| movl %eax, -4(%edx) |
| #ifdef USE_AS_MEMPCPY |
| movl %edx, %eax |
| #else |
| movl DEST(%esp), %eax |
| #endif |
| RETURN |
| |
| .p2align 4 |
| L(fwd_write_45bytes): |
| movq -45(%eax), %xmm0 |
| movq %xmm0, -45(%edx) |
| L(fwd_write_37bytes): |
| movq -37(%eax), %xmm0 |
| movq %xmm0, -37(%edx) |
| L(fwd_write_29bytes): |
| movq -29(%eax), %xmm0 |
| movq %xmm0, -29(%edx) |
| L(fwd_write_21bytes): |
| movq -21(%eax), %xmm0 |
| movq %xmm0, -21(%edx) |
| L(fwd_write_13bytes): |
| movq -13(%eax), %xmm0 |
| movq %xmm0, -13(%edx) |
| movl -5(%eax), %ecx |
| movl %ecx, -5(%edx) |
| movzbl -1(%eax), %ecx |
| movb %cl, -1(%edx) |
| #ifdef USE_AS_MEMPCPY |
| movl %edx, %eax |
| #else |
| movl DEST(%esp), %eax |
| #endif |
| RETURN |
| |
| .p2align 4 |
| L(fwd_write_41bytes): |
| movq -41(%eax), %xmm0 |
| movq %xmm0, -41(%edx) |
| L(fwd_write_33bytes): |
| movq -33(%eax), %xmm0 |
| movq %xmm0, -33(%edx) |
| L(fwd_write_25bytes): |
| movq -25(%eax), %xmm0 |
| movq %xmm0, -25(%edx) |
| L(fwd_write_17bytes): |
| movq -17(%eax), %xmm0 |
| movq %xmm0, -17(%edx) |
| L(fwd_write_9bytes): |
| movq -9(%eax), %xmm0 |
| movq %xmm0, -9(%edx) |
| L(fwd_write_1bytes): |
| movzbl -1(%eax), %ecx |
| movb %cl, -1(%edx) |
| #ifdef USE_AS_MEMPCPY |
| movl %edx, %eax |
| #else |
| movl DEST(%esp), %eax |
| #endif |
| RETURN |
| |
| .p2align 4 |
| L(fwd_write_46bytes): |
| movq -46(%eax), %xmm0 |
| movq %xmm0, -46(%edx) |
| L(fwd_write_38bytes): |
| movq -38(%eax), %xmm0 |
| movq %xmm0, -38(%edx) |
| L(fwd_write_30bytes): |
| movq -30(%eax), %xmm0 |
| movq %xmm0, -30(%edx) |
| L(fwd_write_22bytes): |
| movq -22(%eax), %xmm0 |
| movq %xmm0, -22(%edx) |
| L(fwd_write_14bytes): |
| movq -14(%eax), %xmm0 |
| movq %xmm0, -14(%edx) |
| L(fwd_write_6bytes): |
| movl -6(%eax), %ecx |
| movl %ecx, -6(%edx) |
| movzwl -2(%eax), %ecx |
| movw %cx, -2(%edx) |
| #ifdef USE_AS_MEMPCPY |
| movl %edx, %eax |
| #else |
| movl DEST(%esp), %eax |
| #endif |
| RETURN |
| |
| .p2align 4 |
| L(fwd_write_42bytes): |
| movq -42(%eax), %xmm0 |
| movq %xmm0, -42(%edx) |
| L(fwd_write_34bytes): |
| movq -34(%eax), %xmm0 |
| movq %xmm0, -34(%edx) |
| L(fwd_write_26bytes): |
| movq -26(%eax), %xmm0 |
| movq %xmm0, -26(%edx) |
| L(fwd_write_18bytes): |
| movq -18(%eax), %xmm0 |
| movq %xmm0, -18(%edx) |
| L(fwd_write_10bytes): |
| movq -10(%eax), %xmm0 |
| movq %xmm0, -10(%edx) |
| L(fwd_write_2bytes): |
| movzwl -2(%eax), %ecx |
| movw %cx, -2(%edx) |
| #ifdef USE_AS_MEMPCPY |
| movl %edx, %eax |
| #else |
| movl DEST(%esp), %eax |
| #endif |
| RETURN |
| |
| .p2align 4 |
| L(fwd_write_47bytes): |
| movq -47(%eax), %xmm0 |
| movq %xmm0, -47(%edx) |
| L(fwd_write_39bytes): |
| movq -39(%eax), %xmm0 |
| movq %xmm0, -39(%edx) |
| L(fwd_write_31bytes): |
| movq -31(%eax), %xmm0 |
| movq %xmm0, -31(%edx) |
| L(fwd_write_23bytes): |
| movq -23(%eax), %xmm0 |
| movq %xmm0, -23(%edx) |
| L(fwd_write_15bytes): |
| movq -15(%eax), %xmm0 |
| movq %xmm0, -15(%edx) |
| L(fwd_write_7bytes): |
| movl -7(%eax), %ecx |
| movl %ecx, -7(%edx) |
| movzwl -3(%eax), %ecx |
| movzbl -1(%eax), %eax |
| movw %cx, -3(%edx) |
| movb %al, -1(%edx) |
| #ifdef USE_AS_MEMPCPY |
| movl %edx, %eax |
| #else |
| movl DEST(%esp), %eax |
| #endif |
| RETURN |
| |
| .p2align 4 |
| L(fwd_write_43bytes): |
| movq -43(%eax), %xmm0 |
| movq %xmm0, -43(%edx) |
| L(fwd_write_35bytes): |
| movq -35(%eax), %xmm0 |
| movq %xmm0, -35(%edx) |
| L(fwd_write_27bytes): |
| movq -27(%eax), %xmm0 |
| movq %xmm0, -27(%edx) |
| L(fwd_write_19bytes): |
| movq -19(%eax), %xmm0 |
| movq %xmm0, -19(%edx) |
| L(fwd_write_11bytes): |
| movq -11(%eax), %xmm0 |
| movq %xmm0, -11(%edx) |
| L(fwd_write_3bytes): |
| movzwl -3(%eax), %ecx |
| movzbl -1(%eax), %eax |
| movw %cx, -3(%edx) |
| movb %al, -1(%edx) |
| #ifdef USE_AS_MEMPCPY |
| movl %edx, %eax |
| #else |
| movl DEST(%esp), %eax |
| #endif |
| RETURN |
| |
| .p2align 4 |
| L(fwd_write_40bytes_align): |
| movdqa -40(%eax), %xmm0 |
| movdqa %xmm0, -40(%edx) |
| L(fwd_write_24bytes_align): |
| movdqa -24(%eax), %xmm0 |
| movdqa %xmm0, -24(%edx) |
| L(fwd_write_8bytes_align): |
| movq -8(%eax), %xmm0 |
| movq %xmm0, -8(%edx) |
| L(fwd_write_0bytes_align): |
| #ifdef USE_AS_MEMPCPY |
| movl %edx, %eax |
| #else |
| movl DEST(%esp), %eax |
| #endif |
| RETURN |
| |
| .p2align 4 |
| L(fwd_write_32bytes_align): |
| movdqa -32(%eax), %xmm0 |
| movdqa %xmm0, -32(%edx) |
| L(fwd_write_16bytes_align): |
| movdqa -16(%eax), %xmm0 |
| movdqa %xmm0, -16(%edx) |
| #ifdef USE_AS_MEMPCPY |
| movl %edx, %eax |
| #else |
| movl DEST(%esp), %eax |
| #endif |
| RETURN |
| |
| .p2align 4 |
| L(fwd_write_5bytes_align): |
| movl -5(%eax), %ecx |
| movl -4(%eax), %eax |
| movl %ecx, -5(%edx) |
| movl %eax, -4(%edx) |
| #ifdef USE_AS_MEMPCPY |
| movl %edx, %eax |
| #else |
| movl DEST(%esp), %eax |
| #endif |
| RETURN |
| |
| .p2align 4 |
| L(fwd_write_45bytes_align): |
| movdqa -45(%eax), %xmm0 |
| movdqa %xmm0, -45(%edx) |
| L(fwd_write_29bytes_align): |
| movdqa -29(%eax), %xmm0 |
| movdqa %xmm0, -29(%edx) |
| L(fwd_write_13bytes_align): |
| movq -13(%eax), %xmm0 |
| movq %xmm0, -13(%edx) |
| movl -5(%eax), %ecx |
| movl %ecx, -5(%edx) |
| movzbl -1(%eax), %ecx |
| movb %cl, -1(%edx) |
| #ifdef USE_AS_MEMPCPY |
| movl %edx, %eax |
| #else |
| movl DEST(%esp), %eax |
| #endif |
| RETURN |
| |
| .p2align 4 |
| L(fwd_write_37bytes_align): |
| movdqa -37(%eax), %xmm0 |
| movdqa %xmm0, -37(%edx) |
| L(fwd_write_21bytes_align): |
| movdqa -21(%eax), %xmm0 |
| movdqa %xmm0, -21(%edx) |
| movl -5(%eax), %ecx |
| movl %ecx, -5(%edx) |
| movzbl -1(%eax), %ecx |
| movb %cl, -1(%edx) |
| #ifdef USE_AS_MEMPCPY |
| movl %edx, %eax |
| #else |
| movl DEST(%esp), %eax |
| #endif |
| RETURN |
| |
| .p2align 4 |
| L(fwd_write_41bytes_align): |
| movdqa -41(%eax), %xmm0 |
| movdqa %xmm0, -41(%edx) |
| L(fwd_write_25bytes_align): |
| movdqa -25(%eax), %xmm0 |
| movdqa %xmm0, -25(%edx) |
| L(fwd_write_9bytes_align): |
| movq -9(%eax), %xmm0 |
| movq %xmm0, -9(%edx) |
| L(fwd_write_1bytes_align): |
| movzbl -1(%eax), %ecx |
| movb %cl, -1(%edx) |
| #ifdef USE_AS_MEMPCPY |
| movl %edx, %eax |
| #else |
| movl DEST(%esp), %eax |
| #endif |
| RETURN |
| |
| .p2align 4 |
| L(fwd_write_33bytes_align): |
| movdqa -33(%eax), %xmm0 |
| movdqa %xmm0, -33(%edx) |
| L(fwd_write_17bytes_align): |
| movdqa -17(%eax), %xmm0 |
| movdqa %xmm0, -17(%edx) |
| movzbl -1(%eax), %ecx |
| movb %cl, -1(%edx) |
| #ifdef USE_AS_MEMPCPY |
| movl %edx, %eax |
| #else |
| movl DEST(%esp), %eax |
| #endif |
| RETURN |
| |
| .p2align 4 |
| L(fwd_write_46bytes_align): |
| movdqa -46(%eax), %xmm0 |
| movdqa %xmm0, -46(%edx) |
| L(fwd_write_30bytes_align): |
| movdqa -30(%eax), %xmm0 |
| movdqa %xmm0, -30(%edx) |
| L(fwd_write_14bytes_align): |
| movq -14(%eax), %xmm0 |
| movq %xmm0, -14(%edx) |
| L(fwd_write_6bytes_align): |
| movl -6(%eax), %ecx |
| movl %ecx, -6(%edx) |
| movzwl -2(%eax), %ecx |
| movw %cx, -2(%edx) |
| #ifdef USE_AS_MEMPCPY |
| movl %edx, %eax |
| #else |
| movl DEST(%esp), %eax |
| #endif |
| RETURN |
| |
| .p2align 4 |
| L(fwd_write_38bytes_align): |
| movdqa -38(%eax), %xmm0 |
| movdqa %xmm0, -38(%edx) |
| L(fwd_write_22bytes_align): |
| movdqa -22(%eax), %xmm0 |
| movdqa %xmm0, -22(%edx) |
| movl -6(%eax), %ecx |
| movl %ecx, -6(%edx) |
| movzwl -2(%eax), %ecx |
| movw %cx, -2(%edx) |
| #ifdef USE_AS_MEMPCPY |
| movl %edx, %eax |
| #else |
| movl DEST(%esp), %eax |
| #endif |
| RETURN |
| |
| .p2align 4 |
| L(fwd_write_42bytes_align): |
| movdqa -42(%eax), %xmm0 |
| movdqa %xmm0, -42(%edx) |
| L(fwd_write_26bytes_align): |
| movdqa -26(%eax), %xmm0 |
| movdqa %xmm0, -26(%edx) |
| L(fwd_write_10bytes_align): |
| movq -10(%eax), %xmm0 |
| movq %xmm0, -10(%edx) |
| L(fwd_write_2bytes_align): |
| movzwl -2(%eax), %ecx |
| movw %cx, -2(%edx) |
| #ifdef USE_AS_MEMPCPY |
| movl %edx, %eax |
| #else |
| movl DEST(%esp), %eax |
| #endif |
| RETURN |
| |
| .p2align 4 |
| L(fwd_write_34bytes_align): |
| movdqa -34(%eax), %xmm0 |
| movdqa %xmm0, -34(%edx) |
| L(fwd_write_18bytes_align): |
| movdqa -18(%eax), %xmm0 |
| movdqa %xmm0, -18(%edx) |
| movzwl -2(%eax), %ecx |
| movw %cx, -2(%edx) |
| #ifdef USE_AS_MEMPCPY |
| movl %edx, %eax |
| #else |
| movl DEST(%esp), %eax |
| #endif |
| RETURN |
| |
| .p2align 4 |
| L(fwd_write_47bytes_align): |
| movdqa -47(%eax), %xmm0 |
| movdqa %xmm0, -47(%edx) |
| L(fwd_write_31bytes_align): |
| movdqa -31(%eax), %xmm0 |
| movdqa %xmm0, -31(%edx) |
| L(fwd_write_15bytes_align): |
| movq -15(%eax), %xmm0 |
| movq %xmm0, -15(%edx) |
| L(fwd_write_7bytes_align): |
| movl -7(%eax), %ecx |
| movl %ecx, -7(%edx) |
| movzwl -3(%eax), %ecx |
| movzbl -1(%eax), %eax |
| movw %cx, -3(%edx) |
| movb %al, -1(%edx) |
| #ifdef USE_AS_MEMPCPY |
| movl %edx, %eax |
| #else |
| movl DEST(%esp), %eax |
| #endif |
| RETURN |
| |
| .p2align 4 |
| L(fwd_write_39bytes_align): |
| movdqa -39(%eax), %xmm0 |
| movdqa %xmm0, -39(%edx) |
| L(fwd_write_23bytes_align): |
| movdqa -23(%eax), %xmm0 |
| movdqa %xmm0, -23(%edx) |
| movl -7(%eax), %ecx |
| movl %ecx, -7(%edx) |
| movzwl -3(%eax), %ecx |
| movzbl -1(%eax), %eax |
| movw %cx, -3(%edx) |
| movb %al, -1(%edx) |
| #ifdef USE_AS_MEMPCPY |
| movl %edx, %eax |
| #else |
| movl DEST(%esp), %eax |
| #endif |
| RETURN |
| |
| .p2align 4 |
| L(fwd_write_43bytes_align): |
| movdqa -43(%eax), %xmm0 |
| movdqa %xmm0, -43(%edx) |
| L(fwd_write_27bytes_align): |
| movdqa -27(%eax), %xmm0 |
| movdqa %xmm0, -27(%edx) |
| L(fwd_write_11bytes_align): |
| movq -11(%eax), %xmm0 |
| movq %xmm0, -11(%edx) |
| L(fwd_write_3bytes_align): |
| movzwl -3(%eax), %ecx |
| movzbl -1(%eax), %eax |
| movw %cx, -3(%edx) |
| movb %al, -1(%edx) |
| #ifdef USE_AS_MEMPCPY |
| movl %edx, %eax |
| #else |
| movl DEST(%esp), %eax |
| #endif |
| RETURN |
| |
| .p2align 4 |
| L(fwd_write_35bytes_align): |
| movdqa -35(%eax), %xmm0 |
| movdqa %xmm0, -35(%edx) |
| L(fwd_write_19bytes_align): |
| movdqa -19(%eax), %xmm0 |
| movdqa %xmm0, -19(%edx) |
| movzwl -3(%eax), %ecx |
| movzbl -1(%eax), %eax |
| movw %cx, -3(%edx) |
| movb %al, -1(%edx) |
| #ifdef USE_AS_MEMPCPY |
| movl %edx, %eax |
| #else |
| movl DEST(%esp), %eax |
| #endif |
| RETURN |
| |
| .p2align 4 |
| L(fwd_write_44bytes_align): |
| movdqa -44(%eax), %xmm0 |
| movdqa %xmm0, -44(%edx) |
| L(fwd_write_28bytes_align): |
| movdqa -28(%eax), %xmm0 |
| movdqa %xmm0, -28(%edx) |
| L(fwd_write_12bytes_align): |
| movq -12(%eax), %xmm0 |
| movq %xmm0, -12(%edx) |
| L(fwd_write_4bytes_align): |
| movl -4(%eax), %ecx |
| movl %ecx, -4(%edx) |
| #ifdef USE_AS_MEMPCPY |
| movl %edx, %eax |
| #else |
| movl DEST(%esp), %eax |
| #endif |
| RETURN |
| |
| .p2align 4 |
| L(fwd_write_36bytes_align): |
| movdqa -36(%eax), %xmm0 |
| movdqa %xmm0, -36(%edx) |
| L(fwd_write_20bytes_align): |
| movdqa -20(%eax), %xmm0 |
| movdqa %xmm0, -20(%edx) |
| movl -4(%eax), %ecx |
| movl %ecx, -4(%edx) |
| #ifdef USE_AS_MEMPCPY |
| movl %edx, %eax |
| #else |
| movl DEST(%esp), %eax |
| #endif |
| RETURN_END |
| |
| CFI_PUSH (%edi) |
| |
| .p2align 4 |
| L(large_page): |
| movdqu (%eax), %xmm1 |
| #ifdef USE_AS_MEMMOVE |
| movl DEST+4(%esp), %edi |
| movdqu %xmm0, (%edi) |
| #endif |
| lea 16(%eax), %eax |
| movntdq %xmm1, (%edx) |
| lea 16(%edx), %edx |
| lea -0x90(%ecx), %ecx |
| POP (%edi) |
| |
| .p2align 4 |
| L(large_page_loop): |
| movdqu (%eax), %xmm0 |
| movdqu 0x10(%eax), %xmm1 |
| movdqu 0x20(%eax), %xmm2 |
| movdqu 0x30(%eax), %xmm3 |
| movdqu 0x40(%eax), %xmm4 |
| movdqu 0x50(%eax), %xmm5 |
| movdqu 0x60(%eax), %xmm6 |
| movdqu 0x70(%eax), %xmm7 |
| lea 0x80(%eax), %eax |
| |
| sub $0x80, %ecx |
| movntdq %xmm0, (%edx) |
| movntdq %xmm1, 0x10(%edx) |
| movntdq %xmm2, 0x20(%edx) |
| movntdq %xmm3, 0x30(%edx) |
| movntdq %xmm4, 0x40(%edx) |
| movntdq %xmm5, 0x50(%edx) |
| movntdq %xmm6, 0x60(%edx) |
| movntdq %xmm7, 0x70(%edx) |
| lea 0x80(%edx), %edx |
| jae L(large_page_loop) |
| cmp $-0x40, %ecx |
| lea 0x80(%ecx), %ecx |
| jl L(large_page_less_64bytes) |
| |
| movdqu (%eax), %xmm0 |
| movdqu 0x10(%eax), %xmm1 |
| movdqu 0x20(%eax), %xmm2 |
| movdqu 0x30(%eax), %xmm3 |
| lea 0x40(%eax), %eax |
| |
| movntdq %xmm0, (%edx) |
| movntdq %xmm1, 0x10(%edx) |
| movntdq %xmm2, 0x20(%edx) |
| movntdq %xmm3, 0x30(%edx) |
| lea 0x40(%edx), %edx |
| sub $0x40, %ecx |
| L(large_page_less_64bytes): |
| cmp $32, %ecx |
| jb L(large_page_less_32bytes) |
| movdqu (%eax), %xmm0 |
| movdqu 0x10(%eax), %xmm1 |
| lea 0x20(%eax), %eax |
| movntdq %xmm0, (%edx) |
| movntdq %xmm1, 0x10(%edx) |
| lea 0x20(%edx), %edx |
| sub $0x20, %ecx |
| L(large_page_less_32bytes): |
| add %ecx, %edx |
| add %ecx, %eax |
| sfence |
| BRANCH_TO_JMPTBL_ENTRY (L(table_48bytes_fwd), %ecx, 4) |
| |
| .p2align 4 |
| L(bk_write_44bytes): |
| movq 36(%eax), %xmm0 |
| movq %xmm0, 36(%edx) |
| L(bk_write_36bytes): |
| movq 28(%eax), %xmm0 |
| movq %xmm0, 28(%edx) |
| L(bk_write_28bytes): |
| movq 20(%eax), %xmm0 |
| movq %xmm0, 20(%edx) |
| L(bk_write_20bytes): |
| movq 12(%eax), %xmm0 |
| movq %xmm0, 12(%edx) |
| L(bk_write_12bytes): |
| movq 4(%eax), %xmm0 |
| movq %xmm0, 4(%edx) |
| L(bk_write_4bytes): |
| movl (%eax), %ecx |
| movl %ecx, (%edx) |
| L(bk_write_0bytes): |
| movl DEST(%esp), %eax |
| #ifdef USE_AS_MEMPCPY |
| movl LEN(%esp), %ecx |
| add %ecx, %eax |
| #endif |
| RETURN |
| |
| .p2align 4 |
| L(bk_write_40bytes): |
| movq 32(%eax), %xmm0 |
| movq %xmm0, 32(%edx) |
| L(bk_write_32bytes): |
| movq 24(%eax), %xmm0 |
| movq %xmm0, 24(%edx) |
| L(bk_write_24bytes): |
| movq 16(%eax), %xmm0 |
| movq %xmm0, 16(%edx) |
| L(bk_write_16bytes): |
| movq 8(%eax), %xmm0 |
| movq %xmm0, 8(%edx) |
| L(bk_write_8bytes): |
| movq (%eax), %xmm0 |
| movq %xmm0, (%edx) |
| movl DEST(%esp), %eax |
| #ifdef USE_AS_MEMPCPY |
| movl LEN(%esp), %ecx |
| add %ecx, %eax |
| #endif |
| RETURN |
| |
| .p2align 4 |
| L(bk_write_45bytes): |
| movq 37(%eax), %xmm0 |
| movq %xmm0, 37(%edx) |
| L(bk_write_37bytes): |
| movq 29(%eax), %xmm0 |
| movq %xmm0, 29(%edx) |
| L(bk_write_29bytes): |
| movq 21(%eax), %xmm0 |
| movq %xmm0, 21(%edx) |
| L(bk_write_21bytes): |
| movq 13(%eax), %xmm0 |
| movq %xmm0, 13(%edx) |
| L(bk_write_13bytes): |
| movq 5(%eax), %xmm0 |
| movq %xmm0, 5(%edx) |
| L(bk_write_5bytes): |
| movl 1(%eax), %ecx |
| movl %ecx, 1(%edx) |
| L(bk_write_1bytes): |
| movzbl (%eax), %ecx |
| movb %cl, (%edx) |
| movl DEST(%esp), %eax |
| #ifdef USE_AS_MEMPCPY |
| movl LEN(%esp), %ecx |
| add %ecx, %eax |
| #endif |
| RETURN |
| |
| .p2align 4 |
| L(bk_write_41bytes): |
| movq 33(%eax), %xmm0 |
| movq %xmm0, 33(%edx) |
| L(bk_write_33bytes): |
| movq 25(%eax), %xmm0 |
| movq %xmm0, 25(%edx) |
| L(bk_write_25bytes): |
| movq 17(%eax), %xmm0 |
| movq %xmm0, 17(%edx) |
| L(bk_write_17bytes): |
| movq 9(%eax), %xmm0 |
| movq %xmm0, 9(%edx) |
| L(bk_write_9bytes): |
| movq 1(%eax), %xmm0 |
| movq %xmm0, 1(%edx) |
| movzbl (%eax), %ecx |
| movb %cl, (%edx) |
| movl DEST(%esp), %eax |
| #ifdef USE_AS_MEMPCPY |
| movl LEN(%esp), %ecx |
| add %ecx, %eax |
| #endif |
| RETURN |
| |
| .p2align 4 |
| L(bk_write_46bytes): |
| movq 38(%eax), %xmm0 |
| movq %xmm0, 38(%edx) |
| L(bk_write_38bytes): |
| movq 30(%eax), %xmm0 |
| movq %xmm0, 30(%edx) |
| L(bk_write_30bytes): |
| movq 22(%eax), %xmm0 |
| movq %xmm0, 22(%edx) |
| L(bk_write_22bytes): |
| movq 14(%eax), %xmm0 |
| movq %xmm0, 14(%edx) |
| L(bk_write_14bytes): |
| movq 6(%eax), %xmm0 |
| movq %xmm0, 6(%edx) |
| L(bk_write_6bytes): |
| movl 2(%eax), %ecx |
| movl %ecx, 2(%edx) |
| movzwl (%eax), %ecx |
| movw %cx, (%edx) |
| movl DEST(%esp), %eax |
| #ifdef USE_AS_MEMPCPY |
| movl LEN(%esp), %ecx |
| add %ecx, %eax |
| #endif |
| RETURN |
| |
| .p2align 4 |
| L(bk_write_42bytes): |
| movq 34(%eax), %xmm0 |
| movq %xmm0, 34(%edx) |
| L(bk_write_34bytes): |
| movq 26(%eax), %xmm0 |
| movq %xmm0, 26(%edx) |
| L(bk_write_26bytes): |
| movq 18(%eax), %xmm0 |
| movq %xmm0, 18(%edx) |
| L(bk_write_18bytes): |
| movq 10(%eax), %xmm0 |
| movq %xmm0, 10(%edx) |
| L(bk_write_10bytes): |
| movq 2(%eax), %xmm0 |
| movq %xmm0, 2(%edx) |
| L(bk_write_2bytes): |
| movzwl (%eax), %ecx |
| movw %cx, (%edx) |
| movl DEST(%esp), %eax |
| #ifdef USE_AS_MEMPCPY |
| movl LEN(%esp), %ecx |
| add %ecx, %eax |
| #endif |
| RETURN |
| |
| .p2align 4 |
| L(bk_write_47bytes): |
| movq 39(%eax), %xmm0 |
| movq %xmm0, 39(%edx) |
| L(bk_write_39bytes): |
| movq 31(%eax), %xmm0 |
| movq %xmm0, 31(%edx) |
| L(bk_write_31bytes): |
| movq 23(%eax), %xmm0 |
| movq %xmm0, 23(%edx) |
| L(bk_write_23bytes): |
| movq 15(%eax), %xmm0 |
| movq %xmm0, 15(%edx) |
| L(bk_write_15bytes): |
| movq 7(%eax), %xmm0 |
| movq %xmm0, 7(%edx) |
| L(bk_write_7bytes): |
| movl 3(%eax), %ecx |
| movl %ecx, 3(%edx) |
| movzwl 1(%eax), %ecx |
| movw %cx, 1(%edx) |
| movzbl (%eax), %eax |
| movb %al, (%edx) |
| movl DEST(%esp), %eax |
| #ifdef USE_AS_MEMPCPY |
| movl LEN(%esp), %ecx |
| add %ecx, %eax |
| #endif |
| RETURN |
| |
| .p2align 4 |
| L(bk_write_43bytes): |
| movq 35(%eax), %xmm0 |
| movq %xmm0, 35(%edx) |
| L(bk_write_35bytes): |
| movq 27(%eax), %xmm0 |
| movq %xmm0, 27(%edx) |
| L(bk_write_27bytes): |
| movq 19(%eax), %xmm0 |
| movq %xmm0, 19(%edx) |
| L(bk_write_19bytes): |
| movq 11(%eax), %xmm0 |
| movq %xmm0, 11(%edx) |
| L(bk_write_11bytes): |
| movq 3(%eax), %xmm0 |
| movq %xmm0, 3(%edx) |
| L(bk_write_3bytes): |
| movzwl 1(%eax), %ecx |
| movw %cx, 1(%edx) |
| movzbl (%eax), %eax |
| movb %al, (%edx) |
| movl DEST(%esp), %eax |
| #ifdef USE_AS_MEMPCPY |
| movl LEN(%esp), %ecx |
| add %ecx, %eax |
| #endif |
| RETURN_END |
| |
| |
| .pushsection .rodata.ssse3,"a",@progbits |
| .p2align 2 |
| L(table_48bytes_fwd): |
| .int JMPTBL (L(fwd_write_0bytes), L(table_48bytes_fwd)) |
| .int JMPTBL (L(fwd_write_1bytes), L(table_48bytes_fwd)) |
| .int JMPTBL (L(fwd_write_2bytes), L(table_48bytes_fwd)) |
| .int JMPTBL (L(fwd_write_3bytes), L(table_48bytes_fwd)) |
| .int JMPTBL (L(fwd_write_4bytes), L(table_48bytes_fwd)) |
| .int JMPTBL (L(fwd_write_5bytes), L(table_48bytes_fwd)) |
| .int JMPTBL (L(fwd_write_6bytes), L(table_48bytes_fwd)) |
| .int JMPTBL (L(fwd_write_7bytes), L(table_48bytes_fwd)) |
| .int JMPTBL (L(fwd_write_8bytes), L(table_48bytes_fwd)) |
| .int JMPTBL (L(fwd_write_9bytes), L(table_48bytes_fwd)) |
| .int JMPTBL (L(fwd_write_10bytes), L(table_48bytes_fwd)) |
| .int JMPTBL (L(fwd_write_11bytes), L(table_48bytes_fwd)) |
| .int JMPTBL (L(fwd_write_12bytes), L(table_48bytes_fwd)) |
| .int JMPTBL (L(fwd_write_13bytes), L(table_48bytes_fwd)) |
| .int JMPTBL (L(fwd_write_14bytes), L(table_48bytes_fwd)) |
| .int JMPTBL (L(fwd_write_15bytes), L(table_48bytes_fwd)) |
| .int JMPTBL (L(fwd_write_16bytes), L(table_48bytes_fwd)) |
| .int JMPTBL (L(fwd_write_17bytes), L(table_48bytes_fwd)) |
| .int JMPTBL (L(fwd_write_18bytes), L(table_48bytes_fwd)) |
| .int JMPTBL (L(fwd_write_19bytes), L(table_48bytes_fwd)) |
| .int JMPTBL (L(fwd_write_20bytes), L(table_48bytes_fwd)) |
| .int JMPTBL (L(fwd_write_21bytes), L(table_48bytes_fwd)) |
| .int JMPTBL (L(fwd_write_22bytes), L(table_48bytes_fwd)) |
| .int JMPTBL (L(fwd_write_23bytes), L(table_48bytes_fwd)) |
| .int JMPTBL (L(fwd_write_24bytes), L(table_48bytes_fwd)) |
| .int JMPTBL (L(fwd_write_25bytes), L(table_48bytes_fwd)) |
| .int JMPTBL (L(fwd_write_26bytes), L(table_48bytes_fwd)) |
| .int JMPTBL (L(fwd_write_27bytes), L(table_48bytes_fwd)) |
| .int JMPTBL (L(fwd_write_28bytes), L(table_48bytes_fwd)) |
| .int JMPTBL (L(fwd_write_29bytes), L(table_48bytes_fwd)) |
| .int JMPTBL (L(fwd_write_30bytes), L(table_48bytes_fwd)) |
| .int JMPTBL (L(fwd_write_31bytes), L(table_48bytes_fwd)) |
| .int JMPTBL (L(fwd_write_32bytes), L(table_48bytes_fwd)) |
| .int JMPTBL (L(fwd_write_33bytes), L(table_48bytes_fwd)) |
| .int JMPTBL (L(fwd_write_34bytes), L(table_48bytes_fwd)) |
| .int JMPTBL (L(fwd_write_35bytes), L(table_48bytes_fwd)) |
| .int JMPTBL (L(fwd_write_36bytes), L(table_48bytes_fwd)) |
| .int JMPTBL (L(fwd_write_37bytes), L(table_48bytes_fwd)) |
| .int JMPTBL (L(fwd_write_38bytes), L(table_48bytes_fwd)) |
| .int JMPTBL (L(fwd_write_39bytes), L(table_48bytes_fwd)) |
| .int JMPTBL (L(fwd_write_40bytes), L(table_48bytes_fwd)) |
| .int JMPTBL (L(fwd_write_41bytes), L(table_48bytes_fwd)) |
| .int JMPTBL (L(fwd_write_42bytes), L(table_48bytes_fwd)) |
| .int JMPTBL (L(fwd_write_43bytes), L(table_48bytes_fwd)) |
| .int JMPTBL (L(fwd_write_44bytes), L(table_48bytes_fwd)) |
| .int JMPTBL (L(fwd_write_45bytes), L(table_48bytes_fwd)) |
| .int JMPTBL (L(fwd_write_46bytes), L(table_48bytes_fwd)) |
| .int JMPTBL (L(fwd_write_47bytes), L(table_48bytes_fwd)) |
| |
| .p2align 2 |
| L(table_48bytes_fwd_align): |
| .int JMPTBL (L(fwd_write_0bytes_align), L(table_48bytes_fwd_align)) |
| .int JMPTBL (L(fwd_write_1bytes_align), L(table_48bytes_fwd_align)) |
| .int JMPTBL (L(fwd_write_2bytes_align), L(table_48bytes_fwd_align)) |
| .int JMPTBL (L(fwd_write_3bytes_align), L(table_48bytes_fwd_align)) |
| .int JMPTBL (L(fwd_write_4bytes_align), L(table_48bytes_fwd_align)) |
| .int JMPTBL (L(fwd_write_5bytes_align), L(table_48bytes_fwd_align)) |
| .int JMPTBL (L(fwd_write_6bytes_align), L(table_48bytes_fwd_align)) |
| .int JMPTBL (L(fwd_write_7bytes_align), L(table_48bytes_fwd_align)) |
| .int JMPTBL (L(fwd_write_8bytes_align), L(table_48bytes_fwd_align)) |
| .int JMPTBL (L(fwd_write_9bytes_align), L(table_48bytes_fwd_align)) |
| .int JMPTBL (L(fwd_write_10bytes_align), L(table_48bytes_fwd_align)) |
| .int JMPTBL (L(fwd_write_11bytes_align), L(table_48bytes_fwd_align)) |
| .int JMPTBL (L(fwd_write_12bytes_align), L(table_48bytes_fwd_align)) |
| .int JMPTBL (L(fwd_write_13bytes_align), L(table_48bytes_fwd_align)) |
| .int JMPTBL (L(fwd_write_14bytes_align), L(table_48bytes_fwd_align)) |
| .int JMPTBL (L(fwd_write_15bytes_align), L(table_48bytes_fwd_align)) |
| .int JMPTBL (L(fwd_write_16bytes_align), L(table_48bytes_fwd_align)) |
| .int JMPTBL (L(fwd_write_17bytes_align), L(table_48bytes_fwd_align)) |
| .int JMPTBL (L(fwd_write_18bytes_align), L(table_48bytes_fwd_align)) |
| .int JMPTBL (L(fwd_write_19bytes_align), L(table_48bytes_fwd_align)) |
| .int JMPTBL (L(fwd_write_20bytes_align), L(table_48bytes_fwd_align)) |
| .int JMPTBL (L(fwd_write_21bytes_align), L(table_48bytes_fwd_align)) |
| .int JMPTBL (L(fwd_write_22bytes_align), L(table_48bytes_fwd_align)) |
| .int JMPTBL (L(fwd_write_23bytes_align), L(table_48bytes_fwd_align)) |
| .int JMPTBL (L(fwd_write_24bytes_align), L(table_48bytes_fwd_align)) |
| .int JMPTBL (L(fwd_write_25bytes_align), L(table_48bytes_fwd_align)) |
| .int JMPTBL (L(fwd_write_26bytes_align), L(table_48bytes_fwd_align)) |
| .int JMPTBL (L(fwd_write_27bytes_align), L(table_48bytes_fwd_align)) |
| .int JMPTBL (L(fwd_write_28bytes_align), L(table_48bytes_fwd_align)) |
| .int JMPTBL (L(fwd_write_29bytes_align), L(table_48bytes_fwd_align)) |
| .int JMPTBL (L(fwd_write_30bytes_align), L(table_48bytes_fwd_align)) |
| .int JMPTBL (L(fwd_write_31bytes_align), L(table_48bytes_fwd_align)) |
| .int JMPTBL (L(fwd_write_32bytes_align), L(table_48bytes_fwd_align)) |
| .int JMPTBL (L(fwd_write_33bytes_align), L(table_48bytes_fwd_align)) |
| .int JMPTBL (L(fwd_write_34bytes_align), L(table_48bytes_fwd_align)) |
| .int JMPTBL (L(fwd_write_35bytes_align), L(table_48bytes_fwd_align)) |
| .int JMPTBL (L(fwd_write_36bytes_align), L(table_48bytes_fwd_align)) |
| .int JMPTBL (L(fwd_write_37bytes_align), L(table_48bytes_fwd_align)) |
| .int JMPTBL (L(fwd_write_38bytes_align), L(table_48bytes_fwd_align)) |
| .int JMPTBL (L(fwd_write_39bytes_align), L(table_48bytes_fwd_align)) |
| .int JMPTBL (L(fwd_write_40bytes_align), L(table_48bytes_fwd_align)) |
| .int JMPTBL (L(fwd_write_41bytes_align), L(table_48bytes_fwd_align)) |
| .int JMPTBL (L(fwd_write_42bytes_align), L(table_48bytes_fwd_align)) |
| .int JMPTBL (L(fwd_write_43bytes_align), L(table_48bytes_fwd_align)) |
| .int JMPTBL (L(fwd_write_44bytes_align), L(table_48bytes_fwd_align)) |
| .int JMPTBL (L(fwd_write_45bytes_align), L(table_48bytes_fwd_align)) |
| .int JMPTBL (L(fwd_write_46bytes_align), L(table_48bytes_fwd_align)) |
| .int JMPTBL (L(fwd_write_47bytes_align), L(table_48bytes_fwd_align)) |
| |
| .p2align 2 |
| L(shl_table): |
| .int JMPTBL (L(shl_0), L(shl_table)) |
| .int JMPTBL (L(shl_1), L(shl_table)) |
| .int JMPTBL (L(shl_2), L(shl_table)) |
| .int JMPTBL (L(shl_3), L(shl_table)) |
| .int JMPTBL (L(shl_4), L(shl_table)) |
| .int JMPTBL (L(shl_5), L(shl_table)) |
| .int JMPTBL (L(shl_6), L(shl_table)) |
| .int JMPTBL (L(shl_7), L(shl_table)) |
| .int JMPTBL (L(shl_8), L(shl_table)) |
| .int JMPTBL (L(shl_9), L(shl_table)) |
| .int JMPTBL (L(shl_10), L(shl_table)) |
| .int JMPTBL (L(shl_11), L(shl_table)) |
| .int JMPTBL (L(shl_12), L(shl_table)) |
| .int JMPTBL (L(shl_13), L(shl_table)) |
| .int JMPTBL (L(shl_14), L(shl_table)) |
| .int JMPTBL (L(shl_15), L(shl_table)) |
| |
| .p2align 2 |
| L(table_48_bytes_bwd): |
| .int JMPTBL (L(bk_write_0bytes), L(table_48_bytes_bwd)) |
| .int JMPTBL (L(bk_write_1bytes), L(table_48_bytes_bwd)) |
| .int JMPTBL (L(bk_write_2bytes), L(table_48_bytes_bwd)) |
| .int JMPTBL (L(bk_write_3bytes), L(table_48_bytes_bwd)) |
| .int JMPTBL (L(bk_write_4bytes), L(table_48_bytes_bwd)) |
| .int JMPTBL (L(bk_write_5bytes), L(table_48_bytes_bwd)) |
| .int JMPTBL (L(bk_write_6bytes), L(table_48_bytes_bwd)) |
| .int JMPTBL (L(bk_write_7bytes), L(table_48_bytes_bwd)) |
| .int JMPTBL (L(bk_write_8bytes), L(table_48_bytes_bwd)) |
| .int JMPTBL (L(bk_write_9bytes), L(table_48_bytes_bwd)) |
| .int JMPTBL (L(bk_write_10bytes), L(table_48_bytes_bwd)) |
| .int JMPTBL (L(bk_write_11bytes), L(table_48_bytes_bwd)) |
| .int JMPTBL (L(bk_write_12bytes), L(table_48_bytes_bwd)) |
| .int JMPTBL (L(bk_write_13bytes), L(table_48_bytes_bwd)) |
| .int JMPTBL (L(bk_write_14bytes), L(table_48_bytes_bwd)) |
| .int JMPTBL (L(bk_write_15bytes), L(table_48_bytes_bwd)) |
| .int JMPTBL (L(bk_write_16bytes), L(table_48_bytes_bwd)) |
| .int JMPTBL (L(bk_write_17bytes), L(table_48_bytes_bwd)) |
| .int JMPTBL (L(bk_write_18bytes), L(table_48_bytes_bwd)) |
| .int JMPTBL (L(bk_write_19bytes), L(table_48_bytes_bwd)) |
| .int JMPTBL (L(bk_write_20bytes), L(table_48_bytes_bwd)) |
| .int JMPTBL (L(bk_write_21bytes), L(table_48_bytes_bwd)) |
| .int JMPTBL (L(bk_write_22bytes), L(table_48_bytes_bwd)) |
| .int JMPTBL (L(bk_write_23bytes), L(table_48_bytes_bwd)) |
| .int JMPTBL (L(bk_write_24bytes), L(table_48_bytes_bwd)) |
| .int JMPTBL (L(bk_write_25bytes), L(table_48_bytes_bwd)) |
| .int JMPTBL (L(bk_write_26bytes), L(table_48_bytes_bwd)) |
| .int JMPTBL (L(bk_write_27bytes), L(table_48_bytes_bwd)) |
| .int JMPTBL (L(bk_write_28bytes), L(table_48_bytes_bwd)) |
| .int JMPTBL (L(bk_write_29bytes), L(table_48_bytes_bwd)) |
| .int JMPTBL (L(bk_write_30bytes), L(table_48_bytes_bwd)) |
| .int JMPTBL (L(bk_write_31bytes), L(table_48_bytes_bwd)) |
| .int JMPTBL (L(bk_write_32bytes), L(table_48_bytes_bwd)) |
| .int JMPTBL (L(bk_write_33bytes), L(table_48_bytes_bwd)) |
| .int JMPTBL (L(bk_write_34bytes), L(table_48_bytes_bwd)) |
| .int JMPTBL (L(bk_write_35bytes), L(table_48_bytes_bwd)) |
| .int JMPTBL (L(bk_write_36bytes), L(table_48_bytes_bwd)) |
| .int JMPTBL (L(bk_write_37bytes), L(table_48_bytes_bwd)) |
| .int JMPTBL (L(bk_write_38bytes), L(table_48_bytes_bwd)) |
| .int JMPTBL (L(bk_write_39bytes), L(table_48_bytes_bwd)) |
| .int JMPTBL (L(bk_write_40bytes), L(table_48_bytes_bwd)) |
| .int JMPTBL (L(bk_write_41bytes), L(table_48_bytes_bwd)) |
| .int JMPTBL (L(bk_write_42bytes), L(table_48_bytes_bwd)) |
| .int JMPTBL (L(bk_write_43bytes), L(table_48_bytes_bwd)) |
| .int JMPTBL (L(bk_write_44bytes), L(table_48_bytes_bwd)) |
| .int JMPTBL (L(bk_write_45bytes), L(table_48_bytes_bwd)) |
| .int JMPTBL (L(bk_write_46bytes), L(table_48_bytes_bwd)) |
| .int JMPTBL (L(bk_write_47bytes), L(table_48_bytes_bwd)) |
| |
| .popsection |
| |
| #ifdef USE_AS_MEMMOVE |
| .p2align 4 |
| L(copy_backward): |
| PUSH (%edi) |
| movl %eax, %edi |
| lea (%ecx,%edx,1),%edx |
| lea (%ecx,%edi,1),%edi |
| testl $0x3, %edx |
| jnz L(bk_align) |
| |
| L(bk_aligned_4): |
| cmp $64, %ecx |
| jae L(bk_write_more64bytes) |
| |
| L(bk_write_64bytesless): |
| cmp $32, %ecx |
| jb L(bk_write_less32bytes) |
| |
| L(bk_write_more32bytes): |
| /* Copy 32 bytes at a time. */ |
| sub $32, %ecx |
| movq -8(%edi), %xmm0 |
| movq %xmm0, -8(%edx) |
| movq -16(%edi), %xmm0 |
| movq %xmm0, -16(%edx) |
| movq -24(%edi), %xmm0 |
| movq %xmm0, -24(%edx) |
| movq -32(%edi), %xmm0 |
| movq %xmm0, -32(%edx) |
| sub $32, %edx |
| sub $32, %edi |
| |
| L(bk_write_less32bytes): |
| movl %edi, %eax |
| sub %ecx, %edx |
| sub %ecx, %eax |
| POP (%edi) |
| L(bk_write_less32bytes_2): |
| BRANCH_TO_JMPTBL_ENTRY (L(table_48_bytes_bwd), %ecx, 4) |
| |
| CFI_PUSH (%edi) |
| |
| .p2align 4 |
| L(bk_align): |
| cmp $8, %ecx |
| jbe L(bk_write_less32bytes) |
| testl $1, %edx |
| /* We get here only if (EDX & 3 ) != 0 so if (EDX & 1) ==0, |
| then (EDX & 2) must be != 0. */ |
| jz L(bk_got2) |
| sub $1, %edi |
| sub $1, %ecx |
| sub $1, %edx |
| movzbl (%edi), %eax |
| movb %al, (%edx) |
| |
| testl $2, %edx |
| jz L(bk_aligned_4) |
| |
| L(bk_got2): |
| sub $2, %edi |
| sub $2, %ecx |
| sub $2, %edx |
| movzwl (%edi), %eax |
| movw %ax, (%edx) |
| jmp L(bk_aligned_4) |
| |
| .p2align 4 |
| L(bk_write_more64bytes): |
| /* Check alignment of last byte. */ |
| testl $15, %edx |
| jz L(bk_ssse3_cpy_pre) |
| |
| /* EDX is aligned 4 bytes, but not 16 bytes. */ |
| L(bk_ssse3_align): |
| sub $4, %edi |
| sub $4, %ecx |
| sub $4, %edx |
| movl (%edi), %eax |
| movl %eax, (%edx) |
| |
| testl $15, %edx |
| jz L(bk_ssse3_cpy_pre) |
| |
| sub $4, %edi |
| sub $4, %ecx |
| sub $4, %edx |
| movl (%edi), %eax |
| movl %eax, (%edx) |
| |
| testl $15, %edx |
| jz L(bk_ssse3_cpy_pre) |
| |
| sub $4, %edi |
| sub $4, %ecx |
| sub $4, %edx |
| movl (%edi), %eax |
| movl %eax, (%edx) |
| |
| L(bk_ssse3_cpy_pre): |
| cmp $64, %ecx |
| jb L(bk_write_more32bytes) |
| |
| .p2align 4 |
| L(bk_ssse3_cpy): |
| sub $64, %edi |
| sub $64, %ecx |
| sub $64, %edx |
| movdqu 0x30(%edi), %xmm3 |
| movdqa %xmm3, 0x30(%edx) |
| movdqu 0x20(%edi), %xmm2 |
| movdqa %xmm2, 0x20(%edx) |
| movdqu 0x10(%edi), %xmm1 |
| movdqa %xmm1, 0x10(%edx) |
| movdqu (%edi), %xmm0 |
| movdqa %xmm0, (%edx) |
| cmp $64, %ecx |
| jae L(bk_ssse3_cpy) |
| jmp L(bk_write_64bytesless) |
| |
| #endif |
| |
| END (MEMCPY) |