| /* |
| Copyright (c) 2011, Intel Corporation |
| All rights reserved. |
| |
| Redistribution and use in source and binary forms, with or without |
| modification, are permitted provided that the following conditions are met: |
| |
| * Redistributions of source code must retain the above copyright notice, |
| * this list of conditions and the following disclaimer. |
| |
| * Redistributions in binary form must reproduce the above copyright notice, |
| * this list of conditions and the following disclaimer in the documentation |
| * and/or other materials provided with the distribution. |
| |
| * Neither the name of Intel Corporation nor the names of its contributors |
| * may be used to endorse or promote products derived from this software |
| * without specific prior written permission. |
| |
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND |
| ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED |
| WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE |
| DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR |
| ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES |
| (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; |
| LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON |
| ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT |
| (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS |
| SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. |
| */ |
| |
| #ifndef USE_AS_WCSCAT |
| |
| # ifndef L |
| # define L(label) .L##label |
| # endif |
| |
| # ifndef cfi_startproc |
| # define cfi_startproc .cfi_startproc |
| # endif |
| |
| # ifndef cfi_endproc |
| # define cfi_endproc .cfi_endproc |
| # endif |
| |
| # ifndef cfi_rel_offset |
| # define cfi_rel_offset(reg, off) .cfi_rel_offset reg, off |
| # endif |
| |
| # ifndef cfi_restore |
| # define cfi_restore(reg) .cfi_restore reg |
| # endif |
| |
| # ifndef cfi_adjust_cfa_offset |
| # define cfi_adjust_cfa_offset(off) .cfi_adjust_cfa_offset off |
| # endif |
| |
| # ifndef ENTRY |
| # define ENTRY(name) \ |
| .type name, @function; \ |
| .globl name; \ |
| .p2align 4; \ |
| name: \ |
| cfi_startproc |
| # endif |
| |
| # ifndef END |
| # define END(name) \ |
| cfi_endproc; \ |
| .size name, .-name |
| # endif |
| |
| # define CFI_PUSH(REG) \ |
| cfi_adjust_cfa_offset (4); \ |
| cfi_rel_offset (REG, 0) |
| |
| # define CFI_POP(REG) \ |
| cfi_adjust_cfa_offset (-4); \ |
| cfi_restore (REG) |
| |
| # define PUSH(REG) pushl REG; CFI_PUSH (REG) |
| # define POP(REG) popl REG; CFI_POP (REG) |
| |
| # define PARMS 4 |
| # define RETURN POP (%edi); ret; CFI_PUSH (%edi) |
| |
| # define STR1 PARMS |
| # define STR2 STR1+4 |
| # define LEN STR2+4 |
| |
| .text |
| ENTRY (wcscpy_ssse3) |
| mov STR1(%esp), %edx |
| mov STR2(%esp), %ecx |
| |
| cmpl $0, (%ecx) |
| jz L(ExitTail4) |
| cmpl $0, 4(%ecx) |
| jz L(ExitTail8) |
| cmpl $0, 8(%ecx) |
| jz L(ExitTail12) |
| cmpl $0, 12(%ecx) |
| jz L(ExitTail16) |
| |
| PUSH (%edi) |
| mov %edx, %edi |
| #endif |
| PUSH (%esi) |
| lea 16(%ecx), %esi |
| |
| and $-16, %esi |
| |
| pxor %xmm0, %xmm0 |
| pcmpeqd (%esi), %xmm0 |
| movdqu (%ecx), %xmm1 |
| movdqu %xmm1, (%edx) |
| |
| pmovmskb %xmm0, %eax |
| sub %ecx, %esi |
| |
| test %eax, %eax |
| jnz L(CopyFrom1To16Bytes) |
| |
| mov %edx, %eax |
| lea 16(%edx), %edx |
| and $-16, %edx |
| sub %edx, %eax |
| |
| sub %eax, %ecx |
| mov %ecx, %eax |
| and $0xf, %eax |
| mov $0, %esi |
| |
| jz L(Align16Both) |
| cmp $4, %eax |
| je L(Shl4) |
| cmp $8, %eax |
| je L(Shl8) |
| jmp L(Shl12) |
| |
| L(Align16Both): |
| movaps (%ecx), %xmm1 |
| movaps 16(%ecx), %xmm2 |
| movaps %xmm1, (%edx) |
| pcmpeqd %xmm2, %xmm0 |
| pmovmskb %xmm0, %eax |
| lea 16(%esi), %esi |
| |
| test %eax, %eax |
| jnz L(CopyFrom1To16Bytes) |
| |
| movaps 16(%ecx, %esi), %xmm3 |
| movaps %xmm2, (%edx, %esi) |
| pcmpeqd %xmm3, %xmm0 |
| pmovmskb %xmm0, %eax |
| lea 16(%esi), %esi |
| |
| test %eax, %eax |
| jnz L(CopyFrom1To16Bytes) |
| |
| movaps 16(%ecx, %esi), %xmm4 |
| movaps %xmm3, (%edx, %esi) |
| pcmpeqd %xmm4, %xmm0 |
| pmovmskb %xmm0, %eax |
| lea 16(%esi), %esi |
| |
| test %eax, %eax |
| jnz L(CopyFrom1To16Bytes) |
| |
| movaps 16(%ecx, %esi), %xmm1 |
| movaps %xmm4, (%edx, %esi) |
| pcmpeqd %xmm1, %xmm0 |
| pmovmskb %xmm0, %eax |
| lea 16(%esi), %esi |
| |
| test %eax, %eax |
| jnz L(CopyFrom1To16Bytes) |
| |
| movaps 16(%ecx, %esi), %xmm2 |
| movaps %xmm1, (%edx, %esi) |
| pcmpeqd %xmm2, %xmm0 |
| pmovmskb %xmm0, %eax |
| lea 16(%esi), %esi |
| |
| test %eax, %eax |
| jnz L(CopyFrom1To16Bytes) |
| |
| movaps 16(%ecx, %esi), %xmm3 |
| movaps %xmm2, (%edx, %esi) |
| pcmpeqd %xmm3, %xmm0 |
| pmovmskb %xmm0, %eax |
| lea 16(%esi), %esi |
| |
| test %eax, %eax |
| jnz L(CopyFrom1To16Bytes) |
| |
| movaps %xmm3, (%edx, %esi) |
| mov %ecx, %eax |
| lea 16(%ecx, %esi), %ecx |
| and $-0x40, %ecx |
| sub %ecx, %eax |
| sub %eax, %edx |
| |
| mov $-0x40, %esi |
| |
| L(Aligned64Loop): |
| movaps (%ecx), %xmm2 |
| movaps 32(%ecx), %xmm3 |
| movaps %xmm2, %xmm4 |
| movaps 16(%ecx), %xmm5 |
| movaps %xmm3, %xmm6 |
| movaps 48(%ecx), %xmm7 |
| pminub %xmm5, %xmm2 |
| pminub %xmm7, %xmm3 |
| pminub %xmm2, %xmm3 |
| lea 64(%edx), %edx |
| pcmpeqd %xmm0, %xmm3 |
| lea 64(%ecx), %ecx |
| pmovmskb %xmm3, %eax |
| |
| test %eax, %eax |
| jnz L(Aligned64Leave) |
| movaps %xmm4, -64(%edx) |
| movaps %xmm5, -48(%edx) |
| movaps %xmm6, -32(%edx) |
| movaps %xmm7, -16(%edx) |
| jmp L(Aligned64Loop) |
| |
| L(Aligned64Leave): |
| pcmpeqd %xmm4, %xmm0 |
| pmovmskb %xmm0, %eax |
| test %eax, %eax |
| jnz L(CopyFrom1To16Bytes) |
| |
| pcmpeqd %xmm5, %xmm0 |
| pmovmskb %xmm0, %eax |
| movaps %xmm4, -64(%edx) |
| lea 16(%esi), %esi |
| test %eax, %eax |
| jnz L(CopyFrom1To16Bytes) |
| |
| pcmpeqd %xmm6, %xmm0 |
| pmovmskb %xmm0, %eax |
| movaps %xmm5, -48(%edx) |
| lea 16(%esi), %esi |
| test %eax, %eax |
| jnz L(CopyFrom1To16Bytes) |
| |
| movaps %xmm6, -32(%edx) |
| pcmpeqd %xmm7, %xmm0 |
| pmovmskb %xmm0, %eax |
| lea 16(%esi), %esi |
| test %eax, %eax |
| jnz L(CopyFrom1To16Bytes) |
| |
| mov $-0x40, %esi |
| movaps %xmm7, -16(%edx) |
| jmp L(Aligned64Loop) |
| |
| .p2align 4 |
| L(Shl4): |
| movaps -4(%ecx), %xmm1 |
| movaps 12(%ecx), %xmm2 |
| L(Shl4Start): |
| pcmpeqd %xmm2, %xmm0 |
| pmovmskb %xmm0, %eax |
| movaps %xmm2, %xmm3 |
| |
| test %eax, %eax |
| jnz L(Shl4LoopExit) |
| |
| palignr $4, %xmm1, %xmm2 |
| movaps %xmm2, (%edx) |
| movaps 28(%ecx), %xmm2 |
| |
| pcmpeqd %xmm2, %xmm0 |
| lea 16(%edx), %edx |
| pmovmskb %xmm0, %eax |
| lea 16(%ecx), %ecx |
| movaps %xmm2, %xmm1 |
| |
| test %eax, %eax |
| jnz L(Shl4LoopExit) |
| |
| palignr $4, %xmm3, %xmm2 |
| movaps %xmm2, (%edx) |
| movaps 28(%ecx), %xmm2 |
| |
| pcmpeqd %xmm2, %xmm0 |
| lea 16(%edx), %edx |
| pmovmskb %xmm0, %eax |
| lea 16(%ecx), %ecx |
| movaps %xmm2, %xmm3 |
| |
| test %eax, %eax |
| jnz L(Shl4LoopExit) |
| |
| palignr $4, %xmm1, %xmm2 |
| movaps %xmm2, (%edx) |
| movaps 28(%ecx), %xmm2 |
| |
| pcmpeqd %xmm2, %xmm0 |
| lea 16(%edx), %edx |
| pmovmskb %xmm0, %eax |
| lea 16(%ecx), %ecx |
| |
| test %eax, %eax |
| jnz L(Shl4LoopExit) |
| |
| palignr $4, %xmm3, %xmm2 |
| movaps %xmm2, (%edx) |
| lea 28(%ecx), %ecx |
| lea 16(%edx), %edx |
| |
| mov %ecx, %eax |
| and $-0x40, %ecx |
| sub %ecx, %eax |
| lea -12(%ecx), %ecx |
| sub %eax, %edx |
| |
| movaps -4(%ecx), %xmm1 |
| |
| L(Shl4LoopStart): |
| movaps 12(%ecx), %xmm2 |
| movaps 28(%ecx), %xmm3 |
| movaps %xmm3, %xmm6 |
| movaps 44(%ecx), %xmm4 |
| movaps %xmm4, %xmm7 |
| movaps 60(%ecx), %xmm5 |
| pminub %xmm2, %xmm6 |
| pminub %xmm5, %xmm7 |
| pminub %xmm6, %xmm7 |
| pcmpeqd %xmm0, %xmm7 |
| pmovmskb %xmm7, %eax |
| movaps %xmm5, %xmm7 |
| palignr $4, %xmm4, %xmm5 |
| palignr $4, %xmm3, %xmm4 |
| test %eax, %eax |
| jnz L(Shl4Start) |
| |
| palignr $4, %xmm2, %xmm3 |
| lea 64(%ecx), %ecx |
| palignr $4, %xmm1, %xmm2 |
| movaps %xmm7, %xmm1 |
| movaps %xmm5, 48(%edx) |
| movaps %xmm4, 32(%edx) |
| movaps %xmm3, 16(%edx) |
| movaps %xmm2, (%edx) |
| lea 64(%edx), %edx |
| jmp L(Shl4LoopStart) |
| |
| L(Shl4LoopExit): |
| movlpd (%ecx), %xmm0 |
| movl 8(%ecx), %esi |
| movlpd %xmm0, (%edx) |
| movl %esi, 8(%edx) |
| POP (%esi) |
| add $12, %edx |
| add $12, %ecx |
| test %al, %al |
| jz L(ExitHigh) |
| test $0x01, %al |
| jnz L(Exit4) |
| movlpd (%ecx), %xmm0 |
| movlpd %xmm0, (%edx) |
| movl %edi, %eax |
| RETURN |
| |
| CFI_PUSH (%esi) |
| |
| .p2align 4 |
| L(Shl8): |
| movaps -8(%ecx), %xmm1 |
| movaps 8(%ecx), %xmm2 |
| L(Shl8Start): |
| pcmpeqd %xmm2, %xmm0 |
| pmovmskb %xmm0, %eax |
| movaps %xmm2, %xmm3 |
| |
| test %eax, %eax |
| jnz L(Shl8LoopExit) |
| |
| palignr $8, %xmm1, %xmm2 |
| movaps %xmm2, (%edx) |
| movaps 24(%ecx), %xmm2 |
| |
| pcmpeqd %xmm2, %xmm0 |
| lea 16(%edx), %edx |
| pmovmskb %xmm0, %eax |
| lea 16(%ecx), %ecx |
| movaps %xmm2, %xmm1 |
| |
| test %eax, %eax |
| jnz L(Shl8LoopExit) |
| |
| palignr $8, %xmm3, %xmm2 |
| movaps %xmm2, (%edx) |
| movaps 24(%ecx), %xmm2 |
| |
| pcmpeqd %xmm2, %xmm0 |
| lea 16(%edx), %edx |
| pmovmskb %xmm0, %eax |
| lea 16(%ecx), %ecx |
| movaps %xmm2, %xmm3 |
| |
| test %eax, %eax |
| jnz L(Shl8LoopExit) |
| |
| palignr $8, %xmm1, %xmm2 |
| movaps %xmm2, (%edx) |
| movaps 24(%ecx), %xmm2 |
| |
| pcmpeqd %xmm2, %xmm0 |
| lea 16(%edx), %edx |
| pmovmskb %xmm0, %eax |
| lea 16(%ecx), %ecx |
| |
| test %eax, %eax |
| jnz L(Shl8LoopExit) |
| |
| palignr $8, %xmm3, %xmm2 |
| movaps %xmm2, (%edx) |
| lea 24(%ecx), %ecx |
| lea 16(%edx), %edx |
| |
| mov %ecx, %eax |
| and $-0x40, %ecx |
| sub %ecx, %eax |
| lea -8(%ecx), %ecx |
| sub %eax, %edx |
| |
| movaps -8(%ecx), %xmm1 |
| |
| L(Shl8LoopStart): |
| movaps 8(%ecx), %xmm2 |
| movaps 24(%ecx), %xmm3 |
| movaps %xmm3, %xmm6 |
| movaps 40(%ecx), %xmm4 |
| movaps %xmm4, %xmm7 |
| movaps 56(%ecx), %xmm5 |
| pminub %xmm2, %xmm6 |
| pminub %xmm5, %xmm7 |
| pminub %xmm6, %xmm7 |
| pcmpeqd %xmm0, %xmm7 |
| pmovmskb %xmm7, %eax |
| movaps %xmm5, %xmm7 |
| palignr $8, %xmm4, %xmm5 |
| palignr $8, %xmm3, %xmm4 |
| test %eax, %eax |
| jnz L(Shl8Start) |
| |
| palignr $8, %xmm2, %xmm3 |
| lea 64(%ecx), %ecx |
| palignr $8, %xmm1, %xmm2 |
| movaps %xmm7, %xmm1 |
| movaps %xmm5, 48(%edx) |
| movaps %xmm4, 32(%edx) |
| movaps %xmm3, 16(%edx) |
| movaps %xmm2, (%edx) |
| lea 64(%edx), %edx |
| jmp L(Shl8LoopStart) |
| |
| L(Shl8LoopExit): |
| movlpd (%ecx), %xmm0 |
| movlpd %xmm0, (%edx) |
| POP (%esi) |
| add $8, %edx |
| add $8, %ecx |
| test %al, %al |
| jz L(ExitHigh) |
| test $0x01, %al |
| jnz L(Exit4) |
| movlpd (%ecx), %xmm0 |
| movlpd %xmm0, (%edx) |
| movl %edi, %eax |
| RETURN |
| |
| CFI_PUSH (%esi) |
| |
| .p2align 4 |
| L(Shl12): |
| movaps -12(%ecx), %xmm1 |
| movaps 4(%ecx), %xmm2 |
| L(Shl12Start): |
| pcmpeqd %xmm2, %xmm0 |
| pmovmskb %xmm0, %eax |
| movaps %xmm2, %xmm3 |
| |
| test %eax, %eax |
| jnz L(Shl12LoopExit) |
| |
| palignr $12, %xmm1, %xmm2 |
| movaps %xmm2, (%edx) |
| movaps 20(%ecx), %xmm2 |
| |
| pcmpeqd %xmm2, %xmm0 |
| lea 16(%edx), %edx |
| pmovmskb %xmm0, %eax |
| lea 16(%ecx), %ecx |
| movaps %xmm2, %xmm1 |
| |
| test %eax, %eax |
| jnz L(Shl12LoopExit) |
| |
| palignr $12, %xmm3, %xmm2 |
| movaps %xmm2, (%edx) |
| movaps 20(%ecx), %xmm2 |
| |
| pcmpeqd %xmm2, %xmm0 |
| lea 16(%edx), %edx |
| pmovmskb %xmm0, %eax |
| lea 16(%ecx), %ecx |
| movaps %xmm2, %xmm3 |
| |
| test %eax, %eax |
| jnz L(Shl12LoopExit) |
| |
| palignr $12, %xmm1, %xmm2 |
| movaps %xmm2, (%edx) |
| movaps 20(%ecx), %xmm2 |
| |
| pcmpeqd %xmm2, %xmm0 |
| lea 16(%edx), %edx |
| pmovmskb %xmm0, %eax |
| lea 16(%ecx), %ecx |
| |
| test %eax, %eax |
| jnz L(Shl12LoopExit) |
| |
| palignr $12, %xmm3, %xmm2 |
| movaps %xmm2, (%edx) |
| lea 20(%ecx), %ecx |
| lea 16(%edx), %edx |
| |
| mov %ecx, %eax |
| and $-0x40, %ecx |
| sub %ecx, %eax |
| lea -4(%ecx), %ecx |
| sub %eax, %edx |
| |
| movaps -12(%ecx), %xmm1 |
| |
| L(Shl12LoopStart): |
| movaps 4(%ecx), %xmm2 |
| movaps 20(%ecx), %xmm3 |
| movaps %xmm3, %xmm6 |
| movaps 36(%ecx), %xmm4 |
| movaps %xmm4, %xmm7 |
| movaps 52(%ecx), %xmm5 |
| pminub %xmm2, %xmm6 |
| pminub %xmm5, %xmm7 |
| pminub %xmm6, %xmm7 |
| pcmpeqd %xmm0, %xmm7 |
| pmovmskb %xmm7, %eax |
| movaps %xmm5, %xmm7 |
| palignr $12, %xmm4, %xmm5 |
| palignr $12, %xmm3, %xmm4 |
| test %eax, %eax |
| jnz L(Shl12Start) |
| |
| palignr $12, %xmm2, %xmm3 |
| lea 64(%ecx), %ecx |
| palignr $12, %xmm1, %xmm2 |
| movaps %xmm7, %xmm1 |
| movaps %xmm5, 48(%edx) |
| movaps %xmm4, 32(%edx) |
| movaps %xmm3, 16(%edx) |
| movaps %xmm2, (%edx) |
| lea 64(%edx), %edx |
| jmp L(Shl12LoopStart) |
| |
| L(Shl12LoopExit): |
| movl (%ecx), %esi |
| movl %esi, (%edx) |
| mov $4, %esi |
| |
| .p2align 4 |
| L(CopyFrom1To16Bytes): |
| add %esi, %edx |
| add %esi, %ecx |
| |
| POP (%esi) |
| test %al, %al |
| jz L(ExitHigh) |
| test $0x01, %al |
| jnz L(Exit4) |
| L(Exit8): |
| movlpd (%ecx), %xmm0 |
| movlpd %xmm0, (%edx) |
| movl %edi, %eax |
| RETURN |
| |
| .p2align 4 |
| L(ExitHigh): |
| test $0x01, %ah |
| jnz L(Exit12) |
| L(Exit16): |
| movdqu (%ecx), %xmm0 |
| movdqu %xmm0, (%edx) |
| movl %edi, %eax |
| RETURN |
| |
| .p2align 4 |
| L(Exit4): |
| movl (%ecx), %eax |
| movl %eax, (%edx) |
| movl %edi, %eax |
| RETURN |
| |
| .p2align 4 |
| L(Exit12): |
| movlpd (%ecx), %xmm0 |
| movlpd %xmm0, (%edx) |
| movl 8(%ecx), %eax |
| movl %eax, 8(%edx) |
| movl %edi, %eax |
| RETURN |
| |
| CFI_POP (%edi) |
| |
| .p2align 4 |
| L(ExitTail4): |
| movl (%ecx), %eax |
| movl %eax, (%edx) |
| movl %edx, %eax |
| ret |
| |
| .p2align 4 |
| L(ExitTail8): |
| movlpd (%ecx), %xmm0 |
| movlpd %xmm0, (%edx) |
| movl %edx, %eax |
| ret |
| |
| .p2align 4 |
| L(ExitTail12): |
| movlpd (%ecx), %xmm0 |
| movlpd %xmm0, (%edx) |
| movl 8(%ecx), %eax |
| movl %eax, 8(%edx) |
| movl %edx, %eax |
| ret |
| |
| .p2align 4 |
| L(ExitTail16): |
| movdqu (%ecx), %xmm0 |
| movdqu %xmm0, (%edx) |
| movl %edx, %eax |
| ret |
| |
| #ifndef USE_AS_WCSCAT |
| END (wcscpy_ssse3) |
| #endif |