Revert "Add 64-bit slm optimized strlcpy and srlcat."

This reverts commit 2e7145c048e15bf6646befd70aa08d1bfe7b6c26.

When src is at the end page, the sse2 strlcpy SSE2 optimized version
can issue a movdqu instruction that can cross the page boundary.  If
the next page is not allocated to that process, it leads to
segmentation fault.  This is a rare but has be caught multiple times
during robustness testing.

We isolated a way to reproduce that issue outside of an Android device
and we have been able to resolve this particular case.  However, we
ran some additional compliance and robustness tests and found several
other similar page crossing issues with this implementation.

In conclusion, this optimization needs to be re-written from scratch
because its design is at cause.  In the meantime, it is better to
remove it.

Bug: http://b/78355649
Change-Id:  If90450de430ba9b7cd9282a422783beabd701f3d
Signed-off-by: Jeremy Compostella <jeremy.compostella@intel.com>

(cherry picked from commit 611ad621c670a94dadf51562b6256b24f1247981)
diff --git a/libc/Android.bp b/libc/Android.bp
index 7454d03..008c01a 100644
--- a/libc/Android.bp
+++ b/libc/Android.bp
@@ -628,8 +628,6 @@
                 "upstream-openbsd/lib/libc/string/stpncpy.c",
                 "upstream-openbsd/lib/libc/string/strcat.c",
                 "upstream-openbsd/lib/libc/string/strcpy.c",
-                "upstream-openbsd/lib/libc/string/strlcat.c",
-                "upstream-openbsd/lib/libc/string/strlcpy.c",
                 "upstream-openbsd/lib/libc/string/strncat.c",
                 "upstream-openbsd/lib/libc/string/strncmp.c",
                 "upstream-openbsd/lib/libc/string/strncpy.c",
@@ -1182,8 +1180,6 @@
                 "arch-x86_64/string/sse2-stpncpy-slm.S",
                 "arch-x86_64/string/sse2-strcat-slm.S",
                 "arch-x86_64/string/sse2-strcpy-slm.S",
-                "arch-x86_64/string/sse2-strlcat-slm.S",
-                "arch-x86_64/string/sse2-strlcpy-slm.S",
                 "arch-x86_64/string/sse2-strlen-slm.S",
                 "arch-x86_64/string/sse2-strncat-slm.S",
                 "arch-x86_64/string/sse2-strncpy-slm.S",
diff --git a/libc/arch-x86_64/string/sse2-strlcat-slm.S b/libc/arch-x86_64/string/sse2-strlcat-slm.S
deleted file mode 100644
index d79e8c1..0000000
--- a/libc/arch-x86_64/string/sse2-strlcat-slm.S
+++ /dev/null
@@ -1,37 +0,0 @@
-/*
-Copyright (c) 2014, Intel Corporation
-All rights reserved.
-
-Redistribution and use in source and binary forms, with or without
-modification, are permitted provided that the following conditions are met:
-
-    * Redistributions of source code must retain the above copyright notice,
-    * this list of conditions and the following disclaimer.
-
-    * Redistributions in binary form must reproduce the above copyright notice,
-    * this list of conditions and the following disclaimer in the documentation
-    * and/or other materials provided with the distribution.
-
-    * Neither the name of Intel Corporation nor the names of its contributors
-    * may be used to endorse or promote products derived from this software
-    * without specific prior written permission.
-
-THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
-ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
-WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
-DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
-ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
-(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
-LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
-ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-*/
-
-#define USE_AS_STRLCAT
-
-#ifndef STRLCPY
-# define STRLCPY	strlcat
-#endif
-
-#include "sse2-strlcpy-slm.S"
diff --git a/libc/arch-x86_64/string/sse2-strlcpy-slm.S b/libc/arch-x86_64/string/sse2-strlcpy-slm.S
deleted file mode 100755
index 9d4b52f..0000000
--- a/libc/arch-x86_64/string/sse2-strlcpy-slm.S
+++ /dev/null
@@ -1,1062 +0,0 @@
-/*
-Copyright (c) 2014, Intel Corporation
-All rights reserved.
-
-Redistribution and use in source and binary forms, with or without
-modification, are permitted provided that the following conditions are met:
-
-    * Redistributions of source code must retain the above copyright notice,
-    * this list of conditions and the following disclaimer.
-
-    * Redistributions in binary form must reproduce the above copyright notice,
-    * this list of conditions and the following disclaimer in the documentation
-    * and/or other materials provided with the distribution.
-
-    * Neither the name of Intel Corporation nor the names of its contributors
-    * may be used to endorse or promote products derived from this software
-    * without specific prior written permission.
-
-THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
-ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
-WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
-DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
-ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
-(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
-LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
-ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-*/
-
-#ifndef L
-# define L(label)	.L##label
-#endif
-
-#ifndef cfi_startproc
-# define cfi_startproc	.cfi_startproc
-#endif
-
-#ifndef cfi_endproc
-# define cfi_endproc	.cfi_endproc
-#endif
-
-#ifndef ENTRY
-# define ENTRY(name)	\
-	.type name, @function;	\
-	.globl name;	\
-	.p2align 4;	\
-name:	\
-	cfi_startproc
-#endif
-
-#ifndef END
-# define END(name)	\
-       cfi_endproc;	\
-       .size name, .-name
-#endif
-
-
-#ifndef STRLCPY
-# define STRLCPY	strlcpy
-#endif
-
-#define JMPTBL(I, B)	I - B
-#define BRANCH_TO_JMPTBL_ENTRY(TABLE, INDEX, SCALE)	\
-	lea	TABLE(%rip), %r11;	\
-	movslq	(%r11, INDEX, SCALE), %rcx;	\
-	lea	(%r11, %rcx), %rcx;	\
-	jmp	*%rcx
-
-#define RETURN	\
-	add	%r9, %rax;	\
-	ret
-
-.text
-ENTRY (STRLCPY)
-	xor	%rax, %rax
-	xor	%r9, %r9
-	mov	%rdx, %r8
-	cmp	$0, %r8
-	jz	L(CalculateSrcLen)
-
-#ifdef USE_AS_STRLCAT
-	xor	%rcx, %rcx
-	pxor	%xmm0, %xmm0
-
-	movdqu	(%rdi), %xmm1
-	pcmpeqb %xmm1, %xmm0
-	pmovmskb %xmm0, %rdx
-
-	cmp	$17, %r8
-	jb	L(SizeEndCase1)
-	test	%rdx, %rdx
-	jnz	L(StringEndCase1)
-
-	add	$16, %rax
-	movdqu	16(%rdi), %xmm1
-	pcmpeqb %xmm1, %xmm0
-	pmovmskb %xmm0, %rdx
-
-	cmp	$33, %r8
-	jb	L(SizeEndCase1)
-	test	%rdx, %rdx
-	jnz	L(StringEndCase1)
-
-	mov	%rdi, %rcx
-	and	$15, %rcx
-	and	$-16, %rdi
-
-	add	%rcx, %r8
-	sub	$16, %r8
-
-L(DstLenLoop):
-	movdqa	(%rdi, %rax), %xmm1
-	pcmpeqb %xmm1, %xmm0
-	pmovmskb %xmm0, %rdx
-	sub	$16, %r8
-	jbe	L(SizeEndCase2)
-	test	%rdx, %rdx
-	jnz	L(StringEndCase2)
-	add	$16, %rax
-	jmp	L(DstLenLoop)
-
-L(StringEndCase2):
-	add	$16, %r8
-	bsf	%rdx, %rdx
-	sub	%rdx, %r8
-	add	%rdx, %rax
-	sub	%rcx, %r9
-	add	%rax, %rdi
-	jmp	 L(CopySrcString)
-
-L(SizeEndCase1):
-	test	%rdx, %rdx
-	jz	L(SizeEnd)
-	bsf	%rdx, %rdx
-	add	%rdx, %rax
-	cmp	%r8, %rax
-	jb	L(StringEnd)
-L(SizeEnd):
-	mov	%r8, %r9
-	jmp	L(CalculateSrcLenCase1)
-
-L(SizeEndCase2):
-	add	$16, %r8
-	test	%rdx, %rdx
-	jz	L(StringEndCase4)
-	bsf	%rdx, %rdx
-	cmp	%r8, %rdx
-	jb	L(StringEndCase3)
-L(StringEndCase4):
-	add	%r8, %rax
-	sub	%rcx, %rax
-	mov	%rax, %r9
-	jmp	L(CalculateSrcLenCase1)
-
-L(StringEndCase3):
-	add	%rdx, %rax
-	sub	%rcx, %r9
-	add	%rax, %rdi
-	sub	%rdx, %r8
-	jmp	L(CopySrcString)
-
-L(StringEndCase1):
-	bsf	%rdx, %rdx
-	add	%rdx, %rax
-	sub	%rcx, %rax
-L(StringEnd):
-	add	%rax, %rdi
-	sub	%rax, %r8
-#endif
-
-	mov	%rsi, %rcx
-	and	$63, %rcx
-	cmp	$32, %rcx
-	jbe	L(CopySrcString)
-
-	and	$-16, %rsi
-	and	$15, %rcx
-	pxor	%xmm0, %xmm0
-	pxor	%xmm1, %xmm1
-
-	pcmpeqb	(%rsi), %xmm1
-	pmovmskb %xmm1, %rdx
-	shr	%cl, %rdx
-	mov	$16, %r10
-	sub	%rcx, %r10
-	cmp	%r10, %r8
-	jbe	L(CopyFrom1To16BytesTailCase2OrCase3)
-	test	%rdx, %rdx
-	jnz	L(CopyFrom1To16BytesTail)
-
-	pcmpeqb	16(%rsi), %xmm0
-	pmovmskb %xmm0, %rdx
-	add	$16, %r10
-	cmp	%r10, %r8
-	jbe	L(CopyFrom1To32BytesCase2OrCase3)
-	test	%rdx, %rdx
-	jnz	L(CopyFrom1To32Bytes)
-
-	movdqu	(%rsi, %rcx), %xmm1
-	movdqu	%xmm1, (%rdi)
-#ifdef USE_AS_STRLCAT
-	add	%rax, %r9
-#endif
-	jmp	L(LoopStart)
-
-	.p2align 4
-L(CopySrcString):
-#ifdef USE_AS_STRLCAT
-	add	%rax, %r9
-	xor	%rax, %rax
-#endif
-	pxor	%xmm0, %xmm0
-	movdqu	(%rsi), %xmm1
-	pcmpeqb	%xmm1, %xmm0
-	pmovmskb %xmm0, %rdx
-
-	cmp	$17, %r8
-	jb	L(CopyFrom1To16BytesTail1Case2OrCase3)
-	test	%rdx, %rdx
-	jnz	L(CopyFrom1To16BytesTail1)
-
-	movdqu	16(%rsi), %xmm2
-	pcmpeqb	%xmm2, %xmm0
-	movdqu	%xmm1, (%rdi)
-	pmovmskb %xmm0, %rdx
-	add	$16, %rax
-
-	cmp	$33, %r8
-	jb	L(CopyFrom1To32Bytes1Case2OrCase3)
-	test	%rdx, %rdx
-	jnz	L(CopyFrom1To32Bytes1)
-
-	mov	%rsi, %rcx
-	and	$15, %rcx
-	and	$-16, %rsi
-
-L(LoopStart):
-	sub	%rcx, %rdi
-	add	%rcx, %r8
-	sub	$16, %r8
-	mov	$16, %rax
-
-L(16Loop):
-	movdqa	(%rsi, %rax), %xmm1
-	pcmpeqb	%xmm1, %xmm0
-	pmovmskb %xmm0, %rdx
-	sub	$16, %r8
-	jbe	L(CopyFrom1To16BytesCase2OrCase3)
-	test	%rdx, %rdx
-	jnz	L(CopyFrom1To16BytesXmmExit)
-	movdqu	%xmm1, (%rdi, %rax)
-	add	$16, %rax
-	jmp	L(16Loop)
-
-/*------End of main part with loops---------------------*/
-
-/* Case1 */
-	.p2align 4
-L(CopyFrom1To16Bytes):
-	add	%rcx, %rdi
-	add	%rcx, %rsi
-	bsf	%rdx, %rdx
-	add	%rdx, %rax
-	BRANCH_TO_JMPTBL_ENTRY (L(ExitStringTailTable), %rdx, 4)
-
-	.p2align 4
-L(CopyFrom1To16BytesTail):
-	add	%rcx, %rsi
-	bsf	%rdx, %rdx
-	add	%rdx, %rax
-	BRANCH_TO_JMPTBL_ENTRY (L(ExitStringTailTable), %rdx, 4)
-
-	.p2align 4
-L(CopyFrom1To32Bytes1):
-	add	$16, %rsi
-	add	$16, %rdi
-	sub	$16, %r8
-L(CopyFrom1To16BytesTail1):
-	bsf	%rdx, %rdx
-	add	%rdx, %rax
-	BRANCH_TO_JMPTBL_ENTRY (L(ExitStringTailTable), %rdx, 4)
-
-	.p2align 4
-L(CopyFrom1To32Bytes):
-	bsf	%rdx, %rdx
-	add	%rcx, %rsi
-	add	$16, %rdx
-	sub	%rcx, %rdx
-	add	%rdx, %rax
-	BRANCH_TO_JMPTBL_ENTRY (L(ExitStringTailTable), %rdx, 4)
-
-	.p2align 4
-L(CopyFrom1To16BytesExit):
-	add	%rdx, %rax
-	BRANCH_TO_JMPTBL_ENTRY (L(ExitStringTailTable), %rdx, 4)
-
-/* Case2 */
-
-	.p2align 4
-L(CopyFrom1To16BytesCase2):
-	add	$16, %r8
-	add	%rax, %rdi
-	add	%rax, %rsi
-	bsf	%rdx, %rdx
-	sub	%rcx, %rax
-	cmp	%r8, %rdx
-	jb	L(CopyFrom1To16BytesExit)
-	add	%r8, %rax
-	BRANCH_TO_JMPTBL_ENTRY (L(ExitTable), %r8, 4)
-
-	.p2align 4
-L(CopyFrom1To32BytesCase2):
-	add	%rcx, %rsi
-	bsf	%rdx, %rdx
-	add	$16, %rdx
-	sub	%rcx, %rdx
-	cmp	%r8, %rdx
-	jb	L(CopyFrom1To16BytesExit)
-	add	%r8, %rax
-	BRANCH_TO_JMPTBL_ENTRY (L(ExitTable), %r8, 4)
-
-L(CopyFrom1To16BytesTailCase2):
-	add	%rcx, %rsi
-	bsf	%rdx, %rdx
-	cmp	%r8, %rdx
-	jb	L(CopyFrom1To16BytesExit)
-	add	%r8, %rax
-	BRANCH_TO_JMPTBL_ENTRY (L(ExitTable), %r8, 4)
-
-	.p2align 4
-L(CopyFrom1To16BytesTail1Case2):
-	bsf	%rdx, %rdx
-	cmp	%r8, %rdx
-	jb	L(CopyFrom1To16BytesExit)
-	add	%r8, %rax
-	BRANCH_TO_JMPTBL_ENTRY (L(ExitTable), %r8, 4)
-
-/* Case2 or Case3,  Case3 */
-
-	.p2align 4
-L(CopyFrom1To16BytesCase2OrCase3):
-	test	%rdx, %rdx
-	jnz	L(CopyFrom1To16BytesCase2)
-	add	$16, %r8
-	add	%rax, %rdi
-	add	%rax, %rsi
-	add	%r8, %rax
-	sub	%rcx, %rax
-	BRANCH_TO_JMPTBL_ENTRY (L(ExitTable), %r8, 4)
-
-	.p2align 4
-L(CopyFrom1To32BytesCase2OrCase3):
-	test	%rdx, %rdx
-	jnz	L(CopyFrom1To32BytesCase2)
-	add	%rcx, %rsi
-	add	%r8, %rax
-	BRANCH_TO_JMPTBL_ENTRY (L(ExitTable), %r8, 4)
-
-	.p2align 4
-L(CopyFrom1To16BytesTailCase2OrCase3):
-	test	%rdx, %rdx
-	jnz	L(CopyFrom1To16BytesTailCase2)
-	add	%rcx, %rsi
-	add	%r8, %rax
-	BRANCH_TO_JMPTBL_ENTRY (L(ExitTable), %r8, 4)
-
-	.p2align 4
-L(CopyFrom1To32Bytes1Case2OrCase3):
-	add	$16, %rdi
-	add	$16, %rsi
-	sub	$16, %r8
-L(CopyFrom1To16BytesTail1Case2OrCase3):
-	test	%rdx, %rdx
-	jnz	L(CopyFrom1To16BytesTail1Case2)
-	add	%r8, %rax
-	BRANCH_TO_JMPTBL_ENTRY (L(ExitTable), %r8, 4)
-
-	.p2align 4
-L(CopyFrom1To16BytesXmmExit):
-	bsf	%rdx, %rdx
-	add	%rax, %rdi
-	add	%rax, %rsi
-	add	%rdx, %rax
-	sub	%rcx, %rax
-	BRANCH_TO_JMPTBL_ENTRY (L(ExitStringTailTable), %rdx, 4)
-
-/*------------End labels regarding with copying 1-16 bytes--and 1-32 bytes----*/
-
-
-	.p2align 4
-L(Exit0):
-	RETURN
-
-	.p2align 4
-L(Exit1):
-	movb	$0, (%rdi)
-	jmp	L(CalculateSrcLen)
-
-	.p2align 4
-L(Exit2):
-	movb	(%rsi), %dh
-	movb	%dh, (%rdi)
-	movb	$0, 1(%rdi)
-	jmp	L(CalculateSrcLen)
-
-	.p2align 4
-L(Exit3):
-	movw	(%rsi), %dx
-	movw	%dx, (%rdi)
-	movb	$0, 2(%rdi)
-	jmp	L(CalculateSrcLen)
-
-	.p2align 4
-L(Exit4):
-	movw	(%rsi), %cx
-	movb	2(%rsi), %dh
-	movw	%cx, (%rdi)
-	movb	%dh, 2(%rdi)
-	movb	$0, 3(%rdi)
-	jmp	L(CalculateSrcLen)
-
-	.p2align 4
-L(Exit5):
-	movl	(%rsi), %edx
-	movl	%edx, (%rdi)
-	movb	$0, 4(%rdi)
-	jmp	L(CalculateSrcLen)
-
-	.p2align 4
-L(Exit6):
-	movl	(%rsi), %ecx
-	movb	4(%rsi), %dh
-	movl	%ecx, (%rdi)
-	movb	%dh, 4(%rdi)
-	movb	$0, 5(%rdi)
-	jmp	L(CalculateSrcLen)
-
-	.p2align 4
-L(Exit7):
-	movl	(%rsi), %ecx
-	movw	4(%rsi), %dx
-	movl	%ecx, (%rdi)
-	movw	%dx, 4(%rdi)
-	movb	$0, 6(%rdi)
-	jmp	L(CalculateSrcLen)
-
-	.p2align 4
-L(Exit8):
-	movl	(%rsi), %ecx
-	movl	3(%rsi), %edx
-	movl	%ecx, (%rdi)
-	movl	%edx, 3(%rdi)
-	movb	$0, 7(%rdi)
-	jmp	L(CalculateSrcLen)
-
-	.p2align 4
-L(Exit9):
-	movq	(%rsi), %rdx
-	movq	%rdx, (%rdi)
-	movb	$0, 8(%rdi)
-	jmp	L(CalculateSrcLen)
-
-	.p2align 4
-L(Exit10):
-	movq	(%rsi), %rcx
-	movb	8(%rsi), %dh
-	movq	%rcx, (%rdi)
-	movb	%dh, 8(%rdi)
-	movb	$0, 9(%rdi)
-	jmp	L(CalculateSrcLen)
-
-	.p2align 4
-L(Exit11):
-	movq	(%rsi), %rcx
-	movw	8(%rsi), %dx
-	movq	%rcx, (%rdi)
-	movw	%dx, 8(%rdi)
-	movb	$0, 10(%rdi)
-	jmp	L(CalculateSrcLen)
-
-	.p2align 4
-L(Exit12):
-	movq	(%rsi), %rcx
-	movl	7(%rsi), %edx
-	movq	%rcx, (%rdi)
-	movl	%edx, 7(%rdi)
-	movb	$0, 11(%rdi)
-	jmp	L(CalculateSrcLen)
-
-	.p2align 4
-L(Exit13):
-	movq	(%rsi), %rcx
-	movl	8(%rsi), %edx
-	movq	%rcx, (%rdi)
-	movl	%edx, 8(%rdi)
-	movb	$0, 12(%rdi)
-	jmp	L(CalculateSrcLen)
-
-	.p2align 4
-L(Exit14):
-	movq	(%rsi), %rcx
-	movq	5(%rsi), %rdx
-	movq	%rcx, (%rdi)
-	movq	%rdx, 5(%rdi)
-	movb	$0, 13(%rdi)
-	jmp	L(CalculateSrcLen)
-
-	.p2align 4
-L(Exit15):
-	movq	(%rsi), %rcx
-	movq	6(%rsi), %rdx
-	movq	%rcx, (%rdi)
-	movq	%rdx, 6(%rdi)
-	movb	$0, 14(%rdi)
-	jmp	L(CalculateSrcLen)
-
-	.p2align 4
-L(Exit16):
-	movq	(%rsi), %rcx
-	movq	7(%rsi), %rdx
-	movq	%rcx, (%rdi)
-	movq	%rdx, 7(%rdi)
-	movb	$0, 15(%rdi)
-	jmp	L(CalculateSrcLen)
-
-	.p2align 4
-L(Exit17):
-	movdqu	(%rsi), %xmm0
-	movdqu	%xmm0, (%rdi)
-	movb	$0, 16(%rdi)
-	jmp	L(CalculateSrcLen)
-
-	.p2align 4
-L(Exit18):
-	movdqu	(%rsi), %xmm0
-	movb	16(%rsi), %dh
-	movdqu	%xmm0, (%rdi)
-	movb	%dh, 16(%rdi)
-	movb	$0, 17(%rdi)
-	jmp	L(CalculateSrcLen)
-
-	.p2align 4
-L(Exit19):
-	movdqu	(%rsi), %xmm0
-	movw	16(%rsi), %cx
-	movdqu	%xmm0, (%rdi)
-	movw	%cx, 16(%rdi)
-	movb	$0, 18(%rdi)
-	jmp	L(CalculateSrcLen)
-
-	.p2align 4
-L(Exit20):
-	movdqu	(%rsi), %xmm0
-	movl	15(%rsi), %ecx
-	movdqu	%xmm0, (%rdi)
-	movl	%ecx, 15(%rdi)
-	movb	$0, 19(%rdi)
-	jmp	L(CalculateSrcLen)
-
-	.p2align 4
-L(Exit21):
-	movdqu	(%rsi), %xmm0
-	movl	16(%rsi), %ecx
-	movdqu	%xmm0, (%rdi)
-	movl	%ecx, 16(%rdi)
-	movb	$0, 20(%rdi)
-	jmp	L(CalculateSrcLen)
-
-	.p2align 4
-L(Exit22):
-	movdqu	(%rsi), %xmm0
-	movl	16(%rsi), %ecx
-	movb	20(%rsi), %dh
-	movdqu	%xmm0, (%rdi)
-	movl	%ecx, 16(%rdi)
-	movb	%dh, 20(%rdi)
-	movb	$0, 21(%rdi)
-	jmp	L(CalculateSrcLen)
-
-	.p2align 4
-L(Exit23):
-	movdqu	(%rsi), %xmm0
-	movq	14(%rsi), %rcx
-	movdqu	%xmm0, (%rdi)
-	movq	%rcx, 14(%rdi)
-	movb	$0, 22(%rdi)
-	jmp	L(CalculateSrcLen)
-
-	.p2align 4
-L(Exit24):
-	movdqu	(%rsi), %xmm0
-	movq	15(%rsi), %rcx
-	movdqu	%xmm0, (%rdi)
-	movq	%rcx, 15(%rdi)
-	movb	$0, 23(%rdi)
-	jmp	L(CalculateSrcLen)
-
-	.p2align 4
-L(Exit25):
-	movdqu	(%rsi), %xmm0
-	movq	16(%rsi), %rcx
-	movdqu	%xmm0, (%rdi)
-	movq	%rcx, 16(%rdi)
-	movb	$0, 24(%rdi)
-	jmp	L(CalculateSrcLen)
-
-	.p2align 4
-L(Exit26):
-	movdqu	(%rsi), %xmm0
-	movq	16(%rsi), %rcx
-	movb	24(%rsi), %dh
-	movdqu	%xmm0, (%rdi)
-	movq	%rcx, 16(%rdi)
-	mov	%dh, 24(%rdi)
-	movb	$0, 25(%rdi)
-	jmp	L(CalculateSrcLen)
-
-	.p2align 4
-L(Exit27):
-	movdqu	(%rsi), %xmm0
-	movq	16(%rsi), %rdx
-	movw	24(%rsi), %cx
-	movdqu	%xmm0, (%rdi)
-	movq	%rdx, 16(%rdi)
-	movw	%cx, 24(%rdi)
-	movb	$0, 26(%rdi)
-	jmp	L(CalculateSrcLen)
-
-	.p2align 4
-L(Exit28):
-	movdqu	(%rsi), %xmm0
-	movq	16(%rsi), %rdx
-	movl	23(%rsi), %ecx
-	movdqu	%xmm0, (%rdi)
-	movq	%rdx, 16(%rdi)
-	movl	%ecx, 23(%rdi)
-	movb	$0, 27(%rdi)
-	jmp	L(CalculateSrcLen)
-
-	.p2align 4
-L(Exit29):
-	movdqu	(%rsi), %xmm0
-	movq	16(%rsi), %rdx
-	movl	24(%rsi), %ecx
-	movdqu	%xmm0, (%rdi)
-	movq	%rdx, 16(%rdi)
-	movl	%ecx, 24(%rdi)
-	movb	$0, 28(%rdi)
-	jmp	L(CalculateSrcLen)
-
-	.p2align 4
-L(Exit30):
-	movdqu	(%rsi), %xmm0
-	movdqu	13(%rsi), %xmm2
-	movdqu	%xmm0, (%rdi)
-	movdqu	%xmm2, 13(%rdi)
-	movb	$0, 29(%rdi)
-	jmp	L(CalculateSrcLen)
-
-	.p2align 4
-L(Exit31):
-	movdqu	(%rsi), %xmm0
-	movdqu	14(%rsi), %xmm2
-	movdqu	%xmm0, (%rdi)
-	movdqu	%xmm2, 14(%rdi)
-	movb	$0, 30(%rdi)
-	jmp	L(CalculateSrcLen)
-
-	.p2align 4
-L(Exit32):
-	movdqu	(%rsi), %xmm0
-	movdqu	15(%rsi), %xmm2
-	movdqu	%xmm0, (%rdi)
-	movdqu	%xmm2, 15(%rdi)
-	movb	$0, 31(%rdi)
-	jmp	L(CalculateSrcLen)
-
-	.p2align 4
-L(StringTail0):
-	mov	(%rsi), %dl
-	mov	%dl, (%rdi)
-	RETURN
-
-	.p2align 4
-L(StringTail1):
-	mov	(%rsi), %dx
-	mov	%dx, (%rdi)
-	RETURN
-
-	.p2align 4
-L(StringTail2):
-	mov	(%rsi), %cx
-	mov	2(%rsi), %dl
-	mov	%cx, (%rdi)
-	mov	%dl, 2(%rdi)
-	RETURN
-
-	.p2align 4
-L(StringTail3):
-	mov	(%rsi), %edx
-	mov	%edx, (%rdi)
-	RETURN
-
-	.p2align 4
-L(StringTail4):
-	mov	(%rsi), %ecx
-	mov	4(%rsi), %dl
-	mov	%ecx, (%rdi)
-	mov	%dl, 4(%rdi)
-	RETURN
-
-	.p2align 4
-L(StringTail5):
-	mov	(%rsi), %ecx
-	mov	4(%rsi), %dx
-	mov	%ecx, (%rdi)
-	mov	%dx, 4(%rdi)
-	RETURN
-
-	.p2align 4
-L(StringTail6):
-	mov	(%rsi), %ecx
-	mov	3(%rsi), %edx
-	mov	%ecx, (%rdi)
-	mov	%edx, 3(%rdi)
-	RETURN
-
-	.p2align 4
-L(StringTail7):
-	mov	(%rsi), %rdx
-	mov	%rdx, (%rdi)
-	RETURN
-
-	.p2align 4
-L(StringTail8):
-	mov	(%rsi), %rcx
-	mov	8(%rsi), %dl
-	mov	%rcx, (%rdi)
-	mov	%dl, 8(%rdi)
-	RETURN
-
-	.p2align 4
-L(StringTail9):
-	mov	(%rsi), %rcx
-	mov	8(%rsi), %dx
-	mov	%rcx, (%rdi)
-	mov	%dx, 8(%rdi)
-	RETURN
-
-	.p2align 4
-L(StringTail10):
-	mov	(%rsi), %rcx
-	mov	7(%rsi), %edx
-	mov	%rcx, (%rdi)
-	mov	%edx, 7(%rdi)
-	RETURN
-
-	.p2align 4
-L(StringTail11):
-	mov	(%rsi), %rcx
-	mov	8(%rsi), %edx
-	mov	%rcx, (%rdi)
-	mov	%edx, 8(%rdi)
-	RETURN
-
-	.p2align 4
-L(StringTail12):
-	mov	(%rsi), %rcx
-	mov	5(%rsi), %rdx
-	mov	%rcx, (%rdi)
-	mov	%rdx, 5(%rdi)
-	RETURN
-
-	.p2align 4
-L(StringTail13):
-	mov	(%rsi), %rcx
-	mov	6(%rsi), %rdx
-	mov	%rcx, (%rdi)
-	mov	%rdx, 6(%rdi)
-	RETURN
-
-	.p2align 4
-L(StringTail14):
-	mov	(%rsi), %rcx
-	mov	7(%rsi), %rdx
-	mov	%rcx, (%rdi)
-	mov	%rdx, 7(%rdi)
-	RETURN
-
-	.p2align 4
-L(StringTail15):
-	movdqu	(%rsi), %xmm0
-	movdqu	%xmm0, (%rdi)
-	RETURN
-
-	.p2align 4
-L(StringTail16):
-	movdqu	(%rsi), %xmm0
-	mov	16(%rsi), %cl
-	movdqu	%xmm0, (%rdi)
-	mov	%cl, 16(%rdi)
-	RETURN
-
-	.p2align 4
-L(StringTail17):
-	movdqu	(%rsi), %xmm0
-	mov	16(%rsi), %cx
-	movdqu	%xmm0, (%rdi)
-	mov	%cx, 16(%rdi)
-	RETURN
-
-	.p2align 4
-L(StringTail18):
-	movdqu	(%rsi), %xmm0
-	mov	15(%rsi), %ecx
-	movdqu	%xmm0, (%rdi)
-	mov	%ecx, 15(%rdi)
-	RETURN
-
-	.p2align 4
-L(StringTail19):
-	movdqu	(%rsi), %xmm0
-	mov	16(%rsi), %ecx
-	movdqu	%xmm0, (%rdi)
-	mov	%ecx, 16(%rdi)
-	RETURN
-
-	.p2align 4
-L(StringTail20):
-	movdqu	(%rsi), %xmm0
-	mov	16(%rsi), %ecx
-	mov	20(%rsi), %dl
-	movdqu	%xmm0, (%rdi)
-	mov	%ecx, 16(%rdi)
-	mov	%dl, 20(%rdi)
-	RETURN
-
-	.p2align 4
-L(StringTail21):
-	movdqu	(%rsi), %xmm0
-	mov	14(%rsi), %rcx
-	movdqu	%xmm0, (%rdi)
-	mov	%rcx, 14(%rdi)
-	RETURN
-
-	.p2align 4
-L(StringTail22):
-	movdqu	(%rsi), %xmm0
-	mov	15(%rsi), %rcx
-	movdqu	%xmm0, (%rdi)
-	mov	%rcx, 15(%rdi)
-	RETURN
-
-	.p2align 4
-L(StringTail23):
-	movdqu	(%rsi), %xmm0
-	mov	16(%rsi), %rcx
-	movdqu	%xmm0, (%rdi)
-	mov	%rcx, 16(%rdi)
-	RETURN
-
-	.p2align 4
-L(StringTail24):
-	movdqu	(%rsi), %xmm0
-	mov	16(%rsi), %rdx
-	mov	24(%rsi), %cl
-	movdqu	%xmm0, (%rdi)
-	mov	%rdx, 16(%rdi)
-	mov	%cl, 24(%rdi)
-	RETURN
-
-	.p2align 4
-L(StringTail25):
-	movdqu	(%rsi), %xmm0
-	mov	16(%rsi), %rdx
-	mov	24(%rsi), %cx
-	movdqu	%xmm0, (%rdi)
-	mov	%rdx, 16(%rdi)
-	mov	%cx, 24(%rdi)
-	RETURN
-
-	.p2align 4
-L(StringTail26):
-	movdqu	(%rsi), %xmm0
-	mov	16(%rsi), %rdx
-	mov	23(%rsi), %ecx
-	movdqu	%xmm0, (%rdi)
-	mov	%rdx, 16(%rdi)
-	mov	%ecx, 23(%rdi)
-	RETURN
-
-	.p2align 4
-L(StringTail27):
-	movdqu	(%rsi), %xmm0
-	mov	16(%rsi), %rdx
-	mov	24(%rsi), %ecx
-	movdqu	%xmm0, (%rdi)
-	mov	%rdx, 16(%rdi)
-	mov	%ecx, 24(%rdi)
-	RETURN
-
-	.p2align 4
-L(StringTail28):
-	movdqu	(%rsi), %xmm0
-	movdqu	13(%rsi), %xmm2
-	movdqu	%xmm0, (%rdi)
-	movdqu	%xmm2, 13(%rdi)
-	RETURN
-
-	.p2align 4
-L(StringTail29):
-	movdqu	(%rsi), %xmm0
-	movdqu	14(%rsi), %xmm2
-	movdqu	%xmm0, (%rdi)
-	movdqu	%xmm2, 14(%rdi)
-	RETURN
-
-	.p2align 4
-L(StringTail30):
-	movdqu	(%rsi), %xmm0
-	movdqu	15(%rsi), %xmm2
-	movdqu	%xmm0, (%rdi)
-	movdqu	%xmm2, 15(%rdi)
-	RETURN
-
-	.p2align 4
-L(StringTail31):
-	movdqu	(%rsi), %xmm0
-	movdqu	16(%rsi), %xmm2
-	movdqu	%xmm0, (%rdi)
-	movdqu	%xmm2, 16(%rdi)
-	RETURN
-
-	.p2align 4
-L(StringTail32):
-	movdqu	(%rsi), %xmm0
-	movdqu	16(%rsi), %xmm2
-	mov	32(%rsi), %cl
-	movdqu	%xmm0, (%rdi)
-	movdqu	%xmm2, 16(%rdi)
-	mov	%cl, 32(%rdi)
-	RETURN
-
-	.p2align 4
-L(StringTail33):
-	movdqu	(%rsi), %xmm0
-	movdqu	16(%rsi), %xmm2
-	mov	32(%rsi), %cl
-	movdqu	%xmm0, (%rdi)
-	movdqu	%xmm2, 16(%rdi)
-	mov	%cl, 32(%rdi)
-	RETURN
-
-	.p2align 4
-L(CalculateSrcLenCase1):
-	xor	%r8, %r8
-	xor	%rax, %rax
-L(CalculateSrcLen):
-	pxor	%xmm0, %xmm0
-	xor	%rcx, %rcx
-	add	%r8, %rsi
-	movdqu	(%rsi), %xmm1
-	pcmpeqb	%xmm1, %xmm0
-	pmovmskb %xmm0, %rdx
-	test	%rdx, %rdx
-	jnz	L(SrcLenLoopEnd)
-
-	add	%rax, %r9
-	mov	$16, %rax
-	mov	%rsi, %rcx
-	and	$15, %rcx
-	and	$-16, %rsi
-L(SrcLenLoop):
-	movdqa	(%rsi, %rax), %xmm1
-	pcmpeqb	%xmm1, %xmm0
-	pmovmskb %xmm0, %rdx
-	test	%rdx, %rdx
-	jnz	L(SrcLenLoopEnd)
-	add	$16, %rax
-	jmp	L(SrcLenLoop)
-
-	.p2align 4
-L(SrcLenLoopEnd):
-	bsf	%rdx, %rdx
-	add	%rdx, %rax
-	sub	%rcx, %rax
-	RETURN
-
-END (STRLCPY)
-
-	.p2align 4
-	.section .rodata
-L(ExitTable):
-	.int	JMPTBL(L(Exit0), L(ExitTable))
-	.int	JMPTBL(L(Exit1), L(ExitTable))
-	.int	JMPTBL(L(Exit2), L(ExitTable))
-	.int	JMPTBL(L(Exit3), L(ExitTable))
-	.int	JMPTBL(L(Exit4), L(ExitTable))
-	.int	JMPTBL(L(Exit5), L(ExitTable))
-	.int	JMPTBL(L(Exit6), L(ExitTable))
-	.int	JMPTBL(L(Exit7), L(ExitTable))
-	.int	JMPTBL(L(Exit8), L(ExitTable))
-	.int	JMPTBL(L(Exit9), L(ExitTable))
-	.int	JMPTBL(L(Exit10), L(ExitTable))
-	.int	JMPTBL(L(Exit11), L(ExitTable))
-	.int	JMPTBL(L(Exit12), L(ExitTable))
-	.int	JMPTBL(L(Exit13), L(ExitTable))
-	.int	JMPTBL(L(Exit14), L(ExitTable))
-	.int	JMPTBL(L(Exit15), L(ExitTable))
-	.int	JMPTBL(L(Exit16), L(ExitTable))
-	.int	JMPTBL(L(Exit17), L(ExitTable))
-	.int	JMPTBL(L(Exit18), L(ExitTable))
-	.int	JMPTBL(L(Exit19), L(ExitTable))
-	.int	JMPTBL(L(Exit20), L(ExitTable))
-	.int	JMPTBL(L(Exit21), L(ExitTable))
-	.int	JMPTBL(L(Exit22), L(ExitTable))
-	.int	JMPTBL(L(Exit23), L(ExitTable))
-	.int	JMPTBL(L(Exit24), L(ExitTable))
-	.int	JMPTBL(L(Exit25), L(ExitTable))
-	.int	JMPTBL(L(Exit26), L(ExitTable))
-	.int	JMPTBL(L(Exit27), L(ExitTable))
-	.int	JMPTBL(L(Exit28), L(ExitTable))
-	.int	JMPTBL(L(Exit29), L(ExitTable))
-	.int	JMPTBL(L(Exit30), L(ExitTable))
-	.int	JMPTBL(L(Exit31), L(ExitTable))
-	.int	JMPTBL(L(Exit32), L(ExitTable))
-L(ExitStringTailTable):
-	.int	JMPTBL(L(StringTail0), L(ExitStringTailTable))
-	.int	JMPTBL(L(StringTail1), L(ExitStringTailTable))
-	.int	JMPTBL(L(StringTail2), L(ExitStringTailTable))
-	.int	JMPTBL(L(StringTail3), L(ExitStringTailTable))
-	.int	JMPTBL(L(StringTail4), L(ExitStringTailTable))
-	.int	JMPTBL(L(StringTail5), L(ExitStringTailTable))
-	.int	JMPTBL(L(StringTail6), L(ExitStringTailTable))
-	.int	JMPTBL(L(StringTail7), L(ExitStringTailTable))
-	.int	JMPTBL(L(StringTail8), L(ExitStringTailTable))
-	.int	JMPTBL(L(StringTail9), L(ExitStringTailTable))
-	.int	JMPTBL(L(StringTail10), L(ExitStringTailTable))
-	.int	JMPTBL(L(StringTail11), L(ExitStringTailTable))
-	.int	JMPTBL(L(StringTail12), L(ExitStringTailTable))
-	.int	JMPTBL(L(StringTail13), L(ExitStringTailTable))
-	.int	JMPTBL(L(StringTail14), L(ExitStringTailTable))
-	.int	JMPTBL(L(StringTail15), L(ExitStringTailTable))
-	.int	JMPTBL(L(StringTail16), L(ExitStringTailTable))
-	.int	JMPTBL(L(StringTail17), L(ExitStringTailTable))
-	.int	JMPTBL(L(StringTail18), L(ExitStringTailTable))
-	.int	JMPTBL(L(StringTail19), L(ExitStringTailTable))
-	.int	JMPTBL(L(StringTail20), L(ExitStringTailTable))
-	.int	JMPTBL(L(StringTail21), L(ExitStringTailTable))
-	.int	JMPTBL(L(StringTail22), L(ExitStringTailTable))
-	.int	JMPTBL(L(StringTail23), L(ExitStringTailTable))
-	.int	JMPTBL(L(StringTail24), L(ExitStringTailTable))
-	.int	JMPTBL(L(StringTail25), L(ExitStringTailTable))
-	.int	JMPTBL(L(StringTail26), L(ExitStringTailTable))
-	.int	JMPTBL(L(StringTail27), L(ExitStringTailTable))
-	.int	JMPTBL(L(StringTail28), L(ExitStringTailTable))
-	.int	JMPTBL(L(StringTail29), L(ExitStringTailTable))
-	.int	JMPTBL(L(StringTail30), L(ExitStringTailTable))
-	.int	JMPTBL(L(StringTail31), L(ExitStringTailTable))
-	.int	JMPTBL(L(StringTail32), L(ExitStringTailTable))
-	.int	JMPTBL(L(StringTail33), L(ExitStringTailTable))