Merge "bionic: call stdio cleanup on exit"
diff --git a/libc/Android.mk b/libc/Android.mk
index 4ebd656..ab09089 100644
--- a/libc/Android.mk
+++ b/libc/Android.mk
@@ -288,7 +288,6 @@
     bionic/sysconf.cpp \
     bionic/thread_atexit.cpp \
     bionic/tdestroy.cpp \
-    bionic/__thread_entry.cpp \
     bionic/timer.cpp \
     bionic/tmpfile.cpp \
     bionic/unlink.cpp \
diff --git a/libc/SYSCALLS.TXT b/libc/SYSCALLS.TXT
index 7c15297..a027024 100644
--- a/libc/SYSCALLS.TXT
+++ b/libc/SYSCALLS.TXT
@@ -276,6 +276,7 @@
 long    perf_event_open(struct perf_event_attr* attr_uptr, pid_t pid, int cpu, int group_fd, unsigned long flags) all
 
 pid_t __clone:clone(int, void*, int*, void*, int*)  all
+int __set_tid_address:set_tid_address(int*) all
 
 int epoll_create1(int)  all
 int epoll_ctl(int, int op, int, struct epoll_event*)  all
@@ -304,4 +305,3 @@
 
 # MIPS-specific
 int     _flush_cache:cacheflush(char* addr, const int nbytes, const int op) mips
-int     syscall(int number, ...) mips
diff --git a/libc/arch-aarch64/syscalls.mk b/libc/arch-aarch64/syscalls.mk
index 995d44a..890a6d8 100644
--- a/libc/arch-aarch64/syscalls.mk
+++ b/libc/arch-aarch64/syscalls.mk
@@ -19,6 +19,7 @@
 syscall_src += arch-aarch64/syscalls/__rt_sigsuspend.S
 syscall_src += arch-aarch64/syscalls/__rt_sigtimedwait.S
 syscall_src += arch-aarch64/syscalls/__sched_getaffinity.S
+syscall_src += arch-aarch64/syscalls/__set_tid_address.S
 syscall_src += arch-aarch64/syscalls/__syslog.S
 syscall_src += arch-aarch64/syscalls/__timer_create.S
 syscall_src += arch-aarch64/syscalls/__timer_delete.S
diff --git a/libc/arch-aarch64/syscalls/__set_tid_address.S b/libc/arch-aarch64/syscalls/__set_tid_address.S
new file mode 100644
index 0000000..b7541fc
--- /dev/null
+++ b/libc/arch-aarch64/syscalls/__set_tid_address.S
@@ -0,0 +1,22 @@
+/* Generated by gensyscalls.py. Do not edit. */
+
+#include <private/bionic_asm.h>
+
+ENTRY(__set_tid_address)
+    stp     x29, x30, [sp, #-16]!
+    mov     x29,  sp
+    str     x8,       [sp, #-16]!
+
+    mov     x8, __NR_set_tid_address
+    svc     #0
+
+    ldr     x8,       [sp], #16
+    ldp     x29, x30, [sp], #16
+
+    cmn     x0, #(MAX_ERRNO + 1)
+    cneg    x0, x0, hi
+    b.hi    __set_errno
+
+    ret
+END(__set_tid_address)
+.hidden _C_LABEL(__set_tid_address)
diff --git a/libc/arch-arm/bionic/_exit_with_stack_teardown.S b/libc/arch-arm/bionic/_exit_with_stack_teardown.S
index c430edb..0d97f06 100644
--- a/libc/arch-arm/bionic/_exit_with_stack_teardown.S
+++ b/libc/arch-arm/bionic/_exit_with_stack_teardown.S
@@ -33,12 +33,11 @@
 ENTRY(_exit_with_stack_teardown)
     mov     lr, r2
     ldr     r7, =__NR_munmap
-    swi     #0              // the stack is destroyed by this call
+    swi     #0
+    // If munmap failed, we ignore the failure and exit anyway.
+
     mov     r0, lr
     ldr     r7, =__NR_exit
     swi     #0
-
-    // exit() should never return, cause a crash if it does
-    mov     r0, #0
-    ldr     r0, [r0]
+    // The exit syscall does not return.
 END(_exit_with_stack_teardown)
diff --git a/libc/arch-arm/bionic/clone.S b/libc/arch-arm/bionic/clone.S
index 3556b8e..0782abe 100644
--- a/libc/arch-arm/bionic/clone.S
+++ b/libc/arch-arm/bionic/clone.S
@@ -28,53 +28,7 @@
 
 #include <private/bionic_asm.h>
 
-// int  __pthread_clone(void* (*fn)(void*), void* child_stack, int flags, void* arg);
-ENTRY(__pthread_clone)
-    # Push 'fn' and 'arg' onto 'child_stack'.
-    stmdb   r1!, {r0, r3}
-
-    # The sys_clone system call only takes two arguments: 'flags' and 'child_stack'.
-    # 'child_stack' is already in r1, but we need to move 'flags' into position.
-    mov     r0, r2
-
-    # System call.
-    mov     ip, r7
-    ldr     r7, =__NR_clone
-    swi     #0
-
-    # Child?
-    movs    r0, r0
-    beq     1f
-
-    # Parent.
-    mov     r7, ip
-    cmn     r0, #(MAX_ERRNO + 1)
-    bxls    lr
-    neg     r0, r0
-    b       __set_errno
-
-1:  # Child.
-    # Pop 'fn' and 'arg' back off the stack and call __thread_entry.
-    pop     {r0, r1}
-    # __thread_entry also needs our stack pointer.
-    mov     r2, sp
-    b       __thread_entry
-END(__pthread_clone)
-
-
-    #
-    # This function is defined as:
-    #
-    #   pid_t  __bionic_clone( int  flags, void *child_stack,
-    #                          pid_t *pid, void *tls, pid_t *ctid,
-    #                          int  (*fn)(void *), void* arg );
-    #
-    # NOTE: This is not the same signature as the glibc
-    #       __clone function. Placing 'fn' and 'arg'
-    #       at the end of the parameter list makes the
-    #       implementation much simpler.
-    #
-
+// pid_t __bionic_clone(int flags, void* child_stack, pid_t* parent_tid, void* tls, pid_t* child_tid, int (*fn)(void*), void* arg);
 ENTRY(__bionic_clone)
     mov     ip, sp
     .save   {r4, r5, r6, r7}
diff --git a/libc/arch-arm/syscalls.mk b/libc/arch-arm/syscalls.mk
index 75b6133..f8bb15a 100644
--- a/libc/arch-arm/syscalls.mk
+++ b/libc/arch-arm/syscalls.mk
@@ -23,6 +23,7 @@
 syscall_src += arch-arm/syscalls/__rt_sigsuspend.S
 syscall_src += arch-arm/syscalls/__rt_sigtimedwait.S
 syscall_src += arch-arm/syscalls/__sched_getaffinity.S
+syscall_src += arch-arm/syscalls/__set_tid_address.S
 syscall_src += arch-arm/syscalls/__set_tls.S
 syscall_src += arch-arm/syscalls/__sigaction.S
 syscall_src += arch-arm/syscalls/__statfs64.S
diff --git a/libc/arch-arm/syscalls/__set_tid_address.S b/libc/arch-arm/syscalls/__set_tid_address.S
new file mode 100644
index 0000000..b4b42e7
--- /dev/null
+++ b/libc/arch-arm/syscalls/__set_tid_address.S
@@ -0,0 +1,14 @@
+/* Generated by gensyscalls.py. Do not edit. */
+
+#include <private/bionic_asm.h>
+
+ENTRY(__set_tid_address)
+    mov     ip, r7
+    ldr     r7, =__NR_set_tid_address
+    swi     #0
+    mov     r7, ip
+    cmn     r0, #(MAX_ERRNO + 1)
+    bxls    lr
+    neg     r0, r0
+    b       __set_errno
+END(__set_tid_address)
diff --git a/libc/arch-mips/bionic/_exit_with_stack_teardown.S b/libc/arch-mips/bionic/_exit_with_stack_teardown.S
index 8351e2e..9cab52b 100644
--- a/libc/arch-mips/bionic/_exit_with_stack_teardown.S
+++ b/libc/arch-mips/bionic/_exit_with_stack_teardown.S
@@ -30,7 +30,7 @@
 
 	.text
 
-// void _exit_with_stack_teardown(void * stackBase, size_t stackSize, int status)
+// void _exit_with_stack_teardown(void* stackBase, size_t stackSize, int status)
 
 	.type	_exit_with_stack_teardown, @function
 	.global	_exit_with_stack_teardown
@@ -40,12 +40,11 @@
 	move	$s0,$a2		/* preserve status for exit() call */
 
 	li	$v0,__NR_munmap
-	syscall			/* the stack is destroyed by this call */
+	syscall
+	// If munmap failed, we ignore the failure and exit anyway.
+
 	move	$a0,$s0
 	li	$v0,__NR_exit
 	syscall
-
-	/* exit() should never return, cause a crash if it does */
-	move	$a0,$0
-	lw	$a0,($a0)
+        // The exit syscall does not return.
 	.end	_exit_with_stack_teardown
diff --git a/libc/arch-mips/bionic/clone.S b/libc/arch-mips/bionic/clone.S
index 94e18a2..8970b6e 100644
--- a/libc/arch-mips/bionic/clone.S
+++ b/libc/arch-mips/bionic/clone.S
@@ -30,78 +30,7 @@
 #include <linux/errno.h>
 #include <linux/sched.h>
 
-	.text
-	.type __pthread_clone, @function
-	.global __pthread_clone
-	.align 4
-        .ent __pthread_clone
-
-/*
- * int __pthread_clone(void* (*fn)(void*), void *child_stack,
- *			 int flags, void *arg);
- */
-
-__pthread_clone:
-        .set	noreorder
-        .cpload $t9
-        .set	reorder
-
-	# set up child stack
-	subu	$a1,16
-	sw	$a0,0($a1)	# fn
-	sw	$a3,4($a1)	# arg
-#	sw	$a1+16,8($a1)	# tls
-
-	/*
-	 * int sys_clone(int flags, void *child_stack, int *parent_tidptr,
-	 *	 struct user_desc *newtls, int *child_tidptr);
-	 */
-
-	move	$a0,$a2		# flags
-#	move	$a1,$a1		# child_stack
-	move	$a2,$0		# parent_tidptr
-	move	$a3,$0		# user_desc
-	and	$a0,~(CLONE_CHILD_SETTID | CLONE_CHILD_CLEARTID)
-				# make sure the kernel doesn't access child_tidptr
-
-        li	$v0,__NR_clone
-        syscall
-
-        bnez	$a3,.L__error
-
-        beqz	$v0,.L__thread_start
-
-        j $ra
-
-.L__thread_start:
-        lw	$a0,0($sp)	#  fn
-        lw	$a1,4($sp)	#  arg
-        addu	$a2,$sp,16	#  tls
-
-	# void __thread_entry(void* (*func)(void*), void *arg, void *tls)
-        la	$t9, __thread_entry
-        j	$t9
-
-.L__error:
-	move	$a0,$v0
-	la	$t9,__set_errno
-	j	$t9
-
-        .end __pthread_clone
-
-
-    #
-    # This function is defined as:
-    #
-    #   pid_t  __bionic_clone( int  flags, void *child_stack,
-    #                          pid_t *pid, void *tls, pid_t *ctid,
-    #                          int  (*fn)(void *), void* arg );
-    #
-    # NOTE: This is not the same signature than the GLibc
-    #       __clone function here !! Placing 'fn' and 'arg'
-    #       at the end of the parameter list makes the
-    #       implementation much simpler.
-    #
+// pid_t __bionic_clone(int flags, void* child_stack, pid_t* parent_tid, void* tls, pid_t* child_tid, int (*fn)(void*), void* arg);
 	.text
 	.type __bionic_clone, @function
 	.global __bionic_clone
diff --git a/libc/arch-mips/bionic/syscall.S b/libc/arch-mips/bionic/syscall.S
new file mode 100644
index 0000000..60754e8
--- /dev/null
+++ b/libc/arch-mips/bionic/syscall.S
@@ -0,0 +1,57 @@
+/*
+ * Copyright (C) 2013 The Android Open Source Project
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *  * Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in
+ *    the documentation and/or other materials provided with the
+ *    distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+ * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+ * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS
+ * OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
+ * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
+ * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <asm/unistd.h>
+    .text
+    .globl syscall
+    .align 4
+    .ent syscall
+
+syscall:
+    .set noreorder
+    .cpload $t9
+    move    $v0, $a0
+    move    $a0, $a1
+    move    $a1, $a2
+    move    $a2, $a3
+    lw      $a3, 16($sp)
+    lw      $t0, 20($sp)
+    lw      $t1, 24($sp)
+    sw      $t0, 16($sp)
+    sw      $t1, 20($sp)
+    syscall
+    bnez    $a3, 1f
+    move    $a0, $v0
+    j       $ra
+    nop
+1:
+    la      $t9,__set_errno
+    j       $t9
+    nop
+    .set reorder
+    .end syscall
diff --git a/libc/arch-mips/mips.mk b/libc/arch-mips/mips.mk
index 3ebedd0..c895357 100644
--- a/libc/arch-mips/mips.mk
+++ b/libc/arch-mips/mips.mk
@@ -11,6 +11,7 @@
     arch-mips/bionic/setjmp.S \
     arch-mips/bionic/__set_tls.c \
     arch-mips/bionic/sigsetjmp.S \
+    arch-mips/bionic/syscall.S \
     arch-mips/bionic/vfork.S \
     arch-mips/string/memcpy.S \
     arch-mips/string/memset.S \
diff --git a/libc/arch-mips/syscalls.mk b/libc/arch-mips/syscalls.mk
index 9be6197..6b72e70 100644
--- a/libc/arch-mips/syscalls.mk
+++ b/libc/arch-mips/syscalls.mk
@@ -24,6 +24,7 @@
 syscall_src += arch-mips/syscalls/__rt_sigtimedwait.S
 syscall_src += arch-mips/syscalls/__sched_getaffinity.S
 syscall_src += arch-mips/syscalls/__set_thread_area.S
+syscall_src += arch-mips/syscalls/__set_tid_address.S
 syscall_src += arch-mips/syscalls/__sigaction.S
 syscall_src += arch-mips/syscalls/__statfs64.S
 syscall_src += arch-mips/syscalls/__syslog.S
@@ -175,7 +176,6 @@
 syscall_src += arch-mips/syscalls/swapon.S
 syscall_src += arch-mips/syscalls/symlinkat.S
 syscall_src += arch-mips/syscalls/sync.S
-syscall_src += arch-mips/syscalls/syscall.S
 syscall_src += arch-mips/syscalls/sysinfo.S
 syscall_src += arch-mips/syscalls/tgkill.S
 syscall_src += arch-mips/syscalls/timerfd_create.S
diff --git a/libc/arch-mips/syscalls/syscall.S b/libc/arch-mips/syscalls/__set_tid_address.S
similarity index 65%
rename from libc/arch-mips/syscalls/syscall.S
rename to libc/arch-mips/syscalls/__set_tid_address.S
index 2b2b707..4fcc82a 100644
--- a/libc/arch-mips/syscalls/syscall.S
+++ b/libc/arch-mips/syscalls/__set_tid_address.S
@@ -2,14 +2,14 @@
 
 #include <asm/unistd.h>
     .text
-    .globl syscall
+    .globl __set_tid_address
     .align 4
-    .ent syscall
+    .ent __set_tid_address
 
-syscall:
+__set_tid_address:
     .set noreorder
     .cpload $t9
-    li $v0, __NR_syscall
+    li $v0, __NR_set_tid_address
     syscall
     bnez $a3, 1f
     move $a0, $v0
@@ -20,4 +20,4 @@
     j $t9
     nop
     .set reorder
-    .end syscall
+    .end __set_tid_address
diff --git a/libc/arch-x86/bionic/_exit_with_stack_teardown.S b/libc/arch-x86/bionic/_exit_with_stack_teardown.S
index 1c6d48a..9128f10 100644
--- a/libc/arch-x86/bionic/_exit_with_stack_teardown.S
+++ b/libc/arch-x86/bionic/_exit_with_stack_teardown.S
@@ -6,17 +6,15 @@
     // We can trash %ebx here since this call should never return.
     // We can also take advantage of the fact that the linux syscall trap
     // handler saves all the registers, so we don't need a stack to keep
-    // the status argument for exit while doing the munmap */
+    // the status argument for exit while doing the munmap.
     mov     4(%esp), %ebx             // stackBase
     mov     8(%esp), %ecx             // stackSize
     mov     $__NR_munmap, %eax
     int     $0x80
-
     // If munmap failed, we ignore the failure and exit anyway.
 
     mov     %edx, %ebx                // status
     movl    $__NR_exit, %eax
     int     $0x80
-
     // The exit syscall does not return.
 END(_exit_with_stack_teardown)
diff --git a/libc/arch-x86/bionic/clone.S b/libc/arch-x86/bionic/clone.S
index 457cb4a..eb9f545 100644
--- a/libc/arch-x86/bionic/clone.S
+++ b/libc/arch-x86/bionic/clone.S
@@ -1,68 +1,7 @@
 #include <asm/unistd.h>
 #include <machine/asm.h>
 
-// int  __pthread_clone(void* (*fn)(void*), void* tls, int flags, void* arg);
-ENTRY(__pthread_clone)
-        pushl   %ebx
-        pushl   %ecx
-        movl    16(%esp), %ecx
-
-        # save tls
-        movl    %ecx, %ebx
-        # 16-byte alignment on child stack
-        andl    $~15, %ecx
-
-        # insert arguments onto the child stack
-        movl    12(%esp), %eax
-        movl    %eax, -16(%ecx)
-        movl    24(%esp), %eax
-        movl    %eax, -12(%ecx)
-        movl    %ebx, -8(%ecx)
-
-        subl    $16, %ecx
-        movl    20(%esp), %ebx
-
-        # make system call
-        movl    $__NR_clone, %eax
-        int     $0x80
-
-        cmpl    $0, %eax
-        je      pc_child
-        jg      pc_parent
-
-        # an error occurred, set errno and return -1
-        negl    %eax
-        pushl   %eax
-        call    __set_errno
-        addl    $4, %esp
-        orl     $-1, %eax
-        jmp     pc_return
-
-pc_child:
-        # we're in the child thread now, call __thread_entry
-        # with the appropriate arguments on the child stack
-        # we already placed most of them
-        call    __thread_entry
-        hlt
-
-pc_parent:
-        # we're the parent; nothing to do.
-pc_return:
-        popl    %ecx
-        popl    %ebx
-        ret
-END(__pthread_clone)
-
-
-/*
- * int  __bionic_clone(unsigned long clone_flags,
- *                     void*         newsp,
- *                     int           *parent_tidptr,
- *                     void          *new_tls,
- *                     int           *child_tidptr,
- *                     int           (*fn)(void *),
- *                     void          *arg);
- */
+// pid_t __bionic_clone(int flags, void* child_stack, pid_t* parent_tid, void* tls, pid_t* child_tid, int (*fn)(void*), void* arg);
 ENTRY(__bionic_clone)
         pushl   %ebx
         pushl   %esi
diff --git a/libc/arch-x86/syscalls.mk b/libc/arch-x86/syscalls.mk
index 70e6e39..b6a1e38 100644
--- a/libc/arch-x86/syscalls.mk
+++ b/libc/arch-x86/syscalls.mk
@@ -24,6 +24,7 @@
 syscall_src += arch-x86/syscalls/__rt_sigtimedwait.S
 syscall_src += arch-x86/syscalls/__sched_getaffinity.S
 syscall_src += arch-x86/syscalls/__set_thread_area.S
+syscall_src += arch-x86/syscalls/__set_tid_address.S
 syscall_src += arch-x86/syscalls/__sigaction.S
 syscall_src += arch-x86/syscalls/__statfs64.S
 syscall_src += arch-x86/syscalls/__syslog.S
diff --git a/libc/arch-x86/syscalls/__set_tid_address.S b/libc/arch-x86/syscalls/__set_tid_address.S
new file mode 100644
index 0000000..0c66d47
--- /dev/null
+++ b/libc/arch-x86/syscalls/__set_tid_address.S
@@ -0,0 +1,20 @@
+/* Generated by gensyscalls.py. Do not edit. */
+
+#include <private/bionic_asm.h>
+
+ENTRY(__set_tid_address)
+    pushl   %ebx
+    mov     8(%esp), %ebx
+    movl    $__NR_set_tid_address, %eax
+    int     $0x80
+    cmpl    $-MAX_ERRNO, %eax
+    jb      1f
+    negl    %eax
+    pushl   %eax
+    call    __set_errno
+    addl    $4, %esp
+    orl     $-1, %eax
+1:
+    popl    %ebx
+    ret
+END(__set_tid_address)
diff --git a/libc/arch-x86_64/bionic/_exit_with_stack_teardown.S b/libc/arch-x86_64/bionic/_exit_with_stack_teardown.S
index a09babe..eca3b68 100644
--- a/libc/arch-x86_64/bionic/_exit_with_stack_teardown.S
+++ b/libc/arch-x86_64/bionic/_exit_with_stack_teardown.S
@@ -36,12 +36,10 @@
     // the status argument for exit(2) while doing the munmap(2).
     mov     $__NR_munmap, %eax
     syscall
-
-    // If munmap failed, ignore the failure and exit anyway.
+    // If munmap failed, we ignore the failure and exit anyway.
 
     mov     %rdx, %rdi        // status
     mov     $__NR_exit, %eax
     syscall
-
     // The exit syscall does not return.
 END(_exit_with_stack_teardown)
diff --git a/libc/arch-x86_64/bionic/clone.S b/libc/arch-x86_64/bionic/clone.S
index 7511e86..b37416b 100644
--- a/libc/arch-x86_64/bionic/clone.S
+++ b/libc/arch-x86_64/bionic/clone.S
@@ -29,53 +29,7 @@
 #include <asm/unistd.h>
 #include <machine/asm.h>
 
-// int  __pthread_clone(void* (*fn)(void*), void* tls, int flags, void* arg);
-ENTRY(__pthread_clone)
-        # Save tls.
-        movq    %rsi, %r11
-        # Enforce 16-byte alignment for child stack.
-        andq    $~15, %rsi
-
-        # Copy 'fn', 'arg', and 'tls' onto the child stack.
-        movq    %rdi, -32(%rsi)  # fn
-        movq    %rcx, -24(%rsi)  # arg
-        movq    %r11, -16(%rsi)  # tls
-        subq    $32, %rsi
-
-        movq    %rdx, %rdi
-        movl    $__NR_clone, %eax
-        syscall
-        testl   %eax, %eax
-        jns     1f
-
-        # An error occurred, set errno and return -1.
-        negl    %eax
-        movl    %eax, %edi
-        call    __set_errno
-        orl     $-1, %eax
-        jmp     2f
-1:
-        jnz     2f
-
-        # We're in the child now, so call __thread_entry
-        # with the arguments from the child stack moved into
-        # the appropriate registers. We avoid pop here to keep
-        # the required 16-byte stack alignment.
-        movq    (%rsp), %rdi    # fn
-        movq    8(%rsp), %rsi   # arg
-        movq    16(%rsp), %rdx  # tls
-        call    __thread_entry
-        hlt
-2:
-        ret
-
-// int __bionic_clone(unsigned long clone_flags,
-//                    void* new_sp,
-//                    int* parent_tid_ptr,
-//                    void* new_tls,
-//                    int* child_tid_ptr,
-//                    int (*fn)(void*),
-//                    void* arg);
+// pid_t __bionic_clone(int flags, void* child_stack, pid_t* parent_tid, void* tls, pid_t* child_tid, int (*fn)(void*), void* arg);
 ENTRY(__bionic_clone)
         # Enforce 16-byte alignment for child stack.
         andq    $~15, %rsi
diff --git a/libc/arch-x86_64/syscalls.mk b/libc/arch-x86_64/syscalls.mk
index c874b61..50d9ab3 100644
--- a/libc/arch-x86_64/syscalls.mk
+++ b/libc/arch-x86_64/syscalls.mk
@@ -20,6 +20,7 @@
 syscall_src += arch-x86_64/syscalls/__rt_sigsuspend.S
 syscall_src += arch-x86_64/syscalls/__rt_sigtimedwait.S
 syscall_src += arch-x86_64/syscalls/__sched_getaffinity.S
+syscall_src += arch-x86_64/syscalls/__set_tid_address.S
 syscall_src += arch-x86_64/syscalls/__syslog.S
 syscall_src += arch-x86_64/syscalls/__timer_create.S
 syscall_src += arch-x86_64/syscalls/__timer_delete.S
diff --git a/libc/arch-x86_64/syscalls/__set_tid_address.S b/libc/arch-x86_64/syscalls/__set_tid_address.S
new file mode 100644
index 0000000..fe7260f
--- /dev/null
+++ b/libc/arch-x86_64/syscalls/__set_tid_address.S
@@ -0,0 +1,17 @@
+/* Generated by gensyscalls.py. Do not edit. */
+
+#include <private/bionic_asm.h>
+
+ENTRY(__set_tid_address)
+    movl    $__NR_set_tid_address, %eax
+    syscall
+    cmpq    $-MAX_ERRNO, %rax
+    jb      1f
+    negl    %eax
+    movl    %eax, %edi
+    call    __set_errno
+    orq     $-1, %rax
+1:
+    ret
+END(__set_tid_address)
+.hidden _C_LABEL(__set_tid_address)
diff --git a/libc/bionic/__thread_entry.cpp b/libc/bionic/__thread_entry.cpp
deleted file mode 100644
index 8300a64..0000000
--- a/libc/bionic/__thread_entry.cpp
+++ /dev/null
@@ -1,57 +0,0 @@
-/*
- * Copyright (C) 2008 The Android Open Source Project
- * All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- *  * Redistributions of source code must retain the above copyright
- *    notice, this list of conditions and the following disclaimer.
- *  * Redistributions in binary form must reproduce the above copyright
- *    notice, this list of conditions and the following disclaimer in
- *    the documentation and/or other materials provided with the
- *    distribution.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
- * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
- * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
- * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
- * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
- * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS
- * OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
- * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
- * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
- * SUCH DAMAGE.
- */
-
-#include <pthread.h>
-
-#include "pthread_internal.h"
-
-#include "private/bionic_tls.h"
-
-// This trampoline is called from the assembly _pthread_clone function.
-// Our 'tls' and __pthread_clone's 'child_stack' are one and the same, just growing in
-// opposite directions.
-extern "C" void __thread_entry(void* (*func)(void*), void* arg, void** tls) {
-  // Wait for our creating thread to release us. This lets it have time to
-  // notify gdb about this thread before we start doing anything.
-  // This also provides the memory barrier needed to ensure that all memory
-  // accesses previously made by the creating thread are visible to us.
-  pthread_mutex_t* start_mutex = (pthread_mutex_t*) &tls[TLS_SLOT_SELF];
-  pthread_mutex_lock(start_mutex);
-  pthread_mutex_destroy(start_mutex);
-
-  pthread_internal_t* thread = (pthread_internal_t*) tls[TLS_SLOT_THREAD_ID];
-  thread->tls = tls;
-  __init_tls(thread);
-
-  if ((thread->internal_flags & PTHREAD_INTERNAL_FLAG_THREAD_INIT_FAILED) != 0) {
-    pthread_exit(NULL);
-  }
-
-  void* result = func(arg);
-  pthread_exit(result);
-}
diff --git a/libc/bionic/bionic_clone.c b/libc/bionic/bionic_clone.c
index 8a17e13..518d996 100644
--- a/libc/bionic/bionic_clone.c
+++ b/libc/bionic/bionic_clone.c
@@ -31,14 +31,7 @@
 #include <stdarg.h>
 #include <stdio.h>
 
-extern int  __bionic_clone(unsigned long   clone_flags,
-                           void*           newsp,
-                           int            *parent_tidptr,
-                           void           *new_tls,
-                           int            *child_tidptr,
-                           int            (*fn)(void *),
-                           void          *arg);
-
+extern pid_t __bionic_clone(uint32_t flags, void* child_stack, int* parent_tid, void* tls, int* child_tid, int (*fn)(void*), void* arg);
 extern void __exit(int status);
 
 /* this function is called from the __bionic_clone
diff --git a/libc/bionic/fork.cpp b/libc/bionic/fork.cpp
index 339a0e8..f7d1c11 100644
--- a/libc/bionic/fork.cpp
+++ b/libc/bionic/fork.cpp
@@ -41,7 +41,12 @@
   __timer_table_start_stop(1);
   __bionic_atfork_run_prepare();
 
-  int result = __clone(SIGCHLD, NULL, NULL, NULL, NULL);
+  pthread_internal_t* self = __get_thread();
+#if defined(__x86_64__)
+  int result = __clone(CLONE_CHILD_SETTID | CLONE_CHILD_CLEARTID | SIGCHLD, NULL, NULL, &(self->tid), NULL);
+#else
+  int result = __clone(CLONE_CHILD_SETTID | CLONE_CHILD_CLEARTID | SIGCHLD, NULL, NULL, NULL, &(self->tid));
+#endif
   if (result != 0) {  // Not a child process.
     __timer_table_start_stop(0);
     __bionic_atfork_run_parent();
diff --git a/libc/bionic/libc_init_common.cpp b/libc/bionic/libc_init_common.cpp
index f88a26d..130c287 100644
--- a/libc/bionic/libc_init_common.cpp
+++ b/libc/bionic/libc_init_common.cpp
@@ -49,6 +49,8 @@
 extern "C" abort_msg_t** __abort_message_ptr;
 extern "C" uintptr_t __get_sp(void);
 extern "C" int __system_properties_init(void);
+extern "C" int __set_tls(void* ptr);
+extern "C" int __set_tid_address(int* tid_address);
 
 // Not public, but well-known in the BSDs.
 const char* __progname;
@@ -89,14 +91,24 @@
   uintptr_t stack_bottom = stack_top - stack_size;
 
   static void* tls[BIONIC_TLS_SLOTS];
-  static pthread_internal_t thread;
-  thread.tid = gettid();
-  thread.tls = tls;
-  pthread_attr_init(&thread.attr);
-  pthread_attr_setstack(&thread.attr, (void*) stack_bottom, stack_size);
-  _init_thread(&thread, false);
-  __init_tls(&thread);
+  static pthread_internal_t main_thread;
+  main_thread.tls = tls;
+
+  // Tell the kernel to clear our tid field when we exit, so we're like any other pthread.
+  main_thread.tid = __set_tid_address(&main_thread.tid);
+
+  // We already have a stack, and we don't want to free it up on exit (because things like
+  // environment variables with global scope live on it).
+  pthread_attr_init(&main_thread.attr);
+  pthread_attr_setstack(&main_thread.attr, (void*) stack_bottom, stack_size);
+  main_thread.attr.flags = PTHREAD_ATTR_FLAG_USER_ALLOCATED_STACK;
+
+  _init_thread(&main_thread, false);
+  __init_tls(&main_thread);
+  __set_tls(main_thread.tls);
   tls[TLS_SLOT_BIONIC_PREINIT] = &args;
+
+  __init_alternate_signal_stack(&main_thread);
 }
 
 void __libc_init_common(KernelArgumentBlock& args) {
diff --git a/libc/bionic/pthread_create.cpp b/libc/bionic/pthread_create.cpp
index 2153310..dde5ed7 100644
--- a/libc/bionic/pthread_create.cpp
+++ b/libc/bionic/pthread_create.cpp
@@ -40,7 +40,7 @@
 #include "private/ErrnoRestorer.h"
 #include "private/ScopedPthreadMutexLocker.h"
 
-extern "C" int __pthread_clone(void* (*fn)(void*), void* child_stack, int flags, void* arg);
+extern "C" pid_t __bionic_clone(uint32_t flags, void* child_stack, int* parent_tid, void* tls, int* child_tid, int (*fn)(void*), void* arg);
 
 #ifdef __i386__
 #define ATTRIBUTES __attribute__((noinline)) __attribute__((fastcall))
@@ -50,15 +50,14 @@
 
 extern "C" void ATTRIBUTES _thread_created_hook(pid_t thread_id);
 
-extern "C" int __set_tls(void* ptr);
-
 static pthread_mutex_t gPthreadStackCreationLock = PTHREAD_MUTEX_INITIALIZER;
 
 static pthread_mutex_t gDebuggerNotificationLock = PTHREAD_MUTEX_INITIALIZER;
 
+// This code is used both by each new pthread and the code that initializes the main thread.
 void  __init_tls(pthread_internal_t* thread) {
-  // Zero-initialize all the slots.
-  for (size_t i = 0; i < BIONIC_TLS_SLOTS; ++i) {
+  // Zero-initialize all the slots after TLS_SLOT_SELF and TLS_SLOT_THREAD_ID.
+  for (size_t i = TLS_SLOT_ERRNO; i < BIONIC_TLS_SLOTS; ++i) {
     thread->tls[i] = NULL;
   }
 
@@ -67,11 +66,10 @@
   thread->tls[TLS_SLOT_THREAD_ID] = thread;
   // GCC looks in the TLS for the stack guard on x86, so copy it there from our global.
   thread->tls[TLS_SLOT_STACK_GUARD] = (void*) __stack_chk_guard;
+}
 
-  __set_tls(thread->tls);
-
+void __init_alternate_signal_stack(pthread_internal_t* thread) {
   // Create and set an alternate signal stack.
-  // This must happen after __set_tls, in case a system call fails and tries to set errno.
   stack_t ss;
   ss.ss_sp = mmap(NULL, SIGSTKSZ, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_ANONYMOUS, 0, 0);
   if (ss.ss_sp != MAP_FAILED) {
@@ -99,7 +97,6 @@
     }
   }
 
-  pthread_cond_init(&thread->join_cond, NULL);
   thread->cleanup_stack = NULL;
 
   if (add_to_thread_list) {
@@ -136,6 +133,31 @@
   return stack;
 }
 
+static int __pthread_start(void* arg) {
+  pthread_internal_t* thread = reinterpret_cast<pthread_internal_t*>(arg);
+
+  // Wait for our creating thread to release us. This lets it have time to
+  // notify gdb about this thread before we start doing anything.
+  // This also provides the memory barrier needed to ensure that all memory
+  // accesses previously made by the creating thread are visible to us.
+  pthread_mutex_t* start_mutex = (pthread_mutex_t*) &thread->tls[TLS_SLOT_START_MUTEX];
+  pthread_mutex_lock(start_mutex);
+  pthread_mutex_destroy(start_mutex);
+
+  __init_tls(thread);
+
+  __init_alternate_signal_stack(thread);
+
+  if ((thread->internal_flags & PTHREAD_INTERNAL_FLAG_THREAD_INIT_FAILED) != 0) {
+    pthread_exit(NULL);
+  }
+
+  void* result = thread->start_routine(thread->start_routine_arg);
+  pthread_exit(result);
+
+  return 0;
+}
+
 int pthread_create(pthread_t* thread_out, pthread_attr_t const* attr,
                    void* (*start_routine)(void*), void* arg) {
   ErrnoRestorer errno_restorer;
@@ -181,26 +203,33 @@
   // The child stack is the same address, just growing in the opposite direction.
   // At offsets >= 0, we have the TLS slots.
   // At offsets < 0, we have the child stack.
-  void** tls = (void**)((uint8_t*)(thread->attr.stack_base) + thread->attr.stack_size - BIONIC_TLS_SLOTS * sizeof(void*));
-  void* child_stack = tls;
+  thread->tls = (void**)((uint8_t*)(thread->attr.stack_base) + thread->attr.stack_size - BIONIC_TLS_SLOTS * sizeof(void*));
+  void* child_stack = thread->tls;
 
-  // Create a mutex for the thread in TLS_SLOT_SELF to wait on once it starts so we can keep
+  // Create a mutex for the thread in TLS to wait on once it starts so we can keep
   // it from doing anything until after we notify the debugger about it
   //
   // This also provides the memory barrier we need to ensure that all
   // memory accesses previously performed by this thread are visible to
   // the new thread.
-  pthread_mutex_t* start_mutex = (pthread_mutex_t*) &tls[TLS_SLOT_SELF];
+  pthread_mutex_t* start_mutex = (pthread_mutex_t*) &thread->tls[TLS_SLOT_START_MUTEX];
   pthread_mutex_init(start_mutex, NULL);
-  ScopedPthreadMutexLocker start_locker(start_mutex);
+  pthread_mutex_lock(start_mutex);
 
-  tls[TLS_SLOT_THREAD_ID] = thread;
+  thread->tls[TLS_SLOT_THREAD_ID] = thread;
 
-  int flags = CLONE_FILES | CLONE_FS | CLONE_VM | CLONE_SIGHAND | CLONE_THREAD | CLONE_SYSVSEM;
+  thread->start_routine = start_routine;
+  thread->start_routine_arg = arg;
 
-  int tid = __pthread_clone(start_routine, child_stack, flags, arg);
-  if (tid < 0) {
+  int flags = CLONE_VM | CLONE_FS | CLONE_FILES | CLONE_SIGHAND | CLONE_THREAD | CLONE_SYSVSEM |
+      CLONE_SETTLS | CLONE_PARENT_SETTID | CLONE_CHILD_CLEARTID;
+  int rc = __bionic_clone(flags, child_stack, &(thread->tid), thread->tls, &(thread->tid), __pthread_start, thread);
+  if (rc == -1) {
     int clone_errno = errno;
+    // We don't have to unlock the mutex at all because clone(2) failed so there's no child waiting to
+    // be unblocked, but we're about to unmap the memory the mutex is stored in, so this serves as a
+    // reminder that you can't rewrite this function to use a ScopedPthreadMutexLocker.
+    pthread_mutex_unlock(start_mutex);
     if ((thread->attr.flags & PTHREAD_ATTR_FLAG_USER_ALLOCATED_STACK) == 0) {
       munmap(thread->attr.stack_base, thread->attr.stack_size);
     }
@@ -209,12 +238,10 @@
     return clone_errno;
   }
 
-  thread->tid = tid;
-
   int init_errno = _init_thread(thread, true);
   if (init_errno != 0) {
-    // Mark the thread detached and let its __thread_entry run to
-    // completion. (It'll just exit immediately, cleaning up its resources.)
+    // Mark the thread detached and let its __pthread_start run to completion.
+    // It'll check this flag and exit immediately, cleaning up its resources.
     thread->internal_flags |= PTHREAD_INTERNAL_FLAG_THREAD_INIT_FAILED;
     thread->attr.flags |= PTHREAD_ATTR_FLAG_DETACHED;
     return init_errno;
@@ -226,8 +253,9 @@
     _thread_created_hook(thread->tid);
   }
 
-  // Publish the pthread_t and let the thread run.
-  *thread_out = (pthread_t) thread;
+  // Publish the pthread_t and unlock the mutex to let the new thread start running.
+  *thread_out = reinterpret_cast<pthread_t>(thread);
+  pthread_mutex_unlock(start_mutex);
 
   return 0;
 }
diff --git a/libc/bionic/pthread_exit.cpp b/libc/bionic/pthread_exit.cpp
index cc86271..22c2c3c 100644
--- a/libc/bionic/pthread_exit.cpp
+++ b/libc/bionic/pthread_exit.cpp
@@ -57,8 +57,9 @@
   }
 }
 
-void pthread_exit(void* retval) {
+void pthread_exit(void* return_value) {
   pthread_internal_t* thread = __get_thread();
+  thread->return_value = return_value;
 
   // Call the cleanup handlers first.
   while (thread->cleanup_stack) {
@@ -90,10 +91,9 @@
   size_t stack_size = thread->attr.stack_size;
   bool user_allocated_stack = ((thread->attr.flags & PTHREAD_ATTR_FLAG_USER_ALLOCATED_STACK) != 0);
 
-  // If the thread is detached, destroy the pthread_internal_t,
-  // otherwise keep it in memory and signal any joiners.
   pthread_mutex_lock(&gThreadListLock);
-  if (thread->attr.flags & PTHREAD_ATTR_FLAG_DETACHED) {
+  if ((thread->attr.flags & PTHREAD_ATTR_FLAG_DETACHED) != 0) {
+    // The thread is detached, so we can destroy the pthread_internal_t.
     _pthread_internal_remove_locked(thread);
   } else {
     // Make sure that the pthread_internal_t doesn't have stale pointers to a stack that
@@ -103,15 +103,8 @@
       thread->attr.stack_size = 0;
       thread->tls = NULL;
     }
-
-    // Indicate that the thread has exited for joining threads.
-    thread->attr.flags |= PTHREAD_ATTR_FLAG_ZOMBIE;
-    thread->return_value = retval;
-
-    // Signal the joining thread if present.
-    if (thread->attr.flags & PTHREAD_ATTR_FLAG_JOINED) {
-      pthread_cond_signal(&thread->join_cond);
-    }
+    // pthread_join is responsible for destroying the pthread_internal_t for non-detached threads.
+    // The kernel will futex_wake on the pthread_internal_t::tid field to wake pthread_join.
   }
   pthread_mutex_unlock(&gThreadListLock);
 
@@ -131,6 +124,6 @@
     _exit_with_stack_teardown(stack_base, stack_size, 0);
   }
 
-  /* NOTREACHED, but we told the compiler this function is noreturn, and it doesn't believe us. */
+  // NOTREACHED, but we told the compiler this function is noreturn, and it doesn't believe us.
   abort();
 }
diff --git a/libc/bionic/pthread_internal.h b/libc/bionic/pthread_internal.h
index 8cca83a..de1ef26 100644
--- a/libc/bionic/pthread_internal.h
+++ b/libc/bionic/pthread_internal.h
@@ -31,29 +31,36 @@
 #include <pthread.h>
 
 struct pthread_internal_t {
-    struct pthread_internal_t*  next;
-    struct pthread_internal_t*  prev;
-    pthread_attr_t              attr;
-    pid_t                       tid;
-    bool                        allocated_on_heap;
-    pthread_cond_t              join_cond;
-    void*                       return_value;
-    int                         internal_flags;
-    __pthread_cleanup_t*        cleanup_stack;
-    void**                      tls;         /* thread-local storage area */
+  struct pthread_internal_t* next;
+  struct pthread_internal_t* prev;
 
-    void* alternate_signal_stack;
+  pid_t tid;
 
-    /*
-     * The dynamic linker implements dlerror(3), which makes it hard for us to implement this
-     * per-thread buffer by simply using malloc(3) and free(3).
-     */
+  void** tls;
+
+  pthread_attr_t attr;
+  bool allocated_on_heap; /* TODO: move this into attr.flags? */
+  int internal_flags; /* TODO: move this into attr.flags? */
+
+  __pthread_cleanup_t* cleanup_stack;
+
+  void* (*start_routine)(void*);
+  void* start_routine_arg;
+  void* return_value;
+
+  void* alternate_signal_stack;
+
+  /*
+   * The dynamic linker implements dlerror(3), which makes it hard for us to implement this
+   * per-thread buffer by simply using malloc(3) and free(3).
+   */
 #define __BIONIC_DLERROR_BUFFER_SIZE 512
-    char dlerror_buffer[__BIONIC_DLERROR_BUFFER_SIZE];
+  char dlerror_buffer[__BIONIC_DLERROR_BUFFER_SIZE];
 };
 
 __LIBC_HIDDEN__ int _init_thread(pthread_internal_t* thread, bool add_to_thread_list);
 __LIBC_HIDDEN__ void __init_tls(pthread_internal_t* thread);
+__LIBC_HIDDEN__ void __init_alternate_signal_stack(pthread_internal_t*);
 __LIBC_HIDDEN__ void _pthread_internal_add(pthread_internal_t* thread);
 __LIBC_HIDDEN__ pthread_internal_t* __get_thread(void);
 
@@ -69,9 +76,6 @@
 /* Has the thread been joined by another thread? */
 #define PTHREAD_ATTR_FLAG_JOINED 0x00000004
 
-/* Has the thread already exited but not been joined? */
-#define PTHREAD_ATTR_FLAG_ZOMBIE 0x00000008
-
 #define PTHREAD_INTERNAL_FLAG_THREAD_INIT_FAILED 1
 
 /*
diff --git a/libc/bionic/pthread_join.cpp b/libc/bionic/pthread_join.cpp
index 7e022c2..0cbed62 100644
--- a/libc/bionic/pthread_join.cpp
+++ b/libc/bionic/pthread_join.cpp
@@ -28,33 +28,50 @@
 
 #include <errno.h>
 
+#include "private/bionic_futex.h"
 #include "pthread_accessor.h"
 
-int pthread_join(pthread_t t, void** ret_val) {
+int pthread_join(pthread_t t, void** return_value) {
   if (t == pthread_self()) {
     return EDEADLK;
   }
 
-  pthread_accessor thread(t);
-  if (thread.get() == NULL) {
+  pid_t tid;
+  volatile int* tid_ptr;
+  {
+    pthread_accessor thread(t);
+    if (thread.get() == NULL) {
       return ESRCH;
+    }
+
+    if ((thread->attr.flags & PTHREAD_ATTR_FLAG_DETACHED) != 0) {
+      return EINVAL;
+    }
+
+    if ((thread->attr.flags & PTHREAD_ATTR_FLAG_JOINED) != 0) {
+      return EINVAL;
+    }
+
+    // Okay, looks like we can signal our intention to join.
+    thread->attr.flags |= PTHREAD_ATTR_FLAG_JOINED;
+    tid = thread->tid;
+    tid_ptr = &thread->tid;
   }
 
-  if (thread->attr.flags & PTHREAD_ATTR_FLAG_DETACHED) {
-    return EINVAL;
+  // We set the PTHREAD_ATTR_FLAG_JOINED flag with the lock held,
+  // so no one is going to remove this thread except us.
+
+  // Wait for the thread to actually exit, if it hasn't already.
+  while (*tid_ptr != 0) {
+    __futex_wait(tid_ptr, tid, NULL);
   }
 
-  if (thread->attr.flags & PTHREAD_ATTR_FLAG_JOINED) {
-    return EINVAL;
-  }
+  // Take the lock again so we can pull the thread's return value
+  // and remove the thread from the list.
+  pthread_accessor thread(t);
 
-  // Signal our intention to join, and wait for the thread to exit.
-  thread->attr.flags |= PTHREAD_ATTR_FLAG_JOINED;
-  while ((thread->attr.flags & PTHREAD_ATTR_FLAG_ZOMBIE) == 0) {
-    pthread_cond_wait(&thread->join_cond, &gThreadListLock);
-  }
-  if (ret_val) {
-    *ret_val = thread->return_value;
+  if (return_value) {
+    *return_value = thread->return_value;
   }
 
   _pthread_internal_remove_locked(thread.get());
diff --git a/libc/bionic/pthread_key.cpp b/libc/bionic/pthread_key.cpp
index 440a715..6cc68af 100644
--- a/libc/bionic/pthread_key.cpp
+++ b/libc/bionic/pthread_key.cpp
@@ -215,10 +215,10 @@
     // Skip zombie threads. They don't have a valid TLS area any more.
     // Similarly, it is possible to have t->tls == NULL for threads that
     // were just recently created through pthread_create() but whose
-    // startup trampoline (__thread_entry) hasn't been run yet by the
+    // startup trampoline (__pthread_start) hasn't been run yet by the
     // scheduler. t->tls will also be NULL after a thread's stack has been
     // unmapped but before the ongoing pthread_join() is finished.
-    if ((t->attr.flags & PTHREAD_ATTR_FLAG_ZOMBIE) || t->tls == NULL) {
+    if (t->tid == 0 || t->tls == NULL) {
       continue;
     }
 
diff --git a/libc/kernel/uapi/linux/version.h b/libc/kernel/uapi/linux/version.h
new file mode 100644
index 0000000..71e5234
--- /dev/null
+++ b/libc/kernel/uapi/linux/version.h
@@ -0,0 +1,20 @@
+/****************************************************************************
+ ****************************************************************************
+ ***
+ ***   This header was automatically generated from a Linux kernel header
+ ***   of the same name, to make information necessary for userspace to
+ ***   call into the kernel available to libc.  It contains only constants,
+ ***   structures, and macros generated from the original header, and thus,
+ ***   contains no copyrightable information.
+ ***
+ ***   To edit the content of this header, modify the corresponding
+ ***   source file (e.g. under external/kernel-headers/original/) then
+ ***   run bionic/libc/kernel/tools/update_all.py
+ ***
+ ***   Any manual change here will be lost the next time this script will
+ ***   be run. You've been warned!
+ ***
+ ****************************************************************************
+ ****************************************************************************/
+#define LINUX_VERSION_CODE 199428
+#define KERNEL_VERSION(a,b,c) (((a) << 16) + ((b) << 8) + (c))
diff --git a/libc/private/bionic_futex.h b/libc/private/bionic_futex.h
index bfc3520..5602af7 100644
--- a/libc/private/bionic_futex.h
+++ b/libc/private/bionic_futex.h
@@ -28,6 +28,7 @@
 #ifndef _BIONIC_FUTEX_H
 #define _BIONIC_FUTEX_H
 
+#include <linux/compiler.h> /* needed for __user in non-uapi futex.h */
 #include <linux/futex.h>
 #include <sys/cdefs.h>
 
diff --git a/libc/private/bionic_tls.h b/libc/private/bionic_tls.h
index a14bd3c..ff13fdb 100644
--- a/libc/private/bionic_tls.h
+++ b/libc/private/bionic_tls.h
@@ -51,6 +51,11 @@
   TLS_SLOT_THREAD_ID,
   TLS_SLOT_ERRNO,
 
+  /* This slot in the child's TLS is used to synchronize the parent and child
+   * during thread initialization. The child finishes with this mutex before
+   * running any code that can set errno, so we can reuse the errno slot. */
+  TLS_SLOT_START_MUTEX = TLS_SLOT_ERRNO,
+
   /* These two aren't used by bionic itself, but allow the graphics code to
    * access TLS directly rather than using the pthread API. */
   TLS_SLOT_OPENGL_API = 3,
diff --git a/tests/Android.mk b/tests/Android.mk
index 3344687..19b5447 100644
--- a/tests/Android.mk
+++ b/tests/Android.mk
@@ -58,6 +58,7 @@
     -fno-builtin \
 
 test_src_files = \
+    buffer_tests.cpp \
     dirent_test.cpp \
     eventfd_test.cpp \
     fcntl_test.cpp \
diff --git a/tests/buffer_tests.cpp b/tests/buffer_tests.cpp
new file mode 100644
index 0000000..9e6318b
--- /dev/null
+++ b/tests/buffer_tests.cpp
@@ -0,0 +1,341 @@
+/*
+ * Copyright (C) 2013 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <stdlib.h>
+#include <string.h>
+#include <sys/mman.h>
+
+#include <gtest/gtest.h>
+#include "buffer_tests.h"
+
+#define FENCEPOST_LENGTH 8
+
+static int g_single_aligns[][2] = {
+  // Both buffers at same alignment.
+  { 1, 0 },
+  { 2, 0 },
+  { 4, 0 },
+  { 8, 0 },
+  { 16, 0 },
+  { 32, 0 },
+  { 64, 0 },
+  { 128, 0 },
+
+  // General unaligned cases.
+  { 4, 1 },
+  { 4, 2 },
+  { 4, 3 },
+
+  { 8, 1 },
+  { 8, 2 },
+  { 8, 3 },
+  { 8, 4 },
+  { 8, 5 },
+  { 8, 6 },
+  { 8, 7 },
+
+  { 128, 1 },
+  { 128, 4 },
+  { 128, 8 },
+  { 128, 12 },
+  { 128, 16 },
+};
+
+static const size_t g_single_aligns_len = sizeof(g_single_aligns)/sizeof(int[2]);
+
+// Set of multiple buffer alignment combinations to be used for string/memory
+// testing routines.
+static int g_double_aligns[][4] = {
+  // Both buffers at same alignment.
+  { 1, 0, 1, 0 },
+  { 2, 0, 2, 0 },
+  { 4, 0, 4, 0 },
+  { 8, 0, 8, 0 },
+  { 16, 0, 16, 0 },
+  { 32, 0, 32, 0 },
+  { 64, 0, 64, 0 },
+  { 128, 0, 128, 0 },
+
+  // Different word alignments between buffers.
+  { 8, 0, 4, 0 },
+  { 4, 0, 8, 0 },
+  { 16, 0, 4, 0 },
+  { 4, 0, 16, 0 },
+
+  // General unaligned cases.
+  { 4, 0, 4, 1 },
+  { 4, 0, 4, 2 },
+  { 4, 0, 4, 3 },
+
+  { 4, 1, 4, 0 },
+  { 4, 1, 4, 1 },
+  { 4, 1, 4, 2 },
+  { 4, 1, 4, 3 },
+
+  { 4, 2, 4, 0 },
+  { 4, 2, 4, 1 },
+  { 4, 2, 4, 2 },
+  { 4, 2, 4, 3 },
+
+  { 4, 3, 4, 0 },
+  { 4, 3, 4, 1 },
+  { 4, 3, 4, 2 },
+  { 4, 3, 4, 3 },
+
+  { 8, 0, 8, 1 },
+  { 8, 0, 8, 2 },
+  { 8, 0, 8, 3 },
+  { 8, 0, 8, 4 },
+  { 8, 0, 8, 5 },
+  { 8, 0, 8, 6 },
+  { 8, 0, 8, 7 },
+
+  { 8, 1, 8, 0 },
+  { 8, 1, 8, 1 },
+  { 8, 1, 8, 2 },
+  { 8, 1, 8, 3 },
+  { 8, 1, 8, 4 },
+  { 8, 1, 8, 5 },
+  { 8, 1, 8, 6 },
+  { 8, 1, 8, 7 },
+
+  { 8, 2, 8, 0 },
+  { 8, 2, 8, 1 },
+  { 8, 2, 8, 2 },
+  { 8, 2, 8, 3 },
+  { 8, 2, 8, 4 },
+  { 8, 2, 8, 5 },
+  { 8, 2, 8, 6 },
+  { 8, 2, 8, 7 },
+
+  { 8, 3, 8, 0 },
+  { 8, 3, 8, 1 },
+  { 8, 3, 8, 2 },
+  { 8, 3, 8, 3 },
+  { 8, 3, 8, 4 },
+  { 8, 3, 8, 5 },
+  { 8, 3, 8, 6 },
+  { 8, 3, 8, 7 },
+
+  { 8, 4, 8, 0 },
+  { 8, 4, 8, 1 },
+  { 8, 4, 8, 2 },
+  { 8, 4, 8, 3 },
+  { 8, 4, 8, 4 },
+  { 8, 4, 8, 5 },
+  { 8, 4, 8, 6 },
+  { 8, 4, 8, 7 },
+
+  { 8, 5, 8, 0 },
+  { 8, 5, 8, 1 },
+  { 8, 5, 8, 2 },
+  { 8, 5, 8, 3 },
+  { 8, 5, 8, 4 },
+  { 8, 5, 8, 5 },
+  { 8, 5, 8, 6 },
+  { 8, 5, 8, 7 },
+
+  { 8, 6, 8, 0 },
+  { 8, 6, 8, 1 },
+  { 8, 6, 8, 2 },
+  { 8, 6, 8, 3 },
+  { 8, 6, 8, 4 },
+  { 8, 6, 8, 5 },
+  { 8, 6, 8, 6 },
+  { 8, 6, 8, 7 },
+
+  { 8, 7, 8, 0 },
+  { 8, 7, 8, 1 },
+  { 8, 7, 8, 2 },
+  { 8, 7, 8, 3 },
+  { 8, 7, 8, 4 },
+  { 8, 7, 8, 5 },
+  { 8, 7, 8, 6 },
+  { 8, 7, 8, 7 },
+
+  { 128, 1, 128, 4 },
+  { 128, 1, 128, 8 },
+  { 128, 1, 128, 12 },
+  { 128, 1, 128, 16 },
+  { 128, 4, 128, 1 },
+  { 128, 8, 128, 1 },
+  { 128, 12, 128, 1 },
+  { 128, 16, 128, 1 },
+};
+
+static const size_t g_double_aligns_len = sizeof(g_double_aligns)/sizeof(int[4]);
+
+static size_t SetIncrement(size_t len) {
+  if (len >= 4096) {
+    return 1024;
+  } else if (len >= 1024) {
+    return 256;
+  }
+  return 1;
+}
+
+// Return a pointer into the current buffer with the specified alignment.
+static void *GetAlignedPtr(void *orig_ptr, int alignment, int or_mask) {
+  uint64_t ptr = reinterpret_cast<uint64_t>(orig_ptr);
+  if (alignment > 0) {
+      // When setting the alignment, set it to exactly the alignment chosen.
+      // The pointer returned will be guaranteed not to be aligned to anything
+      // more than that.
+      ptr += alignment - (ptr & (alignment - 1));
+      ptr |= alignment | or_mask;
+  }
+
+  return reinterpret_cast<void*>(ptr);
+}
+
+static void SetFencepost(uint8_t *buffer) {
+  for (int i = 0; i < FENCEPOST_LENGTH; i += 2) {
+    buffer[i] = 0xde;
+    buffer[i+1] = 0xad;
+  }
+}
+
+static void VerifyFencepost(uint8_t *buffer) {
+  for (int i = 0; i < FENCEPOST_LENGTH; i += 2) {
+    if (buffer[i] != 0xde || buffer[i+1] != 0xad) {
+      uint8_t expected_value;
+      if (buffer[i] == 0xde) {
+        i++;
+        expected_value = 0xad;
+      } else {
+        expected_value = 0xde;
+      }
+      ASSERT_EQ(expected_value, buffer[i]);
+    }
+  }
+}
+
+void RunSingleBufferAlignTest(
+    size_t max_test_size, void (*test_func)(uint8_t*, size_t),
+    size_t (*set_incr)(size_t)) {
+  if (!set_incr) {
+    set_incr = SetIncrement;
+  }
+
+  // Allocate one large buffer with lots of extra space so that we can
+  // guarantee that the all possible alignments will fit.
+  uint8_t *buf = new uint8_t[3*max_test_size];
+
+  uint8_t *buf_align;
+  for (size_t i = 0; i < g_single_aligns_len; i++) {
+    size_t incr = 1;
+    for (size_t len = 0; len <= max_test_size; len += incr) {
+      incr = set_incr(len);
+
+      buf_align = reinterpret_cast<uint8_t*>(GetAlignedPtr(
+          buf+FENCEPOST_LENGTH, g_single_aligns[i][0], g_single_aligns[i][1]));
+
+      SetFencepost(&buf_align[-FENCEPOST_LENGTH]);
+      SetFencepost(&buf_align[len]);
+
+      test_func(buf_align, len);
+
+      if (buf_align != buf) {
+        VerifyFencepost(&buf_align[-FENCEPOST_LENGTH]);
+      }
+      VerifyFencepost(&buf_align[len]);
+    }
+  }
+  delete buf;
+}
+
+void RunSrcDstBufferAlignTest(
+    size_t max_test_size, void (*test_func)(uint8_t*, uint8_t*, size_t),
+    size_t (*set_incr)(size_t)) {
+  if (!set_incr) {
+    set_incr = SetIncrement;
+  }
+
+  // Allocate two large buffers for all of the testing.
+  uint8_t* src = new uint8_t[3*max_test_size];
+  uint8_t* dst = new uint8_t[3*max_test_size];
+
+  uint8_t* src_align;
+  uint8_t* dst_align;
+  for (size_t i = 0; i < g_double_aligns_len; i++) {
+    size_t incr = 1;
+    for (size_t len = 0; len <= max_test_size; len += incr) {
+      incr = set_incr(len);
+
+      src_align =
+          reinterpret_cast<uint8_t*>(GetAlignedPtr(
+              src+FENCEPOST_LENGTH, g_double_aligns[i][0], g_double_aligns[i][1]));
+      dst_align =
+          reinterpret_cast<uint8_t*>(GetAlignedPtr(
+              dst+FENCEPOST_LENGTH, g_double_aligns[i][2], g_double_aligns[i][3]));
+      SetFencepost(&dst_align[-FENCEPOST_LENGTH]);
+      SetFencepost(&dst_align[len]);
+
+      test_func(src_align, dst_align, len);
+
+      if (dst_align != dst) {
+        VerifyFencepost(&dst_align[-FENCEPOST_LENGTH]);
+      }
+      VerifyFencepost(&dst_align[len]);
+    }
+  }
+  delete src;
+  delete dst;
+}
+
+void RunSingleBufferOverreadTest(void (*test_func)(uint8_t*, size_t)) {
+  // In order to verify that functions are not reading past the end of the
+  // src, create data that ends exactly at an unreadable memory boundary.
+  size_t pagesize = static_cast<size_t>(sysconf(_SC_PAGE_SIZE));
+  uint8_t* memory;
+  ASSERT_TRUE(posix_memalign(reinterpret_cast<void**>(&memory), pagesize,
+                             2*pagesize) == 0);
+  memset(memory, 0x23, 2*pagesize);
+
+  // Make the second page unreadable and unwritable.
+  ASSERT_TRUE(mprotect(&memory[pagesize], pagesize, PROT_NONE) == 0);
+
+  for (size_t i = 0; i < pagesize; i++) {
+    uint8_t* buf = &memory[pagesize-i];
+
+    test_func(buf, i);
+  }
+  ASSERT_TRUE(mprotect(&memory[pagesize], pagesize, PROT_READ | PROT_WRITE) == 0);
+  free(memory);
+}
+
+void RunSrcDstBufferOverreadTest(void (*test_func)(uint8_t*, uint8_t*, size_t)) {
+  // In order to verify that functions are not reading past the end of the
+  // src, create data that ends exactly at an unreadable memory boundary.
+  size_t pagesize = static_cast<size_t>(sysconf(_SC_PAGE_SIZE));
+  uint8_t* memory;
+  ASSERT_TRUE(posix_memalign(reinterpret_cast<void**>(&memory), pagesize,
+                             2*pagesize) == 0);
+  memset(memory, 0x23, 2*pagesize);
+
+  // Make the second page unreadable and unwritable.
+  ASSERT_TRUE(mprotect(&memory[pagesize], pagesize, PROT_NONE) == 0);
+
+  uint8_t* dst = new uint8_t[pagesize];
+  for (size_t i = 0; i < pagesize; i++) {
+    uint8_t* src = &memory[pagesize-i];
+
+    test_func(src, dst, i);
+  }
+  ASSERT_TRUE(mprotect(&memory[pagesize], pagesize, PROT_READ | PROT_WRITE) == 0);
+  free(memory);
+  delete dst;
+}
diff --git a/tests/buffer_tests.h b/tests/buffer_tests.h
new file mode 100644
index 0000000..f8685a2
--- /dev/null
+++ b/tests/buffer_tests.h
@@ -0,0 +1,35 @@
+/*
+ * Copyright (C) 2013 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef _BIONIC_TESTS_BUFFER_TESTS_H
+#define _BIONIC_TESTS_BUFFER_TESTS_H
+
+#include <stdint.h>
+#include <sys/types.h>
+
+void RunSingleBufferAlignTest(
+    size_t max_test_size, void (*test_func)(uint8_t*, size_t),
+    size_t (*set_incr)(size_t) = NULL);
+
+void RunSrcDstBufferAlignTest(
+    size_t max_test_size, void (*test_func)(uint8_t*, uint8_t*, size_t),
+    size_t (*set_incr)(size_t) = NULL);
+
+void RunSingleBufferOverreadTest(void (*test_func)(uint8_t*, size_t));
+
+void RunSrcDstBufferOverreadTest(void (*test_func)(uint8_t*, uint8_t*, size_t));
+
+#endif // _BIONIC_TESTS_BUFFER_TESTS_H
diff --git a/tests/pthread_test.cpp b/tests/pthread_test.cpp
index 42bd2b9..480e455 100644
--- a/tests/pthread_test.cpp
+++ b/tests/pthread_test.cpp
@@ -20,6 +20,7 @@
 #include <inttypes.h>
 #include <limits.h>
 #include <pthread.h>
+#include <sys/mman.h>
 #include <unistd.h>
 
 TEST(pthread, pthread_key_create) {
@@ -149,22 +150,48 @@
   ASSERT_EQ(EDEADLK, pthread_join(pthread_self(), &result));
 }
 
-#if __BIONIC__ // For some reason, gtest on bionic can cope with this but gtest on glibc can't.
+struct TestBug37410 {
+  pthread_t main_thread;
+  pthread_mutex_t mutex;
 
-static void TestBug37410() {
-  pthread_t t1;
-  ASSERT_EQ(0, pthread_create(&t1, NULL, JoinFn, reinterpret_cast<void*>(pthread_self())));
-  pthread_exit(NULL);
-}
+  static void main() {
+    TestBug37410 data;
+    data.main_thread = pthread_self();
+    ASSERT_EQ(0, pthread_mutex_init(&data.mutex, NULL));
+    ASSERT_EQ(0, pthread_mutex_lock(&data.mutex));
+
+    pthread_t t;
+    ASSERT_EQ(0, pthread_create(&t, NULL, TestBug37410::thread_fn, reinterpret_cast<void*>(&data)));
+
+    // Wait for the thread to be running...
+    ASSERT_EQ(0, pthread_mutex_lock(&data.mutex));
+    ASSERT_EQ(0, pthread_mutex_unlock(&data.mutex));
+
+    // ...and exit.
+    pthread_exit(NULL);
+  }
+
+ private:
+  static void* thread_fn(void* arg) {
+    TestBug37410* data = reinterpret_cast<TestBug37410*>(arg);
+
+    // Let the main thread know we're running.
+    pthread_mutex_unlock(&data->mutex);
+
+    // And wait for the main thread to exit.
+    pthread_join(data->main_thread, NULL);
+
+    return NULL;
+  }
+};
 
 // Even though this isn't really a death test, we have to say "DeathTest" here so gtest knows to
 // run this test (which exits normally) in its own process.
 TEST(pthread_DeathTest, pthread_bug_37410) {
   // http://code.google.com/p/android/issues/detail?id=37410
   ::testing::FLAGS_gtest_death_test_style = "threadsafe";
-  ASSERT_EXIT(TestBug37410(), ::testing::ExitedWithCode(0), "");
+  ASSERT_EXIT(TestBug37410::main(), ::testing::ExitedWithCode(0), "");
 }
-#endif
 
 static void* SignalHandlerFn(void* arg) {
   sigset_t wait_set;
@@ -214,11 +241,12 @@
 }
 
 #if __BIONIC__
-extern "C" int  __pthread_clone(void* (*fn)(void*), void* child_stack, int flags, void* arg);
-TEST(pthread, __pthread_clone) {
+extern "C" pid_t __bionic_clone(int flags, void* child_stack, pid_t* parent_tid, void* tls, pid_t* child_tid, int (*fn)(void*), void* arg);
+TEST(pthread, __bionic_clone) {
+  // Check that our hand-written clone assembler sets errno correctly on failure.
   uintptr_t fake_child_stack[16];
   errno = 0;
-  ASSERT_EQ(-1, __pthread_clone(NULL, &fake_child_stack[0], CLONE_THREAD, NULL));
+  ASSERT_EQ(-1, __bionic_clone(CLONE_THREAD, &fake_child_stack[0], NULL, NULL, NULL, NULL, NULL));
   ASSERT_EQ(EINVAL, errno);
 }
 #endif
@@ -373,6 +401,24 @@
   ASSERT_EQ(0U, reinterpret_cast<uintptr_t>(join_result));
 }
 
+TEST(pthread, pthread_join__race) {
+  // http://b/11693195 --- pthread_join could return before the thread had actually exited.
+  // If the joiner unmapped the thread's stack, that could lead to SIGSEGV in the thread.
+  for (size_t i = 0; i < 1024; ++i) {
+    size_t stack_size = 64*1024;
+    void* stack = mmap(NULL, stack_size, PROT_READ|PROT_WRITE, MAP_ANON|MAP_PRIVATE, -1, 0);
+
+    pthread_attr_t a;
+    pthread_attr_init(&a);
+    pthread_attr_setstack(&a, stack, stack_size);
+
+    pthread_t t;
+    ASSERT_EQ(0, pthread_create(&t, &a, IdFn, NULL));
+    ASSERT_EQ(0, pthread_join(t, NULL));
+    ASSERT_EQ(0, munmap(stack, stack_size));
+  }
+}
+
 static void* GetActualGuardSizeFn(void* arg) {
   pthread_attr_t attributes;
   pthread_getattr_np(pthread_self(), &attributes);
diff --git a/tests/stdio_test.cpp b/tests/stdio_test.cpp
index 11bd17f..2002928 100644
--- a/tests/stdio_test.cpp
+++ b/tests/stdio_test.cpp
@@ -103,7 +103,10 @@
   fclose(fp);
   errno = 0;
   ASSERT_EQ(getdelim(&buffer, &buffer_length, ' ', fp), -1);
+  // glibc sometimes doesn't set errno in this particular case.
+#if defined(__BIONIC__)
   ASSERT_EQ(EBADF, errno);
+#endif
 }
 
 TEST(stdio, getline) {
@@ -168,7 +171,10 @@
   fclose(fp);
   errno = 0;
   ASSERT_EQ(getline(&buffer, &buffer_length, fp), -1);
+  // glibc sometimes doesn't set errno in this particular case.
+#if defined(__BIONIC__)
   ASSERT_EQ(EBADF, errno);
+#endif
 }
 
 TEST(stdio, printf_ssize_t) {
diff --git a/tests/stdlib_test.cpp b/tests/stdlib_test.cpp
index e5d7812..fa59c41 100644
--- a/tests/stdlib_test.cpp
+++ b/tests/stdlib_test.cpp
@@ -19,6 +19,7 @@
 #include <errno.h>
 #include <libgen.h>
 #include <limits.h>
+#include <pthread.h>
 #include <stdint.h>
 #include <stdlib.h>
 
@@ -132,3 +133,27 @@
   ASSERT_STREQ("bravo", entries[1].name);
   ASSERT_STREQ("charlie", entries[2].name);
 }
+
+static void* TestBug57421_child(void* arg) {
+  pthread_t main_thread = reinterpret_cast<pthread_t>(arg);
+  pthread_join(main_thread, NULL);
+  char* value = getenv("ENVIRONMENT_VARIABLE");
+  if (value == NULL) {
+    setenv("ENVIRONMENT_VARIABLE", "value", 1);
+  }
+  return NULL;
+}
+
+static void TestBug57421_main() {
+  pthread_t t;
+  ASSERT_EQ(0, pthread_create(&t, NULL, TestBug57421_child, reinterpret_cast<void*>(pthread_self())));
+  pthread_exit(NULL);
+}
+
+// Even though this isn't really a death test, we have to say "DeathTest" here so gtest knows to
+// run this test (which exits normally) in its own process.
+TEST(stdlib_DeathTest, getenv_after_main_thread_exits) {
+  // https://code.google.com/p/android/issues/detail?id=57421
+  ::testing::FLAGS_gtest_death_test_style = "threadsafe";
+  ASSERT_EXIT(TestBug57421_main(), ::testing::ExitedWithCode(0), "");
+}
diff --git a/tests/string_test.cpp b/tests/string_test.cpp
index ef43f5d..be46dc9 100644
--- a/tests/string_test.cpp
+++ b/tests/string_test.cpp
@@ -20,8 +20,11 @@
 #include <math.h>
 #include <string.h>
 
+#include "buffer_tests.h"
+
 #define KB 1024
 #define SMALL 1*KB
+#define MEDIUM 4*KB
 #define LARGE 64*KB
 
 static int signum(int i) {
@@ -885,3 +888,122 @@
     ASSERT_EQ(0, memcmp(state.ptr1, state.ptr2, state.MAX_LEN));
   }
 }
+
+static void DoMemcpyTest(uint8_t* src, uint8_t* dst, size_t len) {
+  memset(src, (len % 255) + 1, len);
+  memset(dst, 0, len);
+
+  ASSERT_EQ(dst, memcpy(dst, src, len));
+  ASSERT_TRUE(memcmp(src, dst, len) == 0);
+}
+
+TEST(string, memcpy_align) {
+  RunSrcDstBufferAlignTest(LARGE, DoMemcpyTest);
+}
+
+TEST(string, memcpy_overread) {
+  RunSrcDstBufferOverreadTest(DoMemcpyTest);
+}
+
+static void DoMemsetTest(uint8_t* buf, size_t len) {
+  for (size_t i = 0; i < len; i++) {
+    buf[i] = 0;
+  }
+  int value = (len % 255) + 1;
+  ASSERT_EQ(buf, memset(buf, value, len));
+  for (size_t i = 0; i < len; i++) {
+    ASSERT_EQ(value, buf[i]);
+  }
+}
+
+TEST(string, memset_align) {
+  RunSingleBufferAlignTest(LARGE, DoMemsetTest);
+}
+
+static void DoStrlenTest(uint8_t* buf, size_t len) {
+  if (len >= 1) {
+    memset(buf, (32 + (len % 96)), len - 1);
+    buf[len-1] = '\0';
+    ASSERT_EQ(len-1, strlen(reinterpret_cast<char*>(buf)));
+  }
+}
+
+TEST(string, strlen_align) {
+  RunSingleBufferAlignTest(LARGE, DoStrlenTest);
+}
+
+TEST(string, strlen_overread) {
+  RunSingleBufferOverreadTest(DoStrlenTest);
+}
+
+static void DoStrcpyTest(uint8_t* src, uint8_t* dst, size_t len) {
+  if (len >= 1) {
+    memset(src, (32 + (len % 96)), len - 1);
+    src[len-1] = '\0';
+    memset(dst, 0, len);
+    ASSERT_EQ(dst, reinterpret_cast<uint8_t*>(strcpy(reinterpret_cast<char*>(dst),
+                                                     reinterpret_cast<char*>(src))));
+    ASSERT_TRUE(memcmp(src, dst, len) == 0);
+  }
+}
+
+TEST(string, strcpy_align) {
+  RunSrcDstBufferAlignTest(LARGE, DoStrcpyTest);
+}
+
+TEST(string, strcpy_overread) {
+  RunSrcDstBufferOverreadTest(DoStrcpyTest);
+}
+
+// Use our own incrementer to cut down on the total number of calls.
+static size_t StrcatSetIncrement(size_t len) {
+  if (len >= 4096) {
+    return 4096;
+  } else if (len >= 1024) {
+    return 1024;
+  } else if (len >= 256) {
+    return 256;
+  }
+  return 1;
+}
+
+#define STRCAT_DST_LEN  128
+
+static void DoStrcatTest(uint8_t* src, uint8_t* dst, size_t len) {
+  if (len >= 1) {
+    int value = 32 + (len % 96);
+    memset(src, value, len - 1);
+    src[len-1] = '\0';
+
+    if (len >= STRCAT_DST_LEN) {
+      // Create a small buffer for doing quick compares in each loop.
+      uint8_t cmp_buf[STRCAT_DST_LEN];
+      // Make sure dst string contains a different value then the src string.
+      int value2 = 32 + (value + 2) % 96;
+      memset(cmp_buf, value2, sizeof(cmp_buf));
+
+      for (size_t i = 1; i <= STRCAT_DST_LEN; i++) {
+        memset(dst, value2, i-1);
+        memset(dst+i-1, 0, len-i);
+        src[len-i] = '\0';
+        ASSERT_EQ(dst, reinterpret_cast<uint8_t*>(strcat(reinterpret_cast<char*>(dst),
+                                                         reinterpret_cast<char*>(src))));
+        ASSERT_TRUE(memcmp(dst, cmp_buf, i-1) == 0);
+        ASSERT_TRUE(memcmp(src, dst+i-1, len-i+1) == 0);
+      }
+    } else {
+      dst[0] = '\0';
+      ASSERT_EQ(dst, reinterpret_cast<uint8_t*>(strcat(reinterpret_cast<char*>(dst),
+                                                       reinterpret_cast<char*>(src))));
+      ASSERT_TRUE(memcmp(src, dst, len) == 0);
+    }
+  }
+}
+
+TEST(string, strcat_align) {
+  RunSrcDstBufferAlignTest(MEDIUM, DoStrcatTest, StrcatSetIncrement);
+}
+
+TEST(string, strcat_overread) {
+  RunSrcDstBufferOverreadTest(DoStrcatTest);
+}