diff --git a/libc/bionic/fork.cpp b/libc/bionic/fork.cpp
index 339a0e8..f7d1c11 100644
--- a/libc/bionic/fork.cpp
+++ b/libc/bionic/fork.cpp
@@ -41,7 +41,12 @@
   __timer_table_start_stop(1);
   __bionic_atfork_run_prepare();
 
-  int result = __clone(SIGCHLD, NULL, NULL, NULL, NULL);
+  pthread_internal_t* self = __get_thread();
+#if defined(__x86_64__)
+  int result = __clone(CLONE_CHILD_SETTID | CLONE_CHILD_CLEARTID | SIGCHLD, NULL, NULL, &(self->tid), NULL);
+#else
+  int result = __clone(CLONE_CHILD_SETTID | CLONE_CHILD_CLEARTID | SIGCHLD, NULL, NULL, NULL, &(self->tid));
+#endif
   if (result != 0) {  // Not a child process.
     __timer_table_start_stop(0);
     __bionic_atfork_run_parent();
diff --git a/libc/bionic/libc_init_common.cpp b/libc/bionic/libc_init_common.cpp
index 3e092ae..130c287 100644
--- a/libc/bionic/libc_init_common.cpp
+++ b/libc/bionic/libc_init_common.cpp
@@ -50,6 +50,7 @@
 extern "C" uintptr_t __get_sp(void);
 extern "C" int __system_properties_init(void);
 extern "C" int __set_tls(void* ptr);
+extern "C" int __set_tid_address(int* tid_address);
 
 // Not public, but well-known in the BSDs.
 const char* __progname;
@@ -90,17 +91,24 @@
   uintptr_t stack_bottom = stack_top - stack_size;
 
   static void* tls[BIONIC_TLS_SLOTS];
-  static pthread_internal_t thread;
-  thread.tid = gettid();
-  thread.tls = tls;
-  pthread_attr_init(&thread.attr);
-  pthread_attr_setstack(&thread.attr, (void*) stack_bottom, stack_size);
-  _init_thread(&thread, false);
-  __init_tls(&thread);
-  __set_tls(thread.tls);
+  static pthread_internal_t main_thread;
+  main_thread.tls = tls;
+
+  // Tell the kernel to clear our tid field when we exit, so we're like any other pthread.
+  main_thread.tid = __set_tid_address(&main_thread.tid);
+
+  // We already have a stack, and we don't want to free it up on exit (because things like
+  // environment variables with global scope live on it).
+  pthread_attr_init(&main_thread.attr);
+  pthread_attr_setstack(&main_thread.attr, (void*) stack_bottom, stack_size);
+  main_thread.attr.flags = PTHREAD_ATTR_FLAG_USER_ALLOCATED_STACK;
+
+  _init_thread(&main_thread, false);
+  __init_tls(&main_thread);
+  __set_tls(main_thread.tls);
   tls[TLS_SLOT_BIONIC_PREINIT] = &args;
 
-  __init_alternate_signal_stack(&thread);
+  __init_alternate_signal_stack(&main_thread);
 }
 
 void __libc_init_common(KernelArgumentBlock& args) {
diff --git a/libc/bionic/pthread_create.cpp b/libc/bionic/pthread_create.cpp
index 6ed01ff..dde5ed7 100644
--- a/libc/bionic/pthread_create.cpp
+++ b/libc/bionic/pthread_create.cpp
@@ -97,7 +97,6 @@
     }
   }
 
-  pthread_cond_init(&thread->join_cond, NULL);
   thread->cleanup_stack = NULL;
 
   if (add_to_thread_list) {
@@ -215,17 +214,22 @@
   // the new thread.
   pthread_mutex_t* start_mutex = (pthread_mutex_t*) &thread->tls[TLS_SLOT_START_MUTEX];
   pthread_mutex_init(start_mutex, NULL);
-  ScopedPthreadMutexLocker start_locker(start_mutex);
+  pthread_mutex_lock(start_mutex);
 
   thread->tls[TLS_SLOT_THREAD_ID] = thread;
 
   thread->start_routine = start_routine;
   thread->start_routine_arg = arg;
 
-  int flags = CLONE_VM | CLONE_FS | CLONE_FILES | CLONE_SIGHAND | CLONE_THREAD | CLONE_SYSVSEM | CLONE_SETTLS;
-  int tid = __bionic_clone(flags, child_stack, NULL, thread->tls, NULL, __pthread_start, thread);
-  if (tid < 0) {
+  int flags = CLONE_VM | CLONE_FS | CLONE_FILES | CLONE_SIGHAND | CLONE_THREAD | CLONE_SYSVSEM |
+      CLONE_SETTLS | CLONE_PARENT_SETTID | CLONE_CHILD_CLEARTID;
+  int rc = __bionic_clone(flags, child_stack, &(thread->tid), thread->tls, &(thread->tid), __pthread_start, thread);
+  if (rc == -1) {
     int clone_errno = errno;
+    // We don't have to unlock the mutex at all because clone(2) failed so there's no child waiting to
+    // be unblocked, but we're about to unmap the memory the mutex is stored in, so this serves as a
+    // reminder that you can't rewrite this function to use a ScopedPthreadMutexLocker.
+    pthread_mutex_unlock(start_mutex);
     if ((thread->attr.flags & PTHREAD_ATTR_FLAG_USER_ALLOCATED_STACK) == 0) {
       munmap(thread->attr.stack_base, thread->attr.stack_size);
     }
@@ -234,12 +238,10 @@
     return clone_errno;
   }
 
-  thread->tid = tid;
-
   int init_errno = _init_thread(thread, true);
   if (init_errno != 0) {
-    // Mark the thread detached and let its __pthread_start run to
-    // completion. (It'll just exit immediately, cleaning up its resources.)
+    // Mark the thread detached and let its __pthread_start run to completion.
+    // It'll check this flag and exit immediately, cleaning up its resources.
     thread->internal_flags |= PTHREAD_INTERNAL_FLAG_THREAD_INIT_FAILED;
     thread->attr.flags |= PTHREAD_ATTR_FLAG_DETACHED;
     return init_errno;
@@ -251,8 +253,9 @@
     _thread_created_hook(thread->tid);
   }
 
-  // Publish the pthread_t and let the thread run.
-  *thread_out = (pthread_t) thread;
+  // Publish the pthread_t and unlock the mutex to let the new thread start running.
+  *thread_out = reinterpret_cast<pthread_t>(thread);
+  pthread_mutex_unlock(start_mutex);
 
   return 0;
 }
diff --git a/libc/bionic/pthread_exit.cpp b/libc/bionic/pthread_exit.cpp
index cc86271..22c2c3c 100644
--- a/libc/bionic/pthread_exit.cpp
+++ b/libc/bionic/pthread_exit.cpp
@@ -57,8 +57,9 @@
   }
 }
 
-void pthread_exit(void* retval) {
+void pthread_exit(void* return_value) {
   pthread_internal_t* thread = __get_thread();
+  thread->return_value = return_value;
 
   // Call the cleanup handlers first.
   while (thread->cleanup_stack) {
@@ -90,10 +91,9 @@
   size_t stack_size = thread->attr.stack_size;
   bool user_allocated_stack = ((thread->attr.flags & PTHREAD_ATTR_FLAG_USER_ALLOCATED_STACK) != 0);
 
-  // If the thread is detached, destroy the pthread_internal_t,
-  // otherwise keep it in memory and signal any joiners.
   pthread_mutex_lock(&gThreadListLock);
-  if (thread->attr.flags & PTHREAD_ATTR_FLAG_DETACHED) {
+  if ((thread->attr.flags & PTHREAD_ATTR_FLAG_DETACHED) != 0) {
+    // The thread is detached, so we can destroy the pthread_internal_t.
     _pthread_internal_remove_locked(thread);
   } else {
     // Make sure that the pthread_internal_t doesn't have stale pointers to a stack that
@@ -103,15 +103,8 @@
       thread->attr.stack_size = 0;
       thread->tls = NULL;
     }
-
-    // Indicate that the thread has exited for joining threads.
-    thread->attr.flags |= PTHREAD_ATTR_FLAG_ZOMBIE;
-    thread->return_value = retval;
-
-    // Signal the joining thread if present.
-    if (thread->attr.flags & PTHREAD_ATTR_FLAG_JOINED) {
-      pthread_cond_signal(&thread->join_cond);
-    }
+    // pthread_join is responsible for destroying the pthread_internal_t for non-detached threads.
+    // The kernel will futex_wake on the pthread_internal_t::tid field to wake pthread_join.
   }
   pthread_mutex_unlock(&gThreadListLock);
 
@@ -131,6 +124,6 @@
     _exit_with_stack_teardown(stack_base, stack_size, 0);
   }
 
-  /* NOTREACHED, but we told the compiler this function is noreturn, and it doesn't believe us. */
+  // NOTREACHED, but we told the compiler this function is noreturn, and it doesn't believe us.
   abort();
 }
diff --git a/libc/bionic/pthread_internal.h b/libc/bionic/pthread_internal.h
index d8ad544..de1ef26 100644
--- a/libc/bionic/pthread_internal.h
+++ b/libc/bionic/pthread_internal.h
@@ -31,28 +31,31 @@
 #include <pthread.h>
 
 struct pthread_internal_t {
-    struct pthread_internal_t*  next;
-    struct pthread_internal_t*  prev;
-    pthread_attr_t              attr;
-    pid_t                       tid;
-    bool                        allocated_on_heap;
-    pthread_cond_t              join_cond;
-    void*                       return_value;
-    int                         internal_flags;
-    __pthread_cleanup_t*        cleanup_stack;
-    void**                      tls;         /* thread-local storage area */
+  struct pthread_internal_t* next;
+  struct pthread_internal_t* prev;
 
-    void* (*start_routine)(void*);
-    void* start_routine_arg;
+  pid_t tid;
 
-    void* alternate_signal_stack;
+  void** tls;
 
-    /*
-     * The dynamic linker implements dlerror(3), which makes it hard for us to implement this
-     * per-thread buffer by simply using malloc(3) and free(3).
-     */
+  pthread_attr_t attr;
+  bool allocated_on_heap; /* TODO: move this into attr.flags? */
+  int internal_flags; /* TODO: move this into attr.flags? */
+
+  __pthread_cleanup_t* cleanup_stack;
+
+  void* (*start_routine)(void*);
+  void* start_routine_arg;
+  void* return_value;
+
+  void* alternate_signal_stack;
+
+  /*
+   * The dynamic linker implements dlerror(3), which makes it hard for us to implement this
+   * per-thread buffer by simply using malloc(3) and free(3).
+   */
 #define __BIONIC_DLERROR_BUFFER_SIZE 512
-    char dlerror_buffer[__BIONIC_DLERROR_BUFFER_SIZE];
+  char dlerror_buffer[__BIONIC_DLERROR_BUFFER_SIZE];
 };
 
 __LIBC_HIDDEN__ int _init_thread(pthread_internal_t* thread, bool add_to_thread_list);
@@ -73,9 +76,6 @@
 /* Has the thread been joined by another thread? */
 #define PTHREAD_ATTR_FLAG_JOINED 0x00000004
 
-/* Has the thread already exited but not been joined? */
-#define PTHREAD_ATTR_FLAG_ZOMBIE 0x00000008
-
 #define PTHREAD_INTERNAL_FLAG_THREAD_INIT_FAILED 1
 
 /*
diff --git a/libc/bionic/pthread_join.cpp b/libc/bionic/pthread_join.cpp
index 7e022c2..0cbed62 100644
--- a/libc/bionic/pthread_join.cpp
+++ b/libc/bionic/pthread_join.cpp
@@ -28,33 +28,50 @@
 
 #include <errno.h>
 
+#include "private/bionic_futex.h"
 #include "pthread_accessor.h"
 
-int pthread_join(pthread_t t, void** ret_val) {
+int pthread_join(pthread_t t, void** return_value) {
   if (t == pthread_self()) {
     return EDEADLK;
   }
 
-  pthread_accessor thread(t);
-  if (thread.get() == NULL) {
+  pid_t tid;
+  volatile int* tid_ptr;
+  {
+    pthread_accessor thread(t);
+    if (thread.get() == NULL) {
       return ESRCH;
+    }
+
+    if ((thread->attr.flags & PTHREAD_ATTR_FLAG_DETACHED) != 0) {
+      return EINVAL;
+    }
+
+    if ((thread->attr.flags & PTHREAD_ATTR_FLAG_JOINED) != 0) {
+      return EINVAL;
+    }
+
+    // Okay, looks like we can signal our intention to join.
+    thread->attr.flags |= PTHREAD_ATTR_FLAG_JOINED;
+    tid = thread->tid;
+    tid_ptr = &thread->tid;
   }
 
-  if (thread->attr.flags & PTHREAD_ATTR_FLAG_DETACHED) {
-    return EINVAL;
+  // We set the PTHREAD_ATTR_FLAG_JOINED flag with the lock held,
+  // so no one is going to remove this thread except us.
+
+  // Wait for the thread to actually exit, if it hasn't already.
+  while (*tid_ptr != 0) {
+    __futex_wait(tid_ptr, tid, NULL);
   }
 
-  if (thread->attr.flags & PTHREAD_ATTR_FLAG_JOINED) {
-    return EINVAL;
-  }
+  // Take the lock again so we can pull the thread's return value
+  // and remove the thread from the list.
+  pthread_accessor thread(t);
 
-  // Signal our intention to join, and wait for the thread to exit.
-  thread->attr.flags |= PTHREAD_ATTR_FLAG_JOINED;
-  while ((thread->attr.flags & PTHREAD_ATTR_FLAG_ZOMBIE) == 0) {
-    pthread_cond_wait(&thread->join_cond, &gThreadListLock);
-  }
-  if (ret_val) {
-    *ret_val = thread->return_value;
+  if (return_value) {
+    *return_value = thread->return_value;
   }
 
   _pthread_internal_remove_locked(thread.get());
diff --git a/libc/bionic/pthread_key.cpp b/libc/bionic/pthread_key.cpp
index f2f4d20..6cc68af 100644
--- a/libc/bionic/pthread_key.cpp
+++ b/libc/bionic/pthread_key.cpp
@@ -218,7 +218,7 @@
     // startup trampoline (__pthread_start) hasn't been run yet by the
     // scheduler. t->tls will also be NULL after a thread's stack has been
     // unmapped but before the ongoing pthread_join() is finished.
-    if ((t->attr.flags & PTHREAD_ATTR_FLAG_ZOMBIE) || t->tls == NULL) {
+    if (t->tid == 0 || t->tls == NULL) {
       continue;
     }
 
