diff --git a/libc/Android.bp b/libc/Android.bp
index 2e3cedb..74fd22b 100644
--- a/libc/Android.bp
+++ b/libc/Android.bp
@@ -831,8 +831,9 @@
 cc_library_static {
     defaults: ["libc_defaults"],
     srcs: [
-        // The fork implementation depends on pthread data, so we can't include
-        // it in libc_ndk.a.
+        // The following implementations depend on pthread data, so we can't
+        // include them in libc_ndk.a.
+        "bionic/__cxa_thread_atexit_impl.cpp",
         "bionic/fork.cpp",
 
         // The data that backs getauxval is initialized in the libc init
@@ -1454,22 +1455,6 @@
     name: "libc_bionic_ndk",
 }
 
-cc_library_static {
-    name: "libc_thread_atexit_impl",
-    defaults: ["libc_defaults"],
-    srcs: ["bionic/__cxa_thread_atexit_impl.cpp"],
-    cflags: ["-Wframe-larger-than=2048"],
-    cppflags: ["-Wold-style-cast"],
-    include_dirs: ["bionic/libstdc++/include"],
-
-    arch: {
-        arm64: {
-            // b/25662915, clang compiled __cxa_thread_atexit_impl.cpp still failed.
-            clang: false,
-        },
-    },
-}
-
 // ========================================================
 // libc_pthread.a - pthreads parts that previously lived in
 // libc_bionic.a. Relocated to their own library because
@@ -1665,7 +1650,6 @@
         "libc_pthread",
         "libc_stack_protector",
         "libc_syscalls",
-        "libc_thread_atexit_impl",
         "libc_tzcode",
     ],
 
@@ -1946,30 +1930,24 @@
     arch: {
         arm: {
             local_include_dirs: ["arch-arm/include"],
-            cflags: ["-mthumb-interwork"],
         },
         arm64: {
             local_include_dirs: ["arch-arm64/include"],
         },
         mips: {
             local_include_dirs: ["arch-mips/include"],
-            ldflags: ["-melf32ltsmip"],
         },
         mips64: {
             local_include_dirs: ["arch-mips64/include"],
-            ldflags: ["-melf64ltsmip"],
         },
         x86: {
-            cflags: ["-m32"],
-            ldflags: ["-melf_i386"],
             local_include_dirs: ["arch-x86/include"],
         },
         x86_64: {
-            cflags: ["-m64"],
-            ldflags: ["-melf_x86_64"],
             local_include_dirs: ["arch-x86_64/include"],
         },
     },
+    clang: false,
 }
 
 cc_defaults {
diff --git a/libc/Android.mk b/libc/Android.mk
index 2d97f35..4182505 100644
--- a/libc/Android.mk
+++ b/libc/Android.mk
@@ -243,9 +243,11 @@
 
 libc_bionic_src_files :=
 
-# The fork implementation depends on pthread data, so we can't include it in
-# libc_ndk.a.
-libc_bionic_src_files += bionic/fork.cpp
+# The following implementations depend on pthread data, so we can't include
+# them in libc_ndk.a.
+libc_bionic_src_files += \
+    bionic/__cxa_thread_atexit_impl.cpp \
+    bionic/fork.cpp \
 
 # The data that backs getauxval is initialized in the libc init functions which
 # are invoked by the linker. If this file is included in libc_ndk.a, only one of
@@ -586,9 +588,6 @@
     bionic/pthread_sigmask.cpp \
     bionic/pthread_spinlock.cpp \
 
-libc_thread_atexit_impl_src_files := \
-    bionic/__cxa_thread_atexit_impl.cpp \
-
 libc_arch_static_src_files := \
     bionic/dl_iterate_phdr_static.cpp \
 
@@ -1045,24 +1044,6 @@
 $(eval $(call patch-up-arch-specific-flags,LOCAL_SRC_FILES,libc_bionic_ndk_src_files))
 include $(BUILD_STATIC_LIBRARY)
 
-include $(CLEAR_VARS)
-LOCAL_SRC_FILES := $(libc_thread_atexit_impl_src_files)
-LOCAL_CFLAGS := $(libc_common_cflags) -Wframe-larger-than=2048
-
-LOCAL_CONLYFLAGS := $(libc_common_conlyflags)
-LOCAL_CPPFLAGS := $(libc_common_cppflags) -Wold-style-cast
-LOCAL_C_INCLUDES := $(libc_common_c_includes)
-LOCAL_MODULE := libc_thread_atexit_impl
-LOCAL_ADDITIONAL_DEPENDENCIES := $(libc_common_additional_dependencies)
-LOCAL_CXX_STL := none
-LOCAL_SYSTEM_SHARED_LIBRARIES :=
-LOCAL_SANITIZE := never
-LOCAL_NATIVE_COVERAGE := $(bionic_coverage)
-
-# b/25662915, clang compiled __cxa_thread_atexit_impl.cpp still failed.
-LOCAL_CLANG_arm64 := false
-
-include $(BUILD_STATIC_LIBRARY)
 
 # ========================================================
 # libc_pthread.a - pthreads parts that previously lived in
@@ -1260,7 +1241,6 @@
     libc_pthread \
     libc_stack_protector \
     libc_syscalls \
-    libc_thread_atexit_impl \
     libc_tzcode \
 
 LOCAL_WHOLE_STATIC_LIBRARIES_arm := libc_aeabi
diff --git a/libc/bionic/__cxa_thread_atexit_impl.cpp b/libc/bionic/__cxa_thread_atexit_impl.cpp
index 0e427d3..0e903b9 100644
--- a/libc/bionic/__cxa_thread_atexit_impl.cpp
+++ b/libc/bionic/__cxa_thread_atexit_impl.cpp
@@ -15,6 +15,8 @@
  */
 #include <sys/cdefs.h>
 
+#include "pthread_internal.h"
+
 struct thread_local_dtor {
   void (*func) (void *);
   void *arg;
@@ -22,25 +24,24 @@
   thread_local_dtor* next;
 };
 
-static __thread thread_local_dtor* thread_local_dtors = nullptr;
-
 extern "C" int __cxa_thread_atexit_impl(void (*func) (void *), void *arg, void *dso_handle) {
   thread_local_dtor* dtor = new thread_local_dtor();
 
   dtor->func = func;
   dtor->arg = arg;
   dtor->dso_handle = dso_handle;
-  dtor->next = thread_local_dtors;
 
-  thread_local_dtors = dtor;
-
+  pthread_internal_t* thread = __get_thread();
+  dtor->next = thread->thread_local_dtors;
+  thread->thread_local_dtors = dtor;
   return 0;
 }
 
 extern "C" __LIBC_HIDDEN__ void __cxa_thread_finalize() {
-  while (thread_local_dtors != nullptr) {
-    thread_local_dtor* current = thread_local_dtors;
-    thread_local_dtors = current->next;
+  pthread_internal_t* thread = __get_thread();
+  while (thread->thread_local_dtors != nullptr) {
+    thread_local_dtor* current = thread->thread_local_dtors;
+    thread->thread_local_dtors = current->next;
 
     current->func(current->arg);
     delete current;
diff --git a/libc/bionic/pthread_internal.h b/libc/bionic/pthread_internal.h
index d5d62a7..f96e9d2 100644
--- a/libc/bionic/pthread_internal.h
+++ b/libc/bionic/pthread_internal.h
@@ -52,6 +52,8 @@
   THREAD_DETACHED
 };
 
+struct thread_local_dtor;
+
 struct pthread_internal_t {
   struct pthread_internal_t* next;
   struct pthread_internal_t* prev;
@@ -94,6 +96,8 @@
 
   size_t mmap_size;
 
+  thread_local_dtor* thread_local_dtors;
+
   void* tls[BIONIC_TLS_SLOTS];
 
   pthread_key_data_t key_data[BIONIC_PTHREAD_KEY_COUNT];
diff --git a/tests/pthread_test.cpp b/tests/pthread_test.cpp
index f6d3501..27d992b 100755
--- a/tests/pthread_test.cpp
+++ b/tests/pthread_test.cpp
@@ -159,7 +159,7 @@
   pthread_key_t key;
   ASSERT_EQ(0, pthread_key_create(&key, NULL));
 
-  size_t stack_size = 128 * 1024;
+  size_t stack_size = 640 * 1024;
   void* stack = mmap(NULL, stack_size, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_ANONYMOUS, -1, 0);
   ASSERT_NE(MAP_FAILED, stack);
   memset(stack, 0xff, stack_size);
@@ -217,13 +217,13 @@
     while (spin_flag_) {}
     return NULL;
   }
-  static volatile bool spin_flag_;
+  static std::atomic<bool> spin_flag_;
 };
 
 // It doesn't matter if spin_flag_ is used in several tests,
 // because it is always set to false after each test. Each thread
 // loops on spin_flag_ can find it becomes false at some time.
-volatile bool SpinFunctionHelper::spin_flag_ = false;
+std::atomic<bool> SpinFunctionHelper::spin_flag_;
 
 static void* JoinFn(void* arg) {
   return reinterpret_cast<void*>(pthread_join(reinterpret_cast<pthread_t>(arg), NULL));
@@ -416,6 +416,8 @@
   pthread_t t1;
   ASSERT_EQ(0, pthread_create(&t1, NULL, spinhelper.GetFunction(), NULL));
   ASSERT_EQ(0, pthread_setname_np(t1, "short 2"));
+  spinhelper.UnSpin();
+  ASSERT_EQ(0, pthread_join(t1, nullptr));
 }
 
 TEST(pthread, pthread_setname_np__no_such_thread) {
@@ -466,6 +468,8 @@
   ASSERT_EQ(0, pthread_getcpuclockid(t, &c));
   timespec ts;
   ASSERT_EQ(0, clock_gettime(c, &ts));
+  spinhelper.UnSpin();
+  ASSERT_EQ(0, pthread_join(t, nullptr));
 }
 
 TEST(pthread, pthread_getcpuclockid__no_such_thread) {
@@ -534,7 +538,7 @@
   // http://b/11693195 --- pthread_join could return before the thread had actually exited.
   // If the joiner unmapped the thread's stack, that could lead to SIGSEGV in the thread.
   for (size_t i = 0; i < 1024; ++i) {
-    size_t stack_size = 64*1024;
+    size_t stack_size = 640*1024;
     void* stack = mmap(NULL, stack_size, PROT_READ|PROT_WRITE, MAP_ANON|MAP_PRIVATE, -1, 0);
 
     pthread_attr_t a;
