Don't use __thread in __cxa_thread_finalize().

Currently we use __thread variable to store thread_local_dtors,
which makes tsan test fork_atexit.cc hang. The problem is as below:
The main thread creates a worker thread, the worker thread calls
pthread_exit() -> __cxa_thread_finalize() -> __emutls_get_address()
-> pthread_once(emutls_init) -> emutls_init().
Then the main thread calls fork(), the child process cals
exit() -> __cxa_thread_finalize() -> __emutls_get_address()
-> pthread_once(emutls_init).
So the child process is waiting for pthread_once(emutls_init)
to finish which will never occur.

It might be the test's fault because POSIX standard says if a
multi-threaded process calls fork(), the new process may only
execute async-signal-safe operations until exec functions are
called. And exit() is not async-signal-safe. But we can make
bionic more reliable by not using __thread in
__cxa_thread_finalize().

Bug: 25392375
Change-Id: Ife403dd7379dad8ddf1859c348c1c0adea07afb3
diff --git a/libc/bionic/__cxa_thread_atexit_impl.cpp b/libc/bionic/__cxa_thread_atexit_impl.cpp
index 0e427d3..0e903b9 100644
--- a/libc/bionic/__cxa_thread_atexit_impl.cpp
+++ b/libc/bionic/__cxa_thread_atexit_impl.cpp
@@ -15,6 +15,8 @@
  */
 #include <sys/cdefs.h>
 
+#include "pthread_internal.h"
+
 struct thread_local_dtor {
   void (*func) (void *);
   void *arg;
@@ -22,25 +24,24 @@
   thread_local_dtor* next;
 };
 
-static __thread thread_local_dtor* thread_local_dtors = nullptr;
-
 extern "C" int __cxa_thread_atexit_impl(void (*func) (void *), void *arg, void *dso_handle) {
   thread_local_dtor* dtor = new thread_local_dtor();
 
   dtor->func = func;
   dtor->arg = arg;
   dtor->dso_handle = dso_handle;
-  dtor->next = thread_local_dtors;
 
-  thread_local_dtors = dtor;
-
+  pthread_internal_t* thread = __get_thread();
+  dtor->next = thread->thread_local_dtors;
+  thread->thread_local_dtors = dtor;
   return 0;
 }
 
 extern "C" __LIBC_HIDDEN__ void __cxa_thread_finalize() {
-  while (thread_local_dtors != nullptr) {
-    thread_local_dtor* current = thread_local_dtors;
-    thread_local_dtors = current->next;
+  pthread_internal_t* thread = __get_thread();
+  while (thread->thread_local_dtors != nullptr) {
+    thread_local_dtor* current = thread->thread_local_dtors;
+    thread->thread_local_dtors = current->next;
 
     current->func(current->arg);
     delete current;