Reapply "[MTE] allocate ring buffer for stack history"

This reverts commit 45b29f13bbefd1edc1d16a15bb4c1f8e8d8068fc.

The observed slowdown was actually an artifact of stack alignment.

Change-Id: I3af8af33c624491124bfed07e869b4330adfacb3
diff --git a/libc/bionic/pthread_create.cpp b/libc/bionic/pthread_create.cpp
index 5bd4f16..a8d09eb 100644
--- a/libc/bionic/pthread_create.cpp
+++ b/libc/bionic/pthread_create.cpp
@@ -65,6 +65,7 @@
 }
 
 void __init_bionic_tls_ptrs(bionic_tcb* tcb, bionic_tls* tls) {
+  tcb->thread()->bionic_tcb = tcb;
   tcb->thread()->bionic_tls = tls;
   tcb->tls_slot(TLS_SLOT_BIONIC_TLS) = tls;
 }
@@ -443,6 +444,14 @@
 
   ScopedReadLock locker(&g_thread_creation_lock);
 
+// This has to be done under g_thread_creation_lock or g_thread_list_lock to avoid racing with
+// __pthread_internal_remap_stack_with_mte.
+#ifdef __aarch64__
+  if (__libc_memtag_stack_abi) {
+    tcb->tls_slot(TLS_SLOT_STACK_MTE) = __allocate_stack_mte_ringbuffer(0, thread);
+  }
+#endif
+
   sigset64_t block_all_mask;
   sigfillset64(&block_all_mask);
   __rt_sigprocmask(SIG_SETMASK, &block_all_mask, &thread->start_mask, sizeof(thread->start_mask));