[MTE] fix missing thread id from stack_mte_ring mapping

We would use thread->tid to name the mapping before it was set, so it
would always be zero.

Bug: 378140560
Change-Id: Id30c22b7e9d0155c79579b097c81e27f662b5d6d
diff --git a/libc/bionic/pthread_create.cpp b/libc/bionic/pthread_create.cpp
index ba20c51..54bfa20 100644
--- a/libc/bionic/pthread_create.cpp
+++ b/libc/bionic/pthread_create.cpp
@@ -351,15 +351,20 @@
 
 extern "C" int __rt_sigprocmask(int, const sigset64_t*, sigset64_t*, size_t);
 
-__attribute__((no_sanitize("hwaddress")))
+__attribute__((no_sanitize("hwaddress", "memtag")))
 #if defined(__aarch64__)
 // This function doesn't return, but it does appear in stack traces. Avoid using return PAC in this
 // function because we may end up resetting IA, which may confuse unwinders due to mismatching keys.
 __attribute__((target("branch-protection=bti")))
 #endif
-static int __pthread_start(void* arg) {
+static int
+__pthread_start(void* arg) {
   pthread_internal_t* thread = reinterpret_cast<pthread_internal_t*>(arg);
-
+#if defined(__aarch64__)
+  if (thread->should_allocate_stack_mte_ringbuffer) {
+    thread->bionic_tcb->tls_slot(TLS_SLOT_STACK_MTE) = __allocate_stack_mte_ringbuffer(0, thread);
+  }
+#endif
   __hwasan_thread_enter();
 
   // Wait for our creating thread to release us. This lets it have time to
@@ -450,9 +455,9 @@
 // This has to be done under g_thread_creation_lock or g_thread_list_lock to avoid racing with
 // __pthread_internal_remap_stack_with_mte.
 #ifdef __aarch64__
-  if (__libc_memtag_stack_abi) {
-    tcb->tls_slot(TLS_SLOT_STACK_MTE) = __allocate_stack_mte_ringbuffer(0, thread);
-  }
+  thread->should_allocate_stack_mte_ringbuffer = __libc_memtag_stack_abi;
+#else
+  thread->should_allocate_stack_mte_ringbuffer = false;
 #endif
 
   sigset64_t block_all_mask;
diff --git a/libc/bionic/pthread_internal.cpp b/libc/bionic/pthread_internal.cpp
index bec9703..c6426ed 100644
--- a/libc/bionic/pthread_internal.cpp
+++ b/libc/bionic/pthread_internal.cpp
@@ -208,7 +208,10 @@
   __libc_memtag_stack_abi = true;
 
   for (pthread_internal_t* t = g_thread_list; t != nullptr; t = t->next) {
-    if (t->terminating) continue;
+    // should_allocate_stack_mte_ringbuffer indicates the thread is already
+    // aware that this process requires stack MTE, and will allocate the
+    // ring buffer in __pthread_start.
+    if (t->terminating || t->should_allocate_stack_mte_ringbuffer) continue;
     t->bionic_tcb->tls_slot(TLS_SLOT_STACK_MTE) =
         __allocate_stack_mte_ringbuffer(0, t->is_main() ? nullptr : t);
   }
diff --git a/libc/bionic/pthread_internal.h b/libc/bionic/pthread_internal.h
index 5db42ab..cbaa9a6 100644
--- a/libc/bionic/pthread_internal.h
+++ b/libc/bionic/pthread_internal.h
@@ -181,6 +181,7 @@
 
   bionic_tcb* bionic_tcb;
   char stack_mte_ringbuffer_vma_name_buffer[32];
+  bool should_allocate_stack_mte_ringbuffer;
 
   bool is_main() { return start_routine == nullptr; }
 };