Implement dynamic TLS accesses and allocation

Initialize a thread's DTV to an empty zeroed DTV. Allocate the DTV and
any ELF module's TLS segment on-demand in __tls_get_addr. Use a generation
counter, incremented in the linker, to signal when threads should
update/reallocate their DTV objects.

A generation count of 0 always indicates the constant zero DTV.

Once a DTV is allocated, it isn't freed until the thread exits, because
a signal handler could interrupt the fast path of __tls_get_addr between
accessing the DTV slot and reading a field of the DTV. Bionic keeps a
linked list of DTV objects so it can free them at thread-exit.

Dynamic TLS memory is allocated using a BionicAllocator instance in
libc_shared_globals. For async-signal safety, access to the
linker/libc-shared state is protected by first blocking signals, then by
acquiring the reader-writer lock, TlsModules::rwlock. A write lock is
needed to allocate or free memory.

In pthread_exit, unconditionally block signals before freeing dynamic
TLS memory or freeing the shadow call stack.

ndk_cruft.cpp: Avoid including pthread_internal.h inside an extern "C".
(The header now includes a C++ template that doesn't compile inside
extern "C".)

Bug: http://b/78026329
Bug: http://b/123094171
Test: bionic unit tests
Change-Id: I3c9b12921c9e68b33dcc1d1dd276bff364eff5d7
diff --git a/libc/bionic/__libc_init_main_thread.cpp b/libc/bionic/__libc_init_main_thread.cpp
index 6279e65..4984e38 100644
--- a/libc/bionic/__libc_init_main_thread.cpp
+++ b/libc/bionic/__libc_init_main_thread.cpp
@@ -74,6 +74,7 @@
   __libc_init_sysinfo(); // uses AT_SYSINFO auxv entry
 #endif
   __init_tcb(temp_tcb, &main_thread);
+  __init_tcb_dtv(temp_tcb);
   __set_tls(&temp_tcb->tls_slot(0));
   main_thread.tid = __getpid();
   main_thread.set_cached_pid(main_thread.tid);
diff --git a/libc/bionic/bionic_elf_tls.cpp b/libc/bionic/bionic_elf_tls.cpp
index 4253b97..3fa5182 100644
--- a/libc/bionic/bionic_elf_tls.cpp
+++ b/libc/bionic/bionic_elf_tls.cpp
@@ -34,9 +34,22 @@
 #include <unistd.h>
 
 #include "private/ScopedRWLock.h"
+#include "private/ScopedSignalBlocker.h"
 #include "private/bionic_globals.h"
 #include "private/bionic_macros.h"
 #include "private/bionic_tls.h"
+#include "pthread_internal.h"
+
+// Every call to __tls_get_addr needs to check the generation counter, so
+// accesses to the counter need to be as fast as possible. Keep a copy of it in
+// a hidden variable, which can be accessed without using the GOT. The linker
+// will update this variable when it updates its counter.
+//
+// To allow the linker to update this variable, libc.so's constructor passes its
+// address to the linker. To accommodate a possible __tls_get_addr call before
+// libc.so's constructor, this local copy is initialized to SIZE_MAX, forcing
+// __tls_get_addr to initially use the slow path.
+__LIBC_HIDDEN__ _Atomic(size_t) __libc_tls_generation_copy = SIZE_MAX;
 
 // Search for a TLS segment in the given phdr table. Returns true if it has a
 // TLS segment and false otherwise.
@@ -168,6 +181,7 @@
   // moving the initial part. If this locking is too slow, we can duplicate the
   // static part of the table.
   TlsModules& modules = __libc_shared_globals()->tls_modules;
+  ScopedSignalBlocker ssb;
   ScopedReadLock locker(&modules.rwlock);
 
   for (size_t i = 0; i < modules.module_count; ++i) {
@@ -187,3 +201,166 @@
            module.segment.init_size);
   }
 }
+
+static inline size_t dtv_size_in_bytes(size_t module_count) {
+  return sizeof(TlsDtv) + module_count * sizeof(void*);
+}
+
+// Calculates the number of module slots to allocate in a new DTV. For small
+// objects (up to 1KiB), the TLS allocator allocates memory in power-of-2 sizes,
+// so for better space usage, ensure that the DTV size (header + slots) is a
+// power of 2.
+//
+// The lock on TlsModules must be held.
+static size_t calculate_new_dtv_count() {
+  size_t loaded_cnt = __libc_shared_globals()->tls_modules.module_count;
+  size_t bytes = dtv_size_in_bytes(MAX(1, loaded_cnt));
+  if (!powerof2(bytes)) {
+    bytes = BIONIC_ROUND_UP_POWER_OF_2(bytes);
+  }
+  return (bytes - sizeof(TlsDtv)) / sizeof(void*);
+}
+
+// This function must be called with signals blocked and a write lock on
+// TlsModules held.
+static void update_tls_dtv(bionic_tcb* tcb) {
+  const TlsModules& modules = __libc_shared_globals()->tls_modules;
+  BionicAllocator& allocator = __libc_shared_globals()->tls_allocator;
+
+  // Use the generation counter from the shared globals instead of the local
+  // copy, which won't be initialized yet if __tls_get_addr is called before
+  // libc.so's constructor.
+  if (__get_tcb_dtv(tcb)->generation == atomic_load(&modules.generation)) {
+    return;
+  }
+
+  const size_t old_cnt = __get_tcb_dtv(tcb)->count;
+
+  // If the DTV isn't large enough, allocate a larger one. Because a signal
+  // handler could interrupt the fast path of __tls_get_addr, we don't free the
+  // old DTV. Instead, we add the old DTV to a list, then free all of a thread's
+  // DTVs at thread-exit. Each time the DTV is reallocated, its size at least
+  // doubles.
+  if (modules.module_count > old_cnt) {
+    size_t new_cnt = calculate_new_dtv_count();
+    TlsDtv* const old_dtv = __get_tcb_dtv(tcb);
+    TlsDtv* const new_dtv = static_cast<TlsDtv*>(allocator.alloc(dtv_size_in_bytes(new_cnt)));
+    memcpy(new_dtv, old_dtv, dtv_size_in_bytes(old_cnt));
+    new_dtv->count = new_cnt;
+    new_dtv->next = old_dtv;
+    __set_tcb_dtv(tcb, new_dtv);
+  }
+
+  TlsDtv* const dtv = __get_tcb_dtv(tcb);
+
+  const StaticTlsLayout& layout = __libc_shared_globals()->static_tls_layout;
+  char* static_tls = reinterpret_cast<char*>(tcb) - layout.offset_bionic_tcb();
+
+  // Initialize static TLS modules and free unloaded modules.
+  for (size_t i = 0; i < dtv->count; ++i) {
+    if (i < modules.module_count) {
+      const TlsModule& mod = modules.module_table[i];
+      if (mod.static_offset != SIZE_MAX) {
+        dtv->modules[i] = static_tls + mod.static_offset;
+        continue;
+      }
+      if (mod.first_generation != kTlsGenerationNone &&
+          mod.first_generation <= dtv->generation) {
+        continue;
+      }
+    }
+    allocator.free(dtv->modules[i]);
+    dtv->modules[i] = nullptr;
+  }
+
+  dtv->generation = atomic_load(&modules.generation);
+}
+
+__attribute__((noinline)) static void* tls_get_addr_slow_path(const TlsIndex* ti) {
+  TlsModules& modules = __libc_shared_globals()->tls_modules;
+  bionic_tcb* tcb = __get_bionic_tcb();
+
+  // Block signals and lock TlsModules. We may need the allocator, so take
+  // a write lock.
+  ScopedSignalBlocker ssb;
+  ScopedWriteLock locker(&modules.rwlock);
+
+  update_tls_dtv(tcb);
+
+  TlsDtv* dtv = __get_tcb_dtv(tcb);
+  const size_t module_idx = __tls_module_id_to_idx(ti->module_id);
+  void* mod_ptr = dtv->modules[module_idx];
+  if (mod_ptr == nullptr) {
+    const TlsSegment& segment = modules.module_table[module_idx].segment;
+    mod_ptr = __libc_shared_globals()->tls_allocator.memalign(segment.alignment, segment.size);
+    if (segment.init_size > 0) {
+      memcpy(mod_ptr, segment.init_ptr, segment.init_size);
+    }
+    dtv->modules[module_idx] = mod_ptr;
+  }
+
+  return static_cast<char*>(mod_ptr) + ti->offset;
+}
+
+// Returns the address of a thread's TLS memory given a module ID and an offset
+// into that module's TLS segment. This function is called on every access to a
+// dynamic TLS variable on targets that don't use TLSDESC. arm64 uses TLSDESC,
+// so it only calls this function on a thread's first access to a module's TLS
+// segment.
+//
+// On most targets, this accessor function is __tls_get_addr and
+// TLS_GET_ADDR_CCONV is unset. 32-bit x86 uses ___tls_get_addr instead and a
+// regparm() calling convention.
+extern "C" void* TLS_GET_ADDR(const TlsIndex* ti) TLS_GET_ADDR_CCONV {
+  TlsDtv* dtv = __get_tcb_dtv(__get_bionic_tcb());
+
+  // TODO: See if we can use a relaxed memory ordering here instead.
+  size_t generation = atomic_load(&__libc_tls_generation_copy);
+  if (__predict_true(generation == dtv->generation)) {
+    void* mod_ptr = dtv->modules[__tls_module_id_to_idx(ti->module_id)];
+    if (__predict_true(mod_ptr != nullptr)) {
+      return static_cast<char*>(mod_ptr) + ti->offset;
+    }
+  }
+
+  return tls_get_addr_slow_path(ti);
+}
+
+// This function frees:
+//  - TLS modules referenced by the current DTV.
+//  - The list of DTV objects associated with the current thread.
+//
+// The caller must have already blocked signals.
+void __free_dynamic_tls(bionic_tcb* tcb) {
+  TlsModules& modules = __libc_shared_globals()->tls_modules;
+  BionicAllocator& allocator = __libc_shared_globals()->tls_allocator;
+
+  // If we didn't allocate any dynamic memory, skip out early without taking
+  // the lock.
+  TlsDtv* dtv = __get_tcb_dtv(tcb);
+  if (dtv->generation == kTlsGenerationNone) {
+    return;
+  }
+
+  // We need the write lock to use the allocator.
+  ScopedWriteLock locker(&modules.rwlock);
+
+  // First free everything in the current DTV.
+  for (size_t i = 0; i < dtv->count; ++i) {
+    if (i < modules.module_count && modules.module_table[i].static_offset != SIZE_MAX) {
+      // This module's TLS memory is allocated statically, so don't free it here.
+      continue;
+    }
+    allocator.free(dtv->modules[i]);
+  }
+
+  // Now free the thread's list of DTVs.
+  while (dtv->generation != kTlsGenerationNone) {
+    TlsDtv* next = dtv->next;
+    allocator.free(dtv);
+    dtv = next;
+  }
+
+  // Clear the DTV slot. The DTV must not be used again with this thread.
+  tcb->tls_slot(TLS_SLOT_DTV) = nullptr;
+}
diff --git a/libc/bionic/libc_init_dynamic.cpp b/libc/bionic/libc_init_dynamic.cpp
index af1b847..7140776 100644
--- a/libc/bionic/libc_init_dynamic.cpp
+++ b/libc/bionic/libc_init_dynamic.cpp
@@ -51,6 +51,7 @@
 #include <elf.h>
 #include "libc_init_common.h"
 
+#include "private/bionic_elf_tls.h"
 #include "private/bionic_globals.h"
 #include "private/bionic_macros.h"
 #include "private/bionic_ssp.h"
@@ -82,6 +83,12 @@
   __libc_init_sysinfo();
 #endif
 
+  // Register libc.so's copy of the TLS generation variable so the linker can
+  // update it when it loads or unloads a shared object.
+  TlsModules& tls_modules = __libc_shared_globals()->tls_modules;
+  tls_modules.generation_libc_so = &__libc_tls_generation_copy;
+  __libc_tls_generation_copy = tls_modules.generation;
+
   __libc_init_globals();
   __libc_init_common();
 
diff --git a/libc/bionic/libc_init_static.cpp b/libc/bionic/libc_init_static.cpp
index 8fbc20e..514423d 100644
--- a/libc/bionic/libc_init_static.cpp
+++ b/libc/bionic/libc_init_static.cpp
@@ -92,19 +92,22 @@
   size_t phdr_ct = getauxval(AT_PHNUM);
 
   static TlsModule mod;
+  TlsModules& modules = __libc_shared_globals()->tls_modules;
   if (__bionic_get_tls_segment(phdr_start, phdr_ct, 0, &mod.segment)) {
     if (!__bionic_check_tls_alignment(&mod.segment.alignment)) {
       async_safe_fatal("error: TLS segment alignment in \"%s\" is not a power of 2: %zu\n",
                        progname, mod.segment.alignment);
     }
     mod.static_offset = layout.reserve_exe_segment_and_tcb(&mod.segment, progname);
-    mod.first_generation = 1;
-    __libc_shared_globals()->tls_modules.generation = 1;
-    __libc_shared_globals()->tls_modules.module_count = 1;
-    __libc_shared_globals()->tls_modules.module_table = &mod;
+    mod.first_generation = kTlsGenerationFirst;
+
+    modules.module_count = 1;
+    modules.module_table = &mod;
   } else {
     layout.reserve_exe_segment_and_tcb(nullptr, progname);
   }
+  // Enable the fast path in __tls_get_addr.
+  __libc_tls_generation_copy = modules.generation;
 
   layout.finish_layout();
 }
diff --git a/libc/bionic/ndk_cruft.cpp b/libc/bionic/ndk_cruft.cpp
index dbacf18..2c3299f 100644
--- a/libc/bionic/ndk_cruft.cpp
+++ b/libc/bionic/ndk_cruft.cpp
@@ -355,9 +355,14 @@
   return malloc(size);
 }
 
+} // extern "C"
+
 #define __get_thread __real_get_thread
 #include "pthread_internal.h"
 #undef __get_thread
+
+extern "C" {
+
 // Various third-party apps contain a backport of our pthread_rwlock implementation that uses this.
 pthread_internal_t* __get_thread() {
   return __real_get_thread();
diff --git a/libc/bionic/pthread_create.cpp b/libc/bionic/pthread_create.cpp
index 31e0378..d80906c 100644
--- a/libc/bionic/pthread_create.cpp
+++ b/libc/bionic/pthread_create.cpp
@@ -70,6 +70,14 @@
   tcb->tls_slot(TLS_SLOT_STACK_GUARD) = reinterpret_cast<void*>(__stack_chk_guard);
 }
 
+__attribute__((no_stack_protector))
+void __init_tcb_dtv(bionic_tcb* tcb) {
+  // Initialize the DTV slot to a statically-allocated empty DTV. The first
+  // access to a dynamic TLS variable allocates a new DTV.
+  static const TlsDtv zero_dtv = {};
+  __set_tcb_dtv(tcb, const_cast<TlsDtv*>(&zero_dtv));
+}
+
 void __init_bionic_tls_ptrs(bionic_tcb* tcb, bionic_tls* tls) {
   tcb->thread()->bionic_tls = tls;
   tcb->tls_slot(TLS_SLOT_BIONIC_TLS) = tls;
@@ -291,6 +299,7 @@
   // Initialize TLS memory.
   __init_static_tls(mapping.static_tls);
   __init_tcb(tcb, thread);
+  __init_tcb_dtv(tcb);
   __init_tcb_stack_guard(tcb);
   __init_bionic_tls_ptrs(tcb, tls);
 
diff --git a/libc/bionic/pthread_exit.cpp b/libc/bionic/pthread_exit.cpp
index 84ea2e6..3b873b3 100644
--- a/libc/bionic/pthread_exit.cpp
+++ b/libc/bionic/pthread_exit.cpp
@@ -98,15 +98,22 @@
     thread->alternate_signal_stack = nullptr;
   }
 
+  ThreadJoinState old_state = THREAD_NOT_JOINED;
+  while (old_state == THREAD_NOT_JOINED &&
+         !atomic_compare_exchange_weak(&thread->join_state, &old_state, THREAD_EXITED_NOT_JOINED)) {
+  }
+
+  // We don't want to take a signal after unmapping the stack, the shadow call
+  // stack, or dynamic TLS memory.
+  ScopedSignalBlocker ssb;
+
 #ifdef __aarch64__
   // Free the shadow call stack and guard pages.
   munmap(thread->shadow_call_stack_guard_region, SCS_GUARD_REGION_SIZE);
 #endif
 
-  ThreadJoinState old_state = THREAD_NOT_JOINED;
-  while (old_state == THREAD_NOT_JOINED &&
-         !atomic_compare_exchange_weak(&thread->join_state, &old_state, THREAD_EXITED_NOT_JOINED)) {
-  }
+  // Free the ELF TLS DTV and all dynamically-allocated ELF TLS memory.
+  __free_dynamic_tls(__get_bionic_tcb());
 
   if (old_state == THREAD_DETACHED) {
     // The thread is detached, no one will use pthread_internal_t after pthread_exit.
@@ -121,10 +128,6 @@
     if (thread->mmap_size != 0) {
       // We need to free mapped space for detached threads when they exit.
       // That's not something we can do in C.
-
-      // We don't want to take a signal after we've unmapped the stack.
-      // That's one last thing we can do before dropping to assembler.
-      ScopedSignalBlocker ssb;
       __hwasan_thread_exit();
       _exit_with_stack_teardown(thread->mmap_base, thread->mmap_size);
     }
diff --git a/libc/bionic/pthread_internal.h b/libc/bionic/pthread_internal.h
index 27ab3df..cbcdadf 100644
--- a/libc/bionic/pthread_internal.h
+++ b/libc/bionic/pthread_internal.h
@@ -38,6 +38,7 @@
 #define __hwasan_thread_exit()
 #endif
 
+#include "private/bionic_elf_tls.h"
 #include "private/bionic_lock.h"
 #include "private/bionic_tls.h"
 
@@ -154,6 +155,7 @@
 
 __LIBC_HIDDEN__ void __init_tcb(bionic_tcb* tcb, pthread_internal_t* thread);
 __LIBC_HIDDEN__ void __init_tcb_stack_guard(bionic_tcb* tcb);
+__LIBC_HIDDEN__ void __init_tcb_dtv(bionic_tcb* tcb);
 __LIBC_HIDDEN__ void __init_bionic_tls_ptrs(bionic_tcb* tcb, bionic_tls* tls);
 __LIBC_HIDDEN__ bionic_tls* __allocate_temp_bionic_tls();
 __LIBC_HIDDEN__ void __free_temp_bionic_tls(bionic_tls* tls);
@@ -179,6 +181,15 @@
   return *static_cast<bionic_tls*>(__get_tls()[TLS_SLOT_BIONIC_TLS]);
 }
 
+static inline __always_inline TlsDtv* __get_tcb_dtv(bionic_tcb* tcb) {
+  uintptr_t dtv_slot = reinterpret_cast<uintptr_t>(tcb->tls_slot(TLS_SLOT_DTV));
+  return reinterpret_cast<TlsDtv*>(dtv_slot - offsetof(TlsDtv, generation));
+}
+
+static inline void __set_tcb_dtv(bionic_tcb* tcb, TlsDtv* val) {
+  tcb->tls_slot(TLS_SLOT_DTV) = &val->generation;
+}
+
 extern "C" __LIBC_HIDDEN__ int __set_tls(void* ptr);
 
 __LIBC_HIDDEN__ void pthread_key_clean_all(void);
diff --git a/libc/libc.map.txt b/libc/libc.map.txt
index 8d67b9e..6a6ea7d 100644
--- a/libc/libc.map.txt
+++ b/libc/libc.map.txt
@@ -1446,8 +1446,10 @@
 
 LIBC_Q { # introduced=Q
   global:
+    ___tls_get_addr; # x86
     __aeabi_read_tp; # arm
     __res_randomid;
+    __tls_get_addr; # arm x86_64
     android_fdsan_close_with_tag;
     android_fdsan_create_owner_tag;
     android_fdsan_exchange_owner_tag;
diff --git a/libc/private/bionic_elf_tls.h b/libc/private/bionic_elf_tls.h
index 09e1958..fa1af76 100644
--- a/libc/private/bionic_elf_tls.h
+++ b/libc/private/bionic_elf_tls.h
@@ -34,6 +34,8 @@
 #include <stdint.h>
 #include <sys/cdefs.h>
 
+__LIBC_HIDDEN__ extern _Atomic(size_t) __libc_tls_generation_copy;
+
 struct TlsSegment {
   size_t size = 0;
   size_t alignment = 1;
@@ -84,6 +86,16 @@
   size_t round_up_with_overflow_check(size_t value, size_t alignment);
 };
 
+static constexpr size_t kTlsGenerationNone = 0;
+static constexpr size_t kTlsGenerationFirst = 1;
+
+// The first ELF TLS module has ID 1. Zero is reserved for the first word of
+// the DTV, a generation count. Unresolved weak symbols also use module ID 0.
+static constexpr size_t kTlsUninitializedModuleId = 0;
+
+static inline size_t __tls_module_id_to_idx(size_t id) { return id - 1; }
+static inline size_t __tls_module_idx_to_id(size_t idx) { return idx + 1; }
+
 // A descriptor for a single ELF TLS module.
 struct TlsModule {
   TlsSegment segment;
@@ -93,7 +105,7 @@
 
   // The generation in which this module was loaded. Dynamic TLS lookups use
   // this field to detect when a module has been unloaded.
-  size_t first_generation = 0;
+  size_t first_generation = kTlsGenerationNone;
 
   // Used by the dynamic linker to track the associated soinfo* object.
   void* soinfo_ptr = nullptr;
@@ -105,9 +117,10 @@
 struct TlsModules {
   constexpr TlsModules() {}
 
-  // A generation counter. The value is incremented each time an solib is loaded
-  // or unloaded.
-  _Atomic(size_t) generation = 0;
+  // A pointer to the TLS generation counter in libc.so. The counter is
+  // incremented each time an solib is loaded or unloaded.
+  _Atomic(size_t) generation = kTlsGenerationFirst;
+  _Atomic(size_t) *generation_libc_so = nullptr;
 
   // Access to the TlsModule[] table requires taking this lock.
   pthread_rwlock_t rwlock = PTHREAD_RWLOCK_INITIALIZER;
@@ -119,3 +132,46 @@
 };
 
 void __init_static_tls(void* static_tls);
+
+// Dynamic Thread Vector. Each thread has a different DTV. For each module
+// (executable or solib), the DTV has a pointer to that module's TLS memory. The
+// DTV is initially empty and is allocated on-demand. It grows as more modules
+// are dlopen'ed. See https://www.akkadia.org/drepper/tls.pdf.
+//
+// The layout of the DTV is specified in various documents, but it is not part
+// of Bionic's public ABI. A compiler can't generate code to access it directly,
+// because it can't access libc's global generation counter.
+struct TlsDtv {
+  // Number of elements in this object's modules field.
+  size_t count;
+
+  // A pointer to an older TlsDtv object that should be freed when the thread
+  // exits. The objects aren't immediately freed because a DTV could be
+  // reallocated by a signal handler that interrupted __tls_get_addr's fast
+  // path.
+  TlsDtv* next;
+
+  // The DTV slot points at this field, which allows omitting an add instruction
+  // on the fast path for a TLS lookup. The arm64 tlsdesc_resolver.S depends on
+  // the layout of fields past this point.
+  size_t generation;
+  void* modules[];
+};
+
+struct TlsIndex {
+  size_t module_id;
+  size_t offset;
+};
+
+#if defined(__i386__)
+#define TLS_GET_ADDR_CCONV __attribute__((regparm(1)))
+#define TLS_GET_ADDR ___tls_get_addr
+#else
+#define TLS_GET_ADDR_CCONV
+#define TLS_GET_ADDR __tls_get_addr
+#endif
+
+extern "C" void* TLS_GET_ADDR(const TlsIndex* ti) TLS_GET_ADDR_CCONV;
+
+struct bionic_tcb;
+void __free_dynamic_tls(bionic_tcb* tcb);
diff --git a/libc/private/bionic_globals.h b/libc/private/bionic_globals.h
index 4d40476..21a2a24 100644
--- a/libc/private/bionic_globals.h
+++ b/libc/private/bionic_globals.h
@@ -33,6 +33,7 @@
 #include <link.h>
 #include <pthread.h>
 
+#include "private/bionic_allocator.h"
 #include "private/bionic_elf_tls.h"
 #include "private/bionic_fdsan.h"
 #include "private/bionic_malloc_dispatch.h"
@@ -70,6 +71,7 @@
 
   StaticTlsLayout static_tls_layout;
   TlsModules tls_modules;
+  BionicAllocator tls_allocator;
 
   // Values passed from the linker to libc.so.
   const char* init_progname = nullptr;