Implement dynamic TLS accesses and allocation
Initialize a thread's DTV to an empty zeroed DTV. Allocate the DTV and
any ELF module's TLS segment on-demand in __tls_get_addr. Use a generation
counter, incremented in the linker, to signal when threads should
update/reallocate their DTV objects.
A generation count of 0 always indicates the constant zero DTV.
Once a DTV is allocated, it isn't freed until the thread exits, because
a signal handler could interrupt the fast path of __tls_get_addr between
accessing the DTV slot and reading a field of the DTV. Bionic keeps a
linked list of DTV objects so it can free them at thread-exit.
Dynamic TLS memory is allocated using a BionicAllocator instance in
libc_shared_globals. For async-signal safety, access to the
linker/libc-shared state is protected by first blocking signals, then by
acquiring the reader-writer lock, TlsModules::rwlock. A write lock is
needed to allocate or free memory.
In pthread_exit, unconditionally block signals before freeing dynamic
TLS memory or freeing the shadow call stack.
ndk_cruft.cpp: Avoid including pthread_internal.h inside an extern "C".
(The header now includes a C++ template that doesn't compile inside
extern "C".)
Bug: http://b/78026329
Bug: http://b/123094171
Test: bionic unit tests
Change-Id: I3c9b12921c9e68b33dcc1d1dd276bff364eff5d7
diff --git a/libc/bionic/__libc_init_main_thread.cpp b/libc/bionic/__libc_init_main_thread.cpp
index 6279e65..4984e38 100644
--- a/libc/bionic/__libc_init_main_thread.cpp
+++ b/libc/bionic/__libc_init_main_thread.cpp
@@ -74,6 +74,7 @@
__libc_init_sysinfo(); // uses AT_SYSINFO auxv entry
#endif
__init_tcb(temp_tcb, &main_thread);
+ __init_tcb_dtv(temp_tcb);
__set_tls(&temp_tcb->tls_slot(0));
main_thread.tid = __getpid();
main_thread.set_cached_pid(main_thread.tid);
diff --git a/libc/bionic/bionic_elf_tls.cpp b/libc/bionic/bionic_elf_tls.cpp
index 4253b97..3fa5182 100644
--- a/libc/bionic/bionic_elf_tls.cpp
+++ b/libc/bionic/bionic_elf_tls.cpp
@@ -34,9 +34,22 @@
#include <unistd.h>
#include "private/ScopedRWLock.h"
+#include "private/ScopedSignalBlocker.h"
#include "private/bionic_globals.h"
#include "private/bionic_macros.h"
#include "private/bionic_tls.h"
+#include "pthread_internal.h"
+
+// Every call to __tls_get_addr needs to check the generation counter, so
+// accesses to the counter need to be as fast as possible. Keep a copy of it in
+// a hidden variable, which can be accessed without using the GOT. The linker
+// will update this variable when it updates its counter.
+//
+// To allow the linker to update this variable, libc.so's constructor passes its
+// address to the linker. To accommodate a possible __tls_get_addr call before
+// libc.so's constructor, this local copy is initialized to SIZE_MAX, forcing
+// __tls_get_addr to initially use the slow path.
+__LIBC_HIDDEN__ _Atomic(size_t) __libc_tls_generation_copy = SIZE_MAX;
// Search for a TLS segment in the given phdr table. Returns true if it has a
// TLS segment and false otherwise.
@@ -168,6 +181,7 @@
// moving the initial part. If this locking is too slow, we can duplicate the
// static part of the table.
TlsModules& modules = __libc_shared_globals()->tls_modules;
+ ScopedSignalBlocker ssb;
ScopedReadLock locker(&modules.rwlock);
for (size_t i = 0; i < modules.module_count; ++i) {
@@ -187,3 +201,166 @@
module.segment.init_size);
}
}
+
+static inline size_t dtv_size_in_bytes(size_t module_count) {
+ return sizeof(TlsDtv) + module_count * sizeof(void*);
+}
+
+// Calculates the number of module slots to allocate in a new DTV. For small
+// objects (up to 1KiB), the TLS allocator allocates memory in power-of-2 sizes,
+// so for better space usage, ensure that the DTV size (header + slots) is a
+// power of 2.
+//
+// The lock on TlsModules must be held.
+static size_t calculate_new_dtv_count() {
+ size_t loaded_cnt = __libc_shared_globals()->tls_modules.module_count;
+ size_t bytes = dtv_size_in_bytes(MAX(1, loaded_cnt));
+ if (!powerof2(bytes)) {
+ bytes = BIONIC_ROUND_UP_POWER_OF_2(bytes);
+ }
+ return (bytes - sizeof(TlsDtv)) / sizeof(void*);
+}
+
+// This function must be called with signals blocked and a write lock on
+// TlsModules held.
+static void update_tls_dtv(bionic_tcb* tcb) {
+ const TlsModules& modules = __libc_shared_globals()->tls_modules;
+ BionicAllocator& allocator = __libc_shared_globals()->tls_allocator;
+
+ // Use the generation counter from the shared globals instead of the local
+ // copy, which won't be initialized yet if __tls_get_addr is called before
+ // libc.so's constructor.
+ if (__get_tcb_dtv(tcb)->generation == atomic_load(&modules.generation)) {
+ return;
+ }
+
+ const size_t old_cnt = __get_tcb_dtv(tcb)->count;
+
+ // If the DTV isn't large enough, allocate a larger one. Because a signal
+ // handler could interrupt the fast path of __tls_get_addr, we don't free the
+ // old DTV. Instead, we add the old DTV to a list, then free all of a thread's
+ // DTVs at thread-exit. Each time the DTV is reallocated, its size at least
+ // doubles.
+ if (modules.module_count > old_cnt) {
+ size_t new_cnt = calculate_new_dtv_count();
+ TlsDtv* const old_dtv = __get_tcb_dtv(tcb);
+ TlsDtv* const new_dtv = static_cast<TlsDtv*>(allocator.alloc(dtv_size_in_bytes(new_cnt)));
+ memcpy(new_dtv, old_dtv, dtv_size_in_bytes(old_cnt));
+ new_dtv->count = new_cnt;
+ new_dtv->next = old_dtv;
+ __set_tcb_dtv(tcb, new_dtv);
+ }
+
+ TlsDtv* const dtv = __get_tcb_dtv(tcb);
+
+ const StaticTlsLayout& layout = __libc_shared_globals()->static_tls_layout;
+ char* static_tls = reinterpret_cast<char*>(tcb) - layout.offset_bionic_tcb();
+
+ // Initialize static TLS modules and free unloaded modules.
+ for (size_t i = 0; i < dtv->count; ++i) {
+ if (i < modules.module_count) {
+ const TlsModule& mod = modules.module_table[i];
+ if (mod.static_offset != SIZE_MAX) {
+ dtv->modules[i] = static_tls + mod.static_offset;
+ continue;
+ }
+ if (mod.first_generation != kTlsGenerationNone &&
+ mod.first_generation <= dtv->generation) {
+ continue;
+ }
+ }
+ allocator.free(dtv->modules[i]);
+ dtv->modules[i] = nullptr;
+ }
+
+ dtv->generation = atomic_load(&modules.generation);
+}
+
+__attribute__((noinline)) static void* tls_get_addr_slow_path(const TlsIndex* ti) {
+ TlsModules& modules = __libc_shared_globals()->tls_modules;
+ bionic_tcb* tcb = __get_bionic_tcb();
+
+ // Block signals and lock TlsModules. We may need the allocator, so take
+ // a write lock.
+ ScopedSignalBlocker ssb;
+ ScopedWriteLock locker(&modules.rwlock);
+
+ update_tls_dtv(tcb);
+
+ TlsDtv* dtv = __get_tcb_dtv(tcb);
+ const size_t module_idx = __tls_module_id_to_idx(ti->module_id);
+ void* mod_ptr = dtv->modules[module_idx];
+ if (mod_ptr == nullptr) {
+ const TlsSegment& segment = modules.module_table[module_idx].segment;
+ mod_ptr = __libc_shared_globals()->tls_allocator.memalign(segment.alignment, segment.size);
+ if (segment.init_size > 0) {
+ memcpy(mod_ptr, segment.init_ptr, segment.init_size);
+ }
+ dtv->modules[module_idx] = mod_ptr;
+ }
+
+ return static_cast<char*>(mod_ptr) + ti->offset;
+}
+
+// Returns the address of a thread's TLS memory given a module ID and an offset
+// into that module's TLS segment. This function is called on every access to a
+// dynamic TLS variable on targets that don't use TLSDESC. arm64 uses TLSDESC,
+// so it only calls this function on a thread's first access to a module's TLS
+// segment.
+//
+// On most targets, this accessor function is __tls_get_addr and
+// TLS_GET_ADDR_CCONV is unset. 32-bit x86 uses ___tls_get_addr instead and a
+// regparm() calling convention.
+extern "C" void* TLS_GET_ADDR(const TlsIndex* ti) TLS_GET_ADDR_CCONV {
+ TlsDtv* dtv = __get_tcb_dtv(__get_bionic_tcb());
+
+ // TODO: See if we can use a relaxed memory ordering here instead.
+ size_t generation = atomic_load(&__libc_tls_generation_copy);
+ if (__predict_true(generation == dtv->generation)) {
+ void* mod_ptr = dtv->modules[__tls_module_id_to_idx(ti->module_id)];
+ if (__predict_true(mod_ptr != nullptr)) {
+ return static_cast<char*>(mod_ptr) + ti->offset;
+ }
+ }
+
+ return tls_get_addr_slow_path(ti);
+}
+
+// This function frees:
+// - TLS modules referenced by the current DTV.
+// - The list of DTV objects associated with the current thread.
+//
+// The caller must have already blocked signals.
+void __free_dynamic_tls(bionic_tcb* tcb) {
+ TlsModules& modules = __libc_shared_globals()->tls_modules;
+ BionicAllocator& allocator = __libc_shared_globals()->tls_allocator;
+
+ // If we didn't allocate any dynamic memory, skip out early without taking
+ // the lock.
+ TlsDtv* dtv = __get_tcb_dtv(tcb);
+ if (dtv->generation == kTlsGenerationNone) {
+ return;
+ }
+
+ // We need the write lock to use the allocator.
+ ScopedWriteLock locker(&modules.rwlock);
+
+ // First free everything in the current DTV.
+ for (size_t i = 0; i < dtv->count; ++i) {
+ if (i < modules.module_count && modules.module_table[i].static_offset != SIZE_MAX) {
+ // This module's TLS memory is allocated statically, so don't free it here.
+ continue;
+ }
+ allocator.free(dtv->modules[i]);
+ }
+
+ // Now free the thread's list of DTVs.
+ while (dtv->generation != kTlsGenerationNone) {
+ TlsDtv* next = dtv->next;
+ allocator.free(dtv);
+ dtv = next;
+ }
+
+ // Clear the DTV slot. The DTV must not be used again with this thread.
+ tcb->tls_slot(TLS_SLOT_DTV) = nullptr;
+}
diff --git a/libc/bionic/libc_init_dynamic.cpp b/libc/bionic/libc_init_dynamic.cpp
index af1b847..7140776 100644
--- a/libc/bionic/libc_init_dynamic.cpp
+++ b/libc/bionic/libc_init_dynamic.cpp
@@ -51,6 +51,7 @@
#include <elf.h>
#include "libc_init_common.h"
+#include "private/bionic_elf_tls.h"
#include "private/bionic_globals.h"
#include "private/bionic_macros.h"
#include "private/bionic_ssp.h"
@@ -82,6 +83,12 @@
__libc_init_sysinfo();
#endif
+ // Register libc.so's copy of the TLS generation variable so the linker can
+ // update it when it loads or unloads a shared object.
+ TlsModules& tls_modules = __libc_shared_globals()->tls_modules;
+ tls_modules.generation_libc_so = &__libc_tls_generation_copy;
+ __libc_tls_generation_copy = tls_modules.generation;
+
__libc_init_globals();
__libc_init_common();
diff --git a/libc/bionic/libc_init_static.cpp b/libc/bionic/libc_init_static.cpp
index 8fbc20e..514423d 100644
--- a/libc/bionic/libc_init_static.cpp
+++ b/libc/bionic/libc_init_static.cpp
@@ -92,19 +92,22 @@
size_t phdr_ct = getauxval(AT_PHNUM);
static TlsModule mod;
+ TlsModules& modules = __libc_shared_globals()->tls_modules;
if (__bionic_get_tls_segment(phdr_start, phdr_ct, 0, &mod.segment)) {
if (!__bionic_check_tls_alignment(&mod.segment.alignment)) {
async_safe_fatal("error: TLS segment alignment in \"%s\" is not a power of 2: %zu\n",
progname, mod.segment.alignment);
}
mod.static_offset = layout.reserve_exe_segment_and_tcb(&mod.segment, progname);
- mod.first_generation = 1;
- __libc_shared_globals()->tls_modules.generation = 1;
- __libc_shared_globals()->tls_modules.module_count = 1;
- __libc_shared_globals()->tls_modules.module_table = &mod;
+ mod.first_generation = kTlsGenerationFirst;
+
+ modules.module_count = 1;
+ modules.module_table = &mod;
} else {
layout.reserve_exe_segment_and_tcb(nullptr, progname);
}
+ // Enable the fast path in __tls_get_addr.
+ __libc_tls_generation_copy = modules.generation;
layout.finish_layout();
}
diff --git a/libc/bionic/ndk_cruft.cpp b/libc/bionic/ndk_cruft.cpp
index dbacf18..2c3299f 100644
--- a/libc/bionic/ndk_cruft.cpp
+++ b/libc/bionic/ndk_cruft.cpp
@@ -355,9 +355,14 @@
return malloc(size);
}
+} // extern "C"
+
#define __get_thread __real_get_thread
#include "pthread_internal.h"
#undef __get_thread
+
+extern "C" {
+
// Various third-party apps contain a backport of our pthread_rwlock implementation that uses this.
pthread_internal_t* __get_thread() {
return __real_get_thread();
diff --git a/libc/bionic/pthread_create.cpp b/libc/bionic/pthread_create.cpp
index 31e0378..d80906c 100644
--- a/libc/bionic/pthread_create.cpp
+++ b/libc/bionic/pthread_create.cpp
@@ -70,6 +70,14 @@
tcb->tls_slot(TLS_SLOT_STACK_GUARD) = reinterpret_cast<void*>(__stack_chk_guard);
}
+__attribute__((no_stack_protector))
+void __init_tcb_dtv(bionic_tcb* tcb) {
+ // Initialize the DTV slot to a statically-allocated empty DTV. The first
+ // access to a dynamic TLS variable allocates a new DTV.
+ static const TlsDtv zero_dtv = {};
+ __set_tcb_dtv(tcb, const_cast<TlsDtv*>(&zero_dtv));
+}
+
void __init_bionic_tls_ptrs(bionic_tcb* tcb, bionic_tls* tls) {
tcb->thread()->bionic_tls = tls;
tcb->tls_slot(TLS_SLOT_BIONIC_TLS) = tls;
@@ -291,6 +299,7 @@
// Initialize TLS memory.
__init_static_tls(mapping.static_tls);
__init_tcb(tcb, thread);
+ __init_tcb_dtv(tcb);
__init_tcb_stack_guard(tcb);
__init_bionic_tls_ptrs(tcb, tls);
diff --git a/libc/bionic/pthread_exit.cpp b/libc/bionic/pthread_exit.cpp
index 84ea2e6..3b873b3 100644
--- a/libc/bionic/pthread_exit.cpp
+++ b/libc/bionic/pthread_exit.cpp
@@ -98,15 +98,22 @@
thread->alternate_signal_stack = nullptr;
}
+ ThreadJoinState old_state = THREAD_NOT_JOINED;
+ while (old_state == THREAD_NOT_JOINED &&
+ !atomic_compare_exchange_weak(&thread->join_state, &old_state, THREAD_EXITED_NOT_JOINED)) {
+ }
+
+ // We don't want to take a signal after unmapping the stack, the shadow call
+ // stack, or dynamic TLS memory.
+ ScopedSignalBlocker ssb;
+
#ifdef __aarch64__
// Free the shadow call stack and guard pages.
munmap(thread->shadow_call_stack_guard_region, SCS_GUARD_REGION_SIZE);
#endif
- ThreadJoinState old_state = THREAD_NOT_JOINED;
- while (old_state == THREAD_NOT_JOINED &&
- !atomic_compare_exchange_weak(&thread->join_state, &old_state, THREAD_EXITED_NOT_JOINED)) {
- }
+ // Free the ELF TLS DTV and all dynamically-allocated ELF TLS memory.
+ __free_dynamic_tls(__get_bionic_tcb());
if (old_state == THREAD_DETACHED) {
// The thread is detached, no one will use pthread_internal_t after pthread_exit.
@@ -121,10 +128,6 @@
if (thread->mmap_size != 0) {
// We need to free mapped space for detached threads when they exit.
// That's not something we can do in C.
-
- // We don't want to take a signal after we've unmapped the stack.
- // That's one last thing we can do before dropping to assembler.
- ScopedSignalBlocker ssb;
__hwasan_thread_exit();
_exit_with_stack_teardown(thread->mmap_base, thread->mmap_size);
}
diff --git a/libc/bionic/pthread_internal.h b/libc/bionic/pthread_internal.h
index 27ab3df..cbcdadf 100644
--- a/libc/bionic/pthread_internal.h
+++ b/libc/bionic/pthread_internal.h
@@ -38,6 +38,7 @@
#define __hwasan_thread_exit()
#endif
+#include "private/bionic_elf_tls.h"
#include "private/bionic_lock.h"
#include "private/bionic_tls.h"
@@ -154,6 +155,7 @@
__LIBC_HIDDEN__ void __init_tcb(bionic_tcb* tcb, pthread_internal_t* thread);
__LIBC_HIDDEN__ void __init_tcb_stack_guard(bionic_tcb* tcb);
+__LIBC_HIDDEN__ void __init_tcb_dtv(bionic_tcb* tcb);
__LIBC_HIDDEN__ void __init_bionic_tls_ptrs(bionic_tcb* tcb, bionic_tls* tls);
__LIBC_HIDDEN__ bionic_tls* __allocate_temp_bionic_tls();
__LIBC_HIDDEN__ void __free_temp_bionic_tls(bionic_tls* tls);
@@ -179,6 +181,15 @@
return *static_cast<bionic_tls*>(__get_tls()[TLS_SLOT_BIONIC_TLS]);
}
+static inline __always_inline TlsDtv* __get_tcb_dtv(bionic_tcb* tcb) {
+ uintptr_t dtv_slot = reinterpret_cast<uintptr_t>(tcb->tls_slot(TLS_SLOT_DTV));
+ return reinterpret_cast<TlsDtv*>(dtv_slot - offsetof(TlsDtv, generation));
+}
+
+static inline void __set_tcb_dtv(bionic_tcb* tcb, TlsDtv* val) {
+ tcb->tls_slot(TLS_SLOT_DTV) = &val->generation;
+}
+
extern "C" __LIBC_HIDDEN__ int __set_tls(void* ptr);
__LIBC_HIDDEN__ void pthread_key_clean_all(void);
diff --git a/libc/libc.map.txt b/libc/libc.map.txt
index 8d67b9e..6a6ea7d 100644
--- a/libc/libc.map.txt
+++ b/libc/libc.map.txt
@@ -1446,8 +1446,10 @@
LIBC_Q { # introduced=Q
global:
+ ___tls_get_addr; # x86
__aeabi_read_tp; # arm
__res_randomid;
+ __tls_get_addr; # arm x86_64
android_fdsan_close_with_tag;
android_fdsan_create_owner_tag;
android_fdsan_exchange_owner_tag;
diff --git a/libc/private/bionic_elf_tls.h b/libc/private/bionic_elf_tls.h
index 09e1958..fa1af76 100644
--- a/libc/private/bionic_elf_tls.h
+++ b/libc/private/bionic_elf_tls.h
@@ -34,6 +34,8 @@
#include <stdint.h>
#include <sys/cdefs.h>
+__LIBC_HIDDEN__ extern _Atomic(size_t) __libc_tls_generation_copy;
+
struct TlsSegment {
size_t size = 0;
size_t alignment = 1;
@@ -84,6 +86,16 @@
size_t round_up_with_overflow_check(size_t value, size_t alignment);
};
+static constexpr size_t kTlsGenerationNone = 0;
+static constexpr size_t kTlsGenerationFirst = 1;
+
+// The first ELF TLS module has ID 1. Zero is reserved for the first word of
+// the DTV, a generation count. Unresolved weak symbols also use module ID 0.
+static constexpr size_t kTlsUninitializedModuleId = 0;
+
+static inline size_t __tls_module_id_to_idx(size_t id) { return id - 1; }
+static inline size_t __tls_module_idx_to_id(size_t idx) { return idx + 1; }
+
// A descriptor for a single ELF TLS module.
struct TlsModule {
TlsSegment segment;
@@ -93,7 +105,7 @@
// The generation in which this module was loaded. Dynamic TLS lookups use
// this field to detect when a module has been unloaded.
- size_t first_generation = 0;
+ size_t first_generation = kTlsGenerationNone;
// Used by the dynamic linker to track the associated soinfo* object.
void* soinfo_ptr = nullptr;
@@ -105,9 +117,10 @@
struct TlsModules {
constexpr TlsModules() {}
- // A generation counter. The value is incremented each time an solib is loaded
- // or unloaded.
- _Atomic(size_t) generation = 0;
+ // A pointer to the TLS generation counter in libc.so. The counter is
+ // incremented each time an solib is loaded or unloaded.
+ _Atomic(size_t) generation = kTlsGenerationFirst;
+ _Atomic(size_t) *generation_libc_so = nullptr;
// Access to the TlsModule[] table requires taking this lock.
pthread_rwlock_t rwlock = PTHREAD_RWLOCK_INITIALIZER;
@@ -119,3 +132,46 @@
};
void __init_static_tls(void* static_tls);
+
+// Dynamic Thread Vector. Each thread has a different DTV. For each module
+// (executable or solib), the DTV has a pointer to that module's TLS memory. The
+// DTV is initially empty and is allocated on-demand. It grows as more modules
+// are dlopen'ed. See https://www.akkadia.org/drepper/tls.pdf.
+//
+// The layout of the DTV is specified in various documents, but it is not part
+// of Bionic's public ABI. A compiler can't generate code to access it directly,
+// because it can't access libc's global generation counter.
+struct TlsDtv {
+ // Number of elements in this object's modules field.
+ size_t count;
+
+ // A pointer to an older TlsDtv object that should be freed when the thread
+ // exits. The objects aren't immediately freed because a DTV could be
+ // reallocated by a signal handler that interrupted __tls_get_addr's fast
+ // path.
+ TlsDtv* next;
+
+ // The DTV slot points at this field, which allows omitting an add instruction
+ // on the fast path for a TLS lookup. The arm64 tlsdesc_resolver.S depends on
+ // the layout of fields past this point.
+ size_t generation;
+ void* modules[];
+};
+
+struct TlsIndex {
+ size_t module_id;
+ size_t offset;
+};
+
+#if defined(__i386__)
+#define TLS_GET_ADDR_CCONV __attribute__((regparm(1)))
+#define TLS_GET_ADDR ___tls_get_addr
+#else
+#define TLS_GET_ADDR_CCONV
+#define TLS_GET_ADDR __tls_get_addr
+#endif
+
+extern "C" void* TLS_GET_ADDR(const TlsIndex* ti) TLS_GET_ADDR_CCONV;
+
+struct bionic_tcb;
+void __free_dynamic_tls(bionic_tcb* tcb);
diff --git a/libc/private/bionic_globals.h b/libc/private/bionic_globals.h
index 4d40476..21a2a24 100644
--- a/libc/private/bionic_globals.h
+++ b/libc/private/bionic_globals.h
@@ -33,6 +33,7 @@
#include <link.h>
#include <pthread.h>
+#include "private/bionic_allocator.h"
#include "private/bionic_elf_tls.h"
#include "private/bionic_fdsan.h"
#include "private/bionic_malloc_dispatch.h"
@@ -70,6 +71,7 @@
StaticTlsLayout static_tls_layout;
TlsModules tls_modules;
+ BionicAllocator tls_allocator;
// Values passed from the linker to libc.so.
const char* init_progname = nullptr;