| /* |
| * Copyright (C) 2019 The Android Open Source Project |
| * All rights reserved. |
| * |
| * Redistribution and use in source and binary forms, with or without |
| * modification, are permitted provided that the following conditions |
| * are met: |
| * * Redistributions of source code must retain the above copyright |
| * notice, this list of conditions and the following disclaimer. |
| * * Redistributions in binary form must reproduce the above copyright |
| * notice, this list of conditions and the following disclaimer in |
| * the documentation and/or other materials provided with the |
| * distribution. |
| * |
| * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS |
| * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT |
| * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS |
| * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE |
| * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, |
| * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, |
| * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS |
| * OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED |
| * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, |
| * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT |
| * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF |
| * SUCH DAMAGE. |
| */ |
| |
| #include "private/bionic_elf_tls.h" |
| |
| #include <async_safe/CHECK.h> |
| #include <async_safe/log.h> |
| #include <string.h> |
| #include <sys/param.h> |
| #include <unistd.h> |
| |
| #include "platform/bionic/macros.h" |
| #include "platform/bionic/page.h" |
| #include "private/ScopedRWLock.h" |
| #include "private/ScopedSignalBlocker.h" |
| #include "private/bionic_globals.h" |
| #include "private/bionic_tls.h" |
| #include "pthread_internal.h" |
| |
| // Every call to __tls_get_addr needs to check the generation counter, so |
| // accesses to the counter need to be as fast as possible. Keep a copy of it in |
| // a hidden variable, which can be accessed without using the GOT. The linker |
| // will update this variable when it updates its counter. |
| // |
| // To allow the linker to update this variable, libc.so's constructor passes its |
| // address to the linker. To accommodate a possible __tls_get_addr call before |
| // libc.so's constructor, this local copy is initialized to SIZE_MAX, forcing |
| // __tls_get_addr to initially use the slow path. |
| __LIBC_HIDDEN__ _Atomic(size_t) __libc_tls_generation_copy = SIZE_MAX; |
| |
| // Search for a TLS segment in the given phdr table. Returns true if it has a |
| // TLS segment and false otherwise. |
| bool __bionic_get_tls_segment(const ElfW(Phdr)* phdr_table, size_t phdr_count, |
| ElfW(Addr) load_bias, TlsSegment* out) { |
| for (size_t i = 0; i < phdr_count; ++i) { |
| const ElfW(Phdr)& phdr = phdr_table[i]; |
| if (phdr.p_type == PT_TLS) { |
| *out = TlsSegment{ |
| .aligned_size = |
| TlsAlignedSize{ |
| .size = phdr.p_memsz, |
| .align = |
| TlsAlign{ |
| .value = phdr.p_align ?: 1, // 0 means "no alignment requirement" |
| .skew = phdr.p_vaddr % MAX(1, phdr.p_align), |
| }, |
| }, |
| .init_ptr = reinterpret_cast<void*>(load_bias + phdr.p_vaddr), |
| .init_size = phdr.p_filesz, |
| }; |
| return true; |
| } |
| } |
| return false; |
| } |
| |
| // Return true if the alignment of a TLS segment is a valid power-of-two. |
| bool __bionic_check_tls_align(size_t align) { |
| // Note: The size does not need to be a multiple of the alignment. With ld.bfd |
| // (or after using binutils' strip), the TLS segment's size isn't rounded up. |
| return powerof2(align); |
| } |
| |
| static void static_tls_layout_overflow() { |
| async_safe_fatal("error: TLS segments in static TLS overflowed"); |
| } |
| |
| static size_t align_checked(size_t value, TlsAlign tls_align) { |
| const size_t align = tls_align.value; |
| const size_t skew = tls_align.skew; |
| CHECK(align != 0 && powerof2(align + 0) && skew < align); |
| const size_t result = ((value - skew + align - 1) & ~(align - 1)) + skew; |
| if (result < value) static_tls_layout_overflow(); |
| return result; |
| } |
| |
| size_t StaticTlsLayout::offset_thread_pointer() const { |
| return offset_bionic_tcb_ + (-MIN_TLS_SLOT * sizeof(void*)); |
| } |
| |
| // Allocates the Bionic TCB and the executable's TLS segment in the static TLS |
| // layout, satisfying alignment requirements for both. |
| // |
| // For an executable's TLS accesses (using the LocalExec model), the static |
| // linker bakes TLS offsets directly into the .text section, so the loader must |
| // place the executable segment at the same offset relative to the TP. |
| // Similarly, the Bionic TLS slots (bionic_tcb) must also be allocated at the |
| // correct offset relative to the TP. |
| // |
| // Returns the offset of the executable's TLS segment. |
| // |
| // Note: This function has unit tests, but they are in bionic-unit-tests-static, |
| // not bionic-unit-tests. |
| size_t StaticTlsLayout::reserve_exe_segment_and_tcb(const TlsSegment* seg, |
| const char* progname __attribute__((unused))) { |
| // Special case: if the executable has no TLS segment, then just allocate a |
| // TCB and skip the minimum alignment check on ARM. |
| if (seg == nullptr) { |
| offset_bionic_tcb_ = reserve_type<bionic_tcb>(); |
| return 0; |
| } |
| |
| #if defined(__arm__) || defined(__aarch64__) |
| // ARM uses a "variant 1" TLS layout. The ABI specifies that the TP points at |
| // a 2-word TCB, followed by the executable's segment. In practice, libc |
| // implementations actually allocate a larger TCB at negative offsets from the |
| // TP. |
| // |
| // Historically, Bionic allocated an 8-word TCB starting at TP+0, so to keep |
| // the executable's TLS segment from overlapping the last 6 slots, Bionic |
| // requires that executables have an 8-word PT_TLS alignment to ensure that |
| // the TCB fits in the alignment padding, which it accomplishes using |
| // crtbegin.c. Bionic uses negative offsets for new TLS slots to avoid this |
| // problem. |
| |
| static_assert(MIN_TLS_SLOT <= 0 && MAX_TLS_SLOT >= 1); |
| static_assert(sizeof(bionic_tcb) == (MAX_TLS_SLOT - MIN_TLS_SLOT + 1) * sizeof(void*)); |
| static_assert(alignof(bionic_tcb) == sizeof(void*)); |
| const size_t max_align = MAX(alignof(bionic_tcb), seg->aligned_size.align.value); |
| |
| // Allocate the TCB first. Split it into negative and non-negative slots and |
| // ensure that TP (i.e. the first non-negative slot) is aligned to max_align. |
| const size_t tcb_size_pre = -MIN_TLS_SLOT * sizeof(void*); |
| const size_t tcb_size_post = (MAX_TLS_SLOT + 1) * sizeof(void*); |
| const auto pair = |
| reserve_tp_pair(TlsAlignedSize{.size = tcb_size_pre}, |
| TlsAlignedSize{.size = tcb_size_post, .align = TlsAlign{.value = max_align}}); |
| offset_bionic_tcb_ = pair.before; |
| const size_t offset_tp = pair.tp; |
| |
| // Allocate the segment. |
| offset_exe_ = reserve(seg->aligned_size); |
| |
| // Verify that the ABI and Bionic tpoff values are equal, which is equivalent |
| // to checking whether the segment is sufficiently aligned. |
| const size_t abi_tpoff = align_checked(2 * sizeof(void*), seg->aligned_size.align); |
| const size_t actual_tpoff = align_checked(tcb_size_post, seg->aligned_size.align); |
| CHECK(actual_tpoff == offset_exe_ - offset_tp); |
| |
| if (abi_tpoff != actual_tpoff) { |
| async_safe_fatal( |
| "error: \"%s\": executable's TLS segment is underaligned: " |
| "alignment is %zu (skew %zu), needs to be at least %zu for %s Bionic", |
| progname, seg->aligned_size.align.value, seg->aligned_size.align.skew, tcb_size_post, |
| (sizeof(void*) == 4 ? "ARM" : "ARM64")); |
| } |
| |
| #elif defined(__i386__) || defined(__x86_64__) |
| |
| auto pair = reserve_tp_pair(seg->aligned_size, TlsAlignedSize::of_type<bionic_tcb>()); |
| offset_exe_ = pair.before; |
| offset_bionic_tcb_ = pair.after; |
| |
| #elif defined(__riscv) |
| static_assert(MAX_TLS_SLOT == -1, "Last slot of bionic_tcb must be slot #(-1) on riscv"); |
| |
| auto pair = reserve_tp_pair(TlsAlignedSize::of_type<bionic_tcb>(), seg->aligned_size); |
| offset_bionic_tcb_ = pair.before; |
| offset_exe_ = pair.after; |
| |
| #else |
| #error "Unrecognized architecture" |
| #endif |
| |
| return offset_exe_; |
| } |
| |
| size_t StaticTlsLayout::reserve_bionic_tls() { |
| offset_bionic_tls_ = reserve_type<bionic_tls>(); |
| return offset_bionic_tls_; |
| } |
| |
| void StaticTlsLayout::finish_layout() { |
| // Round the offset up to the alignment. |
| cursor_ = align_checked(cursor_, TlsAlign{.value = align_}); |
| } |
| |
| size_t StaticTlsLayout::align_cursor(TlsAlign align) { |
| cursor_ = align_checked(cursor_, align); |
| align_ = MAX(align_, align.value); |
| return cursor_; |
| } |
| |
| size_t StaticTlsLayout::align_cursor_unskewed(size_t align) { |
| return align_cursor(TlsAlign{.value = align}); |
| } |
| |
| // Reserve the requested number of bytes at the requested alignment. The |
| // requested size is not required to be a multiple of the alignment, nor is the |
| // cursor aligned after the allocation. |
| size_t StaticTlsLayout::reserve(TlsAlignedSize aligned_size) { |
| align_cursor(aligned_size.align); |
| const size_t result = cursor_; |
| if (__builtin_add_overflow(cursor_, aligned_size.size, &cursor_)) static_tls_layout_overflow(); |
| return result; |
| } |
| |
| // Calculate the TP offset and allocate something before it and something after |
| // it. The TP will be aligned to: |
| // |
| // MAX(before.align.value, after.align.value) |
| // |
| // The `before` and `after` allocations are each allocated as closely as |
| // possible to the TP. |
| StaticTlsLayout::TpAllocations StaticTlsLayout::reserve_tp_pair(TlsAlignedSize before, |
| TlsAlignedSize after) { |
| // Tentative `before` allocation. |
| const size_t tentative_before = reserve(before); |
| const size_t tentative_before_end = align_cursor_unskewed(before.align.value); |
| |
| const size_t offset_tp = align_cursor_unskewed(MAX(before.align.value, after.align.value)); |
| |
| const size_t offset_after = reserve(after); |
| |
| // If the `after` allocation has higher alignment than `before`, then there |
| // may be alignment padding to remove between `before` and the TP. Shift |
| // `before` forward to remove this padding. |
| CHECK(((offset_tp - tentative_before_end) & (before.align.value - 1)) == 0); |
| const size_t offset_before = tentative_before + (offset_tp - tentative_before_end); |
| |
| return TpAllocations{offset_before, offset_tp, offset_after}; |
| } |
| |
| // Copy each TLS module's initialization image into a newly-allocated block of |
| // static TLS memory. To reduce dirty pages, this function only writes to pages |
| // within the static TLS that need initialization. The memory should already be |
| // zero-initialized on entry. |
| void __init_static_tls(void* static_tls) { |
| // The part of the table we care about (i.e. static TLS modules) never changes |
| // after startup, but we still need the mutex because the table could grow, |
| // moving the initial part. If this locking is too slow, we can duplicate the |
| // static part of the table. |
| TlsModules& modules = __libc_shared_globals()->tls_modules; |
| ScopedSignalBlocker ssb; |
| ScopedReadLock locker(&modules.rwlock); |
| |
| for (size_t i = 0; i < modules.module_count; ++i) { |
| TlsModule& module = modules.module_table[i]; |
| if (module.static_offset == SIZE_MAX) { |
| // All of the static modules come before all of the dynamic modules, so |
| // once we see the first dynamic module, we're done. |
| break; |
| } |
| if (module.segment.init_size == 0) { |
| // Skip the memcpy call for TLS segments with no initializer, which is |
| // common. |
| continue; |
| } |
| memcpy(static_cast<char*>(static_tls) + module.static_offset, |
| module.segment.init_ptr, |
| module.segment.init_size); |
| } |
| } |
| |
| static inline size_t dtv_size_in_bytes(size_t module_count) { |
| return sizeof(TlsDtv) + module_count * sizeof(void*); |
| } |
| |
| // Calculates the number of module slots to allocate in a new DTV. For small |
| // objects (up to 1KiB), the TLS allocator allocates memory in power-of-2 sizes, |
| // so for better space usage, ensure that the DTV size (header + slots) is a |
| // power of 2. |
| // |
| // The lock on TlsModules must be held. |
| static size_t calculate_new_dtv_count() { |
| size_t loaded_cnt = __libc_shared_globals()->tls_modules.module_count; |
| size_t bytes = dtv_size_in_bytes(MAX(1, loaded_cnt)); |
| if (!powerof2(bytes)) { |
| bytes = BIONIC_ROUND_UP_POWER_OF_2(bytes); |
| } |
| return (bytes - sizeof(TlsDtv)) / sizeof(void*); |
| } |
| |
| // This function must be called with signals blocked and a write lock on |
| // TlsModules held. |
| static void update_tls_dtv(bionic_tcb* tcb) { |
| const TlsModules& modules = __libc_shared_globals()->tls_modules; |
| BionicAllocator& allocator = __libc_shared_globals()->tls_allocator; |
| |
| // Use the generation counter from the shared globals instead of the local |
| // copy, which won't be initialized yet if __tls_get_addr is called before |
| // libc.so's constructor. |
| if (__get_tcb_dtv(tcb)->generation == atomic_load(&modules.generation)) { |
| return; |
| } |
| |
| const size_t old_cnt = __get_tcb_dtv(tcb)->count; |
| |
| // If the DTV isn't large enough, allocate a larger one. Because a signal |
| // handler could interrupt the fast path of __tls_get_addr, we don't free the |
| // old DTV. Instead, we add the old DTV to a list, then free all of a thread's |
| // DTVs at thread-exit. Each time the DTV is reallocated, its size at least |
| // doubles. |
| if (modules.module_count > old_cnt) { |
| size_t new_cnt = calculate_new_dtv_count(); |
| TlsDtv* const old_dtv = __get_tcb_dtv(tcb); |
| TlsDtv* const new_dtv = static_cast<TlsDtv*>(allocator.alloc(dtv_size_in_bytes(new_cnt))); |
| memcpy(new_dtv, old_dtv, dtv_size_in_bytes(old_cnt)); |
| new_dtv->count = new_cnt; |
| new_dtv->next = old_dtv; |
| __set_tcb_dtv(tcb, new_dtv); |
| } |
| |
| TlsDtv* const dtv = __get_tcb_dtv(tcb); |
| |
| const StaticTlsLayout& layout = __libc_shared_globals()->static_tls_layout; |
| char* static_tls = reinterpret_cast<char*>(tcb) - layout.offset_bionic_tcb(); |
| |
| // Initialize static TLS modules and free unloaded modules. |
| for (size_t i = 0; i < dtv->count; ++i) { |
| if (i < modules.module_count) { |
| const TlsModule& mod = modules.module_table[i]; |
| if (mod.static_offset != SIZE_MAX) { |
| dtv->modules[i] = static_tls + mod.static_offset; |
| continue; |
| } |
| if (mod.first_generation != kTlsGenerationNone && |
| mod.first_generation <= dtv->generation) { |
| continue; |
| } |
| } |
| if (modules.on_destruction_cb != nullptr) { |
| void* dtls_begin = dtv->modules[i]; |
| void* dtls_end = |
| static_cast<void*>(static_cast<char*>(dtls_begin) + allocator.get_chunk_size(dtls_begin)); |
| modules.on_destruction_cb(dtls_begin, dtls_end); |
| } |
| allocator.free(dtv->modules[i]); |
| dtv->modules[i] = nullptr; |
| } |
| |
| dtv->generation = atomic_load(&modules.generation); |
| } |
| |
| __attribute__((noinline)) static void* tls_get_addr_slow_path(const TlsIndex* ti) { |
| TlsModules& modules = __libc_shared_globals()->tls_modules; |
| bionic_tcb* tcb = __get_bionic_tcb(); |
| |
| // Block signals and lock TlsModules. We may need the allocator, so take |
| // a write lock. |
| ScopedSignalBlocker ssb; |
| ScopedWriteLock locker(&modules.rwlock); |
| |
| update_tls_dtv(tcb); |
| |
| TlsDtv* dtv = __get_tcb_dtv(tcb); |
| const size_t module_idx = __tls_module_id_to_idx(ti->module_id); |
| void* mod_ptr = dtv->modules[module_idx]; |
| if (mod_ptr == nullptr) { |
| const TlsSegment& segment = modules.module_table[module_idx].segment; |
| // TODO: Currently the aligned_size.align.skew property is ignored. |
| // That is, for a dynamic TLS block at addr A, (A % p_align) will be 0, not |
| // (p_vaddr % p_align). |
| mod_ptr = __libc_shared_globals()->tls_allocator.memalign(segment.aligned_size.align.value, |
| segment.aligned_size.size); |
| if (segment.init_size > 0) { |
| memcpy(mod_ptr, segment.init_ptr, segment.init_size); |
| } |
| dtv->modules[module_idx] = mod_ptr; |
| |
| // Reports the allocation to the listener, if any. |
| if (modules.on_creation_cb != nullptr) { |
| modules.on_creation_cb( |
| mod_ptr, static_cast<void*>(static_cast<char*>(mod_ptr) + segment.aligned_size.size)); |
| } |
| } |
| |
| return static_cast<char*>(mod_ptr) + ti->offset + TLS_DTV_OFFSET; |
| } |
| |
| // Returns the address of a thread's TLS memory given a module ID and an offset |
| // into that module's TLS segment. This function is called on every access to a |
| // dynamic TLS variable on targets that don't use TLSDESC. arm64 uses TLSDESC, |
| // so it only calls this function on a thread's first access to a module's TLS |
| // segment. |
| // |
| // On most targets, this accessor function is __tls_get_addr and |
| // TLS_GET_ADDR_CALLING_CONVENTION is unset, but 32-bit x86 uses |
| // ___tls_get_addr (with three underscores) instead, and a regparm |
| // calling convention. |
| extern "C" void* TLS_GET_ADDR(const TlsIndex* ti) TLS_GET_ADDR_CALLING_CONVENTION { |
| TlsDtv* dtv = __get_tcb_dtv(__get_bionic_tcb()); |
| |
| // TODO: See if we can use a relaxed memory ordering here instead. |
| size_t generation = atomic_load(&__libc_tls_generation_copy); |
| if (__predict_true(generation == dtv->generation)) { |
| void* mod_ptr = dtv->modules[__tls_module_id_to_idx(ti->module_id)]; |
| if (__predict_true(mod_ptr != nullptr)) { |
| return static_cast<char*>(mod_ptr) + ti->offset + TLS_DTV_OFFSET; |
| } |
| } |
| |
| return tls_get_addr_slow_path(ti); |
| } |
| |
| // This function frees: |
| // - TLS modules referenced by the current DTV. |
| // - The list of DTV objects associated with the current thread. |
| // |
| // The caller must have already blocked signals. |
| void __free_dynamic_tls(bionic_tcb* tcb) { |
| TlsModules& modules = __libc_shared_globals()->tls_modules; |
| BionicAllocator& allocator = __libc_shared_globals()->tls_allocator; |
| |
| // If we didn't allocate any dynamic memory, skip out early without taking |
| // the lock. |
| TlsDtv* dtv = __get_tcb_dtv(tcb); |
| if (dtv->generation == kTlsGenerationNone) { |
| return; |
| } |
| |
| // We need the write lock to use the allocator. |
| ScopedWriteLock locker(&modules.rwlock); |
| |
| // First free everything in the current DTV. |
| for (size_t i = 0; i < dtv->count; ++i) { |
| if (i < modules.module_count && modules.module_table[i].static_offset != SIZE_MAX) { |
| // This module's TLS memory is allocated statically, so don't free it here. |
| continue; |
| } |
| |
| if (modules.on_destruction_cb != nullptr) { |
| void* dtls_begin = dtv->modules[i]; |
| void* dtls_end = |
| static_cast<void*>(static_cast<char*>(dtls_begin) + allocator.get_chunk_size(dtls_begin)); |
| modules.on_destruction_cb(dtls_begin, dtls_end); |
| } |
| |
| allocator.free(dtv->modules[i]); |
| } |
| |
| // Now free the thread's list of DTVs. |
| while (dtv->generation != kTlsGenerationNone) { |
| TlsDtv* next = dtv->next; |
| allocator.free(dtv); |
| dtv = next; |
| } |
| |
| // Clear the DTV slot. The DTV must not be used again with this thread. |
| tcb->tls_slot(TLS_SLOT_DTV) = nullptr; |
| } |
| |
| // Invokes all the registered thread_exit callbacks, if any. |
| void __notify_thread_exit_callbacks() { |
| TlsModules& modules = __libc_shared_globals()->tls_modules; |
| if (modules.first_thread_exit_callback == nullptr) { |
| // If there is no first_thread_exit_callback, there shouldn't be a tail. |
| CHECK(modules.thread_exit_callback_tail_node == nullptr); |
| return; |
| } |
| |
| // Callbacks are supposed to be invoked in the reverse order |
| // in which they were registered. |
| CallbackHolder* node = modules.thread_exit_callback_tail_node; |
| while (node != nullptr) { |
| node->cb(); |
| node = node->prev; |
| } |
| modules.first_thread_exit_callback(); |
| } |