Reorganize static TLS memory for ELF TLS

For ELF TLS "local-exec" accesses, the static linker assumes that an
executable's TLS segment is located at a statically-known offset from the
thread pointer (i.e. "variant 1" for ARM and "variant 2" for x86).
Because these layouts are incompatible, Bionic generally needs to allocate
its TLS slots differently between different architectures.

To allow per-architecture TLS slots:
 - Replace the TLS_SLOT_xxx enumerators with macros. New ARM slots are
   generally negative, while new x86 slots are generally positive.
 - Define a bionic_tcb struct that provides two things:
    - a void* raw_slots_storage[BIONIC_TLS_SLOTS] field
    - an inline accessor function: void*& tls_slot(size_t tpindex);

For ELF TLS, it's necessary to allocate a temporary TCB (i.e. TLS slots),
because the runtime linker doesn't know how large the static TLS area is
until after it has loaded all of the initial solibs.

To accommodate Golang, it's necessary to allocate the pthread keys at a
fixed, small, positive offset from the thread pointer.

This CL moves the pthread keys into bionic_tls, then allocates a single
mapping per thread that looks like so:
 - stack guard
 - stack [omitted for main thread and with pthread_attr_setstack]
 - static TLS:
    - bionic_tcb [exec TLS will either precede or succeed the TCB]
    - bionic_tls [prefixed by the pthread keys]
    - [solib TLS segments will be placed here]
 - guard page

As before, if the new mapping includes a stack, the pthread_internal_t
is allocated on it.

At startup, Bionic allocates a temporary bionic_tcb object on the stack,
then allocates a temporary bionic_tls object using mmap. This mmap is
delayed because the linker can't currently call async_safe_fatal() before
relocating itself.

Later, Bionic allocates a stack-less thread mapping for the main thread,
and copies slots from the temporary TCB to the new TCB.
(See *::copy_from_bootstrap methods.)

Bug: http://b/78026329
Test: bionic unit tests
Test: verify that a Golang app still works
Test: verify that a Golang app crashes if bionic_{tls,tcb} are swapped
Merged-In: I6543063752f4ec8ef6dc9c7f2a06ce2a18fc5af3
Change-Id: I6543063752f4ec8ef6dc9c7f2a06ce2a18fc5af3
(cherry picked from commit 1e660b70da625fcbf1e43dfae09b7b4817fa1660)
diff --git a/libc/bionic/__libc_init_main_thread.cpp b/libc/bionic/__libc_init_main_thread.cpp
index 5abdc07..2b90c90 100644
--- a/libc/bionic/__libc_init_main_thread.cpp
+++ b/libc/bionic/__libc_init_main_thread.cpp
@@ -28,9 +28,12 @@
 
 #include "libc_init_common.h"
 
+#include <async_safe/log.h>
+
 #include "private/KernelArgumentBlock.h"
 #include "private/bionic_arc4random.h"
 #include "private/bionic_defs.h"
+#include "private/bionic_elf_tls.h"
 #include "private/bionic_globals.h"
 #include "private/bionic_ssp.h"
 #include "pthread_internal.h"
@@ -43,11 +46,6 @@
 
 static pthread_internal_t main_thread;
 
-__attribute__((no_sanitize("hwaddress")))
-pthread_internal_t* __get_main_thread() {
-  return &main_thread;
-}
-
 // Setup for the main thread. For dynamic executables, this is called by the
 // linker _before_ libc is mapped in memory. This means that all writes to
 // globals from this function will apply to linker-private copies and will not
@@ -69,35 +67,28 @@
 // linker, the linker binary hasn't been relocated yet, so certain kinds of code
 // are hazardous, such as accessing non-hidden global variables.
 __BIONIC_WEAK_FOR_NATIVE_BRIDGE
-void __libc_init_main_thread_early(KernelArgumentBlock& args) {
+extern "C" void __libc_init_main_thread_early(const KernelArgumentBlock& args,
+                                              bionic_tcb* temp_tcb) {
   __libc_shared_globals()->auxv = args.auxv;
 #if defined(__i386__)
-  __libc_init_sysinfo();
+  __libc_init_sysinfo(); // uses AT_SYSINFO auxv entry
 #endif
-  __set_tls(main_thread.tls);
-  __init_tls(&main_thread);
+  __init_tcb(temp_tcb, &main_thread);
+  __set_tls(&temp_tcb->tls_slot(0));
   main_thread.tid = __getpid();
   main_thread.set_cached_pid(main_thread.tid);
 }
 
 // Finish initializing the main thread.
 __BIONIC_WEAK_FOR_NATIVE_BRIDGE
-void __libc_init_main_thread_late() {
-  main_thread.bionic_tls = __allocate_bionic_tls();
-  if (main_thread.bionic_tls == nullptr) {
-    // Avoid strerror because it might need bionic_tls.
-    async_safe_fatal("failed to allocate bionic_tls: error %d", errno);
-  }
+extern "C" void __libc_init_main_thread_late() {
+  __init_bionic_tls_ptrs(__get_bionic_tcb(), __allocate_temp_bionic_tls());
 
   // Tell the kernel to clear our tid field when we exit, so we're like any other pthread.
+  // For threads created by pthread_create, this setup happens during the clone syscall (i.e.
+  // CLONE_CHILD_CLEARTID).
   __set_tid_address(&main_thread.tid);
 
-  // We don't want to free the main thread's stack even when the main thread exits
-  // because things like environment variables with global scope live on it.
-  // We also can't free the pthread_internal_t itself, since it is a static variable.
-  // The main thread has no mmap allocated space for stack or pthread_internal_t.
-  main_thread.mmap_size = 0;
-
   pthread_attr_init(&main_thread.attr);
   // We don't want to explicitly set the main thread's scheduler attributes (http://b/68328561).
   pthread_attr_setinheritsched(&main_thread.attr, PTHREAD_INHERIT_SCHED);
@@ -110,9 +101,40 @@
   // before we initialize the TLS. Dynamic executables will initialize their copy of the global
   // stack protector from the one in the main thread's TLS.
   __libc_safe_arc4random_buf(&__stack_chk_guard, sizeof(__stack_chk_guard));
-  __init_tls_stack_guard(&main_thread);
+  __init_tcb_stack_guard(__get_bionic_tcb());
 
   __init_thread(&main_thread);
 
   __init_additional_stacks(&main_thread);
 }
+
+// Once all ELF modules are loaded, allocate the final copy of the main thread's
+// static TLS memory.
+__BIONIC_WEAK_FOR_NATIVE_BRIDGE
+extern "C" void __libc_init_main_thread_final() {
+  bionic_tcb* temp_tcb = __get_bionic_tcb();
+  bionic_tls* temp_tls = &__get_bionic_tls();
+
+  // Allocate the main thread's static TLS. (This mapping doesn't include a
+  // stack.)
+  ThreadMapping mapping = __allocate_thread_mapping(0, PTHREAD_GUARD_SIZE);
+  if (mapping.mmap_base == nullptr) {
+    async_safe_fatal("failed to mmap main thread static TLS: %s", strerror(errno));
+  }
+
+  const StaticTlsLayout& layout = __libc_shared_globals()->static_tls_layout;
+  auto new_tcb = reinterpret_cast<bionic_tcb*>(mapping.static_tls + layout.offset_bionic_tcb());
+  auto new_tls = reinterpret_cast<bionic_tls*>(mapping.static_tls + layout.offset_bionic_tls());
+
+  new_tcb->copy_from_bootstrap(temp_tcb);
+  new_tls->copy_from_bootstrap(temp_tls);
+  __init_tcb(new_tcb, &main_thread);
+  __init_bionic_tls_ptrs(new_tcb, new_tls);
+
+  main_thread.mmap_base = mapping.mmap_base;
+  main_thread.mmap_size = mapping.mmap_size;
+
+  __set_tls(&new_tcb->tls_slot(0));
+
+  __free_temp_bionic_tls(temp_tls);
+}