Reorganize static TLS memory for ELF TLS

For ELF TLS "local-exec" accesses, the static linker assumes that an
executable's TLS segment is located at a statically-known offset from the
thread pointer (i.e. "variant 1" for ARM and "variant 2" for x86).
Because these layouts are incompatible, Bionic generally needs to allocate
its TLS slots differently between different architectures.

To allow per-architecture TLS slots:
 - Replace the TLS_SLOT_xxx enumerators with macros. New ARM slots are
   generally negative, while new x86 slots are generally positive.
 - Define a bionic_tcb struct that provides two things:
    - a void* raw_slots_storage[BIONIC_TLS_SLOTS] field
    - an inline accessor function: void*& tls_slot(size_t tpindex);

For ELF TLS, it's necessary to allocate a temporary TCB (i.e. TLS slots),
because the runtime linker doesn't know how large the static TLS area is
until after it has loaded all of the initial solibs.

To accommodate Golang, it's necessary to allocate the pthread keys at a
fixed, small, positive offset from the thread pointer.

This CL moves the pthread keys into bionic_tls, then allocates a single
mapping per thread that looks like so:
 - stack guard
 - stack [omitted for main thread and with pthread_attr_setstack]
 - static TLS:
    - bionic_tcb [exec TLS will either precede or succeed the TCB]
    - bionic_tls [prefixed by the pthread keys]
    - [solib TLS segments will be placed here]
 - guard page

As before, if the new mapping includes a stack, the pthread_internal_t
is allocated on it.

At startup, Bionic allocates a temporary bionic_tcb object on the stack,
then allocates a temporary bionic_tls object using mmap. This mmap is
delayed because the linker can't currently call async_safe_fatal() before
relocating itself.

Later, Bionic allocates a stack-less thread mapping for the main thread,
and copies slots from the temporary TCB to the new TCB.
(See *::copy_from_bootstrap methods.)

Bug: http://b/78026329
Test: bionic unit tests
Test: verify that a Golang app still works
Test: verify that a Golang app crashes if bionic_{tls,tcb} are swapped
Merged-In: I6543063752f4ec8ef6dc9c7f2a06ce2a18fc5af3
Change-Id: I6543063752f4ec8ef6dc9c7f2a06ce2a18fc5af3
(cherry picked from commit 1e660b70da625fcbf1e43dfae09b7b4817fa1660)
diff --git a/libc/bionic/pthread_internal.h b/libc/bionic/pthread_internal.h
index 4c13dcb..27ab3df 100644
--- a/libc/bionic/pthread_internal.h
+++ b/libc/bionic/pthread_internal.h
@@ -52,12 +52,6 @@
 #define PTHREAD_ATTR_FLAG_INHERIT 0x00000004
 #define PTHREAD_ATTR_FLAG_EXPLICIT 0x00000008
 
-class pthread_key_data_t {
- public:
-  uintptr_t seq; // Use uintptr_t just for alignment, as we use pointer below.
-  void* data;
-};
-
 enum ThreadJoinState {
   THREAD_NOT_JOINED,
   THREAD_EXITED_NOT_JOINED,
@@ -131,6 +125,7 @@
 
   Lock startup_handshake_lock;
 
+  void* mmap_base;
   size_t mmap_size;
 
   thread_local_dtor* thread_local_dtors;
@@ -146,42 +141,44 @@
   bionic_tls* bionic_tls;
 
   int errno_value;
-
-  // The thread pointer (__get_tls()) points at this field. This field must come last so that
-  // an executable's TLS segment can be allocated at a fixed offset after the thread pointer.
-  void* tls[BIONIC_TLS_SLOTS];
-
-  // The golang runtime currently expects this field to come after the slots.
-  pthread_key_data_t key_data[BIONIC_PTHREAD_KEY_COUNT];
 };
 
-__LIBC_HIDDEN__ void __init_tls(pthread_internal_t* thread);
-__LIBC_HIDDEN__ void __init_tls_stack_guard(pthread_internal_t* thread);
-__LIBC_HIDDEN__ bionic_tls* __allocate_bionic_tls();
+struct ThreadMapping {
+  char* mmap_base;
+  size_t mmap_size;
+
+  char* static_tls;
+  char* stack_base;
+  char* stack_top;
+};
+
+__LIBC_HIDDEN__ void __init_tcb(bionic_tcb* tcb, pthread_internal_t* thread);
+__LIBC_HIDDEN__ void __init_tcb_stack_guard(bionic_tcb* tcb);
+__LIBC_HIDDEN__ void __init_bionic_tls_ptrs(bionic_tcb* tcb, bionic_tls* tls);
+__LIBC_HIDDEN__ bionic_tls* __allocate_temp_bionic_tls();
+__LIBC_HIDDEN__ void __free_temp_bionic_tls(bionic_tls* tls);
 __LIBC_HIDDEN__ void __init_additional_stacks(pthread_internal_t*);
 __LIBC_HIDDEN__ int __init_thread(pthread_internal_t* thread);
+__LIBC_HIDDEN__ ThreadMapping __allocate_thread_mapping(size_t stack_size, size_t stack_guard_size);
 
 __LIBC_HIDDEN__ pthread_t           __pthread_internal_add(pthread_internal_t* thread);
 __LIBC_HIDDEN__ pthread_internal_t* __pthread_internal_find(pthread_t pthread_id);
 __LIBC_HIDDEN__ void                __pthread_internal_remove(pthread_internal_t* thread);
 __LIBC_HIDDEN__ void                __pthread_internal_remove_and_free(pthread_internal_t* thread);
 
+static inline __always_inline bionic_tcb* __get_bionic_tcb() {
+  return reinterpret_cast<bionic_tcb*>(&__get_tls()[MIN_TLS_SLOT]);
+}
+
 // Make __get_thread() inlined for performance reason. See http://b/19825434.
 static inline __always_inline pthread_internal_t* __get_thread() {
-  void** tls = __get_tls();
-  if (__predict_true(tls)) {
-    return reinterpret_cast<pthread_internal_t*>(tls[TLS_SLOT_THREAD_ID]);
-  }
-
-  // This happens when called during libc initialization before TLS has been initialized.
-  return nullptr;
+  return static_cast<pthread_internal_t*>(__get_tls()[TLS_SLOT_THREAD_ID]);
 }
 
 static inline __always_inline bionic_tls& __get_bionic_tls() {
-  return *__get_thread()->bionic_tls;
+  return *static_cast<bionic_tls*>(__get_tls()[TLS_SLOT_BIONIC_TLS]);
 }
 
-extern __LIBC_HIDDEN__ pthread_internal_t* __get_main_thread();
 extern "C" __LIBC_HIDDEN__ int __set_tls(void* ptr);
 
 __LIBC_HIDDEN__ void pthread_key_clean_all(void);