bionic: Allocate a shadow call stack for each thread.

Bug: 112907825
Change-Id: I7c1479a0cd68696739bf6aa5e0700ba4f2a137ec
Merged-In: I7c1479a0cd68696739bf6aa5e0700ba4f2a137ec
diff --git a/libc/bionic/__libc_init_main_thread.cpp b/libc/bionic/__libc_init_main_thread.cpp
index 758b295..be9d32e 100644
--- a/libc/bionic/__libc_init_main_thread.cpp
+++ b/libc/bionic/__libc_init_main_thread.cpp
@@ -101,5 +101,5 @@
 
   __init_thread(&main_thread);
 
-  __init_alternate_signal_stack(&main_thread);
+  __init_additional_stacks(&main_thread);
 }
diff --git a/libc/bionic/pthread_create.cpp b/libc/bionic/pthread_create.cpp
index 543fdc5..3ba787b 100644
--- a/libc/bionic/pthread_create.cpp
+++ b/libc/bionic/pthread_create.cpp
@@ -32,6 +32,7 @@
 #include <string.h>
 #include <sys/mman.h>
 #include <sys/prctl.h>
+#include <sys/random.h>
 #include <unistd.h>
 
 #include "pthread_internal.h"
@@ -86,7 +87,7 @@
   thread->tls[TLS_SLOT_STACK_GUARD] = reinterpret_cast<void*>(__stack_chk_guard);
 }
 
-void __init_alternate_signal_stack(pthread_internal_t* thread) {
+static void __init_alternate_signal_stack(pthread_internal_t* thread) {
   // Create and set an alternate signal stack.
   void* stack_base = mmap(nullptr, SIGNAL_STACK_SIZE, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_ANONYMOUS, -1, 0);
   if (stack_base != MAP_FAILED) {
@@ -109,6 +110,32 @@
   }
 }
 
+static void __init_shadow_call_stack(pthread_internal_t* thread) {
+  (void)thread;
+#ifdef __aarch64__
+  char* scs_guard_region = reinterpret_cast<char*>(
+      mmap(nullptr, SCS_GUARD_REGION_SIZE, 0, MAP_PRIVATE | MAP_ANON, -1, 0));
+  thread->shadow_call_stack_guard_region = scs_guard_region;
+
+  // We need to page align scs_offset and ensure that [scs_offset,scs_offset+SCS_SIZE) is in the
+  // guard region. We can't use arc4random_uniform in init because /dev/urandom might not have
+  // been created yet.
+  size_t scs_offset =
+      (getpid() == 1) ? 0 : (arc4random_uniform(SCS_GUARD_REGION_SIZE / SCS_SIZE) * SCS_SIZE);
+
+  // Allocate the stack and store its address in register x18. This is deliberately the only place
+  // where the address is stored.
+  __asm__ __volatile__(
+      "mov x18, %0" ::"r"(mmap(scs_guard_region + scs_offset, SCS_SIZE, PROT_READ | PROT_WRITE,
+                               MAP_PRIVATE | MAP_ANON | MAP_FIXED, -1, 0)));
+#endif
+}
+
+void __init_additional_stacks(pthread_internal_t* thread) {
+  __init_alternate_signal_stack(thread);
+  __init_shadow_call_stack(thread);
+}
+
 int __init_thread(pthread_internal_t* thread) {
   thread->cleanup_stack = nullptr;
 
@@ -252,7 +279,7 @@
   // accesses previously made by the creating thread are visible to us.
   thread->startup_handshake_lock.lock();
 
-  __init_alternate_signal_stack(thread);
+  __init_additional_stacks(thread);
 
   void* result = thread->start_routine(thread->start_routine_arg);
   pthread_exit(result);
diff --git a/libc/bionic/pthread_exit.cpp b/libc/bionic/pthread_exit.cpp
index 220f7a0..010cc06 100644
--- a/libc/bionic/pthread_exit.cpp
+++ b/libc/bionic/pthread_exit.cpp
@@ -103,6 +103,11 @@
     thread->alternate_signal_stack = nullptr;
   }
 
+#ifdef __aarch64__
+  // Free the shadow call stack and guard pages.
+  munmap(thread->shadow_call_stack_guard_region, SCS_GUARD_REGION_SIZE);
+#endif
+
   ThreadJoinState old_state = THREAD_NOT_JOINED;
   while (old_state == THREAD_NOT_JOINED &&
          !atomic_compare_exchange_weak(&thread->join_state, &old_state, THREAD_EXITED_NOT_JOINED)) {
diff --git a/libc/bionic/pthread_internal.h b/libc/bionic/pthread_internal.h
index 65ec5ff..b7173a3 100644
--- a/libc/bionic/pthread_internal.h
+++ b/libc/bionic/pthread_internal.h
@@ -106,6 +106,29 @@
 
   void* alternate_signal_stack;
 
+  // The start address of the shadow call stack's guard region (arm64 only).
+  // This address is only used to deallocate the shadow call stack on thread
+  // exit; the address of the stack itself is stored only in the x18 register.
+  // Because the protection offered by SCS relies on the secrecy of the stack
+  // address, storing the address here weakens the protection, but only
+  // slightly, because it is relatively easy for an attacker to discover the
+  // address of the guard region anyway (e.g. it can be discovered by reference
+  // to other allocations), but not the stack itself, which is <0.1% of the size
+  // of the guard region.
+  //
+  // There are at least two other options for discovering the start address of
+  // the guard region on thread exit, but they are not as simple as storing in
+  // TLS.
+  // 1) Derive it from the value of the x18 register. This is only possible in
+  //    processes that do not contain legacy code that might clobber x18,
+  //    therefore each process must declare early during process startup whether
+  //    it might load legacy code.
+  // 2) Mark the guard region as such using prctl(PR_SET_VMA_ANON_NAME) and
+  //    discover its address by reading /proc/self/maps. One issue with this is
+  //    that reading /proc/self/maps can race with allocations, so we may need
+  //    code to handle retries.
+  void* shadow_call_stack_guard_region;
+
   Lock startup_handshake_lock;
 
   size_t mmap_size;
@@ -129,7 +152,7 @@
 __LIBC_HIDDEN__ int __init_thread(pthread_internal_t* thread);
 __LIBC_HIDDEN__ bool __init_tls(pthread_internal_t* thread);
 __LIBC_HIDDEN__ void __init_thread_stack_guard(pthread_internal_t* thread);
-__LIBC_HIDDEN__ void __init_alternate_signal_stack(pthread_internal_t*);
+__LIBC_HIDDEN__ void __init_additional_stacks(pthread_internal_t*);
 
 __LIBC_HIDDEN__ pthread_t           __pthread_internal_add(pthread_internal_t* thread);
 __LIBC_HIDDEN__ pthread_internal_t* __pthread_internal_find(pthread_t pthread_id);
@@ -178,6 +201,13 @@
 // Leave room for a guard page in the internally created signal stacks.
 #define SIGNAL_STACK_SIZE (SIGNAL_STACK_SIZE_WITHOUT_GUARD + PTHREAD_GUARD_SIZE)
 
+// Size of the shadow call stack.
+#define SCS_SIZE (8 * 1024)
+
+// The shadow call stack is allocated at a random address within a guard region
+// of this size.
+#define SCS_GUARD_REGION_SIZE (16 * 1024 * 1024)
+
 // Needed by fork.
 __LIBC_HIDDEN__ extern void __bionic_atfork_run_prepare();
 __LIBC_HIDDEN__ extern void __bionic_atfork_run_child();