memtag_stack: vfork and longjmp support.

With memtag_stack, each function is responsible for cleaning up
allocation tags for its stack frame. Allocation tags for anything below
SP must match the address tag in SP.

Both vfork and longjmp implement non-local control transfer which
abandons part of the stack without proper cleanup. Update allocation
tags:
* For longjmp, we know both source and destination values of SP.
* For vfork, save the value of SP before exit() or exec*() - the only
  valid ways of ending the child process according to POSIX - and reset
  tags from there to SP-in-parent.

This is not 100% solid and can be confused by a number of hopefully
uncommon conditions:
* Segmented stacks.
* Longjmp from sigaltstack into the main stack.
* Some kind of userspace thread implementation using longjmp (that's UB,
  longjmp can only return to the caller on the current stack).
* and other strange things.

This change adds a sanity limit on the size of the tag cleanup. Also,
this logic is only activated in the binaries that carry the
NT_MEMTAG_STACK note (set by -fsanitize=memtag-stack) which is meant as
a debugging configuration, is not compatible with pre-armv9 CPUs, and
should not be set on production code.

Bug: b/174878242
Test: fvp_mini with ToT LLVM (more test in a separate change)

Change-Id: Ibef8b2fc5a6ce85c8e562dead1019964d9f6b80b
diff --git a/libc/SYSCALLS.TXT b/libc/SYSCALLS.TXT
index 45b2a1b..9aeb07c 100644
--- a/libc/SYSCALLS.TXT
+++ b/libc/SYSCALLS.TXT
@@ -26,7 +26,7 @@
 # This file is processed by a python script named gensyscalls.py, run via
 # genrules in Android.bp.
 
-int     execve(const char*, char* const*, char* const*)  all
+int     __execve:execve(const char*, char* const*, char* const*)  all
 
 uid_t   getuid:getuid32()         lp32
 uid_t   getuid:getuid()           lp64
@@ -320,7 +320,7 @@
 
 int __eventfd:eventfd2(unsigned int, int)  all
 
-void _exit|_Exit:exit_group(int)  all
+void __exit_group:exit_group(int)  all
 void __exit:exit(int)  all
 
 int inotify_init1(int)  all
diff --git a/libc/arch-arm64/bionic/setjmp.S b/libc/arch-arm64/bionic/setjmp.S
index d2fafdb..8e00b56 100644
--- a/libc/arch-arm64/bionic/setjmp.S
+++ b/libc/arch-arm64/bionic/setjmp.S
@@ -194,7 +194,7 @@
   cmp x2, x12
   bne __bionic_setjmp_checksum_mismatch
 
-#if __has_feature(hwaddress_sanitizer)
+  // Update stack memory tags (MTE + hwasan).
   stp x0, x30, [sp, #-16]!
   .cfi_adjust_cfa_offset 16
   .cfi_rel_offset x0, 0
@@ -206,7 +206,7 @@
   bic x2, x2, #1
   ldr x0, [x0, #(_JB_X30_SP  * 8 + 8)]
   eor x0, x0, x2
-  bl __hwasan_handle_longjmp
+  bl memtag_handle_longjmp
 
   mov x1, x19 // Restore 'value'.
   // Restore original x0 and lr.
@@ -214,7 +214,6 @@
   .cfi_adjust_cfa_offset -16
   .cfi_restore x0
   .cfi_restore x30
-#endif
 
   // Do we need to restore the signal mask?
   ldr x2, [x0, #(_JB_SIGFLAG * 8)]
diff --git a/libc/arch-arm64/bionic/vfork.S b/libc/arch-arm64/bionic/vfork.S
index df7b063..9878e8d 100644
--- a/libc/arch-arm64/bionic/vfork.S
+++ b/libc/arch-arm64/bionic/vfork.S
@@ -45,6 +45,9 @@
     ldr     w10, [x9, #20]
     str     w0, [x9, #20]
 
+    // Clear vfork_child_stack_bottom_.
+    str     xzr, [x9, #776]
+
     mov     x0, #(CLONE_VM | CLONE_VFORK | SIGCHLD)
     mov     x1, xzr
     mov     x2, xzr
@@ -62,9 +65,6 @@
     cneg    x0, x0, hi
     b.hi    __set_errno_internal
 
-#if __has_feature(hwaddress_sanitizer)
-    cbz x0, .L_exit
-
     // Clean up stack shadow in the parent process.
     // https://github.com/google/sanitizers/issues/925
     paciasp
@@ -75,7 +75,7 @@
     .cfi_rel_offset x30, 8
 
     add x0, sp, #16
-    bl __hwasan_handle_vfork
+    bl memtag_handle_vfork
 
     ldp x0, x30, [sp], #16
     .cfi_adjust_cfa_offset -16
@@ -84,8 +84,6 @@
     autiasp
     .cfi_negate_ra_state
 
-#endif
-
 .L_exit:
     ret
 END(vfork)
diff --git a/libc/bionic/exec.cpp b/libc/bionic/exec.cpp
index fd2c401..40612e7 100644
--- a/libc/bionic/exec.cpp
+++ b/libc/bionic/exec.cpp
@@ -39,10 +39,12 @@
 #include <string.h>
 #include <unistd.h>
 
-#include "private/__bionic_get_shell_path.h"
 #include "private/FdPath.h"
+#include "private/__bionic_get_shell_path.h"
+#include "pthread_internal.h"
 
 extern "C" char** environ;
+extern "C" int __execve(const char* pathname, char* const* argv, char* const* envp);
 
 enum { ExecL, ExecLE, ExecLP };
 
@@ -181,3 +183,9 @@
   if (errno == ENOENT) errno = EBADF;
   return -1;
 }
+
+__attribute__((no_sanitize("memtag"))) int execve(const char* pathname, char* const* argv,
+                                                  char* const* envp) {
+  __get_thread()->vfork_child_stack_bottom = __builtin_frame_address(0);
+  return __execve(pathname, argv, envp);
+}
diff --git a/libc/bionic/exit.cpp b/libc/bionic/exit.cpp
index a5aed78..52fd193 100644
--- a/libc/bionic/exit.cpp
+++ b/libc/bionic/exit.cpp
@@ -30,9 +30,18 @@
 #include <unistd.h>
 
 #include "private/bionic_defs.h"
+#include "pthread_internal.h"
 
 extern "C" void __cxa_finalize(void* dso_handle);
 extern "C" void __cxa_thread_finalize();
+extern "C" __noreturn void __exit_group(int status);
+
+__attribute__((no_sanitize("memtag"))) void _exit(int status) {
+  __get_thread()->vfork_child_stack_bottom = __builtin_frame_address(0);
+  __exit_group(status);
+}
+
+__strong_alias(_Exit, _exit);
 
 __BIONIC_WEAK_FOR_NATIVE_BRIDGE
 void exit(int status) {
diff --git a/libc/bionic/heap_tagging.cpp b/libc/bionic/heap_tagging.cpp
index aac2048..78d21b0 100644
--- a/libc/bionic/heap_tagging.cpp
+++ b/libc/bionic/heap_tagging.cpp
@@ -32,6 +32,8 @@
 
 #include <bionic/pthread_internal.h>
 #include <platform/bionic/malloc.h>
+#include <sanitizer/hwasan_interface.h>
+#include <sys/auxv.h>
 
 extern "C" void scudo_malloc_disable_memory_tagging();
 extern "C" void scudo_malloc_set_track_allocation_stacks(int);
@@ -170,3 +172,69 @@
 
   return true;
 }
+
+#ifdef __aarch64__
+static inline __attribute__((no_sanitize("memtag"))) void untag_memory(void* from, void* to) {
+  __asm__ __volatile__(
+      ".arch_extension mte\n"
+      "1:\n"
+      "stg %[Ptr], [%[Ptr]], #16\n"
+      "cmp %[Ptr], %[End]\n"
+      "b.lt 1b\n"
+      : [Ptr] "+&r"(from)
+      : [End] "r"(to)
+      : "memory");
+}
+#endif
+
+#ifdef __aarch64__
+// 128Mb of stack should be enough for anybody.
+static constexpr size_t kUntagLimit = 128 * 1024 * 1024;
+#endif  // __aarch64__
+
+extern "C" __LIBC_HIDDEN__ __attribute__((no_sanitize("memtag"))) void memtag_handle_longjmp(
+    void* sp_dst __unused) {
+#ifdef __aarch64__
+  if (__libc_globals->memtag_stack) {
+    void* sp = __builtin_frame_address(0);
+    size_t distance = reinterpret_cast<uintptr_t>(sp_dst) - reinterpret_cast<uintptr_t>(sp);
+    if (distance > kUntagLimit) {
+      async_safe_fatal(
+          "memtag_handle_longjmp: stack adjustment too large! %p -> %p, distance %zx > %zx\n", sp,
+          sp_dst, distance, kUntagLimit);
+    } else {
+      untag_memory(sp, sp_dst);
+    }
+  }
+#endif  // __aarch64__
+
+#if __has_feature(hwaddress_sanitizer)
+  __hwasan_handle_longjmp(sp_dst);
+#endif  // __has_feature(hwaddress_sanitizer)
+}
+
+extern "C" __LIBC_HIDDEN__ __attribute__((no_sanitize("memtag"), no_sanitize("hwaddress"))) void
+memtag_handle_vfork(void* sp __unused) {
+#ifdef __aarch64__
+  if (__libc_globals->memtag_stack) {
+    void* child_sp = __get_thread()->vfork_child_stack_bottom;
+    __get_thread()->vfork_child_stack_bottom = nullptr;
+    if (child_sp) {
+      size_t distance = reinterpret_cast<uintptr_t>(sp) - reinterpret_cast<uintptr_t>(child_sp);
+      if (distance > kUntagLimit) {
+        async_safe_fatal(
+            "memtag_handle_vfork: stack adjustment too large! %p -> %p, distance %zx > %zx\n",
+            child_sp, sp, distance, kUntagLimit);
+      } else {
+        untag_memory(child_sp, sp);
+      }
+    } else {
+      async_safe_fatal("memtag_handle_vfork: child SP unknown\n");
+    }
+  }
+#endif  // __aarch64__
+
+#if __has_feature(hwaddress_sanitizer)
+  __hwasan_handle_vfork(sp);
+#endif  // __has_feature(hwaddress_sanitizer)
+}
diff --git a/libc/bionic/pthread_internal.h b/libc/bionic/pthread_internal.h
index 071a5bc..7222b62 100644
--- a/libc/bionic/pthread_internal.h
+++ b/libc/bionic/pthread_internal.h
@@ -160,6 +160,13 @@
   bionic_tls* bionic_tls;
 
   int errno_value;
+
+  // The last observed value of SP in a vfork child process.
+  // The part of the stack between this address and the value of SP when the vfork parent process
+  // regains control may have stale MTE tags and needs cleanup. This field is only meaningful while
+  // the parent is waiting for the vfork child to return control by calling either exec*() or
+  // exit().
+  void* vfork_child_stack_bottom;
 };
 
 struct ThreadMapping {
diff --git a/tests/struct_layout_test.cpp b/tests/struct_layout_test.cpp
index 0123ed9..10c100a 100644
--- a/tests/struct_layout_test.cpp
+++ b/tests/struct_layout_test.cpp
@@ -30,7 +30,7 @@
 #define CHECK_OFFSET(name, field, offset) \
     check_offset(#name, #field, offsetof(name, field), offset);
 #ifdef __LP64__
-  CHECK_SIZE(pthread_internal_t, 776);
+  CHECK_SIZE(pthread_internal_t, 784);
   CHECK_OFFSET(pthread_internal_t, next, 0);
   CHECK_OFFSET(pthread_internal_t, prev, 8);
   CHECK_OFFSET(pthread_internal_t, tid, 16);
@@ -55,6 +55,7 @@
   CHECK_OFFSET(pthread_internal_t, dlerror_buffer, 248);
   CHECK_OFFSET(pthread_internal_t, bionic_tls, 760);
   CHECK_OFFSET(pthread_internal_t, errno_value, 768);
+  CHECK_OFFSET(pthread_internal_t, vfork_child_stack_bottom, 776);
   CHECK_SIZE(bionic_tls, 12200);
   CHECK_OFFSET(bionic_tls, key_data, 0);
   CHECK_OFFSET(bionic_tls, locale, 2080);
@@ -72,7 +73,7 @@
   CHECK_OFFSET(bionic_tls, bionic_systrace_disabled, 12193);
   CHECK_OFFSET(bionic_tls, padding, 12194);
 #else
-  CHECK_SIZE(pthread_internal_t, 668);
+  CHECK_SIZE(pthread_internal_t, 672);
   CHECK_OFFSET(pthread_internal_t, next, 0);
   CHECK_OFFSET(pthread_internal_t, prev, 4);
   CHECK_OFFSET(pthread_internal_t, tid, 8);
@@ -97,6 +98,7 @@
   CHECK_OFFSET(pthread_internal_t, dlerror_buffer, 148);
   CHECK_OFFSET(pthread_internal_t, bionic_tls, 660);
   CHECK_OFFSET(pthread_internal_t, errno_value, 664);
+  CHECK_OFFSET(pthread_internal_t, vfork_child_stack_bottom, 668);
   CHECK_SIZE(bionic_tls, 11080);
   CHECK_OFFSET(bionic_tls, key_data, 0);
   CHECK_OFFSET(bionic_tls, locale, 1040);