[MTE] Implement permissive (recoverable) MTE for apps Extends the recoverable native crash handling support to also allow for MTE crashes to be recovered from in apps. Backs onto the existing GWP-ASan recoverable handling. At least for system/core, I've renamed the (now) generic "recoverable crash" variables to remove the notion of GWP-ASan. Permissive MTE should no longer crash an app, and crashes are still visible in the AppExitInfo API and tombstones. Test: atest CtsTaggingHostTestCases Bug: 328793166 Change-Id: I4c6ffa85af0e0d9b72d0ccd606bb6e1ca464cfff

commit: 31407917f8e6c36d8f930a3136fe8a3dd00e4fc9 [log] [tgz]
author: Mitch Phillips <mitchp@google.com> Thu Mar 14 14:18:47 2024 +0100
committer: Mitch Phillips <mitchp@google.com> Tue Mar 19 11:26:02 2024 +0100
tree: a731c026ca8a0c6efb6889a09e684b434f4fb5a9
parent: 851c857420fb6d999460d59784ee7d24061729a8 [diff]
diff --git a/debuggerd/crash_dump.cpp b/debuggerd/crash_dump.cpp
index a23a269..203b485 100644
--- a/debuggerd/crash_dump.cpp
+++ b/debuggerd/crash_dump.cpp

@@ -143,7 +143,7 @@
 }
 
 static bool activity_manager_notify(pid_t pid, int signal, const std::string& amfd_data,
-                                    bool recoverable_gwp_asan_crash) {
+                                    bool recoverable_crash) {
   ATRACE_CALL();
   android::base::unique_fd amfd(socket_local_client(
       "/data/system/ndebugsocket", ANDROID_SOCKET_NAMESPACE_FILESYSTEM, SOCK_STREAM));
@@ -169,7 +169,7 @@
   // Activity Manager protocol:
   //  - 32-bit network-byte-order: pid
   //  - 32-bit network-byte-order: signal number
-  //  - byte: recoverable_gwp_asan_crash
+  //  - byte: recoverable_crash
   //  - bytes: raw text of the dump
   //  - null terminator
 
@@ -185,10 +185,9 @@
     return false;
   }
 
-  uint8_t recoverable_gwp_asan_crash_byte = recoverable_gwp_asan_crash ? 1 : 0;
-  if (!android::base::WriteFully(amfd, &recoverable_gwp_asan_crash_byte,
-                                 sizeof(recoverable_gwp_asan_crash_byte))) {
-    PLOG(ERROR) << "AM recoverable_gwp_asan_crash_byte write failed";
+  uint8_t recoverable_crash_byte = recoverable_crash ? 1 : 0;
+  if (!android::base::WriteFully(amfd, &recoverable_crash_byte, sizeof(recoverable_crash_byte))) {
+    PLOG(ERROR) << "AM recoverable_crash_byte write failed";
     return false;
   }
 
@@ -280,11 +279,11 @@
 
 static void ReadCrashInfo(unique_fd& fd, siginfo_t* siginfo,
                           std::unique_ptr<unwindstack::Regs>* regs, ProcessInfo* process_info,
-                          bool* recoverable_gwp_asan_crash) {
+                          bool* recoverable_crash) {
   std::aligned_storage<sizeof(CrashInfo) + 1, alignof(CrashInfo)>::type buf;
   CrashInfo* crash_info = reinterpret_cast<CrashInfo*>(&buf);
   ssize_t rc = TEMP_FAILURE_RETRY(read(fd.get(), &buf, sizeof(buf)));
-  *recoverable_gwp_asan_crash = false;
+  *recoverable_crash = false;
   if (rc == -1) {
     PLOG(FATAL) << "failed to read target ucontext";
   } else {
@@ -321,7 +320,7 @@
       process_info->scudo_region_info = crash_info->data.d.scudo_region_info;
       process_info->scudo_ring_buffer = crash_info->data.d.scudo_ring_buffer;
       process_info->scudo_ring_buffer_size = crash_info->data.d.scudo_ring_buffer_size;
-      *recoverable_gwp_asan_crash = crash_info->data.d.recoverable_gwp_asan_crash;
+      *recoverable_crash = crash_info->data.d.recoverable_crash;
       process_info->crash_detail_page = crash_info->data.d.crash_detail_page;
       FALLTHROUGH_INTENDED;
     case 1:
@@ -487,7 +486,7 @@
   std::map<pid_t, ThreadInfo> thread_info;
   siginfo_t siginfo;
   std::string error;
-  bool recoverable_gwp_asan_crash = false;
+  bool recoverable_crash = false;
 
   {
     ATRACE_NAME("ptrace");
@@ -539,8 +538,7 @@
 
       if (thread == g_target_thread) {
         // Read the thread's registers along with the rest of the crash info out of the pipe.
-        ReadCrashInfo(input_pipe, &siginfo, &info.registers, &process_info,
-                      &recoverable_gwp_asan_crash);
+        ReadCrashInfo(input_pipe, &siginfo, &info.registers, &process_info, &recoverable_crash);
         info.siginfo = &siginfo;
         info.signo = info.siginfo->si_signo;
 
@@ -670,7 +668,7 @@
   if (fatal_signal) {
     // Don't try to notify ActivityManager if it just crashed, or we might hang until timeout.
     if (thread_info[target_process].thread_name != "system_server") {
-      activity_manager_notify(target_process, signo, amfd_data, recoverable_gwp_asan_crash);
+      activity_manager_notify(target_process, signo, amfd_data, recoverable_crash);
     }
   }
 

diff --git a/debuggerd/handler/debuggerd_handler.cpp b/debuggerd/handler/debuggerd_handler.cpp
index 141723b..ba5ad3f 100644
--- a/debuggerd/handler/debuggerd_handler.cpp
+++ b/debuggerd/handler/debuggerd_handler.cpp

@@ -396,7 +396,7 @@
     ASSERT_SAME_OFFSET(scudo_ring_buffer, scudo_ring_buffer);
     ASSERT_SAME_OFFSET(scudo_ring_buffer_size, scudo_ring_buffer_size);
     ASSERT_SAME_OFFSET(scudo_stack_depot_size, scudo_stack_depot_size);
-    ASSERT_SAME_OFFSET(recoverable_gwp_asan_crash, recoverable_gwp_asan_crash);
+    ASSERT_SAME_OFFSET(recoverable_crash, recoverable_crash);
     ASSERT_SAME_OFFSET(crash_detail_page, crash_detail_page);
 #undef ASSERT_SAME_OFFSET
 
@@ -573,6 +573,7 @@
   }
 
   gwp_asan_callbacks_t gwp_asan_callbacks = {};
+  bool recoverable_gwp_asan_crash = false;
   if (g_callbacks.get_gwp_asan_callbacks != nullptr) {
     // GWP-ASan catches use-after-free and heap-buffer-overflow by using PROT_NONE
     // guard pages, which lead to SEGV. Normally, debuggerd prints a bug report
@@ -587,10 +588,30 @@
         gwp_asan_callbacks.debuggerd_gwp_asan_post_crash_report &&
         gwp_asan_callbacks.debuggerd_needs_gwp_asan_recovery(info->si_addr)) {
       gwp_asan_callbacks.debuggerd_gwp_asan_pre_crash_report(info->si_addr);
-      process_info.recoverable_gwp_asan_crash = true;
+      recoverable_gwp_asan_crash = true;
+      process_info.recoverable_crash = true;
     }
   }
 
+  if (info->si_signo == SIGSEGV &&
+      (info->si_code == SEGV_MTESERR || info->si_code == SEGV_MTEAERR) && is_permissive_mte()) {
+    process_info.recoverable_crash = true;
+    // If we are in permissive MTE mode, we do not crash, but instead disable MTE on this thread,
+    // and then let the failing instruction be retried. The second time should work (except
+    // if there is another non-MTE fault).
+    int tagged_addr_ctrl = prctl(PR_GET_TAGGED_ADDR_CTRL, 0, 0, 0, 0);
+    if (tagged_addr_ctrl < 0) {
+      fatal_errno("failed to PR_GET_TAGGED_ADDR_CTRL");
+    }
+    tagged_addr_ctrl = (tagged_addr_ctrl & ~PR_MTE_TCF_MASK) | PR_MTE_TCF_NONE;
+    if (prctl(PR_SET_TAGGED_ADDR_CTRL, tagged_addr_ctrl, 0, 0, 0) < 0) {
+      fatal_errno("failed to PR_SET_TAGGED_ADDR_CTRL");
+    }
+    async_safe_format_log(ANDROID_LOG_ERROR, "libc",
+                          "MTE ERROR DETECTED BUT RUNNING IN PERMISSIVE MODE. CONTINUING.");
+    pthread_mutex_unlock(&crash_mutex);
+  }
+
   // If sival_int is ~0, it means that the fallback handler has been called
   // once before and this function is being called again to dump the stack
   // of a specific thread. It is possible that the prctl call might return 1,
@@ -602,7 +623,7 @@
     // you can only set NO_NEW_PRIVS to 1, and the effect should be at worst a single missing
     // ANR trace.
     debuggerd_fallback_handler(info, ucontext, process_info.abort_msg);
-    if (no_new_privs && process_info.recoverable_gwp_asan_crash) {
+    if (no_new_privs && recoverable_gwp_asan_crash) {
       gwp_asan_callbacks.debuggerd_gwp_asan_post_crash_report(info->si_addr);
       return;
     }
@@ -679,29 +700,14 @@
     // If the signal is fatal, don't unlock the mutex to prevent other crashing threads from
     // starting to dump right before our death.
     pthread_mutex_unlock(&crash_mutex);
-  } else if (process_info.recoverable_gwp_asan_crash) {
-    gwp_asan_callbacks.debuggerd_gwp_asan_post_crash_report(info->si_addr);
+  } else if (process_info.recoverable_crash) {
+    if (recoverable_gwp_asan_crash) {
+      gwp_asan_callbacks.debuggerd_gwp_asan_post_crash_report(info->si_addr);
+    }
     pthread_mutex_unlock(&crash_mutex);
   }
 #ifdef __aarch64__
-  else if (info->si_signo == SIGSEGV &&
-           (info->si_code == SEGV_MTESERR || info->si_code == SEGV_MTEAERR) &&
-           is_permissive_mte()) {
-    // If we are in permissive MTE mode, we do not crash, but instead disable MTE on this thread,
-    // and then let the failing instruction be retried. The second time should work (except
-    // if there is another non-MTE fault).
-    int tagged_addr_ctrl = prctl(PR_GET_TAGGED_ADDR_CTRL, 0, 0, 0, 0);
-    if (tagged_addr_ctrl < 0) {
-      fatal_errno("failed to PR_GET_TAGGED_ADDR_CTRL");
-    }
-    tagged_addr_ctrl = (tagged_addr_ctrl & ~PR_MTE_TCF_MASK) | PR_MTE_TCF_NONE;
-    if (prctl(PR_SET_TAGGED_ADDR_CTRL, tagged_addr_ctrl, 0, 0, 0) < 0) {
-      fatal_errno("failed to PR_SET_TAGGED_ADDR_CTRL");
-    }
-    async_safe_format_log(ANDROID_LOG_ERROR, "libc",
-                          "MTE ERROR DETECTED BUT RUNNING IN PERMISSIVE MODE. CONTINUING.");
-    pthread_mutex_unlock(&crash_mutex);
-  } else if (info->si_signo == SIGSEGV && info->si_code == SEGV_MTEAERR && getppid() == 1) {
+  else if (info->si_signo == SIGSEGV && info->si_code == SEGV_MTEAERR && getppid() == 1) {
     // Back channel to init (see system/core/init/service.cpp) to signal that
     // this process crashed due to an ASYNC MTE fault and should be considered
     // for upgrade to SYNC mode. We are re-using the ART profiler signal, which
@@ -761,18 +767,7 @@
   debuggerd_register_handlers(&action);
 }
 
-// When debuggerd's signal handler is the first handler called, it's great at
-// handling the recoverable GWP-ASan mode. For apps, sigchain (from libart) is
-// always the first signal handler, and so the following function is what
-// sigchain must call before processing the signal. This allows for processing
-// of a potentially recoverable GWP-ASan crash. If the signal requires GWP-ASan
-// recovery, then dump a report (via the regular debuggerd hanndler), and patch
-// up the allocator, and allow the process to continue (indicated by returning
-// 'true'). If the crash has nothing to do with GWP-ASan, or recovery isn't
-// possible, return 'false'.
-bool debuggerd_handle_signal(int signal_number, siginfo_t* info, void* context) {
-  if (signal_number != SIGSEGV || !signal_has_si_addr(info)) return false;
-
+bool debuggerd_handle_gwp_asan_signal(int signal_number, siginfo_t* info, void* context) {
   if (g_callbacks.get_gwp_asan_callbacks == nullptr) return false;
   gwp_asan_callbacks_t gwp_asan_callbacks = g_callbacks.get_gwp_asan_callbacks();
   if (gwp_asan_callbacks.debuggerd_needs_gwp_asan_recovery == nullptr ||
@@ -810,3 +805,33 @@
   pthread_mutex_unlock(&first_crash_mutex);
   return true;
 }
+
+// When debuggerd's signal handler is the first handler called, it's great at
+// handling the recoverable GWP-ASan and permissive MTE modes. For apps,
+// sigchain (from libart) is always the first signal handler, and so the
+// following function is what sigchain must call before processing the signal.
+// This allows for processing of a potentially recoverable GWP-ASan or MTE
+// crash. If the signal requires recovery, then dump a report (via the regular
+// debuggerd hanndler), and patch up the allocator (in the case of GWP-ASan) or
+// disable MTE on the thread, and allow the process to continue (indicated by
+// returning 'true'). If the crash has nothing to do with GWP-ASan/MTE, or
+// recovery isn't possible, return 'false'.
+bool debuggerd_handle_signal(int signal_number, siginfo_t* info, void* context) {
+  if (signal_number != SIGSEGV) return false;
+  if (info->si_code == SEGV_MTEAERR || info->si_code == SEGV_MTESERR) {
+    if (!is_permissive_mte()) return false;
+    // Because permissive MTE disables MTE for the entire thread, we're less
+    // worried about getting a whole bunch of crashes in a row. ActivityManager
+    // doesn't like multiple native crashes for an app in a short period of time
+    // (see the comment about recoverable GWP-ASan in
+    // `debuggerd_handle_gwp_asan_signal`), but that shouldn't happen if MTE is
+    // disabled for the entire thread. This might need to be changed if there's
+    // some low-hanging bug that happens across multiple threads in quick
+    // succession.
+    debuggerd_signal_handler(signal_number, info, context);
+    return true;
+  }
+
+  if (!signal_has_si_addr(info)) return false;
+  return debuggerd_handle_gwp_asan_signal(signal_number, info, context);
+}

diff --git a/debuggerd/include/debuggerd/handler.h b/debuggerd/include/debuggerd/handler.h
index c18cf6e..954f049 100644
--- a/debuggerd/include/debuggerd/handler.h
+++ b/debuggerd/include/debuggerd/handler.h

@@ -47,7 +47,7 @@
   const char* scudo_ring_buffer;
   size_t scudo_ring_buffer_size;
   size_t scudo_stack_depot_size;
-  bool recoverable_gwp_asan_crash;
+  bool recoverable_crash;
   struct crash_detail_page_t* crash_detail_page;
 };
 

diff --git a/debuggerd/protocol.h b/debuggerd/protocol.h
index d3fc15e..9af7377 100644
--- a/debuggerd/protocol.h
+++ b/debuggerd/protocol.h

@@ -100,7 +100,7 @@
   uintptr_t scudo_ring_buffer;
   size_t scudo_ring_buffer_size;
   size_t scudo_stack_depot_size;
-  bool recoverable_gwp_asan_crash;
+  bool recoverable_crash;
   uintptr_t crash_detail_page;
 };
commit	31407917f8e6c36d8f930a3136fe8a3dd00e4fc9	[log] [tgz]
author	Mitch Phillips <mitchp@google.com>	Thu Mar 14 14:18:47 2024 +0100
committer	Mitch Phillips <mitchp@google.com>	Tue Mar 19 11:26:02 2024 +0100
tree	a731c026ca8a0c6efb6889a09e684b434f4fb5a9
parent	851c857420fb6d999460d59784ee7d24061729a8 [diff]