Abort correct thread on TimeCheck timeout

The current TimeCheck implementation leads to incorrect
clustering/duping based on the abort stack trace being generated from
TimeCheck (running in the watchdog thread).

Appropriately signal the timed-out thread, so we receive more accurate
stack traces.

Test: atest timecheck_tests, manual verification of correct thread stack
trace.
Bug: 239252933
Bug: 227594853

Change-Id: I5d62581682ae85af269e7e10125343f096e26303
diff --git a/media/utils/TimeCheck.cpp b/media/utils/TimeCheck.cpp
index efa53f8..0cf5bd9 100644
--- a/media/utils/TimeCheck.cpp
+++ b/media/utils/TimeCheck.cpp
@@ -14,11 +14,14 @@
  * limitations under the License.
  */
 
+#include <csignal>
+#include "mediautils/TimerThread.h"
 #define LOG_TAG "TimeCheck"
 
 #include <optional>
 
 #include <android-base/logging.h>
+#include <android-base/strings.h>
 #include <audio_utils/clock.h>
 #include <mediautils/EventLog.h>
 #include <mediautils/FixedString.h>
@@ -27,20 +30,21 @@
 #include <mediautils/TidWrapper.h>
 #include <utils/Log.h>
 
-#if defined(__BIONIC__)
+#if defined(__ANDROID__)
 #include "debuggerd/handler.h"
 #endif
 
+
+namespace android::mediautils {
 // This function appropriately signals a pid to dump a backtrace if we are
-// running on device.
+// running on device (and the HAL exists). If we are not running on an Android
+// device, there is no HAL to signal (so we do nothing).
 static inline void signalAudioHAL([[maybe_unused]] pid_t pid) {
-#if defined(__BIONIC__)
+#if defined(__ANDROID__)
     sigqueue(pid, DEBUGGER_SIGNAL, {.sival_int = 0});
 #endif
 }
 
-namespace android::mediautils {
-
 /**
  * Returns the std::string "HH:MM:SS.MSc" from a system_clock time_point.
  */
@@ -148,7 +152,7 @@
 std::string TimeCheck::toString() {
     // note pending and retired are individually locked for maximum concurrency,
     // snapshot is not instantaneous at a single time.
-    return getTimeCheckThread().toString();
+    return getTimeCheckThread().getSnapshotAnalysis().toString();
 }
 
 TimeCheck::TimeCheck(std::string_view tag, OnTimerFunc&& onTimer, Duration requestedTimeoutDuration,
@@ -253,7 +257,7 @@
 
     // Generate the TimerThread summary string early before sending signals to the
     // HAL processes which can affect thread behavior.
-    const std::string summary = getTimeCheckThread().toString(4 /* retiredCount */);
+    const auto snapshotAnalysis = getTimeCheckThread().getSnapshotAnalysis(4 /* retiredCount */);
 
     // Generate audio HAL processes tombstones and allow time to complete
     // before forcing restart
@@ -281,7 +285,7 @@
             .append(analyzeTimeouts(requestedTimeoutMs + secondChanceMs,
                     elapsedSteadyMs, elapsedSystemMs)).append("\n")
             .append(halPids).append("\n")
-            .append(summary);
+            .append(snapshotAnalysis.toString());
 
     // Note: LOG_ALWAYS_FATAL limits the size of the string - per log/log.h:
     // Log message text may be truncated to less than an
@@ -291,7 +295,20 @@
     // to avoid the size limitation. LOG(FATAL) does an abort whereas
     // LOG(FATAL_WITHOUT_ABORT) does not abort.
 
-    LOG(FATAL) << abortMessage;
+    static constexpr pid_t invalidPid = TimerThread::SnapshotAnalysis::INVALID_PID;
+    pid_t tidToAbort = invalidPid;
+    if (snapshotAnalysis.suspectTid != invalidPid) {
+        tidToAbort = snapshotAnalysis.suspectTid;
+    } else if (snapshotAnalysis.timeoutTid != invalidPid) {
+        tidToAbort = snapshotAnalysis.timeoutTid;
+    }
+
+    LOG(FATAL_WITHOUT_ABORT) << abortMessage;
+    const auto ret = abortTid(tidToAbort);
+    if (ret < 0) {
+        LOG(FATAL) << "TimeCheck thread signal failed, aborting process. "
+                       "errno: " << errno << base::ErrnoNumberAsString(errno);
+    }
 }
 
 // Automatically create a TimeCheck class for a class and method.
diff --git a/media/utils/TimerThread.cpp b/media/utils/TimerThread.cpp
index 51bd5eb..b760ee2 100644
--- a/media/utils/TimerThread.cpp
+++ b/media/utils/TimerThread.cpp
@@ -59,39 +59,29 @@
     return true;
 }
 
-std::string TimerThread::toString(size_t retiredCount) const {
+
+std::string TimerThread::SnapshotAnalysis::toString() const {
     // Note: These request queues are snapshot very close together but
     // not at "identical" times as we don't use a class-wide lock.
-
-    std::vector<std::shared_ptr<const Request>> timeoutRequests;
-    std::vector<std::shared_ptr<const Request>> retiredRequests;
-    mTimeoutQueue.copyRequests(timeoutRequests);
-    mRetiredQueue.copyRequests(retiredRequests, retiredCount);
-    std::vector<std::shared_ptr<const Request>> pendingRequests =
-        getPendingRequests();
-
-    struct Analysis analysis = analyzeTimeout(timeoutRequests, pendingRequests);
-    std::string analysisSummary;
-    if (!analysis.summary.empty()) {
-        analysisSummary = std::string("\nanalysis [ ").append(analysis.summary).append(" ]");
-    }
+    std::string analysisSummary = std::string("\nanalysis [ ").append(description).append(" ]");
     std::string timeoutStack;
-    if (analysis.timeoutTid != -1) {
-        timeoutStack = std::string("\ntimeout(")
-                .append(std::to_string(analysis.timeoutTid)).append(") callstack [\n")
-                .append(getCallStackStringForTid(analysis.timeoutTid)).append("]");
-    }
     std::string blockedStack;
-    if (analysis.HALBlockedTid != -1) {
+    if (timeoutTid != -1) {
+        timeoutStack = std::string(suspectTid == timeoutTid ? "\ntimeout/blocked(" : "\ntimeout(")
+                .append(std::to_string(timeoutTid)).append(") callstack [\n")
+                .append(getCallStackStringForTid(timeoutTid)).append("]");
+    }
+
+    if (suspectTid != -1 && suspectTid != timeoutTid) {
         blockedStack = std::string("\nblocked(")
-                .append(std::to_string(analysis.HALBlockedTid)).append(")  callstack [\n")
-                .append(getCallStackStringForTid(analysis.HALBlockedTid)).append("]");
+                .append(std::to_string(suspectTid)).append(")  callstack [\n")
+                .append(getCallStackStringForTid(suspectTid)).append("]");
     }
 
     return std::string("now ")
             .append(formatTime(std::chrono::system_clock::now()))
             .append("\nsecondChanceCount ")
-            .append(std::to_string(mMonitorThread.getSecondChanceCount()))
+            .append(std::to_string(secondChanceCount))
             .append(analysisSummary)
             .append("\ntimeout [ ")
             .append(requestsToString(timeoutRequests))
@@ -121,16 +111,23 @@
     return separatorPos != std::string::npos;
 }
 
-/* static */
-struct TimerThread::Analysis TimerThread::analyzeTimeout(
-    const std::vector<std::shared_ptr<const Request>>& timeoutRequests,
-    const std::vector<std::shared_ptr<const Request>>& pendingRequests) {
-
-    if (timeoutRequests.empty() || pendingRequests.empty()) return {}; // nothing to say.
-
+struct TimerThread::SnapshotAnalysis TimerThread::getSnapshotAnalysis(size_t retiredCount) const {
+    struct SnapshotAnalysis analysis{};
+    // The following snapshot of the TimerThread state will be utilized for
+    // analysis. Note, there is no lock around these calls, so there could be
+    // a state update between them.
+    mTimeoutQueue.copyRequests(analysis.timeoutRequests);
+    mRetiredQueue.copyRequests(analysis.retiredRequests, retiredCount);
+    analysis.pendingRequests = getPendingRequests();
+    analysis.secondChanceCount = mMonitorThread.getSecondChanceCount();
+    // No call has timed out, so there is no analysis to be done.
+    if (analysis.timeoutRequests.empty())
+        return analysis;
     // for now look at last timeout (in our case, the only timeout)
-    const std::shared_ptr<const Request> timeout = timeoutRequests.back();
-
+    const std::shared_ptr<const Request> timeout = analysis.timeoutRequests.back();
+    analysis.timeoutTid = timeout->tid;
+    if (analysis.pendingRequests.empty())
+      return analysis;
     // pending Requests that are problematic.
     std::vector<std::shared_ptr<const Request>> pendingExact;
     std::vector<std::shared_ptr<const Request>> pendingPossible;
@@ -141,7 +138,7 @@
     // such as HAL write() and read().
     //
     constexpr Duration kPendingDuration = 1000ms;
-    for (const auto& pending : pendingRequests) {
+    for (const auto& pending : analysis.pendingRequests) {
         // If the pending tid is the same as timeout tid, problem identified.
         if (pending->tid == timeout->tid) {
             pendingExact.emplace_back(pending);
@@ -154,29 +151,27 @@
         }
     }
 
-    struct Analysis analysis{};
-
-    analysis.timeoutTid = timeout->tid;
-    std::string& summary = analysis.summary;
+    std::string& description = analysis.description;
     if (!pendingExact.empty()) {
         const auto& request = pendingExact.front();
         const bool hal = isRequestFromHal(request);
 
         if (hal) {
-            summary = std::string("Blocked directly due to HAL call: ")
+            description = std::string("Blocked directly due to HAL call: ")
                 .append(request->toString());
+            analysis.suspectTid= request->tid;
         }
     }
-    if (summary.empty() && !pendingPossible.empty()) {
+    if (description.empty() && !pendingPossible.empty()) {
         for (const auto& request : pendingPossible) {
             const bool hal = isRequestFromHal(request);
             if (hal) {
                 // The first blocked call is the most likely one.
                 // Recent calls might be temporarily blocked
                 // calls such as write() or read() depending on kDuration.
-                summary = std::string("Blocked possibly due to HAL call: ")
+                description = std::string("Blocked possibly due to HAL call: ")
                     .append(request->toString());
-                analysis.HALBlockedTid = request->tid;
+                analysis.suspectTid= request->tid;
             }
        }
     }
diff --git a/media/utils/include/mediautils/TidWrapper.h b/media/utils/include/mediautils/TidWrapper.h
index add2fa5..aeefa01 100644
--- a/media/utils/include/mediautils/TidWrapper.h
+++ b/media/utils/include/mediautils/TidWrapper.h
@@ -16,8 +16,11 @@
 
 #pragma once
 
+#if defined(__linux__)
+#include <signal.h>
 #include <sys/syscall.h>
 #include <unistd.h>
+#endif
 
 namespace android::mediautils {
 
@@ -31,4 +34,20 @@
 #endif
 }
 
+// Send an abort signal to a (linux) thread id.
+inline int abortTid(int tid) {
+#if defined(__linux__)
+    const pid_t pid = getpid();
+    siginfo_t siginfo = {
+        .si_code = SI_QUEUE,
+        .si_pid = pid,
+        .si_uid = getuid(),
+    };
+    return syscall(SYS_rt_tgsigqueueinfo, pid, tid, SIGABRT, &siginfo);
+#else
+  errno = ENODEV;
+  return -1;
+#endif
+}
+
 }
diff --git a/media/utils/include/mediautils/TimeCheck.h b/media/utils/include/mediautils/TimeCheck.h
index bdb5337..f9ea50c 100644
--- a/media/utils/include/mediautils/TimeCheck.h
+++ b/media/utils/include/mediautils/TimeCheck.h
@@ -123,7 +123,6 @@
         const Duration secondChanceDuration;
         const std::chrono::system_clock::time_point startSystemTime;
         const pid_t tid;
-
         void onCancel(TimerThread::Handle handle) const;
         void onTimeout(TimerThread::Handle handle) const;
     };
diff --git a/media/utils/include/mediautils/TimerThread.h b/media/utils/include/mediautils/TimerThread.h
index c76fa7d..d5be177 100644
--- a/media/utils/include/mediautils/TimerThread.h
+++ b/media/utils/include/mediautils/TimerThread.h
@@ -21,9 +21,11 @@
 #include <deque>
 #include <functional>
 #include <map>
+#include <memory>
 #include <mutex>
 #include <string>
 #include <thread>
+#include <vector>
 
 #include <android-base/thread_annotations.h>
 
@@ -151,7 +153,15 @@
      */
     bool cancelTask(Handle handle);
 
-    std::string toString(size_t retiredCount = SIZE_MAX) const;
+    struct SnapshotAnalysis;
+    /**
+     * Take a snapshot of the current state of the TimerThread and determine the
+     * potential cause of a deadlock.
+     * \param retiredCount The number of successfully retired calls to capture
+     *                      (may be many).
+     * \return See below for a description of a SnapShotAnalysis object
+     */
+    SnapshotAnalysis getSnapshotAnalysis(size_t retiredCount = SIZE_MAX) const;
 
     /**
      * Returns a string representation of the TimerThread queue.
@@ -202,7 +212,6 @@
         return s;
     }
 
-  private:
     // To minimize movement of data, we pass around shared_ptrs to Requests.
     // These are allocated and deallocated outside of the lock.
     // TODO(b/243839867) consider options to merge Request with the
@@ -232,6 +241,40 @@
         std::string toString() const;
     };
 
+
+    // SnapshotAnalysis contains info deduced by analysisTimeout().
+
+    struct SnapshotAnalysis {
+        // If we were unable to determine any applicable thread ids,
+        // we leave their value as INVALID_PID.
+        // Note, we use the linux thread id (not pthread), so its type is pid_t.
+        static constexpr pid_t INVALID_PID = -1;
+        // Description of likely issue and/or blocked method.
+        // Empty if no actionable info.
+        std::string description;
+        // Tid of the (latest) monitored thread which has timed out.
+        // This is the thread which the suspect is deduced with respect to.
+        // Most often, this is the thread which an abort is being triggered
+        // from.
+        pid_t timeoutTid = INVALID_PID;
+        // Tid of the (HAL) thread which has likely halted progress, selected
+        // from pendingRequests. May be the same as timeoutTid, if the timed-out
+        // thread directly called into the HAL.
+        pid_t suspectTid = INVALID_PID;
+        // Number of second chances given by the timer thread
+        size_t secondChanceCount;
+        // List of pending requests
+        std::vector<std::shared_ptr<const Request>> pendingRequests;
+        // List of timed-out requests
+        std::vector<std::shared_ptr<const Request>> timeoutRequests;
+        // List of retired requests
+        std::vector<std::shared_ptr<const Request>> retiredRequests;
+        // Dumps the information contained above as well as additional call
+        // stacks where applicable.
+        std::string toString() const;
+    };
+
+  private:
     // Deque of requests, in order of add().
     // This class is thread-safe.
     class RequestQueue {
@@ -326,36 +369,11 @@
         }
     };
 
-    // Analysis contains info deduced by analysisTimeout().
-    //
-    // Summary is the result string from checking timeoutRequests to see if
-    // any might be caused by blocked calls in pendingRequests.
-    //
-    // Summary string is empty if there is no automatic actionable info.
-    //
-    // timeoutTid is the tid selected from timeoutRequests (if any).
-    //
-    // HALBlockedTid is the tid that is blocked from pendingRequests believed
-    // to cause the timeout.
-    // HALBlockedTid may be INVALID_PID if no suspected tid is found,
-    // and if HALBlockedTid is valid, it will not be the same as timeoutTid.
-    //
-    static constexpr pid_t INVALID_PID = -1;
-    struct Analysis {
-        std::string summary;
-        pid_t timeoutTid = INVALID_PID;
-        pid_t HALBlockedTid = INVALID_PID;
-    };
 
     // A HAL method is where the substring "Hidl" is in the class name.
     // The tag should look like: ... Hidl ... :: ...
     static bool isRequestFromHal(const std::shared_ptr<const Request>& request);
 
-    // Returns analysis from the requests.
-    static Analysis analyzeTimeout(
-        const std::vector<std::shared_ptr<const Request>>& timeoutRequests,
-        const std::vector<std::shared_ptr<const Request>>& pendingRequests);
-
     std::vector<std::shared_ptr<const Request>> getPendingRequests() const;
 
     static constexpr size_t kRetiredQueueMax = 16;
diff --git a/media/utils/tests/timecheck_tests.cpp b/media/utils/tests/timecheck_tests.cpp
index 8236174..bd91efa 100644
--- a/media/utils/tests/timecheck_tests.cpp
+++ b/media/utils/tests/timecheck_tests.cpp
@@ -26,7 +26,6 @@
 using namespace std::chrono_literals;
 
 namespace {
-
 TEST(timecheck_tests, success) {
     bool timeoutRegistered = false;
     float elapsedMsRegistered = 0.f;
@@ -69,4 +68,33 @@
 // Note: We do not test TimeCheck crash because TimeCheck is multithreaded and the
 // EXPECT_EXIT() signal catching is imperfect due to the gtest fork.
 
+// Note, the following test is to manually verify the correct thread is aborted.
+// Due to difficulties with gtest and EXPECT_EXIT, this is difficult to verify
+// automatically. TODO(b/246446561) Attempt to use EXPECT_EXIT
+
+#if 0
+void threadFunction() {
+    bool timeoutRegistered = false;
+    float elapsedMsRegistered = 0.f;
+    std::atomic_bool event = false;  // seq-cst implies acquire-release
+    {
+        TimeCheck timeCheck("timeout",
+                [&event, &timeoutRegistered, &elapsedMsRegistered]
+                        (bool timeout, float elapsedMs) {
+            timeoutRegistered = timeout;
+            elapsedMsRegistered = elapsedMs;
+            event = true; // store-release, must be last.
+        }, 1ms /* timeoutDuration */, {} /* secondChanceDuration */, true /* crash */);
+        std::this_thread::sleep_for(100ms);
+        ADD_FAILURE();
+    }
+}
+
+TEST(timecheck_tests, death) {
+  std::thread mthread{threadFunction};
+  mthread.join();
+}
+#endif
+
 } // namespace
+