Abort correct thread on TimeCheck timeout
The current TimeCheck implementation leads to incorrect
clustering/duping based on the abort stack trace being generated from
TimeCheck (running in the watchdog thread).
Appropriately signal the timed-out thread, so we receive more accurate
stack traces.
Test: atest timecheck_tests, manual verification of correct thread stack
trace.
Bug: 239252933
Bug: 227594853
Change-Id: I5d62581682ae85af269e7e10125343f096e26303
diff --git a/media/utils/include/mediautils/TidWrapper.h b/media/utils/include/mediautils/TidWrapper.h
index add2fa5..aeefa01 100644
--- a/media/utils/include/mediautils/TidWrapper.h
+++ b/media/utils/include/mediautils/TidWrapper.h
@@ -16,8 +16,11 @@
#pragma once
+#if defined(__linux__)
+#include <signal.h>
#include <sys/syscall.h>
#include <unistd.h>
+#endif
namespace android::mediautils {
@@ -31,4 +34,20 @@
#endif
}
+// Send an abort signal to a (linux) thread id.
+inline int abortTid(int tid) {
+#if defined(__linux__)
+ const pid_t pid = getpid();
+ siginfo_t siginfo = {
+ .si_code = SI_QUEUE,
+ .si_pid = pid,
+ .si_uid = getuid(),
+ };
+ return syscall(SYS_rt_tgsigqueueinfo, pid, tid, SIGABRT, &siginfo);
+#else
+ errno = ENODEV;
+ return -1;
+#endif
+}
+
}
diff --git a/media/utils/include/mediautils/TimeCheck.h b/media/utils/include/mediautils/TimeCheck.h
index bdb5337..f9ea50c 100644
--- a/media/utils/include/mediautils/TimeCheck.h
+++ b/media/utils/include/mediautils/TimeCheck.h
@@ -123,7 +123,6 @@
const Duration secondChanceDuration;
const std::chrono::system_clock::time_point startSystemTime;
const pid_t tid;
-
void onCancel(TimerThread::Handle handle) const;
void onTimeout(TimerThread::Handle handle) const;
};
diff --git a/media/utils/include/mediautils/TimerThread.h b/media/utils/include/mediautils/TimerThread.h
index c76fa7d..d5be177 100644
--- a/media/utils/include/mediautils/TimerThread.h
+++ b/media/utils/include/mediautils/TimerThread.h
@@ -21,9 +21,11 @@
#include <deque>
#include <functional>
#include <map>
+#include <memory>
#include <mutex>
#include <string>
#include <thread>
+#include <vector>
#include <android-base/thread_annotations.h>
@@ -151,7 +153,15 @@
*/
bool cancelTask(Handle handle);
- std::string toString(size_t retiredCount = SIZE_MAX) const;
+ struct SnapshotAnalysis;
+ /**
+ * Take a snapshot of the current state of the TimerThread and determine the
+ * potential cause of a deadlock.
+ * \param retiredCount The number of successfully retired calls to capture
+ * (may be many).
+ * \return See below for a description of a SnapShotAnalysis object
+ */
+ SnapshotAnalysis getSnapshotAnalysis(size_t retiredCount = SIZE_MAX) const;
/**
* Returns a string representation of the TimerThread queue.
@@ -202,7 +212,6 @@
return s;
}
- private:
// To minimize movement of data, we pass around shared_ptrs to Requests.
// These are allocated and deallocated outside of the lock.
// TODO(b/243839867) consider options to merge Request with the
@@ -232,6 +241,40 @@
std::string toString() const;
};
+
+ // SnapshotAnalysis contains info deduced by analysisTimeout().
+
+ struct SnapshotAnalysis {
+ // If we were unable to determine any applicable thread ids,
+ // we leave their value as INVALID_PID.
+ // Note, we use the linux thread id (not pthread), so its type is pid_t.
+ static constexpr pid_t INVALID_PID = -1;
+ // Description of likely issue and/or blocked method.
+ // Empty if no actionable info.
+ std::string description;
+ // Tid of the (latest) monitored thread which has timed out.
+ // This is the thread which the suspect is deduced with respect to.
+ // Most often, this is the thread which an abort is being triggered
+ // from.
+ pid_t timeoutTid = INVALID_PID;
+ // Tid of the (HAL) thread which has likely halted progress, selected
+ // from pendingRequests. May be the same as timeoutTid, if the timed-out
+ // thread directly called into the HAL.
+ pid_t suspectTid = INVALID_PID;
+ // Number of second chances given by the timer thread
+ size_t secondChanceCount;
+ // List of pending requests
+ std::vector<std::shared_ptr<const Request>> pendingRequests;
+ // List of timed-out requests
+ std::vector<std::shared_ptr<const Request>> timeoutRequests;
+ // List of retired requests
+ std::vector<std::shared_ptr<const Request>> retiredRequests;
+ // Dumps the information contained above as well as additional call
+ // stacks where applicable.
+ std::string toString() const;
+ };
+
+ private:
// Deque of requests, in order of add().
// This class is thread-safe.
class RequestQueue {
@@ -326,36 +369,11 @@
}
};
- // Analysis contains info deduced by analysisTimeout().
- //
- // Summary is the result string from checking timeoutRequests to see if
- // any might be caused by blocked calls in pendingRequests.
- //
- // Summary string is empty if there is no automatic actionable info.
- //
- // timeoutTid is the tid selected from timeoutRequests (if any).
- //
- // HALBlockedTid is the tid that is blocked from pendingRequests believed
- // to cause the timeout.
- // HALBlockedTid may be INVALID_PID if no suspected tid is found,
- // and if HALBlockedTid is valid, it will not be the same as timeoutTid.
- //
- static constexpr pid_t INVALID_PID = -1;
- struct Analysis {
- std::string summary;
- pid_t timeoutTid = INVALID_PID;
- pid_t HALBlockedTid = INVALID_PID;
- };
// A HAL method is where the substring "Hidl" is in the class name.
// The tag should look like: ... Hidl ... :: ...
static bool isRequestFromHal(const std::shared_ptr<const Request>& request);
- // Returns analysis from the requests.
- static Analysis analyzeTimeout(
- const std::vector<std::shared_ptr<const Request>>& timeoutRequests,
- const std::vector<std::shared_ptr<const Request>>& pendingRequests);
-
std::vector<std::shared_ptr<const Request>> getPendingRequests() const;
static constexpr size_t kRetiredQueueMax = 16;