Abort correct thread on TimeCheck timeout
The current TimeCheck implementation leads to incorrect
clustering/duping based on the abort stack trace being generated from
TimeCheck (running in the watchdog thread).
Appropriately signal the timed-out thread, so we receive more accurate
stack traces.
Test: atest timecheck_tests, manual verification of correct thread stack
trace.
Bug: 239252933
Bug: 227594853
Change-Id: I5d62581682ae85af269e7e10125343f096e26303
diff --git a/media/utils/TimeCheck.cpp b/media/utils/TimeCheck.cpp
index efa53f8..0cf5bd9 100644
--- a/media/utils/TimeCheck.cpp
+++ b/media/utils/TimeCheck.cpp
@@ -14,11 +14,14 @@
* limitations under the License.
*/
+#include <csignal>
+#include "mediautils/TimerThread.h"
#define LOG_TAG "TimeCheck"
#include <optional>
#include <android-base/logging.h>
+#include <android-base/strings.h>
#include <audio_utils/clock.h>
#include <mediautils/EventLog.h>
#include <mediautils/FixedString.h>
@@ -27,20 +30,21 @@
#include <mediautils/TidWrapper.h>
#include <utils/Log.h>
-#if defined(__BIONIC__)
+#if defined(__ANDROID__)
#include "debuggerd/handler.h"
#endif
+
+namespace android::mediautils {
// This function appropriately signals a pid to dump a backtrace if we are
-// running on device.
+// running on device (and the HAL exists). If we are not running on an Android
+// device, there is no HAL to signal (so we do nothing).
static inline void signalAudioHAL([[maybe_unused]] pid_t pid) {
-#if defined(__BIONIC__)
+#if defined(__ANDROID__)
sigqueue(pid, DEBUGGER_SIGNAL, {.sival_int = 0});
#endif
}
-namespace android::mediautils {
-
/**
* Returns the std::string "HH:MM:SS.MSc" from a system_clock time_point.
*/
@@ -148,7 +152,7 @@
std::string TimeCheck::toString() {
// note pending and retired are individually locked for maximum concurrency,
// snapshot is not instantaneous at a single time.
- return getTimeCheckThread().toString();
+ return getTimeCheckThread().getSnapshotAnalysis().toString();
}
TimeCheck::TimeCheck(std::string_view tag, OnTimerFunc&& onTimer, Duration requestedTimeoutDuration,
@@ -253,7 +257,7 @@
// Generate the TimerThread summary string early before sending signals to the
// HAL processes which can affect thread behavior.
- const std::string summary = getTimeCheckThread().toString(4 /* retiredCount */);
+ const auto snapshotAnalysis = getTimeCheckThread().getSnapshotAnalysis(4 /* retiredCount */);
// Generate audio HAL processes tombstones and allow time to complete
// before forcing restart
@@ -281,7 +285,7 @@
.append(analyzeTimeouts(requestedTimeoutMs + secondChanceMs,
elapsedSteadyMs, elapsedSystemMs)).append("\n")
.append(halPids).append("\n")
- .append(summary);
+ .append(snapshotAnalysis.toString());
// Note: LOG_ALWAYS_FATAL limits the size of the string - per log/log.h:
// Log message text may be truncated to less than an
@@ -291,7 +295,20 @@
// to avoid the size limitation. LOG(FATAL) does an abort whereas
// LOG(FATAL_WITHOUT_ABORT) does not abort.
- LOG(FATAL) << abortMessage;
+ static constexpr pid_t invalidPid = TimerThread::SnapshotAnalysis::INVALID_PID;
+ pid_t tidToAbort = invalidPid;
+ if (snapshotAnalysis.suspectTid != invalidPid) {
+ tidToAbort = snapshotAnalysis.suspectTid;
+ } else if (snapshotAnalysis.timeoutTid != invalidPid) {
+ tidToAbort = snapshotAnalysis.timeoutTid;
+ }
+
+ LOG(FATAL_WITHOUT_ABORT) << abortMessage;
+ const auto ret = abortTid(tidToAbort);
+ if (ret < 0) {
+ LOG(FATAL) << "TimeCheck thread signal failed, aborting process. "
+ "errno: " << errno << base::ErrnoNumberAsString(errno);
+ }
}
// Automatically create a TimeCheck class for a class and method.
diff --git a/media/utils/TimerThread.cpp b/media/utils/TimerThread.cpp
index 51bd5eb..b760ee2 100644
--- a/media/utils/TimerThread.cpp
+++ b/media/utils/TimerThread.cpp
@@ -59,39 +59,29 @@
return true;
}
-std::string TimerThread::toString(size_t retiredCount) const {
+
+std::string TimerThread::SnapshotAnalysis::toString() const {
// Note: These request queues are snapshot very close together but
// not at "identical" times as we don't use a class-wide lock.
-
- std::vector<std::shared_ptr<const Request>> timeoutRequests;
- std::vector<std::shared_ptr<const Request>> retiredRequests;
- mTimeoutQueue.copyRequests(timeoutRequests);
- mRetiredQueue.copyRequests(retiredRequests, retiredCount);
- std::vector<std::shared_ptr<const Request>> pendingRequests =
- getPendingRequests();
-
- struct Analysis analysis = analyzeTimeout(timeoutRequests, pendingRequests);
- std::string analysisSummary;
- if (!analysis.summary.empty()) {
- analysisSummary = std::string("\nanalysis [ ").append(analysis.summary).append(" ]");
- }
+ std::string analysisSummary = std::string("\nanalysis [ ").append(description).append(" ]");
std::string timeoutStack;
- if (analysis.timeoutTid != -1) {
- timeoutStack = std::string("\ntimeout(")
- .append(std::to_string(analysis.timeoutTid)).append(") callstack [\n")
- .append(getCallStackStringForTid(analysis.timeoutTid)).append("]");
- }
std::string blockedStack;
- if (analysis.HALBlockedTid != -1) {
+ if (timeoutTid != -1) {
+ timeoutStack = std::string(suspectTid == timeoutTid ? "\ntimeout/blocked(" : "\ntimeout(")
+ .append(std::to_string(timeoutTid)).append(") callstack [\n")
+ .append(getCallStackStringForTid(timeoutTid)).append("]");
+ }
+
+ if (suspectTid != -1 && suspectTid != timeoutTid) {
blockedStack = std::string("\nblocked(")
- .append(std::to_string(analysis.HALBlockedTid)).append(") callstack [\n")
- .append(getCallStackStringForTid(analysis.HALBlockedTid)).append("]");
+ .append(std::to_string(suspectTid)).append(") callstack [\n")
+ .append(getCallStackStringForTid(suspectTid)).append("]");
}
return std::string("now ")
.append(formatTime(std::chrono::system_clock::now()))
.append("\nsecondChanceCount ")
- .append(std::to_string(mMonitorThread.getSecondChanceCount()))
+ .append(std::to_string(secondChanceCount))
.append(analysisSummary)
.append("\ntimeout [ ")
.append(requestsToString(timeoutRequests))
@@ -121,16 +111,23 @@
return separatorPos != std::string::npos;
}
-/* static */
-struct TimerThread::Analysis TimerThread::analyzeTimeout(
- const std::vector<std::shared_ptr<const Request>>& timeoutRequests,
- const std::vector<std::shared_ptr<const Request>>& pendingRequests) {
-
- if (timeoutRequests.empty() || pendingRequests.empty()) return {}; // nothing to say.
-
+struct TimerThread::SnapshotAnalysis TimerThread::getSnapshotAnalysis(size_t retiredCount) const {
+ struct SnapshotAnalysis analysis{};
+ // The following snapshot of the TimerThread state will be utilized for
+ // analysis. Note, there is no lock around these calls, so there could be
+ // a state update between them.
+ mTimeoutQueue.copyRequests(analysis.timeoutRequests);
+ mRetiredQueue.copyRequests(analysis.retiredRequests, retiredCount);
+ analysis.pendingRequests = getPendingRequests();
+ analysis.secondChanceCount = mMonitorThread.getSecondChanceCount();
+ // No call has timed out, so there is no analysis to be done.
+ if (analysis.timeoutRequests.empty())
+ return analysis;
// for now look at last timeout (in our case, the only timeout)
- const std::shared_ptr<const Request> timeout = timeoutRequests.back();
-
+ const std::shared_ptr<const Request> timeout = analysis.timeoutRequests.back();
+ analysis.timeoutTid = timeout->tid;
+ if (analysis.pendingRequests.empty())
+ return analysis;
// pending Requests that are problematic.
std::vector<std::shared_ptr<const Request>> pendingExact;
std::vector<std::shared_ptr<const Request>> pendingPossible;
@@ -141,7 +138,7 @@
// such as HAL write() and read().
//
constexpr Duration kPendingDuration = 1000ms;
- for (const auto& pending : pendingRequests) {
+ for (const auto& pending : analysis.pendingRequests) {
// If the pending tid is the same as timeout tid, problem identified.
if (pending->tid == timeout->tid) {
pendingExact.emplace_back(pending);
@@ -154,29 +151,27 @@
}
}
- struct Analysis analysis{};
-
- analysis.timeoutTid = timeout->tid;
- std::string& summary = analysis.summary;
+ std::string& description = analysis.description;
if (!pendingExact.empty()) {
const auto& request = pendingExact.front();
const bool hal = isRequestFromHal(request);
if (hal) {
- summary = std::string("Blocked directly due to HAL call: ")
+ description = std::string("Blocked directly due to HAL call: ")
.append(request->toString());
+ analysis.suspectTid= request->tid;
}
}
- if (summary.empty() && !pendingPossible.empty()) {
+ if (description.empty() && !pendingPossible.empty()) {
for (const auto& request : pendingPossible) {
const bool hal = isRequestFromHal(request);
if (hal) {
// The first blocked call is the most likely one.
// Recent calls might be temporarily blocked
// calls such as write() or read() depending on kDuration.
- summary = std::string("Blocked possibly due to HAL call: ")
+ description = std::string("Blocked possibly due to HAL call: ")
.append(request->toString());
- analysis.HALBlockedTid = request->tid;
+ analysis.suspectTid= request->tid;
}
}
}
diff --git a/media/utils/include/mediautils/TidWrapper.h b/media/utils/include/mediautils/TidWrapper.h
index add2fa5..aeefa01 100644
--- a/media/utils/include/mediautils/TidWrapper.h
+++ b/media/utils/include/mediautils/TidWrapper.h
@@ -16,8 +16,11 @@
#pragma once
+#if defined(__linux__)
+#include <signal.h>
#include <sys/syscall.h>
#include <unistd.h>
+#endif
namespace android::mediautils {
@@ -31,4 +34,20 @@
#endif
}
+// Send an abort signal to a (linux) thread id.
+inline int abortTid(int tid) {
+#if defined(__linux__)
+ const pid_t pid = getpid();
+ siginfo_t siginfo = {
+ .si_code = SI_QUEUE,
+ .si_pid = pid,
+ .si_uid = getuid(),
+ };
+ return syscall(SYS_rt_tgsigqueueinfo, pid, tid, SIGABRT, &siginfo);
+#else
+ errno = ENODEV;
+ return -1;
+#endif
+}
+
}
diff --git a/media/utils/include/mediautils/TimeCheck.h b/media/utils/include/mediautils/TimeCheck.h
index bdb5337..f9ea50c 100644
--- a/media/utils/include/mediautils/TimeCheck.h
+++ b/media/utils/include/mediautils/TimeCheck.h
@@ -123,7 +123,6 @@
const Duration secondChanceDuration;
const std::chrono::system_clock::time_point startSystemTime;
const pid_t tid;
-
void onCancel(TimerThread::Handle handle) const;
void onTimeout(TimerThread::Handle handle) const;
};
diff --git a/media/utils/include/mediautils/TimerThread.h b/media/utils/include/mediautils/TimerThread.h
index c76fa7d..d5be177 100644
--- a/media/utils/include/mediautils/TimerThread.h
+++ b/media/utils/include/mediautils/TimerThread.h
@@ -21,9 +21,11 @@
#include <deque>
#include <functional>
#include <map>
+#include <memory>
#include <mutex>
#include <string>
#include <thread>
+#include <vector>
#include <android-base/thread_annotations.h>
@@ -151,7 +153,15 @@
*/
bool cancelTask(Handle handle);
- std::string toString(size_t retiredCount = SIZE_MAX) const;
+ struct SnapshotAnalysis;
+ /**
+ * Take a snapshot of the current state of the TimerThread and determine the
+ * potential cause of a deadlock.
+ * \param retiredCount The number of successfully retired calls to capture
+ * (may be many).
+ * \return See below for a description of a SnapShotAnalysis object
+ */
+ SnapshotAnalysis getSnapshotAnalysis(size_t retiredCount = SIZE_MAX) const;
/**
* Returns a string representation of the TimerThread queue.
@@ -202,7 +212,6 @@
return s;
}
- private:
// To minimize movement of data, we pass around shared_ptrs to Requests.
// These are allocated and deallocated outside of the lock.
// TODO(b/243839867) consider options to merge Request with the
@@ -232,6 +241,40 @@
std::string toString() const;
};
+
+ // SnapshotAnalysis contains info deduced by analysisTimeout().
+
+ struct SnapshotAnalysis {
+ // If we were unable to determine any applicable thread ids,
+ // we leave their value as INVALID_PID.
+ // Note, we use the linux thread id (not pthread), so its type is pid_t.
+ static constexpr pid_t INVALID_PID = -1;
+ // Description of likely issue and/or blocked method.
+ // Empty if no actionable info.
+ std::string description;
+ // Tid of the (latest) monitored thread which has timed out.
+ // This is the thread which the suspect is deduced with respect to.
+ // Most often, this is the thread which an abort is being triggered
+ // from.
+ pid_t timeoutTid = INVALID_PID;
+ // Tid of the (HAL) thread which has likely halted progress, selected
+ // from pendingRequests. May be the same as timeoutTid, if the timed-out
+ // thread directly called into the HAL.
+ pid_t suspectTid = INVALID_PID;
+ // Number of second chances given by the timer thread
+ size_t secondChanceCount;
+ // List of pending requests
+ std::vector<std::shared_ptr<const Request>> pendingRequests;
+ // List of timed-out requests
+ std::vector<std::shared_ptr<const Request>> timeoutRequests;
+ // List of retired requests
+ std::vector<std::shared_ptr<const Request>> retiredRequests;
+ // Dumps the information contained above as well as additional call
+ // stacks where applicable.
+ std::string toString() const;
+ };
+
+ private:
// Deque of requests, in order of add().
// This class is thread-safe.
class RequestQueue {
@@ -326,36 +369,11 @@
}
};
- // Analysis contains info deduced by analysisTimeout().
- //
- // Summary is the result string from checking timeoutRequests to see if
- // any might be caused by blocked calls in pendingRequests.
- //
- // Summary string is empty if there is no automatic actionable info.
- //
- // timeoutTid is the tid selected from timeoutRequests (if any).
- //
- // HALBlockedTid is the tid that is blocked from pendingRequests believed
- // to cause the timeout.
- // HALBlockedTid may be INVALID_PID if no suspected tid is found,
- // and if HALBlockedTid is valid, it will not be the same as timeoutTid.
- //
- static constexpr pid_t INVALID_PID = -1;
- struct Analysis {
- std::string summary;
- pid_t timeoutTid = INVALID_PID;
- pid_t HALBlockedTid = INVALID_PID;
- };
// A HAL method is where the substring "Hidl" is in the class name.
// The tag should look like: ... Hidl ... :: ...
static bool isRequestFromHal(const std::shared_ptr<const Request>& request);
- // Returns analysis from the requests.
- static Analysis analyzeTimeout(
- const std::vector<std::shared_ptr<const Request>>& timeoutRequests,
- const std::vector<std::shared_ptr<const Request>>& pendingRequests);
-
std::vector<std::shared_ptr<const Request>> getPendingRequests() const;
static constexpr size_t kRetiredQueueMax = 16;
diff --git a/media/utils/tests/timecheck_tests.cpp b/media/utils/tests/timecheck_tests.cpp
index 8236174..bd91efa 100644
--- a/media/utils/tests/timecheck_tests.cpp
+++ b/media/utils/tests/timecheck_tests.cpp
@@ -26,7 +26,6 @@
using namespace std::chrono_literals;
namespace {
-
TEST(timecheck_tests, success) {
bool timeoutRegistered = false;
float elapsedMsRegistered = 0.f;
@@ -69,4 +68,33 @@
// Note: We do not test TimeCheck crash because TimeCheck is multithreaded and the
// EXPECT_EXIT() signal catching is imperfect due to the gtest fork.
+// Note, the following test is to manually verify the correct thread is aborted.
+// Due to difficulties with gtest and EXPECT_EXIT, this is difficult to verify
+// automatically. TODO(b/246446561) Attempt to use EXPECT_EXIT
+
+#if 0
+void threadFunction() {
+ bool timeoutRegistered = false;
+ float elapsedMsRegistered = 0.f;
+ std::atomic_bool event = false; // seq-cst implies acquire-release
+ {
+ TimeCheck timeCheck("timeout",
+ [&event, &timeoutRegistered, &elapsedMsRegistered]
+ (bool timeout, float elapsedMs) {
+ timeoutRegistered = timeout;
+ elapsedMsRegistered = elapsedMs;
+ event = true; // store-release, must be last.
+ }, 1ms /* timeoutDuration */, {} /* secondChanceDuration */, true /* crash */);
+ std::this_thread::sleep_for(100ms);
+ ADD_FAILURE();
+ }
+}
+
+TEST(timecheck_tests, death) {
+ std::thread mthread{threadFunction};
+ mthread.join();
+}
+#endif
+
} // namespace
+