TimerThread: Add timeout analysis logic Automatically dumps TimeCheck analysis if there is an actionable blocked binder call. Test: dedicated "crasher" test, check logcat Test: adb shell dumpsys media.audio_flinger Bug: 219958414 Change-Id: Ie21ef8940404fa7174938277bbf9c7aab2817ef7

commit: f45f34cddf847d443afca518916ea309b448ffa1 [log] [tgz]
author: Andy Hung <hunga@google.com> Fri Mar 25 13:09:03 2022 -0700
committer: Andy Hung <hunga@google.com> Wed Mar 30 11:35:23 2022 -0700
tree: 7cfa10118243f3d285a211193f248ac31c432c3b
parent: 18b6ea1aba2aabcf93e9783734cf48361394b824 [diff]
diff --git a/media/utils/TimeCheck.cpp b/media/utils/TimeCheck.cpp
index d0dc9e1..3e35970 100644
--- a/media/utils/TimeCheck.cpp
+++ b/media/utils/TimeCheck.cpp

@@ -197,6 +197,7 @@
     } else {
         ALOGI("No HAL process pid available, skipping tombstones");
     }
+
     LOG_EVENT_STRING(LOGTAG_AUDIO_BINDER_TIMEOUT, tag.c_str());
     LOG_ALWAYS_FATAL("TimeCheck timeout for %s scheduled %s on thread %d\n%s",
             tag.c_str(), formatTime(startTime).c_str(), tid, summary.c_str());

diff --git a/media/utils/TimerThread.cpp b/media/utils/TimerThread.cpp
index 3bc4549..b82816f 100644
--- a/media/utils/TimerThread.cpp
+++ b/media/utils/TimerThread.cpp

@@ -53,17 +53,112 @@
 }
 
 std::string TimerThread::toString(size_t retiredCount) const {
+    // Note: These request queues are snapshot very close together but
+    // not at "identical" times as we don't use a class-wide lock.
+
+    std::vector<std::shared_ptr<const Request>> timeoutRequests;
+    std::vector<std::shared_ptr<const Request>> retiredRequests;
+    mTimeoutQueue.copyRequests(timeoutRequests);
+    mRetiredQueue.copyRequests(retiredRequests, retiredCount);
+    std::vector<std::shared_ptr<const Request>> pendingRequests =
+        getPendingRequests();
+
+    struct Analysis analysis = analyzeTimeout(timeoutRequests, pendingRequests);
+    std::string analysisSummary;
+    if (!analysis.summary.empty()) {
+        analysisSummary = std::string("\nanalysis [ ").append(analysis.summary).append(" ]");
+    }
     return std::string("now ")
             .append(formatTime(std::chrono::system_clock::now()))
-            .append("\npending [ ")
-            .append(pendingToString())
-            .append(" ]\ntimeout [ ")
-            .append(timeoutToString())
+            .append(analysisSummary)
+            .append("\ntimeout [ ")
+            .append(requestsToString(timeoutRequests))
+            .append(" ]\npending [ ")
+            .append(requestsToString(pendingRequests))
             .append(" ]\nretired [ ")
-            .append(retiredToString(retiredCount))
+            .append(requestsToString(retiredRequests))
             .append(" ]");
 }
 
+// A HAL method is where the substring "Hidl" is in the class name.
+// The tag should look like: ... Hidl ... :: ...
+// When the audio HAL is updated to AIDL perhaps we will use instead
+// a global directory of HAL classes.
+//
+// See MethodStatistics.cpp:
+// mediautils::getStatisticsClassesForModule(METHOD_STATISTICS_MODULE_NAME_AUDIO_HIDL)
+//
+/* static */
+bool TimerThread::isRequestFromHal(const std::shared_ptr<const Request>& request) {
+    const size_t hidlPos = request->tag.find("Hidl");
+    if (hidlPos == std::string::npos) return false;
+    // should be a separator afterwards Hidl which indicates the string was in the class.
+    const size_t separatorPos = request->tag.find("::", hidlPos);
+    return separatorPos != std::string::npos;
+}
+
+/* static */
+struct TimerThread::Analysis TimerThread::analyzeTimeout(
+    const std::vector<std::shared_ptr<const Request>>& timeoutRequests,
+    const std::vector<std::shared_ptr<const Request>>& pendingRequests) {
+
+    if (timeoutRequests.empty() || pendingRequests.empty()) return {}; // nothing to say.
+
+    // for now look at last timeout (in our case, the only timeout)
+    const std::shared_ptr<const Request> timeout = timeoutRequests.back();
+
+    // pending Requests that are problematic.
+    std::vector<std::shared_ptr<const Request>> pendingExact;
+    std::vector<std::shared_ptr<const Request>> pendingPossible;
+
+    // We look at pending requests that were scheduled no later than kDuration
+    // after the timeout request. This prevents false matches with calls
+    // that naturally block for a short period of time
+    // such as HAL write() and read().
+    //
+    auto constexpr kDuration = std::chrono::milliseconds(1000);
+    for (const auto& pending : pendingRequests) {
+        // If the pending tid is the same as timeout tid, problem identified.
+        if (pending->tid == timeout->tid) {
+            pendingExact.emplace_back(pending);
+            continue;
+        }
+
+        // if the pending tid is scheduled within time limit
+        if (pending->scheduled - timeout->scheduled < kDuration) {
+            pendingPossible.emplace_back(pending);
+        }
+    }
+
+    struct Analysis analysis{};
+
+    analysis.timeoutTid = timeout->tid;
+    std::string& summary = analysis.summary;
+    if (!pendingExact.empty()) {
+        const auto& request = pendingExact.front();
+        const bool hal = isRequestFromHal(request);
+
+        if (hal) {
+            summary = std::string("Blocked directly due to HAL call: ")
+                .append(request->toString());
+        }
+    }
+    if (summary.empty() && !pendingPossible.empty()) {
+        for (const auto& request : pendingPossible) {
+            const bool hal = isRequestFromHal(request);
+            if (hal) {
+                // The first blocked call is the most likely one.
+                // Recent calls might be temporarily blocked
+                // calls such as write() or read() depending on kDuration.
+                summary = std::string("Blocked possibly due to HAL call: ")
+                    .append(request->toString());
+                analysis.HALBlockedTid = request->tid;
+            }
+       }
+    }
+    return analysis;
+}
+
 std::vector<std::shared_ptr<const TimerThread::Request>> TimerThread::getPendingRequests() const {
     constexpr size_t kEstimatedPendingRequests = 8;  // approx 128 byte alloc.
     std::vector<std::shared_ptr<const Request>> pendingRequests;

diff --git a/media/utils/include/mediautils/TimerThread.h b/media/utils/include/mediautils/TimerThread.h
index 2dfe499..ffee602 100644
--- a/media/utils/include/mediautils/TimerThread.h
+++ b/media/utils/include/mediautils/TimerThread.h

@@ -212,6 +212,36 @@
         void copyRequests(std::vector<std::shared_ptr<const Request>>& requests) const;
     };
 
+    // Analysis contains info deduced by analysisTimeout().
+    //
+    // Summary is the result string from checking timeoutRequests to see if
+    // any might be caused by blocked calls in pendingRequests.
+    //
+    // Summary string is empty if there is no automatic actionable info.
+    //
+    // timeoutTid is the tid selected from timeoutRequests (if any).
+    //
+    // HALBlockedTid is the tid that is blocked from pendingRequests believed
+    // to cause the timeout.
+    // HALBlockedTid may be INVALID_PID if no suspected tid is found,
+    // and if HALBlockedTid is valid, it will not be the same as timeoutTid.
+    //
+    static constexpr pid_t INVALID_PID = -1;
+    struct Analysis {
+        std::string summary;
+        pid_t timeoutTid = INVALID_PID;
+        pid_t HALBlockedTid = INVALID_PID;
+    };
+
+    // A HAL method is where the substring "Hidl" is in the class name.
+    // The tag should look like: ... Hidl ... :: ...
+    static bool isRequestFromHal(const std::shared_ptr<const Request>& request);
+
+    // Returns analysis from the requests.
+    static Analysis analyzeTimeout(
+        const std::vector<std::shared_ptr<const Request>>& timeoutRequests,
+        const std::vector<std::shared_ptr<const Request>>& pendingRequests);
+
     std::vector<std::shared_ptr<const Request>> getPendingRequests() const;
 
     // A no-timeout request is represented by a handles at the end of steady_clock time,
commit	f45f34cddf847d443afca518916ea309b448ffa1	[log] [tgz]
author	Andy Hung <hunga@google.com>	Fri Mar 25 13:09:03 2022 -0700
committer	Andy Hung <hunga@google.com>	Wed Mar 30 11:35:23 2022 -0700
tree	7cfa10118243f3d285a211193f248ac31c432c3b
parent	18b6ea1aba2aabcf93e9783734cf48361394b824 [diff]