blob: 658191e67234c56155e90fe69ba4ad5370d57873 [file] [log] [blame]
Eric Laurent3528c932018-02-23 17:17:22 -08001/*
2 * Copyright (C) 2018 The Android Open Source Project
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
Atneya Naircce14202022-09-05 20:17:50 -070017#include <csignal>
18#include "mediautils/TimerThread.h"
Eric Laurent42896a02019-09-27 15:40:33 -070019#define LOG_TAG "TimeCheck"
Eric Laurent3528c932018-02-23 17:17:22 -080020
Ytai Ben-Tsvi1ea62c92021-11-10 14:38:27 -080021#include <optional>
22
Andy Hung10ac7112022-03-28 08:00:40 -070023#include <android-base/logging.h>
Atneya Naircce14202022-09-05 20:17:50 -070024#include <android-base/strings.h>
Andy Hunga2a1ac32022-03-18 16:12:11 -070025#include <audio_utils/clock.h>
Andy Hungcb899102024-08-06 15:17:16 -070026#include <cutils/properties.h>
Marco Nelissencf90b492019-09-26 11:20:54 -070027#include <mediautils/EventLog.h>
Andy Hung35f96152022-07-15 15:18:59 -070028#include <mediautils/FixedString.h>
Andy Hung224f82f2022-03-22 00:00:49 -070029#include <mediautils/MethodStatistics.h>
Ytai Ben-Tsvi1ea62c92021-11-10 14:38:27 -080030#include <mediautils/TimeCheck.h>
Atneya Nairf5b68512022-05-23 20:02:49 -040031#include <mediautils/TidWrapper.h>
Ytai Ben-Tsvi1ea62c92021-11-10 14:38:27 -080032#include <utils/Log.h>
Atneya Nairf5b68512022-05-23 20:02:49 -040033
Atneya Naircce14202022-09-05 20:17:50 -070034#if defined(__ANDROID__)
Eric Laurent42896a02019-09-27 15:40:33 -070035#include "debuggerd/handler.h"
Atneya Nairf5b68512022-05-23 20:02:49 -040036#endif
37
Atneya Naircce14202022-09-05 20:17:50 -070038
39namespace android::mediautils {
Andy Hungcb899102024-08-06 15:17:16 -070040
41// Note: The sum of kDefaultTimeOutDurationMs and kDefaultSecondChanceDurationMs
42// should be no less than 2 seconds, otherwise spurious timeouts
43// may occur with system suspend.
44static constexpr int kDefaultTimeoutDurationMs = 3000;
45
46// Due to suspend abort not incrementing the monotonic clock,
47// we allow another second chance timeout after the first timeout expires.
48//
49// The total timeout is therefore kDefaultTimeoutDuration + kDefaultSecondChanceDuration,
50// and the result is more stable when the monotonic clock increments during suspend.
51//
52static constexpr int kDefaultSecondChanceDurationMs = 2000;
53
54/* static */
55TimeCheck::Duration TimeCheck::getDefaultTimeoutDuration() {
56 static constinit std::atomic<int> defaultTimeoutDurationMs{};
57 auto defaultMs = defaultTimeoutDurationMs.load(std::memory_order_relaxed);
58 if (defaultMs == 0) {
59 defaultMs = property_get_int32(
60 "audio.timecheck.timeout_duration_ms", kDefaultTimeoutDurationMs);
61 if (defaultMs < 1) defaultMs = kDefaultTimeoutDurationMs;
62 defaultTimeoutDurationMs.store(defaultMs, std::memory_order_relaxed);
63 }
64 return std::chrono::milliseconds(defaultMs);
65}
66
67/* static */
68TimeCheck::Duration TimeCheck::getDefaultSecondChanceDuration() {
69 static constinit std::atomic<int> defaultSecondChanceDurationMs{};
70 auto defaultMs = defaultSecondChanceDurationMs.load(std::memory_order_relaxed);
71 if (defaultMs == 0) {
72 defaultMs = property_get_int32(
73 "audio.timecheck.second_chance_duration_ms", kDefaultSecondChanceDurationMs);
74 if (defaultMs < 1) defaultMs = kDefaultSecondChanceDurationMs;
75 defaultSecondChanceDurationMs.store(defaultMs, std::memory_order_relaxed);
76 }
77 return std::chrono::milliseconds(defaultMs);
78}
79
Atneya Nairf5b68512022-05-23 20:02:49 -040080// This function appropriately signals a pid to dump a backtrace if we are
Atneya Naircce14202022-09-05 20:17:50 -070081// running on device (and the HAL exists). If we are not running on an Android
82// device, there is no HAL to signal (so we do nothing).
Atneya Nairf5b68512022-05-23 20:02:49 -040083static inline void signalAudioHAL([[maybe_unused]] pid_t pid) {
Atneya Naircce14202022-09-05 20:17:50 -070084#if defined(__ANDROID__)
Atneya Nairf5b68512022-05-23 20:02:49 -040085 sigqueue(pid, DEBUGGER_SIGNAL, {.sival_int = 0});
86#endif
87}
Eric Laurent3528c932018-02-23 17:17:22 -080088
Andy Hunga2a1ac32022-03-18 16:12:11 -070089/**
90 * Returns the std::string "HH:MM:SS.MSc" from a system_clock time_point.
91 */
Ytai Ben-Tsvi34f26b12021-12-02 13:58:38 -080092std::string formatTime(std::chrono::system_clock::time_point t) {
Andy Hunga2a1ac32022-03-18 16:12:11 -070093 auto time_string = audio_utils_time_string_from_ns(
94 std::chrono::nanoseconds(t.time_since_epoch()).count());
95
96 // The time string is 19 characters (including null termination).
97 // Example: "03-27 16:47:06.187"
98 // MM DD HH MM SS MS
99 // We offset by 6 to get HH:MM:SS.MSc
100 //
101 return time_string.time + 6; // offset to remove month/day.
Ytai Ben-Tsvi34f26b12021-12-02 13:58:38 -0800102}
103
Andy Hunga2a1ac32022-03-18 16:12:11 -0700104/**
105 * Finds the end of the common time prefix.
106 *
107 * This is as an option to remove the common time prefix to avoid
108 * unnecessary duplicated strings.
109 *
110 * \param time1 a time string
111 * \param time2 a time string
112 * \return the position where the common time prefix ends. For abbreviated
113 * printing of time2, offset the character pointer by this position.
114 */
115static size_t commonTimePrefixPosition(std::string_view time1, std::string_view time2) {
116 const size_t endPos = std::min(time1.size(), time2.size());
117 size_t i;
118
119 // Find location of the first mismatch between strings
120 for (i = 0; ; ++i) {
121 if (i == endPos) {
122 return i; // strings match completely to the length of one of the strings.
123 }
124 if (time1[i] != time2[i]) {
125 break;
126 }
127 if (time1[i] == '\0') {
128 return i; // "printed" strings match completely. No need to check further.
129 }
130 }
131
132 // Go backwards until we find a delimeter or space.
133 for (; i > 0
134 && isdigit(time1[i]) // still a number
135 && time1[i - 1] != ' '
136 ; --i) {
137 }
138 return i;
139}
140
141/**
142 * Returns the unique suffix of time2 that isn't present in time1.
143 *
144 * If time2 is identical to time1, then an empty string_view is returned.
145 * This method is used to elide the common prefix when printing times.
146 */
147std::string_view timeSuffix(std::string_view time1, std::string_view time2) {
148 const size_t pos = commonTimePrefixPosition(time1, time2);
149 return time2.substr(pos);
150}
Ytai Ben-Tsvi34f26b12021-12-02 13:58:38 -0800151
Eric Laurent42896a02019-09-27 15:40:33 -0700152// Audio HAL server pids vector used to generate audio HAL processes tombstone
153// when audioserver watchdog triggers.
154// We use a lockless storage to avoid potential deadlocks in the context of watchdog
155// trigger.
156// Protection again simultaneous writes is not needed given one update takes place
157// during AudioFlinger construction and other comes necessarily later once the IAudioFlinger
158// interface is available.
159// The use of an atomic index just guaranties that current vector is fully initialized
160// when read.
161/* static */
162void TimeCheck::accessAudioHalPids(std::vector<pid_t>* pids, bool update) {
163 static constexpr int kNumAudioHalPidsVectors = 3;
164 static std::vector<pid_t> audioHalPids[kNumAudioHalPidsVectors];
Andy Hung5c6d68a2022-03-09 21:54:59 -0800165 static std::atomic<unsigned> curAudioHalPids = 0;
Eric Laurent42896a02019-09-27 15:40:33 -0700166
167 if (update) {
Eric Laurent1ad278b2021-03-05 18:09:01 +0100168 audioHalPids[(curAudioHalPids++ + 1) % kNumAudioHalPidsVectors] = *pids;
Eric Laurent42896a02019-09-27 15:40:33 -0700169 } else {
Eric Laurent1ad278b2021-03-05 18:09:01 +0100170 *pids = audioHalPids[curAudioHalPids % kNumAudioHalPidsVectors];
Eric Laurent42896a02019-09-27 15:40:33 -0700171 }
172}
173
174/* static */
175void TimeCheck::setAudioHalPids(const std::vector<pid_t>& pids) {
176 accessAudioHalPids(&(const_cast<std::vector<pid_t>&>(pids)), true);
177}
178
179/* static */
180std::vector<pid_t> TimeCheck::getAudioHalPids() {
181 std::vector<pid_t> pids;
182 accessAudioHalPids(&pids, false);
183 return pids;
184}
185
Eric Laurent3528c932018-02-23 17:17:22 -0800186/* static */
Andy Hung5c6d68a2022-03-09 21:54:59 -0800187TimerThread& TimeCheck::getTimeCheckThread() {
188 static TimerThread sTimeCheckThread{};
Eric Laurent3528c932018-02-23 17:17:22 -0800189 return sTimeCheckThread;
190}
191
Andy Hunga2a1ac32022-03-18 16:12:11 -0700192/* static */
193std::string TimeCheck::toString() {
194 // note pending and retired are individually locked for maximum concurrency,
195 // snapshot is not instantaneous at a single time.
Atneya Naircce14202022-09-05 20:17:50 -0700196 return getTimeCheckThread().getSnapshotAnalysis().toString();
Andy Hunga2a1ac32022-03-18 16:12:11 -0700197}
198
Andy Hungf8ab0932022-06-13 19:49:43 -0700199TimeCheck::TimeCheck(std::string_view tag, OnTimerFunc&& onTimer, Duration requestedTimeoutDuration,
200 Duration secondChanceDuration, bool crashOnTimeout)
Andy Hung35f96152022-07-15 15:18:59 -0700201 : mTimeCheckHandler{ std::make_shared<TimeCheckHandler>(
Andy Hungf8ab0932022-06-13 19:49:43 -0700202 tag, std::move(onTimer), crashOnTimeout, requestedTimeoutDuration,
Atneya Nairf5b68512022-05-23 20:02:49 -0400203 secondChanceDuration, std::chrono::system_clock::now(), getThreadIdWrapper()) }
Andy Hungf8ab0932022-06-13 19:49:43 -0700204 , mTimerHandle(requestedTimeoutDuration.count() == 0
205 /* for TimeCheck we don't consider a non-zero secondChanceDuration here */
Andy Hunga2a1ac32022-03-18 16:12:11 -0700206 ? getTimeCheckThread().trackTask(mTimeCheckHandler->tag)
207 : getTimeCheckThread().scheduleTask(
208 mTimeCheckHandler->tag,
209 // Pass in all the arguments by value to this task for safety.
210 // The thread could call the callback before the constructor is finished.
211 // The destructor is not blocked on callback.
Andy Hung2aa15102022-06-13 19:49:43 -0700212 [ timeCheckHandler = mTimeCheckHandler ](TimerThread::Handle timerHandle) {
213 timeCheckHandler->onTimeout(timerHandle);
Andy Hunga2a1ac32022-03-18 16:12:11 -0700214 },
Andy Hungf8ab0932022-06-13 19:49:43 -0700215 requestedTimeoutDuration,
216 secondChanceDuration)) {}
Eric Laurent3528c932018-02-23 17:17:22 -0800217
218TimeCheck::~TimeCheck() {
Andy Hunga2a1ac32022-03-18 16:12:11 -0700219 if (mTimeCheckHandler) {
220 mTimeCheckHandler->onCancel(mTimerHandle);
221 }
Eric Laurent3528c932018-02-23 17:17:22 -0800222}
223
Andy Hung2aa15102022-06-13 19:49:43 -0700224/* static */
225std::string TimeCheck::analyzeTimeouts(
Andy Hungcb899102024-08-06 15:17:16 -0700226 float requestedTimeoutMs, float secondChanceMs,
227 float elapsedSteadyMs, float elapsedSystemMs) {
Andy Hung2aa15102022-06-13 19:49:43 -0700228 // Track any OS clock issues with suspend.
229 // It is possible that the elapsedSystemMs is much greater than elapsedSteadyMs if
230 // a suspend occurs; however, we always expect the timeout ms should always be slightly
231 // less than the elapsed steady ms regardless of whether a suspend occurs or not.
232
Andy Hungcb899102024-08-06 15:17:16 -0700233 const float totalTimeoutMs = requestedTimeoutMs + secondChanceMs;
234 std::string s = std::format(
235 "Timeout ms {:.2f} ({:.2f} + {:.2f})"
236 " elapsed steady ms {:.4f} elapsed system ms {:.4f}",
237 totalTimeoutMs, requestedTimeoutMs, secondChanceMs, elapsedSteadyMs, elapsedSystemMs);
Andy Hung2aa15102022-06-13 19:49:43 -0700238
239 // Is there something unusual?
240 static constexpr float TOLERANCE_CONTEXT_SWITCH_MS = 200.f;
241
Andy Hungcb899102024-08-06 15:17:16 -0700242 if (totalTimeoutMs > elapsedSteadyMs || totalTimeoutMs > elapsedSystemMs) {
Andy Hung2aa15102022-06-13 19:49:43 -0700243 s.append("\nError: early expiration - "
Andy Hungcb899102024-08-06 15:17:16 -0700244 "totalTimeoutMs should be less than elapsed time");
Andy Hung2aa15102022-06-13 19:49:43 -0700245 }
246
247 if (elapsedSteadyMs > elapsedSystemMs + TOLERANCE_CONTEXT_SWITCH_MS) {
248 s.append("\nWarning: steady time should not advance faster than system time");
249 }
250
251 // This has been found in suspend stress testing.
Andy Hungcb899102024-08-06 15:17:16 -0700252 if (elapsedSteadyMs > totalTimeoutMs + TOLERANCE_CONTEXT_SWITCH_MS) {
Andy Hung2aa15102022-06-13 19:49:43 -0700253 s.append("\nWarning: steady time significantly exceeds timeout "
254 "- possible thread stall or aborted suspend");
255 }
256
257 // This has been found in suspend stress testing.
Andy Hungcb899102024-08-06 15:17:16 -0700258 if (elapsedSystemMs > totalTimeoutMs + TOLERANCE_CONTEXT_SWITCH_MS) {
Andy Hung2aa15102022-06-13 19:49:43 -0700259 s.append("\nInformation: system time significantly exceeds timeout "
260 "- possible suspend");
261 }
262 return s;
263}
264
265// To avoid any potential race conditions, the timer handle
266// (expiration = clock steady start + timeout) is passed into the callback.
Andy Hung5c6d68a2022-03-09 21:54:59 -0800267void TimeCheck::TimeCheckHandler::onCancel(TimerThread::Handle timerHandle) const
268{
269 if (TimeCheck::getTimeCheckThread().cancelTask(timerHandle) && onTimer) {
Andy Hung2aa15102022-06-13 19:49:43 -0700270 const std::chrono::steady_clock::time_point endSteadyTime =
271 std::chrono::steady_clock::now();
272 const float elapsedSteadyMs = std::chrono::duration_cast<FloatMs>(
273 endSteadyTime - timerHandle + timeoutDuration).count();
274 // send the elapsed steady time for statistics.
275 onTimer(false /* timeout */, elapsedSteadyMs);
Andy Hung5c6d68a2022-03-09 21:54:59 -0800276 }
277}
278
Andy Hung2aa15102022-06-13 19:49:43 -0700279// To avoid any potential race conditions, the timer handle
280// (expiration = clock steady start + timeout) is passed into the callback.
281void TimeCheck::TimeCheckHandler::onTimeout(TimerThread::Handle timerHandle) const
Andy Hung5c6d68a2022-03-09 21:54:59 -0800282{
Andy Hung2aa15102022-06-13 19:49:43 -0700283 const std::chrono::steady_clock::time_point endSteadyTime = std::chrono::steady_clock::now();
284 const std::chrono::system_clock::time_point endSystemTime = std::chrono::system_clock::now();
285 // timerHandle incorporates the timeout
286 const float elapsedSteadyMs = std::chrono::duration_cast<FloatMs>(
287 endSteadyTime - (timerHandle - timeoutDuration)).count();
288 const float elapsedSystemMs = std::chrono::duration_cast<FloatMs>(
289 endSystemTime - startSystemTime).count();
290 const float requestedTimeoutMs = std::chrono::duration_cast<FloatMs>(
291 timeoutDuration).count();
Andy Hungf8ab0932022-06-13 19:49:43 -0700292 const float secondChanceMs = std::chrono::duration_cast<FloatMs>(
293 secondChanceDuration).count();
Andy Hung2aa15102022-06-13 19:49:43 -0700294
Andy Hung5c6d68a2022-03-09 21:54:59 -0800295 if (onTimer) {
Andy Hung2aa15102022-06-13 19:49:43 -0700296 onTimer(true /* timeout */, elapsedSteadyMs);
Andy Hung5c6d68a2022-03-09 21:54:59 -0800297 }
298
299 if (!crashOnTimeout) return;
Ytai Ben-Tsvi34f26b12021-12-02 13:58:38 -0800300
Andy Hunga2a1ac32022-03-18 16:12:11 -0700301 // Generate the TimerThread summary string early before sending signals to the
302 // HAL processes which can affect thread behavior.
Atneya Naircce14202022-09-05 20:17:50 -0700303 const auto snapshotAnalysis = getTimeCheckThread().getSnapshotAnalysis(4 /* retiredCount */);
Andy Hunga2a1ac32022-03-18 16:12:11 -0700304
Ytai Ben-Tsvi1ea62c92021-11-10 14:38:27 -0800305 // Generate audio HAL processes tombstones and allow time to complete
306 // before forcing restart
Andy Hung5c6d68a2022-03-09 21:54:59 -0800307 std::vector<pid_t> pids = TimeCheck::getAudioHalPids();
Jaideep Sharmaeaf9d9c2022-05-13 15:15:27 +0530308 std::string halPids = "HAL pids [ ";
Ytai Ben-Tsvi1ea62c92021-11-10 14:38:27 -0800309 if (pids.size() != 0) {
310 for (const auto& pid : pids) {
311 ALOGI("requesting tombstone for pid: %d", pid);
Jaideep Sharmaeaf9d9c2022-05-13 15:15:27 +0530312 halPids.append(std::to_string(pid)).append(" ");
Atneya Nairf5b68512022-05-23 20:02:49 -0400313 signalAudioHAL(pid);
Eric Laurent3528c932018-02-23 17:17:22 -0800314 }
Ytai Ben-Tsvi1ea62c92021-11-10 14:38:27 -0800315 sleep(1);
316 } else {
317 ALOGI("No HAL process pid available, skipping tombstones");
Eric Laurent39b09b52018-06-29 12:24:40 -0700318 }
Jaideep Sharmaeaf9d9c2022-05-13 15:15:27 +0530319 halPids.append("]");
Andy Hungf45f34c2022-03-25 13:09:03 -0700320
Andy Hung5c6d68a2022-03-09 21:54:59 -0800321 LOG_EVENT_STRING(LOGTAG_AUDIO_BINDER_TIMEOUT, tag.c_str());
Andy Hung10ac7112022-03-28 08:00:40 -0700322
323 // Create abort message string - caution: this can be very large.
324 const std::string abortMessage = std::string("TimeCheck timeout for ")
325 .append(tag)
Andy Hung2aa15102022-06-13 19:49:43 -0700326 .append(" scheduled ").append(formatTime(startSystemTime))
Andy Hung10ac7112022-03-28 08:00:40 -0700327 .append(" on thread ").append(std::to_string(tid)).append("\n")
Andy Hungcb899102024-08-06 15:17:16 -0700328 .append(analyzeTimeouts(requestedTimeoutMs, secondChanceMs,
Andy Hungf8ab0932022-06-13 19:49:43 -0700329 elapsedSteadyMs, elapsedSystemMs)).append("\n")
Jaideep Sharmaeaf9d9c2022-05-13 15:15:27 +0530330 .append(halPids).append("\n")
Atneya Naircce14202022-09-05 20:17:50 -0700331 .append(snapshotAnalysis.toString());
Andy Hung10ac7112022-03-28 08:00:40 -0700332
Andy Hung4c544c32023-11-03 15:56:24 -0700333 // In many cases, the initial timeout stack differs from the abort backtrace because
334 // (1) the time difference between initial timeout and the final abort signal
335 // and (2) signalling the HAL audio service may cause
336 // the thread to unblock and continue.
337
Andy Hung10ac7112022-03-28 08:00:40 -0700338 // Note: LOG_ALWAYS_FATAL limits the size of the string - per log/log.h:
339 // Log message text may be truncated to less than an
340 // implementation-specific limit (1023 bytes).
341 //
342 // Here, we send the string through android-base/logging.h LOG()
343 // to avoid the size limitation. LOG(FATAL) does an abort whereas
344 // LOG(FATAL_WITHOUT_ABORT) does not abort.
345
Atneya Naircce14202022-09-05 20:17:50 -0700346 static constexpr pid_t invalidPid = TimerThread::SnapshotAnalysis::INVALID_PID;
347 pid_t tidToAbort = invalidPid;
348 if (snapshotAnalysis.suspectTid != invalidPid) {
349 tidToAbort = snapshotAnalysis.suspectTid;
350 } else if (snapshotAnalysis.timeoutTid != invalidPid) {
351 tidToAbort = snapshotAnalysis.timeoutTid;
352 }
353
354 LOG(FATAL_WITHOUT_ABORT) << abortMessage;
355 const auto ret = abortTid(tidToAbort);
356 if (ret < 0) {
357 LOG(FATAL) << "TimeCheck thread signal failed, aborting process. "
358 "errno: " << errno << base::ErrnoNumberAsString(errno);
359 }
Eric Laurent3528c932018-02-23 17:17:22 -0800360}
361
Andy Hung224f82f2022-03-22 00:00:49 -0700362// Automatically create a TimeCheck class for a class and method.
Mikhail Naganov31d46652023-01-10 18:29:25 +0000363// This is used for Audio HAL support.
Andy Hung224f82f2022-03-22 00:00:49 -0700364mediautils::TimeCheck makeTimeCheckStatsForClassMethod(
365 std::string_view className, std::string_view methodName) {
366 std::shared_ptr<MethodStatistics<std::string>> statistics =
367 mediautils::getStatisticsForClass(className);
368 if (!statistics) return {}; // empty TimeCheck.
369 return mediautils::TimeCheck(
Andy Hung35f96152022-07-15 15:18:59 -0700370 FixedString62(className).append("::").append(methodName),
371 [ safeMethodName = FixedString30(methodName),
Andy Hung224f82f2022-03-22 00:00:49 -0700372 stats = std::move(statistics) ]
373 (bool timeout, float elapsedMs) {
374 if (timeout) {
375 ; // ignored, there is no timeout value.
376 } else {
Andy Hung35f96152022-07-15 15:18:59 -0700377 stats->event(safeMethodName.asStringView(), elapsedMs);
Andy Hung224f82f2022-03-22 00:00:49 -0700378 }
Andy Hungf8ab0932022-06-13 19:49:43 -0700379 }, {} /* timeoutDuration */, {} /* secondChanceDuration */, false /* crashOnTimeout */);
Andy Hung224f82f2022-03-22 00:00:49 -0700380}
381
Andy Hung5c6d68a2022-03-09 21:54:59 -0800382} // namespace android::mediautils