blob: baed5ca5a5cf7af54946a8381ea173cbd224f393 [file] [log] [blame]
Janis Danisevskis7e13aa02021-05-04 14:34:41 -07001// Copyright 2021, The Android Open Source Project
2//
3// Licensed under the Apache License, Version 2.0 (the "License");
4// you may not use this file except in compliance with the License.
5// You may obtain a copy of the License at
6//
7// http://www.apache.org/licenses/LICENSE-2.0
8//
9// Unless required by applicable law or agreed to in writing, software
10// distributed under the License is distributed on an "AS IS" BASIS,
11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12// See the License for the specific language governing permissions and
13// limitations under the License.
14
15// Can be removed when instrumentations are added to keystore.
16#![allow(dead_code)]
17
18//! This module implements a watchdog thread.
19
20use std::{
21 cmp::min,
22 collections::HashMap,
23 sync::Arc,
24 sync::{Condvar, Mutex, MutexGuard},
25 thread,
26};
27use std::{
28 marker::PhantomData,
29 time::{Duration, Instant},
30};
31
32/// Represents a Watchdog record. It can be created with `Watchdog::watch` or
33/// `Watchdog::watch_with`. It disarms the record when dropped.
34pub struct WatchPoint {
35 id: &'static str,
36 wd: Arc<Watchdog>,
37 not_send: PhantomData<*mut ()>, // WatchPoint must not be Send.
38}
39
40impl Drop for WatchPoint {
41 fn drop(&mut self) {
42 self.wd.disarm(self.id)
43 }
44}
45
46#[derive(Debug, PartialEq, Eq)]
47enum State {
48 NotRunning,
49 Running,
50}
51
52#[derive(Debug, Clone, Hash, PartialEq, Eq)]
53struct Index {
54 tid: thread::ThreadId,
55 id: &'static str,
56}
57
58struct Record {
59 started: Instant,
60 deadline: Instant,
David Drysdale387c85b2024-06-10 14:40:45 +010061 context: Option<Box<dyn std::fmt::Debug + Send + 'static>>,
Janis Danisevskis7e13aa02021-05-04 14:34:41 -070062}
63
64struct WatchdogState {
65 state: State,
66 thread: Option<thread::JoinHandle<()>>,
David Drysdale387c85b2024-06-10 14:40:45 +010067 /// How long to wait before dropping the watchdog thread when idle.
68 idle_timeout: Duration,
Janis Danisevskis7e13aa02021-05-04 14:34:41 -070069 records: HashMap<Index, Record>,
David Drysdale387c85b2024-06-10 14:40:45 +010070 last_report: Option<Instant>,
Shaquille Johnsonb0883512024-07-07 20:12:18 +000071 noisy_timeout: Duration,
Janis Danisevskis7e13aa02021-05-04 14:34:41 -070072}
73
74impl WatchdogState {
Shaquille Johnsonb0883512024-07-07 20:12:18 +000075 /// If we have overdue records, we want to log them but slowly backoff
76 /// so that we do not clog the logs. We start with logs every
77 /// `MIN_REPORT_TIMEOUT` sec then increment the timeout by 5 up
78 /// to a maximum of `MAX_REPORT_TIMEOUT`.
79 const MIN_REPORT_TIMEOUT: Duration = Duration::from_secs(1);
80 const MAX_REPORT_TIMEOUT: Duration = Duration::from_secs(30);
81
82 fn reset_noisy_timeout(&mut self) {
83 self.noisy_timeout = Self::MIN_REPORT_TIMEOUT;
84 }
85
86 fn update_noisy_timeout(&mut self) {
87 let noisy_update = self.noisy_timeout + Duration::from_secs(5);
88 self.noisy_timeout = min(Self::MAX_REPORT_TIMEOUT, noisy_update);
89 }
90
David Drysdale387c85b2024-06-10 14:40:45 +010091 fn overdue_and_next_timeout(&self) -> (bool, Option<Duration>) {
Janis Danisevskis7e13aa02021-05-04 14:34:41 -070092 let now = Instant::now();
93 let mut next_timeout: Option<Duration> = None;
Janis Danisevskisd1d99172021-05-06 08:21:43 -070094 let mut has_overdue = false;
Janis Danisevskis7e13aa02021-05-04 14:34:41 -070095 for (_, r) in self.records.iter() {
96 let timeout = r.deadline.saturating_duration_since(now);
97 if timeout == Duration::new(0, 0) {
David Drysdale387c85b2024-06-10 14:40:45 +010098 // This timeout has passed.
Janis Danisevskisd1d99172021-05-06 08:21:43 -070099 has_overdue = true;
David Drysdale387c85b2024-06-10 14:40:45 +0100100 } else {
101 // This timeout is still to come; see if it's the closest one to now.
102 next_timeout = match next_timeout {
103 Some(nt) if timeout < nt => Some(timeout),
104 Some(nt) => Some(nt),
105 None => Some(timeout),
106 };
Janis Danisevskis7e13aa02021-05-04 14:34:41 -0700107 }
Janis Danisevskis7e13aa02021-05-04 14:34:41 -0700108 }
Janis Danisevskisd1d99172021-05-06 08:21:43 -0700109 (has_overdue, next_timeout)
Janis Danisevskis7e13aa02021-05-04 14:34:41 -0700110 }
111
David Drysdale387c85b2024-06-10 14:40:45 +0100112 fn log_report(&mut self, has_overdue: bool) {
113 if !has_overdue {
114 // Nothing to report.
115 self.last_report = None;
116 return;
Janis Danisevskis7e13aa02021-05-04 14:34:41 -0700117 }
David Drysdale387c85b2024-06-10 14:40:45 +0100118 // Something to report...
119 if let Some(reported_at) = self.last_report {
Shaquille Johnsonb0883512024-07-07 20:12:18 +0000120 if reported_at.elapsed() < self.noisy_timeout {
David Drysdale387c85b2024-06-10 14:40:45 +0100121 // .. but it's too soon since the last report.
122 self.last_report = None;
123 return;
124 }
125 }
Shaquille Johnsonb0883512024-07-07 20:12:18 +0000126 self.update_noisy_timeout();
David Drysdale387c85b2024-06-10 14:40:45 +0100127 self.last_report = Some(Instant::now());
Janis Danisevskis9bdc4302022-01-31 14:23:12 -0800128 log::warn!("### Keystore Watchdog report - BEGIN ###");
129
Janis Danisevskis7e13aa02021-05-04 14:34:41 -0700130 let now = Instant::now();
Janis Danisevskis9bdc4302022-01-31 14:23:12 -0800131 let mut overdue_records: Vec<(&Index, &Record)> = self
132 .records
133 .iter()
134 .filter(|(_, r)| r.deadline.saturating_duration_since(now) == Duration::new(0, 0))
135 .collect();
136
137 log::warn!("When extracting from a bug report, please include this header");
138 log::warn!("and all {} records below.", overdue_records.len());
139
140 // Watch points can be nested, i.e., a single thread may have multiple armed
141 // watch points. And the most recent on each thread (thread recent) is closest to the point
142 // where something is blocked. Furthermore, keystore2 has various critical section
143 // and common backend resources KeyMint that can only be entered serialized. So if one
144 // thread hangs, the others will soon follow suite. Thus the oldest "thread recent" watch
145 // point is most likely pointing toward the culprit.
146 // Thus, sort by start time first.
147 overdue_records.sort_unstable_by(|(_, r1), (_, r2)| r1.started.cmp(&r2.started));
148 // Then we groups all of the watch points per thread preserving the order within
149 // groups.
150 let groups = overdue_records.iter().fold(
151 HashMap::<thread::ThreadId, Vec<(&Index, &Record)>>::new(),
152 |mut acc, (i, r)| {
153 acc.entry(i.tid).or_default().push((i, r));
154 acc
155 },
156 );
157 // Put the groups back into a vector.
Charisee03e00842023-01-25 01:41:23 +0000158 let mut groups: Vec<Vec<(&Index, &Record)>> = groups.into_values().collect();
Janis Danisevskis9bdc4302022-01-31 14:23:12 -0800159 // Sort the groups by start time of the most recent (.last()) of each group.
160 // It is panic safe to use unwrap() here because we never add empty vectors to
161 // the map.
162 groups.sort_by(|v1, v2| v1.last().unwrap().1.started.cmp(&v2.last().unwrap().1.started));
163
164 for g in groups.iter() {
165 for (i, r) in g.iter() {
David Drysdale387c85b2024-06-10 14:40:45 +0100166 match &r.context {
167 Some(ctx) => {
Janis Danisevskis7e13aa02021-05-04 14:34:41 -0700168 log::warn!(
David Drysdale387c85b2024-06-10 14:40:45 +0100169 "{:?} {} Pending: {:?} Overdue {:?} for {:?}",
Janis Danisevskis7e13aa02021-05-04 14:34:41 -0700170 i.tid,
171 i.id,
172 r.started.elapsed(),
173 r.deadline.elapsed(),
David Drysdale387c85b2024-06-10 14:40:45 +0100174 ctx
Janis Danisevskis7e13aa02021-05-04 14:34:41 -0700175 );
176 }
177 None => {
178 log::warn!(
179 "{:?} {} Pending: {:?} Overdue {:?}",
180 i.tid,
181 i.id,
182 r.started.elapsed(),
183 r.deadline.elapsed()
184 );
185 }
186 }
187 }
188 }
Janis Danisevskis9bdc4302022-01-31 14:23:12 -0800189 log::warn!("### Keystore Watchdog report - END ###");
Janis Danisevskis7e13aa02021-05-04 14:34:41 -0700190 }
191
192 fn disarm(&mut self, index: Index) {
193 self.records.remove(&index);
194 }
195
196 fn arm(&mut self, index: Index, record: Record) {
197 if self.records.insert(index.clone(), record).is_some() {
198 log::warn!("Recursive watchdog record at \"{:?}\" replaces previous record.", index);
199 }
200 }
201}
202
203/// Watchdog spawns a thread that logs records of all overdue watch points when a deadline
204/// is missed and at least every second as long as overdue watch points exist.
205/// The thread terminates when idle for a given period of time.
206pub struct Watchdog {
207 state: Arc<(Condvar, Mutex<WatchdogState>)>,
208}
209
210impl Watchdog {
David Drysdale387c85b2024-06-10 14:40:45 +0100211 /// Construct a [`Watchdog`]. When `idle_timeout` has elapsed since the watchdog thread became
Janis Danisevskis7e13aa02021-05-04 14:34:41 -0700212 /// idle, i.e., there are no more active or overdue watch points, the watchdog thread
213 /// terminates.
David Drysdale387c85b2024-06-10 14:40:45 +0100214 pub fn new(idle_timeout: Duration) -> Arc<Self> {
Janis Danisevskis7e13aa02021-05-04 14:34:41 -0700215 Arc::new(Self {
216 state: Arc::new((
217 Condvar::new(),
218 Mutex::new(WatchdogState {
219 state: State::NotRunning,
220 thread: None,
David Drysdale387c85b2024-06-10 14:40:45 +0100221 idle_timeout,
Janis Danisevskis7e13aa02021-05-04 14:34:41 -0700222 records: HashMap::new(),
David Drysdale387c85b2024-06-10 14:40:45 +0100223 last_report: None,
Shaquille Johnsonb0883512024-07-07 20:12:18 +0000224 noisy_timeout: WatchdogState::MIN_REPORT_TIMEOUT,
Janis Danisevskis7e13aa02021-05-04 14:34:41 -0700225 }),
226 )),
227 })
228 }
229
230 fn watch_with_optional(
David Drysdale387c85b2024-06-10 14:40:45 +0100231 wd: Arc<Self>,
232 context: Option<Box<dyn std::fmt::Debug + Send + 'static>>,
Janis Danisevskis7e13aa02021-05-04 14:34:41 -0700233 id: &'static str,
234 timeout: Duration,
235 ) -> Option<WatchPoint> {
David Drysdale387c85b2024-06-10 14:40:45 +0100236 let Some(deadline) = Instant::now().checked_add(timeout) else {
Janis Danisevskis7e13aa02021-05-04 14:34:41 -0700237 log::warn!("Deadline computation failed for WatchPoint \"{}\"", id);
238 log::warn!("WatchPoint not armed.");
239 return None;
David Drysdale387c85b2024-06-10 14:40:45 +0100240 };
241 wd.arm(context, id, deadline);
242 Some(WatchPoint { id, wd, not_send: Default::default() })
Janis Danisevskis7e13aa02021-05-04 14:34:41 -0700243 }
244
245 /// Create a new watch point. If the WatchPoint is not dropped before the timeout
246 /// expires, a report is logged at least every second, which includes the id string
David Drysdale387c85b2024-06-10 14:40:45 +0100247 /// and any provided context.
Janis Danisevskis7e13aa02021-05-04 14:34:41 -0700248 pub fn watch_with(
249 wd: &Arc<Self>,
250 id: &'static str,
251 timeout: Duration,
David Drysdale387c85b2024-06-10 14:40:45 +0100252 context: impl std::fmt::Debug + Send + 'static,
Janis Danisevskis7e13aa02021-05-04 14:34:41 -0700253 ) -> Option<WatchPoint> {
David Drysdale387c85b2024-06-10 14:40:45 +0100254 Self::watch_with_optional(wd.clone(), Some(Box::new(context)), id, timeout)
Janis Danisevskis7e13aa02021-05-04 14:34:41 -0700255 }
256
David Drysdale387c85b2024-06-10 14:40:45 +0100257 /// Like `watch_with`, but without context.
Janis Danisevskis7e13aa02021-05-04 14:34:41 -0700258 pub fn watch(wd: &Arc<Self>, id: &'static str, timeout: Duration) -> Option<WatchPoint> {
David Drysdale387c85b2024-06-10 14:40:45 +0100259 Self::watch_with_optional(wd.clone(), None, id, timeout)
Janis Danisevskis7e13aa02021-05-04 14:34:41 -0700260 }
261
262 fn arm(
263 &self,
David Drysdale387c85b2024-06-10 14:40:45 +0100264 context: Option<Box<dyn std::fmt::Debug + Send + 'static>>,
Janis Danisevskis7e13aa02021-05-04 14:34:41 -0700265 id: &'static str,
266 deadline: Instant,
267 ) {
268 let tid = thread::current().id();
269 let index = Index { tid, id };
David Drysdale387c85b2024-06-10 14:40:45 +0100270 let record = Record { started: Instant::now(), deadline, context };
Janis Danisevskis7e13aa02021-05-04 14:34:41 -0700271
272 let (ref condvar, ref state) = *self.state;
273
274 let mut state = state.lock().unwrap();
275 state.arm(index, record);
276
277 if state.state != State::Running {
278 self.spawn_thread(&mut state);
279 }
280 drop(state);
281 condvar.notify_all();
282 }
283
284 fn disarm(&self, id: &'static str) {
285 let tid = thread::current().id();
286 let index = Index { tid, id };
287 let (_, ref state) = *self.state;
288
289 let mut state = state.lock().unwrap();
290 state.disarm(index);
291 // There is no need to notify condvar. There is no action required for the
292 // watchdog thread before the next deadline.
293 }
294
295 fn spawn_thread(&self, state: &mut MutexGuard<WatchdogState>) {
296 if let Some(t) = state.thread.take() {
297 t.join().expect("Watchdog thread panicked.");
298 }
299
300 let cloned_state = self.state.clone();
301
302 state.thread = Some(thread::spawn(move || {
303 let (ref condvar, ref state) = *cloned_state;
304
305 let mut state = state.lock().unwrap();
306
307 loop {
David Drysdale387c85b2024-06-10 14:40:45 +0100308 let (has_overdue, next_timeout) = state.overdue_and_next_timeout();
Janis Danisevskisd1d99172021-05-06 08:21:43 -0700309 state.log_report(has_overdue);
David Drysdale387c85b2024-06-10 14:40:45 +0100310
Janis Danisevskis7e13aa02021-05-04 14:34:41 -0700311 let (next_timeout, idle) = match (has_overdue, next_timeout) {
Shaquille Johnsonb0883512024-07-07 20:12:18 +0000312 (true, Some(next_timeout)) => (min(next_timeout, state.noisy_timeout), false),
313 (true, None) => (state.noisy_timeout, false),
David Drysdale387c85b2024-06-10 14:40:45 +0100314 (false, Some(next_timeout)) => (next_timeout, false),
315 (false, None) => (state.idle_timeout, true),
Janis Danisevskis7e13aa02021-05-04 14:34:41 -0700316 };
317
David Drysdale387c85b2024-06-10 14:40:45 +0100318 // Wait until the closest timeout pops, but use a condition variable so that if a
319 // new watchpoint is started in the meanwhile it will interrupt the wait so we can
320 // recalculate.
Janis Danisevskis7e13aa02021-05-04 14:34:41 -0700321 let (s, timeout) = condvar.wait_timeout(state, next_timeout).unwrap();
322 state = s;
323
324 if idle && timeout.timed_out() && state.records.is_empty() {
Shaquille Johnsonb0883512024-07-07 20:12:18 +0000325 state.reset_noisy_timeout();
Janis Danisevskis7e13aa02021-05-04 14:34:41 -0700326 state.state = State::NotRunning;
327 break;
328 }
329 }
Janis Danisevskis2ee014b2021-05-05 14:29:08 -0700330 log::info!("Watchdog thread idle -> terminating. Have a great day.");
Janis Danisevskis7e13aa02021-05-04 14:34:41 -0700331 }));
332 state.state = State::Running;
333 }
334}
335
336#[cfg(test)]
337mod tests {
338
339 use super::*;
340 use std::sync::atomic;
341 use std::thread;
342 use std::time::Duration;
343
Shaquille Johnsonb0883512024-07-07 20:12:18 +0000344 /// Count the number of times `Debug::fmt` is invoked.
345 #[derive(Default, Clone)]
346 struct DebugCounter(Arc<atomic::AtomicU8>);
347 impl DebugCounter {
348 fn value(&self) -> u8 {
349 self.0.load(atomic::Ordering::Relaxed)
350 }
351 }
352 impl std::fmt::Debug for DebugCounter {
353 fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> Result<(), std::fmt::Error> {
354 let count = self.0.fetch_add(1, atomic::Ordering::Relaxed);
355 write!(f, "hit_count: {count}")
356 }
357 }
358
Janis Danisevskis7e13aa02021-05-04 14:34:41 -0700359 #[test]
360 fn test_watchdog() {
361 android_logger::init_once(
362 android_logger::Config::default()
363 .with_tag("keystore2_watchdog_tests")
Jeff Vander Stoep153d1aa2024-02-07 14:33:36 +0100364 .with_max_level(log::LevelFilter::Debug),
Janis Danisevskis7e13aa02021-05-04 14:34:41 -0700365 );
366
Shaquille Johnsonb0883512024-07-07 20:12:18 +0000367 let wd = Watchdog::new(Duration::from_secs(3));
David Drysdale387c85b2024-06-10 14:40:45 +0100368 let hit_counter = DebugCounter::default();
369 let wp = Watchdog::watch_with(
370 &wd,
371 "test_watchdog",
372 Duration::from_millis(100),
373 hit_counter.clone(),
374 );
375 assert_eq!(0, hit_counter.value());
Janis Danisevskis7e13aa02021-05-04 14:34:41 -0700376 thread::sleep(Duration::from_millis(500));
David Drysdale387c85b2024-06-10 14:40:45 +0100377 assert_eq!(1, hit_counter.value());
Shaquille Johnsonb0883512024-07-07 20:12:18 +0000378 thread::sleep(Duration::from_secs(1));
379 assert_eq!(1, hit_counter.value());
380
Janis Danisevskis7e13aa02021-05-04 14:34:41 -0700381 drop(wp);
Shaquille Johnsonb0883512024-07-07 20:12:18 +0000382 thread::sleep(Duration::from_secs(10));
383 assert_eq!(1, hit_counter.value());
Janis Danisevskis7e13aa02021-05-04 14:34:41 -0700384 let (_, ref state) = *wd.state;
385 let state = state.lock().unwrap();
386 assert_eq!(state.state, State::NotRunning);
387 }
Shaquille Johnsonb0883512024-07-07 20:12:18 +0000388
389 #[test]
390 fn test_watchdog_backoff() {
391 android_logger::init_once(
392 android_logger::Config::default()
393 .with_tag("keystore2_watchdog_tests")
394 .with_max_level(log::LevelFilter::Debug),
395 );
396
397 let wd = Watchdog::new(Duration::from_secs(3));
398 let hit_counter = DebugCounter::default();
399 let wp = Watchdog::watch_with(
400 &wd,
401 "test_watchdog",
402 Duration::from_millis(100),
403 hit_counter.clone(),
404 );
405 assert_eq!(0, hit_counter.value());
406 thread::sleep(Duration::from_millis(500));
407 assert_eq!(1, hit_counter.value());
408 thread::sleep(Duration::from_secs(6));
409 assert_eq!(2, hit_counter.value());
410 thread::sleep(Duration::from_secs(11));
411 assert_eq!(3, hit_counter.value());
412
413 drop(wp);
414 thread::sleep(Duration::from_secs(4));
415 assert_eq!(3, hit_counter.value());
416 }
Janis Danisevskis7e13aa02021-05-04 14:34:41 -0700417}