blob: fa4620a8fc237308c79852c8f10aa4d89d65ba3d [file] [log] [blame]
Janis Danisevskis7e13aa02021-05-04 14:34:41 -07001// Copyright 2021, The Android Open Source Project
2//
3// Licensed under the Apache License, Version 2.0 (the "License");
4// you may not use this file except in compliance with the License.
5// You may obtain a copy of the License at
6//
7// http://www.apache.org/licenses/LICENSE-2.0
8//
9// Unless required by applicable law or agreed to in writing, software
10// distributed under the License is distributed on an "AS IS" BASIS,
11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12// See the License for the specific language governing permissions and
13// limitations under the License.
14
15// Can be removed when instrumentations are added to keystore.
16#![allow(dead_code)]
17
18//! This module implements a watchdog thread.
19
20use std::{
21 cmp::min,
22 collections::HashMap,
23 sync::Arc,
24 sync::{Condvar, Mutex, MutexGuard},
25 thread,
26};
27use std::{
28 marker::PhantomData,
29 time::{Duration, Instant},
30};
31
32/// Represents a Watchdog record. It can be created with `Watchdog::watch` or
33/// `Watchdog::watch_with`. It disarms the record when dropped.
34pub struct WatchPoint {
35 id: &'static str,
36 wd: Arc<Watchdog>,
37 not_send: PhantomData<*mut ()>, // WatchPoint must not be Send.
38}
39
40impl Drop for WatchPoint {
41 fn drop(&mut self) {
42 self.wd.disarm(self.id)
43 }
44}
45
46#[derive(Debug, PartialEq, Eq)]
47enum State {
48 NotRunning,
49 Running,
50}
51
52#[derive(Debug, Clone, Hash, PartialEq, Eq)]
53struct Index {
54 tid: thread::ThreadId,
55 id: &'static str,
56}
57
58struct Record {
59 started: Instant,
60 deadline: Instant,
61 callback: Option<Box<dyn Fn() -> String + Send + 'static>>,
62}
63
64struct WatchdogState {
65 state: State,
66 thread: Option<thread::JoinHandle<()>>,
67 timeout: Duration,
68 records: HashMap<Index, Record>,
Janis Danisevskisd1d99172021-05-06 08:21:43 -070069 last_report: Instant,
Janis Danisevskis7e13aa02021-05-04 14:34:41 -070070 has_overdue: bool,
71}
72
73impl WatchdogState {
Janis Danisevskisd1d99172021-05-06 08:21:43 -070074 fn update_overdue_and_find_next_timeout(&mut self) -> (bool, Option<Duration>) {
Janis Danisevskis7e13aa02021-05-04 14:34:41 -070075 let now = Instant::now();
76 let mut next_timeout: Option<Duration> = None;
Janis Danisevskisd1d99172021-05-06 08:21:43 -070077 let mut has_overdue = false;
Janis Danisevskis7e13aa02021-05-04 14:34:41 -070078 for (_, r) in self.records.iter() {
79 let timeout = r.deadline.saturating_duration_since(now);
80 if timeout == Duration::new(0, 0) {
Janis Danisevskisd1d99172021-05-06 08:21:43 -070081 has_overdue = true;
Janis Danisevskis7e13aa02021-05-04 14:34:41 -070082 continue;
83 }
84 next_timeout = match next_timeout {
85 Some(nt) => {
86 if timeout < nt {
87 Some(timeout)
88 } else {
89 Some(nt)
90 }
91 }
92 None => Some(timeout),
93 };
94 }
Janis Danisevskisd1d99172021-05-06 08:21:43 -070095 (has_overdue, next_timeout)
Janis Danisevskis7e13aa02021-05-04 14:34:41 -070096 }
97
Janis Danisevskisd1d99172021-05-06 08:21:43 -070098 fn log_report(&mut self, has_overdue: bool) -> bool {
99 match (self.has_overdue, has_overdue) {
100 (true, true) => {
101 if self.last_report.elapsed() < Watchdog::NOISY_REPORT_TIMEOUT {
102 self.has_overdue = false;
103 return false;
104 }
105 }
106 (_, false) => {
107 self.has_overdue = false;
108 return false;
109 }
110 (false, true) => {}
Janis Danisevskis7e13aa02021-05-04 14:34:41 -0700111 }
Janis Danisevskisd1d99172021-05-06 08:21:43 -0700112 self.last_report = Instant::now();
113 self.has_overdue = has_overdue;
Janis Danisevskis9bdc4302022-01-31 14:23:12 -0800114 log::warn!("### Keystore Watchdog report - BEGIN ###");
115
Janis Danisevskis7e13aa02021-05-04 14:34:41 -0700116 let now = Instant::now();
Janis Danisevskis9bdc4302022-01-31 14:23:12 -0800117 let mut overdue_records: Vec<(&Index, &Record)> = self
118 .records
119 .iter()
120 .filter(|(_, r)| r.deadline.saturating_duration_since(now) == Duration::new(0, 0))
121 .collect();
122
123 log::warn!("When extracting from a bug report, please include this header");
124 log::warn!("and all {} records below.", overdue_records.len());
125
126 // Watch points can be nested, i.e., a single thread may have multiple armed
127 // watch points. And the most recent on each thread (thread recent) is closest to the point
128 // where something is blocked. Furthermore, keystore2 has various critical section
129 // and common backend resources KeyMint that can only be entered serialized. So if one
130 // thread hangs, the others will soon follow suite. Thus the oldest "thread recent" watch
131 // point is most likely pointing toward the culprit.
132 // Thus, sort by start time first.
133 overdue_records.sort_unstable_by(|(_, r1), (_, r2)| r1.started.cmp(&r2.started));
134 // Then we groups all of the watch points per thread preserving the order within
135 // groups.
136 let groups = overdue_records.iter().fold(
137 HashMap::<thread::ThreadId, Vec<(&Index, &Record)>>::new(),
138 |mut acc, (i, r)| {
139 acc.entry(i.tid).or_default().push((i, r));
140 acc
141 },
142 );
143 // Put the groups back into a vector.
Charisee03e00842023-01-25 01:41:23 +0000144 let mut groups: Vec<Vec<(&Index, &Record)>> = groups.into_values().collect();
Janis Danisevskis9bdc4302022-01-31 14:23:12 -0800145 // Sort the groups by start time of the most recent (.last()) of each group.
146 // It is panic safe to use unwrap() here because we never add empty vectors to
147 // the map.
148 groups.sort_by(|v1, v2| v1.last().unwrap().1.started.cmp(&v2.last().unwrap().1.started));
149
150 for g in groups.iter() {
151 for (i, r) in g.iter() {
Janis Danisevskis7e13aa02021-05-04 14:34:41 -0700152 match &r.callback {
153 Some(cb) => {
154 log::warn!(
155 "{:?} {} Pending: {:?} Overdue {:?}: {}",
156 i.tid,
157 i.id,
158 r.started.elapsed(),
159 r.deadline.elapsed(),
160 (cb)()
161 );
162 }
163 None => {
164 log::warn!(
165 "{:?} {} Pending: {:?} Overdue {:?}",
166 i.tid,
167 i.id,
168 r.started.elapsed(),
169 r.deadline.elapsed()
170 );
171 }
172 }
173 }
174 }
Janis Danisevskis9bdc4302022-01-31 14:23:12 -0800175 log::warn!("### Keystore Watchdog report - END ###");
Janis Danisevskis7e13aa02021-05-04 14:34:41 -0700176 true
177 }
178
179 fn disarm(&mut self, index: Index) {
180 self.records.remove(&index);
181 }
182
183 fn arm(&mut self, index: Index, record: Record) {
184 if self.records.insert(index.clone(), record).is_some() {
185 log::warn!("Recursive watchdog record at \"{:?}\" replaces previous record.", index);
186 }
187 }
188}
189
190/// Watchdog spawns a thread that logs records of all overdue watch points when a deadline
191/// is missed and at least every second as long as overdue watch points exist.
192/// The thread terminates when idle for a given period of time.
193pub struct Watchdog {
194 state: Arc<(Condvar, Mutex<WatchdogState>)>,
195}
196
197impl Watchdog {
198 /// If we have overdue records, we want to be noisy about it and log a report
199 /// at least every `NOISY_REPORT_TIMEOUT` interval.
200 const NOISY_REPORT_TIMEOUT: Duration = Duration::from_secs(1);
201
202 /// Construct a [`Watchdog`]. When `timeout` has elapsed since the watchdog thread became
203 /// idle, i.e., there are no more active or overdue watch points, the watchdog thread
204 /// terminates.
205 pub fn new(timeout: Duration) -> Arc<Self> {
206 Arc::new(Self {
207 state: Arc::new((
208 Condvar::new(),
209 Mutex::new(WatchdogState {
210 state: State::NotRunning,
211 thread: None,
212 timeout,
213 records: HashMap::new(),
Janis Danisevskisd1d99172021-05-06 08:21:43 -0700214 last_report: Instant::now(),
Janis Danisevskis7e13aa02021-05-04 14:34:41 -0700215 has_overdue: false,
216 }),
217 )),
218 })
219 }
220
221 fn watch_with_optional(
222 wd: &Arc<Self>,
223 callback: Option<Box<dyn Fn() -> String + Send + 'static>>,
224 id: &'static str,
225 timeout: Duration,
226 ) -> Option<WatchPoint> {
227 let deadline = Instant::now().checked_add(timeout);
228 if deadline.is_none() {
229 log::warn!("Deadline computation failed for WatchPoint \"{}\"", id);
230 log::warn!("WatchPoint not armed.");
231 return None;
232 }
233 wd.arm(callback, id, deadline.unwrap());
234 Some(WatchPoint { id, wd: wd.clone(), not_send: Default::default() })
235 }
236
237 /// Create a new watch point. If the WatchPoint is not dropped before the timeout
238 /// expires, a report is logged at least every second, which includes the id string
239 /// and whatever string the callback returns.
240 pub fn watch_with(
241 wd: &Arc<Self>,
242 id: &'static str,
243 timeout: Duration,
244 callback: impl Fn() -> String + Send + 'static,
245 ) -> Option<WatchPoint> {
246 Self::watch_with_optional(wd, Some(Box::new(callback)), id, timeout)
247 }
248
249 /// Like `watch_with`, but without a callback.
250 pub fn watch(wd: &Arc<Self>, id: &'static str, timeout: Duration) -> Option<WatchPoint> {
251 Self::watch_with_optional(wd, None, id, timeout)
252 }
253
254 fn arm(
255 &self,
256 callback: Option<Box<dyn Fn() -> String + Send + 'static>>,
257 id: &'static str,
258 deadline: Instant,
259 ) {
260 let tid = thread::current().id();
261 let index = Index { tid, id };
262 let record = Record { started: Instant::now(), deadline, callback };
263
264 let (ref condvar, ref state) = *self.state;
265
266 let mut state = state.lock().unwrap();
267 state.arm(index, record);
268
269 if state.state != State::Running {
270 self.spawn_thread(&mut state);
271 }
272 drop(state);
273 condvar.notify_all();
274 }
275
276 fn disarm(&self, id: &'static str) {
277 let tid = thread::current().id();
278 let index = Index { tid, id };
279 let (_, ref state) = *self.state;
280
281 let mut state = state.lock().unwrap();
282 state.disarm(index);
283 // There is no need to notify condvar. There is no action required for the
284 // watchdog thread before the next deadline.
285 }
286
287 fn spawn_thread(&self, state: &mut MutexGuard<WatchdogState>) {
288 if let Some(t) = state.thread.take() {
289 t.join().expect("Watchdog thread panicked.");
290 }
291
292 let cloned_state = self.state.clone();
293
294 state.thread = Some(thread::spawn(move || {
295 let (ref condvar, ref state) = *cloned_state;
296
297 let mut state = state.lock().unwrap();
298
299 loop {
Janis Danisevskisd1d99172021-05-06 08:21:43 -0700300 let (has_overdue, next_timeout) = state.update_overdue_and_find_next_timeout();
301 state.log_report(has_overdue);
Janis Danisevskis7e13aa02021-05-04 14:34:41 -0700302 let (next_timeout, idle) = match (has_overdue, next_timeout) {
303 (true, Some(next_timeout)) => {
304 (min(next_timeout, Self::NOISY_REPORT_TIMEOUT), false)
305 }
306 (false, Some(next_timeout)) => (next_timeout, false),
307 (true, None) => (Self::NOISY_REPORT_TIMEOUT, false),
308 (false, None) => (state.timeout, true),
309 };
310
311 let (s, timeout) = condvar.wait_timeout(state, next_timeout).unwrap();
312 state = s;
313
314 if idle && timeout.timed_out() && state.records.is_empty() {
315 state.state = State::NotRunning;
316 break;
317 }
318 }
Janis Danisevskis2ee014b2021-05-05 14:29:08 -0700319 log::info!("Watchdog thread idle -> terminating. Have a great day.");
Janis Danisevskis7e13aa02021-05-04 14:34:41 -0700320 }));
321 state.state = State::Running;
322 }
323}
324
325#[cfg(test)]
326mod tests {
327
328 use super::*;
329 use std::sync::atomic;
330 use std::thread;
331 use std::time::Duration;
332
333 #[test]
334 fn test_watchdog() {
335 android_logger::init_once(
336 android_logger::Config::default()
337 .with_tag("keystore2_watchdog_tests")
Jeff Vander Stoep153d1aa2024-02-07 14:33:36 +0100338 .with_max_level(log::LevelFilter::Debug),
Janis Danisevskis7e13aa02021-05-04 14:34:41 -0700339 );
340
341 let wd = Watchdog::new(Watchdog::NOISY_REPORT_TIMEOUT.checked_mul(3).unwrap());
342 let hit_count = Arc::new(atomic::AtomicU8::new(0));
343 let hit_count_clone = hit_count.clone();
344 let wp =
345 Watchdog::watch_with(&wd, "test_watchdog", Duration::from_millis(100), move || {
346 format!("hit_count: {}", hit_count_clone.fetch_add(1, atomic::Ordering::Relaxed))
347 });
348 assert_eq!(0, hit_count.load(atomic::Ordering::Relaxed));
349 thread::sleep(Duration::from_millis(500));
350 assert_eq!(1, hit_count.load(atomic::Ordering::Relaxed));
351 thread::sleep(Watchdog::NOISY_REPORT_TIMEOUT);
352 assert_eq!(2, hit_count.load(atomic::Ordering::Relaxed));
353 drop(wp);
354 thread::sleep(Watchdog::NOISY_REPORT_TIMEOUT.checked_mul(4).unwrap());
355 assert_eq!(2, hit_count.load(atomic::Ordering::Relaxed));
356 let (_, ref state) = *wd.state;
357 let state = state.lock().unwrap();
358 assert_eq!(state.state, State::NotRunning);
359 }
360}