Janis Danisevskis | 7e13aa0 | 2021-05-04 14:34:41 -0700 | [diff] [blame] | 1 | // Copyright 2021, The Android Open Source Project |
| 2 | // |
| 3 | // Licensed under the Apache License, Version 2.0 (the "License"); |
| 4 | // you may not use this file except in compliance with the License. |
| 5 | // You may obtain a copy of the License at |
| 6 | // |
| 7 | // http://www.apache.org/licenses/LICENSE-2.0 |
| 8 | // |
| 9 | // Unless required by applicable law or agreed to in writing, software |
| 10 | // distributed under the License is distributed on an "AS IS" BASIS, |
| 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| 12 | // See the License for the specific language governing permissions and |
| 13 | // limitations under the License. |
| 14 | |
| 15 | // Can be removed when instrumentations are added to keystore. |
| 16 | #![allow(dead_code)] |
| 17 | |
| 18 | //! This module implements a watchdog thread. |
| 19 | |
| 20 | use std::{ |
| 21 | cmp::min, |
| 22 | collections::HashMap, |
| 23 | sync::Arc, |
| 24 | sync::{Condvar, Mutex, MutexGuard}, |
| 25 | thread, |
| 26 | }; |
| 27 | use std::{ |
| 28 | marker::PhantomData, |
| 29 | time::{Duration, Instant}, |
| 30 | }; |
| 31 | |
| 32 | /// Represents a Watchdog record. It can be created with `Watchdog::watch` or |
| 33 | /// `Watchdog::watch_with`. It disarms the record when dropped. |
| 34 | pub struct WatchPoint { |
| 35 | id: &'static str, |
| 36 | wd: Arc<Watchdog>, |
| 37 | not_send: PhantomData<*mut ()>, // WatchPoint must not be Send. |
| 38 | } |
| 39 | |
| 40 | impl Drop for WatchPoint { |
| 41 | fn drop(&mut self) { |
| 42 | self.wd.disarm(self.id) |
| 43 | } |
| 44 | } |
| 45 | |
| 46 | #[derive(Debug, PartialEq, Eq)] |
| 47 | enum State { |
| 48 | NotRunning, |
| 49 | Running, |
| 50 | } |
| 51 | |
| 52 | #[derive(Debug, Clone, Hash, PartialEq, Eq)] |
| 53 | struct Index { |
| 54 | tid: thread::ThreadId, |
| 55 | id: &'static str, |
| 56 | } |
| 57 | |
| 58 | struct Record { |
| 59 | started: Instant, |
| 60 | deadline: Instant, |
| 61 | callback: Option<Box<dyn Fn() -> String + Send + 'static>>, |
| 62 | } |
| 63 | |
| 64 | struct WatchdogState { |
| 65 | state: State, |
| 66 | thread: Option<thread::JoinHandle<()>>, |
| 67 | timeout: Duration, |
| 68 | records: HashMap<Index, Record>, |
Janis Danisevskis | d1d9917 | 2021-05-06 08:21:43 -0700 | [diff] [blame] | 69 | last_report: Instant, |
Janis Danisevskis | 7e13aa0 | 2021-05-04 14:34:41 -0700 | [diff] [blame] | 70 | has_overdue: bool, |
| 71 | } |
| 72 | |
| 73 | impl WatchdogState { |
Janis Danisevskis | d1d9917 | 2021-05-06 08:21:43 -0700 | [diff] [blame] | 74 | fn update_overdue_and_find_next_timeout(&mut self) -> (bool, Option<Duration>) { |
Janis Danisevskis | 7e13aa0 | 2021-05-04 14:34:41 -0700 | [diff] [blame] | 75 | let now = Instant::now(); |
| 76 | let mut next_timeout: Option<Duration> = None; |
Janis Danisevskis | d1d9917 | 2021-05-06 08:21:43 -0700 | [diff] [blame] | 77 | let mut has_overdue = false; |
Janis Danisevskis | 7e13aa0 | 2021-05-04 14:34:41 -0700 | [diff] [blame] | 78 | for (_, r) in self.records.iter() { |
| 79 | let timeout = r.deadline.saturating_duration_since(now); |
| 80 | if timeout == Duration::new(0, 0) { |
Janis Danisevskis | d1d9917 | 2021-05-06 08:21:43 -0700 | [diff] [blame] | 81 | has_overdue = true; |
Janis Danisevskis | 7e13aa0 | 2021-05-04 14:34:41 -0700 | [diff] [blame] | 82 | continue; |
| 83 | } |
| 84 | next_timeout = match next_timeout { |
| 85 | Some(nt) => { |
| 86 | if timeout < nt { |
| 87 | Some(timeout) |
| 88 | } else { |
| 89 | Some(nt) |
| 90 | } |
| 91 | } |
| 92 | None => Some(timeout), |
| 93 | }; |
| 94 | } |
Janis Danisevskis | d1d9917 | 2021-05-06 08:21:43 -0700 | [diff] [blame] | 95 | (has_overdue, next_timeout) |
Janis Danisevskis | 7e13aa0 | 2021-05-04 14:34:41 -0700 | [diff] [blame] | 96 | } |
| 97 | |
Janis Danisevskis | d1d9917 | 2021-05-06 08:21:43 -0700 | [diff] [blame] | 98 | fn log_report(&mut self, has_overdue: bool) -> bool { |
| 99 | match (self.has_overdue, has_overdue) { |
| 100 | (true, true) => { |
| 101 | if self.last_report.elapsed() < Watchdog::NOISY_REPORT_TIMEOUT { |
| 102 | self.has_overdue = false; |
| 103 | return false; |
| 104 | } |
| 105 | } |
| 106 | (_, false) => { |
| 107 | self.has_overdue = false; |
| 108 | return false; |
| 109 | } |
| 110 | (false, true) => {} |
Janis Danisevskis | 7e13aa0 | 2021-05-04 14:34:41 -0700 | [diff] [blame] | 111 | } |
Janis Danisevskis | d1d9917 | 2021-05-06 08:21:43 -0700 | [diff] [blame] | 112 | self.last_report = Instant::now(); |
| 113 | self.has_overdue = has_overdue; |
Janis Danisevskis | 9bdc430 | 2022-01-31 14:23:12 -0800 | [diff] [blame] | 114 | log::warn!("### Keystore Watchdog report - BEGIN ###"); |
| 115 | |
Janis Danisevskis | 7e13aa0 | 2021-05-04 14:34:41 -0700 | [diff] [blame] | 116 | let now = Instant::now(); |
Janis Danisevskis | 9bdc430 | 2022-01-31 14:23:12 -0800 | [diff] [blame] | 117 | let mut overdue_records: Vec<(&Index, &Record)> = self |
| 118 | .records |
| 119 | .iter() |
| 120 | .filter(|(_, r)| r.deadline.saturating_duration_since(now) == Duration::new(0, 0)) |
| 121 | .collect(); |
| 122 | |
| 123 | log::warn!("When extracting from a bug report, please include this header"); |
| 124 | log::warn!("and all {} records below.", overdue_records.len()); |
| 125 | |
| 126 | // Watch points can be nested, i.e., a single thread may have multiple armed |
| 127 | // watch points. And the most recent on each thread (thread recent) is closest to the point |
| 128 | // where something is blocked. Furthermore, keystore2 has various critical section |
| 129 | // and common backend resources KeyMint that can only be entered serialized. So if one |
| 130 | // thread hangs, the others will soon follow suite. Thus the oldest "thread recent" watch |
| 131 | // point is most likely pointing toward the culprit. |
| 132 | // Thus, sort by start time first. |
| 133 | overdue_records.sort_unstable_by(|(_, r1), (_, r2)| r1.started.cmp(&r2.started)); |
| 134 | // Then we groups all of the watch points per thread preserving the order within |
| 135 | // groups. |
| 136 | let groups = overdue_records.iter().fold( |
| 137 | HashMap::<thread::ThreadId, Vec<(&Index, &Record)>>::new(), |
| 138 | |mut acc, (i, r)| { |
| 139 | acc.entry(i.tid).or_default().push((i, r)); |
| 140 | acc |
| 141 | }, |
| 142 | ); |
| 143 | // Put the groups back into a vector. |
| 144 | let mut groups: Vec<Vec<(&Index, &Record)>> = groups.into_iter().map(|(_, v)| v).collect(); |
| 145 | // Sort the groups by start time of the most recent (.last()) of each group. |
| 146 | // It is panic safe to use unwrap() here because we never add empty vectors to |
| 147 | // the map. |
| 148 | groups.sort_by(|v1, v2| v1.last().unwrap().1.started.cmp(&v2.last().unwrap().1.started)); |
| 149 | |
| 150 | for g in groups.iter() { |
| 151 | for (i, r) in g.iter() { |
Janis Danisevskis | 7e13aa0 | 2021-05-04 14:34:41 -0700 | [diff] [blame] | 152 | match &r.callback { |
| 153 | Some(cb) => { |
| 154 | log::warn!( |
| 155 | "{:?} {} Pending: {:?} Overdue {:?}: {}", |
| 156 | i.tid, |
| 157 | i.id, |
| 158 | r.started.elapsed(), |
| 159 | r.deadline.elapsed(), |
| 160 | (cb)() |
| 161 | ); |
| 162 | } |
| 163 | None => { |
| 164 | log::warn!( |
| 165 | "{:?} {} Pending: {:?} Overdue {:?}", |
| 166 | i.tid, |
| 167 | i.id, |
| 168 | r.started.elapsed(), |
| 169 | r.deadline.elapsed() |
| 170 | ); |
| 171 | } |
| 172 | } |
| 173 | } |
| 174 | } |
Janis Danisevskis | 9bdc430 | 2022-01-31 14:23:12 -0800 | [diff] [blame] | 175 | log::warn!("### Keystore Watchdog report - END ###"); |
Janis Danisevskis | 7e13aa0 | 2021-05-04 14:34:41 -0700 | [diff] [blame] | 176 | true |
| 177 | } |
| 178 | |
| 179 | fn disarm(&mut self, index: Index) { |
| 180 | self.records.remove(&index); |
| 181 | } |
| 182 | |
| 183 | fn arm(&mut self, index: Index, record: Record) { |
| 184 | if self.records.insert(index.clone(), record).is_some() { |
| 185 | log::warn!("Recursive watchdog record at \"{:?}\" replaces previous record.", index); |
| 186 | } |
| 187 | } |
| 188 | } |
| 189 | |
| 190 | /// Watchdog spawns a thread that logs records of all overdue watch points when a deadline |
| 191 | /// is missed and at least every second as long as overdue watch points exist. |
| 192 | /// The thread terminates when idle for a given period of time. |
| 193 | pub struct Watchdog { |
| 194 | state: Arc<(Condvar, Mutex<WatchdogState>)>, |
| 195 | } |
| 196 | |
| 197 | impl Watchdog { |
| 198 | /// If we have overdue records, we want to be noisy about it and log a report |
| 199 | /// at least every `NOISY_REPORT_TIMEOUT` interval. |
| 200 | const NOISY_REPORT_TIMEOUT: Duration = Duration::from_secs(1); |
| 201 | |
| 202 | /// Construct a [`Watchdog`]. When `timeout` has elapsed since the watchdog thread became |
| 203 | /// idle, i.e., there are no more active or overdue watch points, the watchdog thread |
| 204 | /// terminates. |
| 205 | pub fn new(timeout: Duration) -> Arc<Self> { |
| 206 | Arc::new(Self { |
| 207 | state: Arc::new(( |
| 208 | Condvar::new(), |
| 209 | Mutex::new(WatchdogState { |
| 210 | state: State::NotRunning, |
| 211 | thread: None, |
| 212 | timeout, |
| 213 | records: HashMap::new(), |
Janis Danisevskis | d1d9917 | 2021-05-06 08:21:43 -0700 | [diff] [blame] | 214 | last_report: Instant::now(), |
Janis Danisevskis | 7e13aa0 | 2021-05-04 14:34:41 -0700 | [diff] [blame] | 215 | has_overdue: false, |
| 216 | }), |
| 217 | )), |
| 218 | }) |
| 219 | } |
| 220 | |
| 221 | fn watch_with_optional( |
| 222 | wd: &Arc<Self>, |
| 223 | callback: Option<Box<dyn Fn() -> String + Send + 'static>>, |
| 224 | id: &'static str, |
| 225 | timeout: Duration, |
| 226 | ) -> Option<WatchPoint> { |
| 227 | let deadline = Instant::now().checked_add(timeout); |
| 228 | if deadline.is_none() { |
| 229 | log::warn!("Deadline computation failed for WatchPoint \"{}\"", id); |
| 230 | log::warn!("WatchPoint not armed."); |
| 231 | return None; |
| 232 | } |
| 233 | wd.arm(callback, id, deadline.unwrap()); |
| 234 | Some(WatchPoint { id, wd: wd.clone(), not_send: Default::default() }) |
| 235 | } |
| 236 | |
| 237 | /// Create a new watch point. If the WatchPoint is not dropped before the timeout |
| 238 | /// expires, a report is logged at least every second, which includes the id string |
| 239 | /// and whatever string the callback returns. |
| 240 | pub fn watch_with( |
| 241 | wd: &Arc<Self>, |
| 242 | id: &'static str, |
| 243 | timeout: Duration, |
| 244 | callback: impl Fn() -> String + Send + 'static, |
| 245 | ) -> Option<WatchPoint> { |
| 246 | Self::watch_with_optional(wd, Some(Box::new(callback)), id, timeout) |
| 247 | } |
| 248 | |
| 249 | /// Like `watch_with`, but without a callback. |
| 250 | pub fn watch(wd: &Arc<Self>, id: &'static str, timeout: Duration) -> Option<WatchPoint> { |
| 251 | Self::watch_with_optional(wd, None, id, timeout) |
| 252 | } |
| 253 | |
| 254 | fn arm( |
| 255 | &self, |
| 256 | callback: Option<Box<dyn Fn() -> String + Send + 'static>>, |
| 257 | id: &'static str, |
| 258 | deadline: Instant, |
| 259 | ) { |
| 260 | let tid = thread::current().id(); |
| 261 | let index = Index { tid, id }; |
| 262 | let record = Record { started: Instant::now(), deadline, callback }; |
| 263 | |
| 264 | let (ref condvar, ref state) = *self.state; |
| 265 | |
| 266 | let mut state = state.lock().unwrap(); |
| 267 | state.arm(index, record); |
| 268 | |
| 269 | if state.state != State::Running { |
| 270 | self.spawn_thread(&mut state); |
| 271 | } |
| 272 | drop(state); |
| 273 | condvar.notify_all(); |
| 274 | } |
| 275 | |
| 276 | fn disarm(&self, id: &'static str) { |
| 277 | let tid = thread::current().id(); |
| 278 | let index = Index { tid, id }; |
| 279 | let (_, ref state) = *self.state; |
| 280 | |
| 281 | let mut state = state.lock().unwrap(); |
| 282 | state.disarm(index); |
| 283 | // There is no need to notify condvar. There is no action required for the |
| 284 | // watchdog thread before the next deadline. |
| 285 | } |
| 286 | |
| 287 | fn spawn_thread(&self, state: &mut MutexGuard<WatchdogState>) { |
| 288 | if let Some(t) = state.thread.take() { |
| 289 | t.join().expect("Watchdog thread panicked."); |
| 290 | } |
| 291 | |
| 292 | let cloned_state = self.state.clone(); |
| 293 | |
| 294 | state.thread = Some(thread::spawn(move || { |
| 295 | let (ref condvar, ref state) = *cloned_state; |
| 296 | |
| 297 | let mut state = state.lock().unwrap(); |
| 298 | |
| 299 | loop { |
Janis Danisevskis | d1d9917 | 2021-05-06 08:21:43 -0700 | [diff] [blame] | 300 | let (has_overdue, next_timeout) = state.update_overdue_and_find_next_timeout(); |
| 301 | state.log_report(has_overdue); |
Janis Danisevskis | 7e13aa0 | 2021-05-04 14:34:41 -0700 | [diff] [blame] | 302 | let (next_timeout, idle) = match (has_overdue, next_timeout) { |
| 303 | (true, Some(next_timeout)) => { |
| 304 | (min(next_timeout, Self::NOISY_REPORT_TIMEOUT), false) |
| 305 | } |
| 306 | (false, Some(next_timeout)) => (next_timeout, false), |
| 307 | (true, None) => (Self::NOISY_REPORT_TIMEOUT, false), |
| 308 | (false, None) => (state.timeout, true), |
| 309 | }; |
| 310 | |
| 311 | let (s, timeout) = condvar.wait_timeout(state, next_timeout).unwrap(); |
| 312 | state = s; |
| 313 | |
| 314 | if idle && timeout.timed_out() && state.records.is_empty() { |
| 315 | state.state = State::NotRunning; |
| 316 | break; |
| 317 | } |
| 318 | } |
Janis Danisevskis | 2ee014b | 2021-05-05 14:29:08 -0700 | [diff] [blame] | 319 | log::info!("Watchdog thread idle -> terminating. Have a great day."); |
Janis Danisevskis | 7e13aa0 | 2021-05-04 14:34:41 -0700 | [diff] [blame] | 320 | })); |
| 321 | state.state = State::Running; |
| 322 | } |
| 323 | } |
| 324 | |
| 325 | #[cfg(test)] |
| 326 | mod tests { |
| 327 | |
| 328 | use super::*; |
| 329 | use std::sync::atomic; |
| 330 | use std::thread; |
| 331 | use std::time::Duration; |
| 332 | |
| 333 | #[test] |
| 334 | fn test_watchdog() { |
| 335 | android_logger::init_once( |
| 336 | android_logger::Config::default() |
| 337 | .with_tag("keystore2_watchdog_tests") |
| 338 | .with_min_level(log::Level::Debug), |
| 339 | ); |
| 340 | |
| 341 | let wd = Watchdog::new(Watchdog::NOISY_REPORT_TIMEOUT.checked_mul(3).unwrap()); |
| 342 | let hit_count = Arc::new(atomic::AtomicU8::new(0)); |
| 343 | let hit_count_clone = hit_count.clone(); |
| 344 | let wp = |
| 345 | Watchdog::watch_with(&wd, "test_watchdog", Duration::from_millis(100), move || { |
| 346 | format!("hit_count: {}", hit_count_clone.fetch_add(1, atomic::Ordering::Relaxed)) |
| 347 | }); |
| 348 | assert_eq!(0, hit_count.load(atomic::Ordering::Relaxed)); |
| 349 | thread::sleep(Duration::from_millis(500)); |
| 350 | assert_eq!(1, hit_count.load(atomic::Ordering::Relaxed)); |
| 351 | thread::sleep(Watchdog::NOISY_REPORT_TIMEOUT); |
| 352 | assert_eq!(2, hit_count.load(atomic::Ordering::Relaxed)); |
| 353 | drop(wp); |
| 354 | thread::sleep(Watchdog::NOISY_REPORT_TIMEOUT.checked_mul(4).unwrap()); |
| 355 | assert_eq!(2, hit_count.load(atomic::Ordering::Relaxed)); |
| 356 | let (_, ref state) = *wd.state; |
| 357 | let state = state.lock().unwrap(); |
| 358 | assert_eq!(state.state, State::NotRunning); |
| 359 | } |
| 360 | } |