Update watchdog logs to backoff if logs stuck At times the underlying thread can get stuck and keep the watchdog occupied for longer peroids of time. While there are a number of reasons for this we don't want to clog the logs every second. So we increase the time between logs to MAX_REPORT_TIMEOUT so that when threads are stuck they do not continuously send logs. Bug: 265778655 Test: atest libwatchdog_rs.test Change-Id: Ib143ad75cdde80fbd8e3301e207aebe62ce2f018

commit: b088351d1d01e101a4e60366b7ea4eaab65be91f [log] [tgz]
author: Shaquille Johnson <ssjohnson@google.com> Sun Jul 07 20:12:18 2024 +0000
committer: Shaquille Johnson <ssjohnson@google.com> Tue Jul 09 10:15:47 2024 +0000
tree: 6754c9b6b35f4f5ae949e0452044006d2351f9c1
parent: 931a94336db9d97f800e516f7c0a777d91e348d4 [diff]
diff --git a/keystore2/watchdog/src/lib.rs b/keystore2/watchdog/src/lib.rs
index 1ac3ef3..baed5ca 100644
--- a/keystore2/watchdog/src/lib.rs
+++ b/keystore2/watchdog/src/lib.rs

@@ -68,9 +68,26 @@
     idle_timeout: Duration,
     records: HashMap<Index, Record>,
     last_report: Option<Instant>,
+    noisy_timeout: Duration,
 }
 
 impl WatchdogState {
+    /// If we have overdue records, we want to log them but slowly backoff
+    /// so that we do not clog the logs. We start with logs every
+    /// `MIN_REPORT_TIMEOUT` sec then increment the timeout by 5 up
+    /// to a maximum of `MAX_REPORT_TIMEOUT`.
+    const MIN_REPORT_TIMEOUT: Duration = Duration::from_secs(1);
+    const MAX_REPORT_TIMEOUT: Duration = Duration::from_secs(30);
+
+    fn reset_noisy_timeout(&mut self) {
+        self.noisy_timeout = Self::MIN_REPORT_TIMEOUT;
+    }
+
+    fn update_noisy_timeout(&mut self) {
+        let noisy_update = self.noisy_timeout + Duration::from_secs(5);
+        self.noisy_timeout = min(Self::MAX_REPORT_TIMEOUT, noisy_update);
+    }
+
     fn overdue_and_next_timeout(&self) -> (bool, Option<Duration>) {
         let now = Instant::now();
         let mut next_timeout: Option<Duration> = None;
@@ -100,12 +117,13 @@
         }
         // Something to report...
         if let Some(reported_at) = self.last_report {
-            if reported_at.elapsed() < Watchdog::NOISY_REPORT_TIMEOUT {
+            if reported_at.elapsed() < self.noisy_timeout {
                 // .. but it's too soon since the last report.
                 self.last_report = None;
                 return;
             }
         }
+        self.update_noisy_timeout();
         self.last_report = Some(Instant::now());
         log::warn!("### Keystore Watchdog report - BEGIN ###");
 
@@ -190,10 +208,6 @@
 }
 
 impl Watchdog {
-    /// If we have overdue records, we want to be noisy about it and log a report
-    /// at least every `NOISY_REPORT_TIMEOUT` interval.
-    const NOISY_REPORT_TIMEOUT: Duration = Duration::from_secs(1);
-
     /// Construct a [`Watchdog`]. When `idle_timeout` has elapsed since the watchdog thread became
     /// idle, i.e., there are no more active or overdue watch points, the watchdog thread
     /// terminates.
@@ -207,6 +221,7 @@
                     idle_timeout,
                     records: HashMap::new(),
                     last_report: None,
+                    noisy_timeout: WatchdogState::MIN_REPORT_TIMEOUT,
                 }),
             )),
         })
@@ -294,10 +309,8 @@
                 state.log_report(has_overdue);
 
                 let (next_timeout, idle) = match (has_overdue, next_timeout) {
-                    (true, Some(next_timeout)) => {
-                        (min(next_timeout, Self::NOISY_REPORT_TIMEOUT), false)
-                    }
-                    (true, None) => (Self::NOISY_REPORT_TIMEOUT, false),
+                    (true, Some(next_timeout)) => (min(next_timeout, state.noisy_timeout), false),
+                    (true, None) => (state.noisy_timeout, false),
                     (false, Some(next_timeout)) => (next_timeout, false),
                     (false, None) => (state.idle_timeout, true),
                 };
@@ -309,6 +322,7 @@
                 state = s;
 
                 if idle && timeout.timed_out() && state.records.is_empty() {
+                    state.reset_noisy_timeout();
                     state.state = State::NotRunning;
                     break;
                 }
@@ -327,6 +341,21 @@
     use std::thread;
     use std::time::Duration;
 
+    /// Count the number of times `Debug::fmt` is invoked.
+    #[derive(Default, Clone)]
+    struct DebugCounter(Arc<atomic::AtomicU8>);
+    impl DebugCounter {
+        fn value(&self) -> u8 {
+            self.0.load(atomic::Ordering::Relaxed)
+        }
+    }
+    impl std::fmt::Debug for DebugCounter {
+        fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> Result<(), std::fmt::Error> {
+            let count = self.0.fetch_add(1, atomic::Ordering::Relaxed);
+            write!(f, "hit_count: {count}")
+        }
+    }
+
     #[test]
     fn test_watchdog() {
         android_logger::init_once(
@@ -335,22 +364,7 @@
                 .with_max_level(log::LevelFilter::Debug),
         );
 
-        /// Count the number of times `Debug::fmt` is invoked.
-        #[derive(Default, Clone)]
-        struct DebugCounter(Arc<atomic::AtomicU8>);
-        impl DebugCounter {
-            fn value(&self) -> u8 {
-                self.0.load(atomic::Ordering::Relaxed)
-            }
-        }
-        impl std::fmt::Debug for DebugCounter {
-            fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> Result<(), std::fmt::Error> {
-                let count = self.0.fetch_add(1, atomic::Ordering::Relaxed);
-                write!(f, "hit_count: {count}")
-            }
-        }
-
-        let wd = Watchdog::new(Watchdog::NOISY_REPORT_TIMEOUT.checked_mul(3).unwrap());
+        let wd = Watchdog::new(Duration::from_secs(3));
         let hit_counter = DebugCounter::default();
         let wp = Watchdog::watch_with(
             &wd,
@@ -361,13 +375,43 @@
         assert_eq!(0, hit_counter.value());
         thread::sleep(Duration::from_millis(500));
         assert_eq!(1, hit_counter.value());
-        thread::sleep(Watchdog::NOISY_REPORT_TIMEOUT);
-        assert_eq!(2, hit_counter.value());
+        thread::sleep(Duration::from_secs(1));
+        assert_eq!(1, hit_counter.value());
+
         drop(wp);
-        thread::sleep(Watchdog::NOISY_REPORT_TIMEOUT.checked_mul(4).unwrap());
-        assert_eq!(2, hit_counter.value());
+        thread::sleep(Duration::from_secs(10));
+        assert_eq!(1, hit_counter.value());
         let (_, ref state) = *wd.state;
         let state = state.lock().unwrap();
         assert_eq!(state.state, State::NotRunning);
     }
+
+    #[test]
+    fn test_watchdog_backoff() {
+        android_logger::init_once(
+            android_logger::Config::default()
+                .with_tag("keystore2_watchdog_tests")
+                .with_max_level(log::LevelFilter::Debug),
+        );
+
+        let wd = Watchdog::new(Duration::from_secs(3));
+        let hit_counter = DebugCounter::default();
+        let wp = Watchdog::watch_with(
+            &wd,
+            "test_watchdog",
+            Duration::from_millis(100),
+            hit_counter.clone(),
+        );
+        assert_eq!(0, hit_counter.value());
+        thread::sleep(Duration::from_millis(500));
+        assert_eq!(1, hit_counter.value());
+        thread::sleep(Duration::from_secs(6));
+        assert_eq!(2, hit_counter.value());
+        thread::sleep(Duration::from_secs(11));
+        assert_eq!(3, hit_counter.value());
+
+        drop(wp);
+        thread::sleep(Duration::from_secs(4));
+        assert_eq!(3, hit_counter.value());
+    }
 }
commit	b088351d1d01e101a4e60366b7ea4eaab65be91f	[log] [tgz]
author	Shaquille Johnson <ssjohnson@google.com>	Sun Jul 07 20:12:18 2024 +0000
committer	Shaquille Johnson <ssjohnson@google.com>	Tue Jul 09 10:15:47 2024 +0000
tree	6754c9b6b35f4f5ae949e0452044006d2351f9c1
parent	931a94336db9d97f800e516f7c0a777d91e348d4 [diff]