Hangup is detected in another thread Previously, early exit of a VM was not detected immediately by virtualizationservice because it was waiting for the payload_state to become Ready until timeout (10 secs) expires. This change fixes the problem by doing the hangup detection in another thread. When hangup is detected, the thread updates the payload_state to Hangup and kills crosvm. That unblocks the main thread which is waiting for the exit of crosvm. Bug: 238321974 Test: atest MicrodroidTestApp Change-Id: I32054f2684edccf22be1433eb003ebb9e0af7598

commit: a4eebdefe8820b0cf5d9dcd8d239198a1abca91b [log] [tgz]
author: Jiyong Park <jiyong@google.com> Tue Jul 12 18:01:12 2022 +0900
committer: Jiyong Park <jiyong@google.com> Thu Jul 14 16:06:33 2022 +0900
tree: 3893cdca0ab2b86b7f1c602dba47ecf380eeca16
parent: 9fab60536e68bf589664143972b3f56afdf7dfe2 [diff]
diff --git a/tests/testapk/src/java/com/android/microdroid/test/MicrodroidTests.java b/tests/testapk/src/java/com/android/microdroid/test/MicrodroidTests.java
index e7e4647..2873cf3 100644
--- a/tests/testapk/src/java/com/android/microdroid/test/MicrodroidTests.java
+++ b/tests/testapk/src/java/com/android/microdroid/test/MicrodroidTests.java

@@ -395,7 +395,11 @@
 
         flipBit(instanceFile, offset.getAsLong());
 
-        assertThat(tryBootVm(TAG, "test_vm_integrity").payloadStarted).isFalse();
+        BootResult result = tryBootVm(TAG, "test_vm_integrity");
+        assertThat(result.payloadStarted).isFalse();
+
+        // This failure should shut the VM down immediately and shouldn't trigger a hangup.
+        assertThat(result.deathReason).isNotEqualTo(DeathReason.HANGUP);
     }
 
     @Test

diff --git a/virtualizationservice/src/aidl.rs b/virtualizationservice/src/aidl.rs
index 0078736..e2e76d5 100644
--- a/virtualizationservice/src/aidl.rs
+++ b/virtualizationservice/src/aidl.rs

@@ -1007,6 +1007,7 @@
             PayloadState::Started => VirtualMachineState::STARTED,
             PayloadState::Ready => VirtualMachineState::READY,
             PayloadState::Finished => VirtualMachineState::FINISHED,
+            PayloadState::Hangup => VirtualMachineState::DEAD,
         },
         VmState::Dead => VirtualMachineState::DEAD,
         VmState::Failed => VirtualMachineState::DEAD,

diff --git a/virtualizationservice/src/crosvm.rs b/virtualizationservice/src/crosvm.rs
index b6e62fe..95c5c53 100644
--- a/virtualizationservice/src/crosvm.rs
+++ b/virtualizationservice/src/crosvm.rs

@@ -104,6 +104,7 @@
     Started,
     Ready,
     Finished,
+    Hangup, // Hasn't reached to Ready before timeout expires
 }
 
 /// The current state of the VM itself.
@@ -139,10 +140,18 @@
             let child = Arc::new(run_vm(config, failure_pipe_write)?);
 
             let child_clone = child.clone();
+            let instance_clone = instance.clone();
             thread::spawn(move || {
-                instance.monitor(child_clone, failure_pipe_read, detect_hangup);
+                instance_clone.monitor_vm_exit(child_clone, failure_pipe_read);
             });
 
+            if detect_hangup {
+                let child_clone = child.clone();
+                thread::spawn(move || {
+                    instance.monitor_payload_hangup(child_clone);
+                });
+            }
+
             // If it started correctly, update the state.
             *self = VmState::Running { child };
             Ok(())
@@ -179,8 +188,8 @@
     pub vm_service: Mutex<Option<Strong<dyn IVirtualMachineService>>>,
     /// The latest lifecycle state which the payload reported itself to be in.
     payload_state: Mutex<PayloadState>,
-    /// Represents the condition that payload_state becomes Started
-    payload_started: Condvar,
+    /// Represents the condition that payload_state was updated
+    payload_state_updated: Condvar,
 }
 
 impl VmInstance {
@@ -207,7 +216,7 @@
             stream: Mutex::new(None),
             vm_service: Mutex::new(None),
             payload_state: Mutex::new(PayloadState::Starting),
-            payload_started: Condvar::new(),
+            payload_state_updated: Condvar::new(),
         })
     }
 
@@ -217,41 +226,10 @@
         self.vm_state.lock().unwrap().start(self.clone())
     }
 
-    /// Waits for the crosvm child process to finish, then marks the VM as no longer running and
-    /// calls any callbacks. If `detect_hangup` is optionally set to true, waits for the start of
-    /// payload in the crosvm process. If that doesn't occur within a BOOT_HANGUP_TIMEOUT, declare
-    /// it as a hangup and forcibly kill the process.
-    ///
-    /// This takes a separate reference to the `SharedChild` rather than using the one in
-    /// `self.vm_state` to avoid holding the lock on `vm_state` while it is running.
-    fn monitor(&self, child: Arc<SharedChild>, mut failure_pipe_read: File, detect_hangup: bool) {
-        let hungup = if detect_hangup {
-            // Wait until payload is started or the crosvm process terminates. The checking of the
-            // child process is needed because otherwise we will be waiting for a condition that
-            // will never be satisfied (because crosvm is the one who can make the condition true).
-            let state = self.payload_state.lock().unwrap();
-            let (_, result) = self
-                .payload_started
-                .wait_timeout_while(state, *BOOT_HANGUP_TIMEOUT, |state| {
-                    *state < PayloadState::Started && child.try_wait().is_ok()
-                })
-                .unwrap();
-            if result.timed_out() {
-                error!(
-                    "Microdroid failed to start payload within {} secs timeout. Shutting down",
-                    BOOT_HANGUP_TIMEOUT.as_secs()
-                );
-                if let Err(e) = self.kill() {
-                    error!("Error stopping timed-out VM with CID {}: {:?}", self.cid, e);
-                }
-                true
-            } else {
-                false
-            }
-        } else {
-            false
-        };
-
+    /// Monitors the exit of the VM (i.e. termination of the `child` process). When that happens,
+    /// handles the event by updating the state, noityfing the event to clients by calling
+    /// callbacks, and removing temporary files for the VM.
+    fn monitor_vm_exit(&self, child: Arc<SharedChild>, mut failure_pipe_read: File) {
         let result = child.wait();
         match &result {
             Err(e) => error!("Error waiting for crosvm({}) instance to die: {}", child.id(), e),
@@ -263,20 +241,26 @@
         // Ensure that the mutex is released before calling the callbacks.
         drop(vm_state);
 
-        let failure_string = if hungup {
-            Cow::from("HANGUP")
-        } else {
-            let mut s = String::new();
-            match failure_pipe_read.read_to_string(&mut s) {
-                Err(e) => error!("Error reading VM failure reason from pipe: {}", e),
-                Ok(len) if len > 0 => info!("VM returned failure reason '{}'", &s),
-                _ => (),
-            };
-            Cow::from(s)
+        // Read the pipe to see if any failure reason is written
+        let mut failure_reason = String::new();
+        match failure_pipe_read.read_to_string(&mut failure_reason) {
+            Err(e) => error!("Error reading VM failure reason from pipe: {}", e),
+            Ok(len) if len > 0 => info!("VM returned failure reason '{}'", &failure_reason),
+            _ => (),
         };
 
+        // In case of hangup, the pipe doesn't give us any information because the hangup can't be
+        // detected on the VM side (otherwise, it isn't a hangup), but in the
+        // monitor_payload_hangup function below which updates the payload state to Hangup.
+        let failure_reason =
+            if failure_reason.is_empty() && self.payload_state() == PayloadState::Hangup {
+                Cow::from("HANGUP")
+            } else {
+                Cow::from(failure_reason)
+            };
+
         self.handle_ramdump().unwrap_or_else(|e| error!("Error handling ramdump: {}", e));
-        self.callbacks.callback_on_died(self.cid, death_reason(&result, &failure_string));
+        self.callbacks.callback_on_died(self.cid, death_reason(&result, &failure_reason));
 
         // Delete temporary files.
         if let Err(e) = remove_dir_all(&self.temporary_directory) {
@@ -284,6 +268,30 @@
         }
     }
 
+    /// Waits until payload is started, or timeout expires. When timeout occurs, kill
+    /// the VM to prevent indefinite hangup and update the payload_state accordingly.
+    fn monitor_payload_hangup(&self, child: Arc<SharedChild>) {
+        debug!("Starting to monitor hangup for Microdroid({})", child.id());
+        let (_, result) = self
+            .payload_state_updated
+            .wait_timeout_while(self.payload_state.lock().unwrap(), *BOOT_HANGUP_TIMEOUT, |s| {
+                *s < PayloadState::Started
+            })
+            .unwrap();
+        let child_still_running = child.try_wait().ok() == Some(None);
+        if result.timed_out() && child_still_running {
+            error!(
+                "Microdroid({}) failed to start payload within {} secs timeout. Shutting down.",
+                child.id(),
+                BOOT_HANGUP_TIMEOUT.as_secs()
+            );
+            self.update_payload_state(PayloadState::Hangup).unwrap();
+            if let Err(e) = self.kill() {
+                error!("Error stopping timed-out VM with CID {}: {:?}", child.id(), e);
+            }
+        }
+    }
+
     /// Returns the last reported state of the VM payload.
     pub fn payload_state(&self) -> PayloadState {
         *self.payload_state.lock().unwrap()
@@ -296,9 +304,7 @@
         // the other direction.
         if new_state > *state_locked {
             *state_locked = new_state;
-            if new_state >= PayloadState::Started {
-                self.payload_started.notify_all();
-            }
+            self.payload_state_updated.notify_all();
             Ok(())
         } else {
             bail!("Invalid payload state transition from {:?} to {:?}", *state_locked, new_state)
commit	a4eebdefe8820b0cf5d9dcd8d239198a1abca91b	[log] [tgz]
author	Jiyong Park <jiyong@google.com>	Tue Jul 12 18:01:12 2022 +0900
committer	Jiyong Park <jiyong@google.com>	Thu Jul 14 16:06:33 2022 +0900
tree	3893cdca0ab2b86b7f1c602dba47ecf380eeca16
parent	9fab60536e68bf589664143972b3f56afdf7dfe2 [diff]