Merge changes from topic "microdroid_tombstone_flaky" * changes: Add tombstone upon crash test Make sure tombstone is transmitted before VM exit

commit: efe378b394c04b618bb792546e31f620bfc2cfe4 [log] [tgz]
author: Inseob Kim <inseob@google.com> Thu Jul 14 16:09:23 2022 +0000
committer: Gerrit Code Review <noreply-gerritcodereview@google.com> Thu Jul 14 16:09:23 2022 +0000
tree: 2db5d167300ddd3985aced4a9ee3faa7f0056096
parent: a4eebdefe8820b0cf5d9dcd8d239198a1abca91b [diff]
parent: 3fe420cec3a5d247ca2c2fa6f9a677c183b84424 [diff]
diff --git a/tests/testapk/src/java/com/android/microdroid/test/MicrodroidTests.java b/tests/testapk/src/java/com/android/microdroid/test/MicrodroidTests.java
index e7e4647..2873cf3 100644
--- a/tests/testapk/src/java/com/android/microdroid/test/MicrodroidTests.java
+++ b/tests/testapk/src/java/com/android/microdroid/test/MicrodroidTests.java

@@ -395,7 +395,11 @@
 
         flipBit(instanceFile, offset.getAsLong());
 
-        assertThat(tryBootVm(TAG, "test_vm_integrity").payloadStarted).isFalse();
+        BootResult result = tryBootVm(TAG, "test_vm_integrity");
+        assertThat(result.payloadStarted).isFalse();
+
+        // This failure should shut the VM down immediately and shouldn't trigger a hangup.
+        assertThat(result.deathReason).isNotEqualTo(DeathReason.HANGUP);
     }
 
     @Test

diff --git a/virtualizationservice/src/aidl.rs b/virtualizationservice/src/aidl.rs
index 0078736..e2e76d5 100644
--- a/virtualizationservice/src/aidl.rs
+++ b/virtualizationservice/src/aidl.rs

@@ -1007,6 +1007,7 @@
             PayloadState::Started => VirtualMachineState::STARTED,
             PayloadState::Ready => VirtualMachineState::READY,
             PayloadState::Finished => VirtualMachineState::FINISHED,
+            PayloadState::Hangup => VirtualMachineState::DEAD,
         },
         VmState::Dead => VirtualMachineState::DEAD,
         VmState::Failed => VirtualMachineState::DEAD,

diff --git a/virtualizationservice/src/crosvm.rs b/virtualizationservice/src/crosvm.rs
index b6e62fe..95c5c53 100644
--- a/virtualizationservice/src/crosvm.rs
+++ b/virtualizationservice/src/crosvm.rs

@@ -104,6 +104,7 @@
     Started,
     Ready,
     Finished,
+    Hangup, // Hasn't reached to Ready before timeout expires
 }
 
 /// The current state of the VM itself.
@@ -139,10 +140,18 @@
             let child = Arc::new(run_vm(config, failure_pipe_write)?);
 
             let child_clone = child.clone();
+            let instance_clone = instance.clone();
             thread::spawn(move || {
-                instance.monitor(child_clone, failure_pipe_read, detect_hangup);
+                instance_clone.monitor_vm_exit(child_clone, failure_pipe_read);
             });
 
+            if detect_hangup {
+                let child_clone = child.clone();
+                thread::spawn(move || {
+                    instance.monitor_payload_hangup(child_clone);
+                });
+            }
+
             // If it started correctly, update the state.
             *self = VmState::Running { child };
             Ok(())
@@ -179,8 +188,8 @@
     pub vm_service: Mutex<Option<Strong<dyn IVirtualMachineService>>>,
     /// The latest lifecycle state which the payload reported itself to be in.
     payload_state: Mutex<PayloadState>,
-    /// Represents the condition that payload_state becomes Started
-    payload_started: Condvar,
+    /// Represents the condition that payload_state was updated
+    payload_state_updated: Condvar,
 }
 
 impl VmInstance {
@@ -207,7 +216,7 @@
             stream: Mutex::new(None),
             vm_service: Mutex::new(None),
             payload_state: Mutex::new(PayloadState::Starting),
-            payload_started: Condvar::new(),
+            payload_state_updated: Condvar::new(),
         })
     }
 
@@ -217,41 +226,10 @@
         self.vm_state.lock().unwrap().start(self.clone())
     }
 
-    /// Waits for the crosvm child process to finish, then marks the VM as no longer running and
-    /// calls any callbacks. If `detect_hangup` is optionally set to true, waits for the start of
-    /// payload in the crosvm process. If that doesn't occur within a BOOT_HANGUP_TIMEOUT, declare
-    /// it as a hangup and forcibly kill the process.
-    ///
-    /// This takes a separate reference to the `SharedChild` rather than using the one in
-    /// `self.vm_state` to avoid holding the lock on `vm_state` while it is running.
-    fn monitor(&self, child: Arc<SharedChild>, mut failure_pipe_read: File, detect_hangup: bool) {
-        let hungup = if detect_hangup {
-            // Wait until payload is started or the crosvm process terminates. The checking of the
-            // child process is needed because otherwise we will be waiting for a condition that
-            // will never be satisfied (because crosvm is the one who can make the condition true).
-            let state = self.payload_state.lock().unwrap();
-            let (_, result) = self
-                .payload_started
-                .wait_timeout_while(state, *BOOT_HANGUP_TIMEOUT, |state| {
-                    *state < PayloadState::Started && child.try_wait().is_ok()
-                })
-                .unwrap();
-            if result.timed_out() {
-                error!(
-                    "Microdroid failed to start payload within {} secs timeout. Shutting down",
-                    BOOT_HANGUP_TIMEOUT.as_secs()
-                );
-                if let Err(e) = self.kill() {
-                    error!("Error stopping timed-out VM with CID {}: {:?}", self.cid, e);
-                }
-                true
-            } else {
-                false
-            }
-        } else {
-            false
-        };
-
+    /// Monitors the exit of the VM (i.e. termination of the `child` process). When that happens,
+    /// handles the event by updating the state, noityfing the event to clients by calling
+    /// callbacks, and removing temporary files for the VM.
+    fn monitor_vm_exit(&self, child: Arc<SharedChild>, mut failure_pipe_read: File) {
         let result = child.wait();
         match &result {
             Err(e) => error!("Error waiting for crosvm({}) instance to die: {}", child.id(), e),
@@ -263,20 +241,26 @@
         // Ensure that the mutex is released before calling the callbacks.
         drop(vm_state);
 
-        let failure_string = if hungup {
-            Cow::from("HANGUP")
-        } else {
-            let mut s = String::new();
-            match failure_pipe_read.read_to_string(&mut s) {
-                Err(e) => error!("Error reading VM failure reason from pipe: {}", e),
-                Ok(len) if len > 0 => info!("VM returned failure reason '{}'", &s),
-                _ => (),
-            };
-            Cow::from(s)
+        // Read the pipe to see if any failure reason is written
+        let mut failure_reason = String::new();
+        match failure_pipe_read.read_to_string(&mut failure_reason) {
+            Err(e) => error!("Error reading VM failure reason from pipe: {}", e),
+            Ok(len) if len > 0 => info!("VM returned failure reason '{}'", &failure_reason),
+            _ => (),
         };
 
+        // In case of hangup, the pipe doesn't give us any information because the hangup can't be
+        // detected on the VM side (otherwise, it isn't a hangup), but in the
+        // monitor_payload_hangup function below which updates the payload state to Hangup.
+        let failure_reason =
+            if failure_reason.is_empty() && self.payload_state() == PayloadState::Hangup {
+                Cow::from("HANGUP")
+            } else {
+                Cow::from(failure_reason)
+            };
+
         self.handle_ramdump().unwrap_or_else(|e| error!("Error handling ramdump: {}", e));
-        self.callbacks.callback_on_died(self.cid, death_reason(&result, &failure_string));
+        self.callbacks.callback_on_died(self.cid, death_reason(&result, &failure_reason));
 
         // Delete temporary files.
         if let Err(e) = remove_dir_all(&self.temporary_directory) {
@@ -284,6 +268,30 @@
         }
     }
 
+    /// Waits until payload is started, or timeout expires. When timeout occurs, kill
+    /// the VM to prevent indefinite hangup and update the payload_state accordingly.
+    fn monitor_payload_hangup(&self, child: Arc<SharedChild>) {
+        debug!("Starting to monitor hangup for Microdroid({})", child.id());
+        let (_, result) = self
+            .payload_state_updated
+            .wait_timeout_while(self.payload_state.lock().unwrap(), *BOOT_HANGUP_TIMEOUT, |s| {
+                *s < PayloadState::Started
+            })
+            .unwrap();
+        let child_still_running = child.try_wait().ok() == Some(None);
+        if result.timed_out() && child_still_running {
+            error!(
+                "Microdroid({}) failed to start payload within {} secs timeout. Shutting down.",
+                child.id(),
+                BOOT_HANGUP_TIMEOUT.as_secs()
+            );
+            self.update_payload_state(PayloadState::Hangup).unwrap();
+            if let Err(e) = self.kill() {
+                error!("Error stopping timed-out VM with CID {}: {:?}", child.id(), e);
+            }
+        }
+    }
+
     /// Returns the last reported state of the VM payload.
     pub fn payload_state(&self) -> PayloadState {
         *self.payload_state.lock().unwrap()
@@ -296,9 +304,7 @@
         // the other direction.
         if new_state > *state_locked {
             *state_locked = new_state;
-            if new_state >= PayloadState::Started {
-                self.payload_started.notify_all();
-            }
+            self.payload_state_updated.notify_all();
             Ok(())
         } else {
             bail!("Invalid payload state transition from {:?} to {:?}", *state_locked, new_state)
commit	efe378b394c04b618bb792546e31f620bfc2cfe4	[log] [tgz]
author	Inseob Kim <inseob@google.com>	Thu Jul 14 16:09:23 2022 +0000
committer	Gerrit Code Review <noreply-gerritcodereview@google.com>	Thu Jul 14 16:09:23 2022 +0000
tree	2db5d167300ddd3985aced4a9ee3faa7f0056096
parent	a4eebdefe8820b0cf5d9dcd8d239198a1abca91b [diff]
parent	3fe420cec3a5d247ca2c2fa6f9a677c183b84424 [diff]