virtualizationservice: Fix race in monitor_vm_exit monitor_vm_exit() runs in a detached thread, waiting for an instance of crosvm to exit. When it does, it cleans up after it and submits telemetry to statsd. However, there is currently no guarantee that the thread will get to complete its task. If exit() is called, eg. because there are no remaining clients of the service, the thread is interrupted and terminated together with the rest of the process. Whether it completes its job or not therefore can depend on the delay between the last VirtualMachine reference is dropped and LazyServiceGuard terminating the process. Fix this race by joining the monitor_vm_exit as the VirtualMachine object is dropped. This prevents the process from exiting until the thread has completed. Bug: 245727626 Test: atest -p packages/modules/Virtualization:avf-presubmit Change-Id: Iefbd4181a8ca95bbb3a4fed0f85d9929d081ff73

commit: f18e116c007d054f83ca5bf261c48696556f329a [log] [tgz]
author: David Brazdil <dbrazdil@google.com> Sun Dec 18 18:27:23 2022 +0000
committer: David Brazdil <dbrazdil@google.com> Sun Dec 18 23:13:44 2022 +0000
tree: c7df0e74582724fdac39f0b62c875f64adba9df4
parent: 146bfc12f94aaf87ea8a01e00aad49404a453fbe [diff]
diff --git a/virtualizationservice/src/crosvm.rs b/virtualizationservice/src/crosvm.rs
index 5125f19..2c6e370 100644
--- a/virtualizationservice/src/crosvm.rs
+++ b/virtualizationservice/src/crosvm.rs

@@ -39,7 +39,7 @@
 use std::process::{Command, ExitStatus};
 use std::sync::{Arc, Condvar, Mutex};
 use std::time::{Duration, SystemTime};
-use std::thread;
+use std::thread::{self, JoinHandle};
 use android_system_virtualizationservice::aidl::android::system::virtualizationservice::{
     DeathReason::DeathReason,
     MemoryTrimLevel::MemoryTrimLevel,
@@ -140,6 +140,8 @@
     Running {
         /// The crosvm child process.
         child: Arc<SharedChild>,
+        /// The thread waiting for crosvm to finish.
+        monitor_vm_exit_thread: Option<JoinHandle<()>>,
     },
     /// The VM died or was killed.
     Dead,
@@ -187,9 +189,9 @@
 
             let child_clone = child.clone();
             let instance_clone = instance.clone();
-            thread::spawn(move || {
+            let monitor_vm_exit_thread = Some(thread::spawn(move || {
                 instance_clone.monitor_vm_exit(child_clone, failure_pipe_read);
-            });
+            }));
 
             if detect_hangup {
                 let child_clone = child.clone();
@@ -199,7 +201,7 @@
             }
 
             // If it started correctly, update the state.
-            *self = VmState::Running { child };
+            *self = VmState::Running { child, monitor_vm_exit_thread };
             Ok(())
         } else {
             *self = state;
@@ -465,19 +467,24 @@
 
     /// Kills the crosvm instance, if it is running.
     pub fn kill(&self) -> Result<(), Error> {
-        let vm_state = &*self.vm_state.lock().unwrap();
-        if let VmState::Running { child } = vm_state {
-            let id = child.id();
-            debug!("Killing crosvm({})", id);
-            // TODO: Talk to crosvm to shutdown cleanly.
-            if let Err(e) = child.kill() {
-                bail!("Error killing crosvm({}) instance: {}", id, e);
+        let monitor_vm_exit_thread = {
+            let vm_state = &mut *self.vm_state.lock().unwrap();
+            if let VmState::Running { child, monitor_vm_exit_thread } = vm_state {
+                let id = child.id();
+                debug!("Killing crosvm({})", id);
+                // TODO: Talk to crosvm to shutdown cleanly.
+                child.kill().with_context(|| format!("Error killing crosvm({id}) instance"))?;
+                monitor_vm_exit_thread.take()
             } else {
-                Ok(())
+                bail!("VM is not running")
             }
-        } else {
-            bail!("VM is not running")
-        }
+        };
+
+        // Wait for monitor_vm_exit() to finish. Must release vm_state lock
+        // first, as monitor_vm_exit() takes it as well.
+        monitor_vm_exit_thread.map(JoinHandle::join);
+
+        Ok(())
     }
 
     /// Responds to memory-trimming notifications by inflating the virtio
commit	f18e116c007d054f83ca5bf261c48696556f329a	[log] [tgz]
author	David Brazdil <dbrazdil@google.com>	Sun Dec 18 18:27:23 2022 +0000
committer	David Brazdil <dbrazdil@google.com>	Sun Dec 18 23:13:44 2022 +0000
tree	c7df0e74582724fdac39f0b62c875f64adba9df4
parent	146bfc12f94aaf87ea8a01e00aad49404a453fbe [diff]