Wait for crosvm in a separate thread, and keep track of when it dies.

This lets us tell whether a VM is still running or has finished. Also
add callback so clients can get notified when crosvm dies.

Bug: 171277638
Test: Ran on VIM3L
Test: atest VirtualizationTestCases
Change-Id: I52c1625af45cfcfe7aa0be465ea08f427ec5bc43
diff --git a/virtmanager/Android.bp b/virtmanager/Android.bp
index 83ac3d6..f1971dc 100644
--- a/virtmanager/Android.bp
+++ b/virtmanager/Android.bp
@@ -14,6 +14,7 @@
         "liblog_rust",
         "libserde_json",
         "libserde",
+        "libshared_child",
         "libanyhow",
     ],
     apex_available: ["com.android.virt"],
diff --git a/virtmanager/aidl/android/system/virtmanager/IVirtualMachine.aidl b/virtmanager/aidl/android/system/virtmanager/IVirtualMachine.aidl
index 0358bfd..26aad0c 100644
--- a/virtmanager/aidl/android/system/virtmanager/IVirtualMachine.aidl
+++ b/virtmanager/aidl/android/system/virtmanager/IVirtualMachine.aidl
@@ -15,7 +15,18 @@
  */
 package android.system.virtmanager;
 
+import android.system.virtmanager.IVirtualMachineCallback;
+
 interface IVirtualMachine {
     /** Get the CID allocated to the VM. */
     int getCid();
+
+    /** Returns true if the VM is still running, or false if it has exited for any reason. */
+    boolean isRunning();
+
+    /**
+     * Register a Binder object to get callbacks when the state of the VM changes, such as if it
+     * dies.
+     */
+    void registerCallback(IVirtualMachineCallback callback);
 }
diff --git a/virtmanager/aidl/android/system/virtmanager/IVirtualMachineCallback.aidl b/virtmanager/aidl/android/system/virtmanager/IVirtualMachineCallback.aidl
new file mode 100644
index 0000000..65a685d
--- /dev/null
+++ b/virtmanager/aidl/android/system/virtmanager/IVirtualMachineCallback.aidl
@@ -0,0 +1,32 @@
+/*
+ * Copyright 2021 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package android.system.virtmanager;
+
+import android.system.virtmanager.IVirtualMachine;
+
+/**
+ * An object which a client may register with the Virt Manager to get callbacks about the state of
+ * a particular VM.
+ */
+oneway interface IVirtualMachineCallback {
+    /**
+     * Called when the VM dies.
+     *
+     * Note that this will not be called if the Virt Manager itself dies, so you should also use
+     * `link_to_death` to handle that.
+     */
+    void onDied(int cid);
+}
diff --git a/virtmanager/aidl/android/system/virtmanager/VirtualMachineDebugInfo.aidl b/virtmanager/aidl/android/system/virtmanager/VirtualMachineDebugInfo.aidl
index 3a271cb..7bb77ce 100644
--- a/virtmanager/aidl/android/system/virtmanager/VirtualMachineDebugInfo.aidl
+++ b/virtmanager/aidl/android/system/virtmanager/VirtualMachineDebugInfo.aidl
@@ -31,4 +31,7 @@
      * the PID may have been reused for a different process, so this should not be trusted.
      */
     int requesterPid;
+
+    /** Whether the VM is still running. */
+    boolean running;
 }
diff --git a/virtmanager/src/aidl.rs b/virtmanager/src/aidl.rs
index 96ba04f..5a4eedc 100644
--- a/virtmanager/src/aidl.rs
+++ b/virtmanager/src/aidl.rs
@@ -21,11 +21,12 @@
 use android_system_virtmanager::aidl::android::system::virtmanager::IVirtualMachine::{
     BnVirtualMachine, IVirtualMachine,
 };
+use android_system_virtmanager::aidl::android::system::virtmanager::IVirtualMachineCallback::IVirtualMachineCallback;
 use android_system_virtmanager::aidl::android::system::virtmanager::VirtualMachineDebugInfo::VirtualMachineDebugInfo;
 use android_system_virtmanager::binder::{
     self, Interface, ParcelFileDescriptor, StatusCode, Strong, ThreadState,
 };
-use log::error;
+use log::{debug, error};
 use std::ffi::CStr;
 use std::fs::File;
 use std::sync::{Arc, Mutex, Weak};
@@ -54,7 +55,6 @@
         log_fd: Option<&ParcelFileDescriptor>,
     ) -> binder::Result<Strong<dyn IVirtualMachine>> {
         let state = &mut *self.state.lock().unwrap();
-        let cid = state.next_cid;
         let log_fd = log_fd
             .map(|fd| fd.as_ref().try_clone().map_err(|_| StatusCode::UNKNOWN_ERROR))
             .transpose()?;
@@ -69,16 +69,9 @@
             })
         });
         let requester_pid = ThreadState::get_calling_pid();
-        let instance = Arc::new(start_vm(
-            config_fd.as_ref(),
-            cid,
-            log_fd,
-            requester_uid,
-            requester_sid,
-            requester_pid,
-        )?);
-        // TODO(qwandor): keep track of which CIDs are currently in use so that we can reuse them.
-        state.next_cid = state.next_cid.checked_add(1).ok_or(StatusCode::UNKNOWN_ERROR)?;
+        let cid = state.allocate_cid()?;
+        let instance =
+            start_vm(config_fd.as_ref(), cid, log_fd, requester_uid, requester_sid, requester_pid)?;
         state.add_vm(Arc::downgrade(&instance));
         Ok(VirtualMachine::create(instance))
     }
@@ -99,6 +92,7 @@
                 requesterUid: vm.requester_uid as i32,
                 requesterSid: vm.requester_sid.clone(),
                 requesterPid: vm.requester_pid,
+                running: vm.running(),
             })
             .collect();
         Ok(cids)
@@ -155,6 +149,48 @@
     fn getCid(&self) -> binder::Result<i32> {
         Ok(self.instance.cid as i32)
     }
+
+    fn isRunning(&self) -> binder::Result<bool> {
+        Ok(self.instance.running())
+    }
+
+    fn registerCallback(
+        &self,
+        callback: &Strong<dyn IVirtualMachineCallback>,
+    ) -> binder::Result<()> {
+        // TODO: Should this give an error if the VM is already dead?
+        self.instance.callbacks.add(callback.clone());
+        Ok(())
+    }
+}
+
+impl Drop for VirtualMachine {
+    fn drop(&mut self) {
+        debug!("Dropping {:?}", self);
+        self.instance.kill();
+    }
+}
+
+/// A set of Binders to be called back in response to various events on the VM, such as when it
+/// dies.
+#[derive(Debug, Default)]
+pub struct VirtualMachineCallbacks(Mutex<Vec<Strong<dyn IVirtualMachineCallback>>>);
+
+impl VirtualMachineCallbacks {
+    /// Call all registered callbacks to say that the VM has died.
+    pub fn callback_on_died(&self, cid: Cid) {
+        let callbacks = &*self.0.lock().unwrap();
+        for callback in callbacks {
+            if let Err(e) = callback.onDied(cid as i32) {
+                error!("Error calling callback: {}", e);
+            }
+        }
+    }
+
+    /// Add a new callback to the set.
+    fn add(&self, callback: Strong<dyn IVirtualMachineCallback>) {
+        self.0.lock().unwrap().push(callback);
+    }
 }
 
 /// The mutable state of the Virt Manager. There should only be one instance of this struct.
@@ -175,7 +211,7 @@
 }
 
 impl State {
-    /// Get a list of VMs which are currently running.
+    /// Get a list of VMs which still have Binder references to them.
     fn vms(&self) -> Vec<Arc<VmInstance>> {
         // Attempt to upgrade the weak pointers to strong pointers.
         self.vms.iter().filter_map(Weak::upgrade).collect()
@@ -200,6 +236,14 @@
         let pos = self.debug_held_vms.iter().position(|vm| vm.getCid() == Ok(cid))?;
         Some(self.debug_held_vms.swap_remove(pos))
     }
+
+    /// Get the next available CID, or an error if we have run out.
+    fn allocate_cid(&mut self) -> binder::Result<Cid> {
+        // TODO(qwandor): keep track of which CIDs are currently in use so that we can reuse them.
+        let cid = self.next_cid;
+        self.next_cid = self.next_cid.checked_add(1).ok_or(StatusCode::UNKNOWN_ERROR)?;
+        Ok(cid)
+    }
 }
 
 impl Default for State {
@@ -217,7 +261,7 @@
     requester_uid: u32,
     requester_sid: Option<String>,
     requester_pid: i32,
-) -> binder::Result<VmInstance> {
+) -> binder::Result<Arc<VmInstance>> {
     let config = VmConfig::load(config_file).map_err(|e| {
         error!("Failed to load VM config from {:?}: {:?}", config_file, e);
         StatusCode::BAD_VALUE
diff --git a/virtmanager/src/crosvm.rs b/virtmanager/src/crosvm.rs
index bef9982..5e6f658 100644
--- a/virtmanager/src/crosvm.rs
+++ b/virtmanager/src/crosvm.rs
@@ -14,12 +14,17 @@
 
 //! Functions for running instances of `crosvm`.
 
+use crate::aidl::VirtualMachineCallbacks;
 use crate::config::VmConfig;
 use crate::Cid;
 use anyhow::Error;
-use log::{debug, error, info};
+use log::{error, info};
+use shared_child::SharedChild;
 use std::fs::File;
-use std::process::{Child, Command};
+use std::process::Command;
+use std::sync::atomic::{AtomicBool, Ordering};
+use std::sync::Arc;
+use std::thread;
 
 const CROSVM_PATH: &str = "/apex/com.android.virt/bin/crosvm";
 
@@ -27,7 +32,7 @@
 #[derive(Debug)]
 pub struct VmInstance {
     /// The crosvm child process.
-    child: Child,
+    child: SharedChild,
     /// The CID assigned to the VM for vsock communication.
     pub cid: Cid,
     /// The UID of the process which requested the VM.
@@ -37,18 +42,30 @@
     /// The PID of the process which requested the VM. Note that this process may no longer exist
     /// and the PID may have been reused for a different process, so this should not be trusted.
     pub requester_pid: i32,
+    /// Whether the VM is still running.
+    running: AtomicBool,
+    /// Callbacks to clients of the VM.
+    pub callbacks: VirtualMachineCallbacks,
 }
 
 impl VmInstance {
     /// Create a new `VmInstance` for the given process.
     fn new(
-        child: Child,
+        child: SharedChild,
         cid: Cid,
         requester_uid: u32,
         requester_sid: Option<String>,
         requester_pid: i32,
     ) -> VmInstance {
-        VmInstance { child, cid, requester_uid, requester_sid, requester_pid }
+        VmInstance {
+            child,
+            cid,
+            requester_uid,
+            requester_sid,
+            requester_pid,
+            running: AtomicBool::new(true),
+            callbacks: Default::default(),
+        }
     }
 
     /// Start an instance of `crosvm` to manage a new VM. The `crosvm` instance will be killed when
@@ -60,29 +77,46 @@
         requester_uid: u32,
         requester_sid: Option<String>,
         requester_pid: i32,
-    ) -> Result<VmInstance, Error> {
+    ) -> Result<Arc<VmInstance>, Error> {
         let child = run_vm(config, cid, log_fd)?;
-        Ok(VmInstance::new(child, cid, requester_uid, requester_sid, requester_pid))
-    }
-}
+        let instance =
+            Arc::new(VmInstance::new(child, cid, requester_uid, requester_sid, requester_pid));
 
-impl Drop for VmInstance {
-    fn drop(&mut self) {
-        debug!("Dropping {:?}", self);
+        let instance_clone = instance.clone();
+        thread::spawn(move || {
+            instance_clone.monitor();
+        });
+
+        Ok(instance)
+    }
+
+    /// Wait for the crosvm child process to finish, then mark the VM as no longer running and call
+    /// any callbacks.
+    fn monitor(&self) {
+        match self.child.wait() {
+            Err(e) => error!("Error waiting for crosvm instance to die: {}", e),
+            Ok(status) => info!("crosvm exited with status {}", status),
+        }
+        self.running.store(false, Ordering::Release);
+        self.callbacks.callback_on_died(self.cid);
+    }
+
+    /// Return whether `crosvm` is still running the VM.
+    pub fn running(&self) -> bool {
+        self.running.load(Ordering::Acquire)
+    }
+
+    /// Kill the crosvm instance.
+    pub fn kill(&self) {
         // TODO: Talk to crosvm to shutdown cleanly.
         if let Err(e) = self.child.kill() {
             error!("Error killing crosvm instance: {}", e);
         }
-        // We need to wait on the process after killing it to avoid zombies.
-        match self.child.wait() {
-            Err(e) => error!("Error waiting for crosvm instance to die: {}", e),
-            Ok(status) => info!("Crosvm exited with status {}", status),
-        }
     }
 }
 
 /// Start an instance of `crosvm` to manage a new VM.
-fn run_vm(config: &VmConfig, cid: Cid, log_fd: Option<File>) -> Result<Child, Error> {
+fn run_vm(config: &VmConfig, cid: Cid, log_fd: Option<File>) -> Result<SharedChild, Error> {
     config.validate()?;
 
     let mut command = Command::new(CROSVM_PATH);
@@ -110,6 +144,5 @@
         command.arg(kernel);
     }
     info!("Running {:?}", command);
-    // TODO: Monitor child process, and remove from VM map if it dies.
-    Ok(command.spawn()?)
+    Ok(SharedChild::spawn(&mut command)?)
 }