Orderly VM shutdown, part 1
Rather than killing the VM when we are done with it, ask the payload
to exit, and wait for the VM to have full exited. This allows the VM
time to write logs, generated crash dumps etc if there has been a
failure.
Add a quit method to the CompOS service, so clients can request it to
exit. Add the ability to wait for a VM to have died with a timeout to
vmclient. Implement a wait for shutdown in compos_client that waits
for the VM to exit but terminates it abruptly if it doesn't do so in a
reasonable time; do the same thing if the VM fails to start.
Change compos_verify to use this method to wait for the VM to have
fully exited once we are done with it.
Assorted refactorings:
- Simplify the timeout handling code so we panic if the neccessary
property isn't available (all requests would fail anyway). Also
updated the timeouts a little.
- Rename get_service to connect_service (it's definitely not a simple
getter).
I haven't dealt with compilation yet; that will have ramifications all
the way up to Java, and this CL is big enough already. Additionally I
haven't yet attempted to allow odsign to continue while we wait for
the VM to exit.
Bug: 236581575
Test: Run VM, see both finished & died in the logs.
Change-Id: I47501081d23833fe7ef791240161d93e38b3d951
diff --git a/compos/aidl/com/android/compos/ICompOsService.aidl b/compos/aidl/com/android/compos/ICompOsService.aidl
index 9cf99be..4df22da 100644
--- a/compos/aidl/com/android/compos/ICompOsService.aidl
+++ b/compos/aidl/com/android/compos/ICompOsService.aidl
@@ -74,4 +74,10 @@
* hardware/interfaces/security/dice/aidl/android/hardware/security/dice/Bcc.aidl.
*/
byte[] getAttestationChain();
+
+ /**
+ * Request the service to exit, triggering the termination of the VM. This may cause any
+ * requests in flight to fail.
+ */
+ oneway void quit();
}
diff --git a/compos/common/Android.bp b/compos/common/Android.bp
index d1b45c4..1a69b1a 100644
--- a/compos/common/Android.bp
+++ b/compos/common/Android.bp
@@ -13,6 +13,7 @@
"libanyhow",
"libbinder_common",
"libbinder_rpc_unstable_bindgen",
+ "liblazy_static",
"liblog_rust",
"libnested_virt",
"libnum_traits",
diff --git a/compos/common/compos_client.rs b/compos/common/compos_client.rs
index 23cd505..9314fe1 100644
--- a/compos/common/compos_client.rs
+++ b/compos/common/compos_client.rs
@@ -16,7 +16,7 @@
//! Support for starting CompOS in a VM and connecting to the service
-use crate::timeouts::timeouts;
+use crate::timeouts::TIMEOUTS;
use crate::{COMPOS_APEX_ROOT, COMPOS_DATA_ROOT, COMPOS_VSOCK_PORT, DEFAULT_VM_CONFIG_PATH};
use android_system_virtualizationservice::aidl::android::system::virtualizationservice::{
DeathReason::DeathReason,
@@ -37,7 +37,7 @@
use std::num::NonZeroU32;
use std::path::{Path, PathBuf};
use std::thread;
-use vmclient::VmInstance;
+use vmclient::{VmInstance, VmWaitError};
/// This owns an instance of the CompOS VM.
pub struct ComposClient(VmInstance);
@@ -128,14 +128,39 @@
instance.start()?;
- instance.wait_until_ready(timeouts()?.vm_max_time_to_ready)?;
+ let ready = instance.wait_until_ready(TIMEOUTS.vm_max_time_to_ready);
+ if let Err(VmWaitError::Finished) = ready {
+ if debug_level != DebugLevel::NONE {
+ // The payload has (unexpectedly) finished, but the VM is still running. Give it
+ // some time to shutdown to maximize our chances of getting useful logs.
+ if let Some(death_reason) =
+ instance.wait_for_death_with_timeout(TIMEOUTS.vm_max_time_to_exit)
+ {
+ bail!("VM died during startup - reason {:?}", death_reason);
+ }
+ }
+ }
+ ready?;
Ok(Self(instance))
}
/// Create and return an RPC Binder connection to the Comp OS service in the VM.
- pub fn get_service(&self) -> Result<Strong<dyn ICompOsService>> {
- self.0.get_service(COMPOS_VSOCK_PORT).context("Connecting to CompOS service")
+ pub fn connect_service(&self) -> Result<Strong<dyn ICompOsService>> {
+ self.0.connect_service(COMPOS_VSOCK_PORT).context("Connecting to CompOS service")
+ }
+
+ /// Wait for the instance to shut down. If it fails to shutdown within a reasonable time the
+ /// instance is dropped, which forcibly terminates it.
+ /// This should only be called when the instance has been requested to quit, or we believe that
+ /// it is already in the process of exiting due to some failure.
+ pub fn wait_for_shutdown(self) {
+ let death_reason = self.0.wait_for_death_with_timeout(TIMEOUTS.vm_max_time_to_exit);
+ match death_reason {
+ Some(vmclient::DeathReason::Shutdown) => info!("VM has exited normally"),
+ Some(reason) => warn!("VM died with reason {:?}", reason),
+ None => warn!("VM failed to exit, dropping"),
+ }
}
}
diff --git a/compos/common/timeouts.rs b/compos/common/timeouts.rs
index d0d107f..952be0a 100644
--- a/compos/common/timeouts.rs
+++ b/compos/common/timeouts.rs
@@ -17,7 +17,7 @@
//! Timeouts for common situations, with support for longer timeouts when using nested
//! virtualization.
-use anyhow::Result;
+use lazy_static::lazy_static;
use std::time::Duration;
/// Holder for the various timeouts we use.
@@ -27,27 +27,32 @@
pub odrefresh_max_execution_time: Duration,
/// Time allowed for the CompOS VM to start up and become ready.
pub vm_max_time_to_ready: Duration,
+ /// Time we wait for a VM to exit once the payload has finished.
+ pub vm_max_time_to_exit: Duration,
}
-/// Return the timeouts that are appropriate on the current platform.
-pub fn timeouts() -> Result<&'static Timeouts> {
+lazy_static! {
+/// The timeouts that are appropriate on the current platform.
+pub static ref TIMEOUTS: Timeouts = if nested_virt::is_nested_virtualization().unwrap() {
// Nested virtualization is slow.
- if nested_virt::is_nested_virtualization()? {
- Ok(&EXTENDED_TIMEOUTS)
- } else {
- Ok(&NORMAL_TIMEOUTS)
- }
+ EXTENDED_TIMEOUTS
+} else {
+ NORMAL_TIMEOUTS
+};
}
/// The timeouts that we use normally.
const NORMAL_TIMEOUTS: Timeouts = Timeouts {
- // Note: the source of truth for these odrefresh timeouts is art/odrefresh/odr_config.h.
+ // Note: the source of truth for this odrefresh timeout is art/odrefresh/odrefresh.cc.
odrefresh_max_execution_time: Duration::from_secs(300),
vm_max_time_to_ready: Duration::from_secs(15),
+ vm_max_time_to_exit: Duration::from_secs(3),
};
-/// The timeouts that we use when need_extra_time() returns true.
+/// The timeouts that we use when running under nested virtualization.
const EXTENDED_TIMEOUTS: Timeouts = Timeouts {
+ // Note: the source of truth for this odrefresh timeout is art/odrefresh/odrefresh.cc.
odrefresh_max_execution_time: Duration::from_secs(480),
vm_max_time_to_ready: Duration::from_secs(120),
+ vm_max_time_to_exit: Duration::from_secs(10),
};
diff --git a/compos/composd/src/instance_starter.rs b/compos/composd/src/instance_starter.rs
index 340e8b7..6e6253e 100644
--- a/compos/composd/src/instance_starter.rs
+++ b/compos/composd/src/instance_starter.rs
@@ -113,7 +113,7 @@
&self.vm_parameters,
)
.context("Starting VM")?;
- let service = vm_instance.get_service().context("Connecting to CompOS")?;
+ let service = vm_instance.connect_service().context("Connecting to CompOS")?;
Ok(CompOsInstance { vm_instance, service, lazy_service_guard: Default::default() })
}
diff --git a/compos/composd_cmd/composd_cmd.rs b/compos/composd_cmd/composd_cmd.rs
index 6afd711..c6a5479 100644
--- a/compos/composd_cmd/composd_cmd.rs
+++ b/compos/composd_cmd/composd_cmd.rs
@@ -31,7 +31,7 @@
},
};
use anyhow::{bail, Context, Result};
-use compos_common::timeouts::timeouts;
+use compos_common::timeouts::TIMEOUTS;
use std::sync::{Arc, Condvar, Mutex};
use std::time::Duration;
@@ -147,7 +147,7 @@
println!("Waiting");
- match state.wait(timeouts()?.odrefresh_max_execution_time) {
+ match state.wait(TIMEOUTS.odrefresh_max_execution_time) {
Ok(Outcome::Succeeded) => Ok(()),
Ok(Outcome::TaskDied) => bail!("Compilation task died"),
Ok(Outcome::Failed(reason, message)) => {
diff --git a/compos/src/compsvc.rs b/compos/src/compsvc.rs
index 91415bb..9fa68d6 100644
--- a/compos/src/compsvc.rs
+++ b/compos/src/compsvc.rs
@@ -151,6 +151,12 @@
fn getAttestationChain(&self) -> BinderResult<Vec<u8>> {
to_binder_result(compos_key::get_attestation_chain())
}
+
+ fn quit(&self) -> BinderResult<()> {
+ // TODO(b/236581575) Consider shutting down the binder server a bit more gracefully.
+ // When our process exits, Microdroid will shut down the VM.
+ std::process::exit(0);
+ }
}
fn add_artifacts(target_dir: &Path, artifact_signer: &mut ArtifactSigner) -> Result<()> {
diff --git a/compos/verify/verify.rs b/compos/verify/verify.rs
index 64ae75f..e6848c7 100644
--- a/compos/verify/verify.rs
+++ b/compos/verify/verify.rs
@@ -106,11 +106,15 @@
&idsig_manifest_apk,
&VmParameters { debug_mode, ..Default::default() },
)?;
- let service = vm_instance.get_service()?;
- let public_key = service.getPublicKey().context("Getting public key")?;
+ let service = vm_instance.connect_service()?;
+ let public_key = service.getPublicKey().context("Getting public key");
- if !compos_verify_native::verify(&public_key, &signature, &info) {
+ // Shut down the VM cleanly, giving time for any relevant logs to be written
+ let _ = service.quit(); // If this fails, the VM is probably dying anyway
+ vm_instance.wait_for_shutdown();
+
+ if !compos_verify_native::verify(&public_key?, &signature, &info) {
bail!("Signature verification failed");
}
diff --git a/vmclient/src/errors.rs b/vmclient/src/errors.rs
index 532706d..994d0ef 100644
--- a/vmclient/src/errors.rs
+++ b/vmclient/src/errors.rs
@@ -33,9 +33,9 @@
Finished,
}
-/// An error connection to a VM RPC Binder service.
+/// An error connecting to a VM RPC Binder service.
#[derive(Clone, Debug, Error)]
-pub enum GetServiceError {
+pub enum ConnectServiceError {
/// The RPC binder connection failed.
#[error("Vsock connection to RPC binder failed.")]
ConnectionFailed,
diff --git a/vmclient/src/lib.rs b/vmclient/src/lib.rs
index 867c3a7..9b5b8dd 100644
--- a/vmclient/src/lib.rs
+++ b/vmclient/src/lib.rs
@@ -20,7 +20,7 @@
mod sync;
pub use crate::death_reason::DeathReason;
-pub use crate::errors::{GetServiceError, VmWaitError};
+pub use crate::errors::{ConnectServiceError, VmWaitError};
use crate::{rpc_binder::VsockFactory, sync::Monitor};
use android_system_virtualizationservice::{
aidl::android::system::virtualizationservice::{
@@ -111,6 +111,15 @@
self.state.wait_while(|state| state.death_reason.is_none()).unwrap().death_reason.unwrap()
}
+ /// Blocks until the VM or the VirtualizationService itself dies, or the given timeout expires.
+ /// Returns the reason why it died if it did so.
+ pub fn wait_for_death_with_timeout(&self, timeout: Duration) -> Option<DeathReason> {
+ let (state, _timeout_result) =
+ self.state.wait_timeout_while(timeout, |state| state.death_reason.is_none()).unwrap();
+ // We don't care if it timed out - we just return the reason if there now is one
+ state.death_reason
+ }
+
/// Waits until the VM reports that it is ready.
///
/// Returns an error if the VM dies first, or the `timeout` elapses before the VM is ready.
@@ -133,15 +142,15 @@
}
/// Tries to connect to an RPC Binder service provided by the VM on the given vsock port.
- pub fn get_service<T: FromIBinder + ?Sized>(
+ pub fn connect_service<T: FromIBinder + ?Sized>(
&self,
port: u32,
- ) -> Result<Strong<T>, GetServiceError> {
+ ) -> Result<Strong<T>, ConnectServiceError> {
let mut vsock_factory = VsockFactory::new(&*self.vm, port);
let ibinder = vsock_factory.connect_rpc_client()?;
- FromIBinder::try_from(ibinder).map_err(GetServiceError::WrongServiceType)
+ FromIBinder::try_from(ibinder).map_err(ConnectServiceError::WrongServiceType)
}
/// Get ramdump
diff --git a/vmclient/src/rpc_binder.rs b/vmclient/src/rpc_binder.rs
index fee643f..7c2992b 100644
--- a/vmclient/src/rpc_binder.rs
+++ b/vmclient/src/rpc_binder.rs
@@ -12,7 +12,7 @@
// See the License for the specific language governing permissions and
// limitations under the License.
-use crate::errors::GetServiceError;
+use crate::errors::ConnectServiceError;
use android_system_virtualizationservice::{
aidl::android::system::virtualizationservice::IVirtualMachine::IVirtualMachine,
};
@@ -30,7 +30,7 @@
Self { vm, port }
}
- pub fn connect_rpc_client(&mut self) -> Result<binder::SpIBinder, GetServiceError> {
+ pub fn connect_rpc_client(&mut self) -> Result<binder::SpIBinder, ConnectServiceError> {
let param = self.as_void_ptr();
unsafe {
@@ -41,7 +41,7 @@
let binder =
binder_rpc_unstable_bindgen::RpcPreconnectedClient(Some(Self::request_fd), param)
as *mut AIBinder;
- new_spibinder(binder).ok_or(GetServiceError::ConnectionFailed)
+ new_spibinder(binder).ok_or(ConnectServiceError::ConnectionFailed)
}
}