Detect Microdroid hangup during boot
Hangup in Microdroid is defined as a state where payload hasn't been
started for a long time. In that case AVF kills the VM and the death is
reported via onDied callback.
In addition, modified the client-facing java and rust libraries to add
death reasons that were added before but haven't surfaced yet.
Bug: 222228861
Test: I couldn't make a test for this because it was impossible to
intentionally make the hang by a test. Instead, I confirm that `onDied`
is called and the VM eventually is killed when I edited the timeout
value to a very small number (e.g. 100ms).
Change-Id: I53f232d0b609e6e8a429d996c7d6fdd0b37e7b4c
diff --git a/javalib/src/android/system/virtualmachine/VirtualMachine.java b/javalib/src/android/system/virtualmachine/VirtualMachine.java
index ed2c2a1..de44b63 100644
--- a/javalib/src/android/system/virtualmachine/VirtualMachine.java
+++ b/javalib/src/android/system/virtualmachine/VirtualMachine.java
@@ -439,6 +439,7 @@
}
@Override
public void onDied(int cid, int reason) {
+ // TODO(b/236811123) translate `reason` into a stable reason numbers
service.asBinder().unlinkToDeath(deathRecipient, 0);
if (onDiedCalled.compareAndSet(false, true)) {
executeCallback((cb) -> cb.onDied(VirtualMachine.this, reason));
diff --git a/javalib/src/android/system/virtualmachine/VirtualMachineCallback.java b/javalib/src/android/system/virtualmachine/VirtualMachineCallback.java
index a49c9be..54d0701 100644
--- a/javalib/src/android/system/virtualmachine/VirtualMachineCallback.java
+++ b/javalib/src/android/system/virtualmachine/VirtualMachineCallback.java
@@ -66,7 +66,8 @@
DEATH_REASON_SHUTDOWN,
DEATH_REASON_ERROR,
DEATH_REASON_REBOOT,
- DEATH_REASON_CRASH
+ DEATH_REASON_CRASH,
+ DEATH_REASON_HANGUP,
})
@interface DeathReason {}
@@ -97,6 +98,36 @@
/** The VM or crosvm crashed. */
int DEATH_REASON_CRASH = 6;
+ /** The pVM firmware failed to verify the VM because the public key doesn't match. */
+ int DEATH_REASON_PVM_FIRMWARE_PUBLIC_KEY_MISMATCH = 7;
+
+ /** The pVM firmware failed to verify the VM because the instance image changed. */
+ int DEATH_REASON_PVM_FIRMWARE_INSTANCE_IMAGE_CHANGED = 8;
+
+ /** The bootloader failed to verify the VM because the public key doesn't match. */
+ int DEATH_REASON_BOOTLOADER_PUBLIC_KEY_MISMATCH = 9;
+
+ /** The bootloader failed to verify the VM because the instance image changed. */
+ int DEATH_REASON_BOOTLOADER_INSTANCE_IMAGE_CHANGED = 10;
+
+ /** The microdroid failed to connect to VirtualizationService's RPC server. */
+ int DEATH_REASON_MICRODROID_FAILED_TO_CONNECT_TO_VIRTUALIZATION_SERVICE = 11;
+
+ /** The payload for microdroid is changed. */
+ int DEATH_REASON_MICRODROID_PAYLOAD_HAS_CHANGED = 12;
+
+ /** The microdroid failed to verify given payload APK. */
+ int DEATH_REASON_MICRODROID_PAYLOAD_VERIFICATION_FAILED = 13;
+
+ /** The VM config for microdroid is invalid (e.g. missing tasks). */
+ int DEATH_REASON_MICRODROID_INVALID_PAYLOAD_CONFIG = 14;
+
+ /** There was a runtime error while running microdroid manager. */
+ int DEATH_REASON_MICRODROID_UNKNOWN_RUNTIME_ERROR = 15;
+
+ /** The VM killed due to hangup */
+ int DEATH_REASON_HANGUP = 16;
+
/** Called when the payload starts in the VM. */
void onPayloadStarted(@NonNull VirtualMachine vm, @Nullable ParcelFileDescriptor stream);
diff --git a/virtualizationservice/Android.bp b/virtualizationservice/Android.bp
index 7a8da96..791da24 100644
--- a/virtualizationservice/Android.bp
+++ b/virtualizationservice/Android.bp
@@ -31,6 +31,7 @@
"libcommand_fds",
"libdisk",
"libidsig",
+ "liblazy_static",
"liblog_rust",
"libmicrodroid_metadata",
"libmicrodroid_payload_config",
diff --git a/virtualizationservice/aidl/android/system/virtualizationservice/DeathReason.aidl b/virtualizationservice/aidl/android/system/virtualizationservice/DeathReason.aidl
index 577e868..dceabf1 100644
--- a/virtualizationservice/aidl/android/system/virtualizationservice/DeathReason.aidl
+++ b/virtualizationservice/aidl/android/system/virtualizationservice/DeathReason.aidl
@@ -52,4 +52,6 @@
MICRODROID_INVALID_PAYLOAD_CONFIG = 14,
/** There was a runtime error while running microdroid manager. */
MICRODROID_UNKNOWN_RUNTIME_ERROR = 15,
+ /** The VM killed due to hangup */
+ HANGUP = 16,
}
diff --git a/virtualizationservice/src/aidl.rs b/virtualizationservice/src/aidl.rs
index f0b3fb4..24f3706 100644
--- a/virtualizationservice/src/aidl.rs
+++ b/virtualizationservice/src/aidl.rs
@@ -479,6 +479,7 @@
log_fd,
indirect_files,
platform_version: parse_platform_version_req(&config.platformVersion)?,
+ detect_hangup: is_app_config,
};
let instance = Arc::new(
VmInstance::new(
diff --git a/virtualizationservice/src/crosvm.rs b/virtualizationservice/src/crosvm.rs
index fcc09c6..7ca802f 100644
--- a/virtualizationservice/src/crosvm.rs
+++ b/virtualizationservice/src/crosvm.rs
@@ -18,10 +18,13 @@
use crate::Cid;
use anyhow::{bail, Error};
use command_fds::CommandFdExt;
+use lazy_static::lazy_static;
use log::{debug, error, info};
use semver::{Version, VersionReq};
use nix::{fcntl::OFlag, unistd::pipe2};
+use rustutils::system_properties;
use shared_child::SharedChild;
+use std::borrow::Cow;
use std::fs::{remove_dir_all, File};
use std::io::{self, Read};
use std::mem;
@@ -29,7 +32,8 @@
use std::os::unix::io::{AsRawFd, RawFd, FromRawFd};
use std::path::PathBuf;
use std::process::{Command, ExitStatus};
-use std::sync::{Arc, Mutex};
+use std::sync::{Arc, Condvar, Mutex};
+use std::time::Duration;
use std::thread;
use vsock::VsockStream;
use android_system_virtualizationservice::aidl::android::system::virtualizationservice::DeathReason::DeathReason;
@@ -51,6 +55,25 @@
/// The exit status which crosvm returns when it crashes due to an error.
const CROSVM_CRASH_STATUS: i32 = 33;
+fn is_nested_virtualization() -> bool {
+ // Check if we are running on vsoc as a proxy for this.
+ matches!(
+ system_properties::read("ro.build.product").unwrap().as_deref(),
+ Some("vsoc_x86_64") | Some("vsoc_x86")
+ )
+}
+
+lazy_static! {
+ /// If the VM doesn't move to the Started state within this amount time, a hang-up error is
+ /// triggered.
+ static ref BOOT_HANGUP_TIMEOUT: Duration = if is_nested_virtualization() {
+ // Nested virtualization is slow, so we need a longer timeout.
+ Duration::from_secs(100)
+ } else {
+ Duration::from_secs(10)
+ };
+}
+
/// Configuration for a VM to run with crosvm.
#[derive(Debug)]
pub struct CrosvmConfig {
@@ -69,6 +92,7 @@
pub log_fd: Option<File>,
pub indirect_files: Vec<File>,
pub platform_version: VersionReq,
+ pub detect_hangup: bool,
}
/// A disk image to pass to crosvm for a VM.
@@ -116,6 +140,7 @@
fn start(&mut self, instance: Arc<VmInstance>) -> Result<(), Error> {
let state = mem::replace(self, VmState::Failed);
if let VmState::NotStarted { config } = state {
+ let detect_hangup = config.detect_hangup;
let (failure_pipe_read, failure_pipe_write) = create_pipe()?;
// If this fails and returns an error, `self` will be left in the `Failed` state.
@@ -123,7 +148,7 @@
let child_clone = child.clone();
thread::spawn(move || {
- instance.monitor(child_clone, failure_pipe_read);
+ instance.monitor(child_clone, failure_pipe_read, detect_hangup);
});
// If it started correctly, update the state.
@@ -162,6 +187,8 @@
pub vm_service: Mutex<Option<Strong<dyn IVirtualMachineService>>>,
/// The latest lifecycle state which the payload reported itself to be in.
payload_state: Mutex<PayloadState>,
+ /// Represents the condition that payload_state becomes Started
+ payload_started: Condvar,
}
impl VmInstance {
@@ -188,6 +215,7 @@
stream: Mutex::new(None),
vm_service: Mutex::new(None),
payload_state: Mutex::new(PayloadState::Starting),
+ payload_started: Condvar::new(),
})
}
@@ -198,11 +226,38 @@
}
/// Waits for the crosvm child process to finish, then marks the VM as no longer running and
- /// calls any callbacks.
+ /// calls any callbacks. If `detect_hangup` is optionally set to true, waits for the start of
+ /// payload in the crosvm process. If that doesn't occur within a BOOT_HANGUP_TIMEOUT, declare
+ /// it as a hangup and forcibly kill the process.
///
/// This takes a separate reference to the `SharedChild` rather than using the one in
/// `self.vm_state` to avoid holding the lock on `vm_state` while it is running.
- fn monitor(&self, child: Arc<SharedChild>, mut failure_pipe_read: File) {
+ fn monitor(&self, child: Arc<SharedChild>, mut failure_pipe_read: File, detect_hangup: bool) {
+ let hungup = if detect_hangup {
+ // Wait until payload is started or the crosvm process terminates. The checking of the
+ // child process is needed because otherwise we will be waiting for a condition that
+ // will never be satisfied (because crosvm is the one who can make the condition true).
+ let state = self.payload_state.lock().unwrap();
+ let (_, result) = self
+ .payload_started
+ .wait_timeout_while(state, *BOOT_HANGUP_TIMEOUT, |state| {
+ *state < PayloadState::Started && child.try_wait().is_ok()
+ })
+ .unwrap();
+ if result.timed_out() {
+ error!(
+ "Microdroid failed to start payload within {} secs timeout. Shutting down",
+ BOOT_HANGUP_TIMEOUT.as_secs()
+ );
+ self.kill();
+ true
+ } else {
+ false
+ }
+ } else {
+ false
+ };
+
let result = child.wait();
match &result {
Err(e) => error!("Error waiting for crosvm({}) instance to die: {}", child.id(), e),
@@ -214,14 +269,17 @@
// Ensure that the mutex is released before calling the callbacks.
drop(vm_state);
- let mut failure_string = String::new();
- let failure_read_result = failure_pipe_read.read_to_string(&mut failure_string);
- if let Err(e) = &failure_read_result {
- error!("Error reading VM failure reason from pipe: {}", e);
- }
- if !failure_string.is_empty() {
- info!("VM returned failure reason '{}'", failure_string);
- }
+ let failure_string = if hungup {
+ Cow::from("HANGUP")
+ } else {
+ let mut s = String::new();
+ match failure_pipe_read.read_to_string(&mut s) {
+ Err(e) => error!("Error reading VM failure reason from pipe: {}", e),
+ Ok(len) if len > 0 => info!("VM returned failure reason '{}'", &s),
+ _ => (),
+ };
+ Cow::from(s)
+ };
self.callbacks.callback_on_died(self.cid, death_reason(&result, &failure_string));
@@ -243,6 +301,9 @@
// the other direction.
if new_state > *state_locked {
*state_locked = new_state;
+ if new_state >= PayloadState::Started {
+ self.payload_started.notify_all();
+ }
Ok(())
} else {
bail!("Invalid payload state transition from {:?} to {:?}", *state_locked, new_state)
@@ -289,6 +350,7 @@
"MICRODROID_UNKNOWN_RUNTIME_ERROR" => {
return DeathReason::MICRODROID_UNKNOWN_RUNTIME_ERROR
}
+ "HANGUP" => return DeathReason::HANGUP,
_ => {}
}
match status.code() {
diff --git a/vmclient/src/death_reason.rs b/vmclient/src/death_reason.rs
index 657eaa2..b976f6f 100644
--- a/vmclient/src/death_reason.rs
+++ b/vmclient/src/death_reason.rs
@@ -44,6 +44,18 @@
BootloaderPublicKeyMismatch,
/// The bootloader failed to verify the VM because the instance image changed.
BootloaderInstanceImageChanged,
+ /// The microdroid failed to connect to VirtualizationService's RPC server.
+ MicrodroidFailedToConnectToVirtualizationService,
+ /// The payload for microdroid is changed.
+ MicrodroidPayloadHasChanged,
+ /// The microdroid failed to verify given payload APK.
+ MicrodroidPayloadVerificationFailed,
+ /// The VM config for microdroid is invalid (e.g. missing tasks).
+ MicrodroidInvalidPayloadConfig,
+ /// There was a runtime error while running microdroid manager.
+ MicrodroidUnknownRuntimeError,
+ /// The VM was killed due to hangup.
+ Hangup,
/// VirtualizationService sent a death reason which was not recognised by the client library.
Unrecognised(AidlDeathReason),
}
@@ -66,6 +78,20 @@
AidlDeathReason::BOOTLOADER_INSTANCE_IMAGE_CHANGED => {
Self::BootloaderInstanceImageChanged
}
+ AidlDeathReason::MICRODROID_FAILED_TO_CONNECT_TO_VIRTUALIZATION_SERVICE => {
+ Self::MicrodroidFailedToConnectToVirtualizationService
+ }
+ AidlDeathReason::MICRODROID_PAYLOAD_HAS_CHANGED => Self::MicrodroidPayloadHasChanged,
+ AidlDeathReason::MICRODROID_PAYLOAD_VERIFICATION_FAILED => {
+ Self::MicrodroidPayloadVerificationFailed
+ }
+ AidlDeathReason::MICRODROID_INVALID_PAYLOAD_CONFIG => {
+ Self::MicrodroidInvalidPayloadConfig
+ }
+ AidlDeathReason::MICRODROID_UNKNOWN_RUNTIME_ERROR => {
+ Self::MicrodroidUnknownRuntimeError
+ }
+ AidlDeathReason::HANGUP => Self::Hangup,
_ => Self::Unrecognised(reason),
}
}
@@ -94,6 +120,20 @@
Self::BootloaderInstanceImageChanged => {
"Bootloader failed to verify the VM because the instance image changed."
}
+ Self::MicrodroidFailedToConnectToVirtualizationService => {
+ "The microdroid failed to connect to VirtualizationService's RPC server."
+ }
+ Self::MicrodroidPayloadHasChanged => "The payload for microdroid is changed.",
+ Self::MicrodroidPayloadVerificationFailed => {
+ "The microdroid failed to verify given payload APK."
+ }
+ Self::MicrodroidInvalidPayloadConfig => {
+ "The VM config for microdroid is invalid (e.g. missing tasks)."
+ }
+ Self::MicrodroidUnknownRuntimeError => {
+ "There was a runtime error while running microdroid manager."
+ }
+ Self::Hangup => "VM hangup.",
Self::Unrecognised(reason) => {
return write!(f, "Unrecognised death reason {:?}.", reason);
}