Implement early_virtmgr

early_virtmgr will serve clients as a service running early boot VMs
which can start before /data mount. early_virtmgr will act similar to
virtmgr, serving IVirtualizationService aidl interface.

See docs/early_vm.md for more details.

Bug: 354059713
Test: add a demo binary, see logcat
Change-Id: I4e842cb108bf0f6f7620e711ffa429a03c9b25eb
diff --git a/android/virtmgr/src/aidl.rs b/android/virtmgr/src/aidl.rs
index fb3d353..b54b37b 100644
--- a/android/virtmgr/src/aidl.rs
+++ b/android/virtmgr/src/aidl.rs
@@ -45,6 +45,7 @@
     VirtualMachineRawConfig::VirtualMachineRawConfig,
     VirtualMachineState::VirtualMachineState,
 };
+use android_system_virtualizationservice_internal::aidl::android::system::virtualizationservice_internal::IGlobalVmContext::IGlobalVmContext;
 use android_system_virtualizationservice_internal::aidl::android::system::virtualizationservice_internal::IVirtualizationServiceInternal::IVirtualizationServiceInternal;
 use android_system_virtualmachineservice::aidl::android::system::virtualmachineservice::IVirtualMachineService::{
         BnVirtualMachineService, IVirtualMachineService,
@@ -74,14 +75,16 @@
 use rustutils::system_properties;
 use safe_ownedfd::take_fd_ownership;
 use semver::VersionReq;
+use serde::Deserialize;
 use std::collections::HashSet;
 use std::convert::TryInto;
 use std::fs;
 use std::ffi::CStr;
-use std::fs::{canonicalize, read_dir, remove_file, File, OpenOptions};
+use std::fs::{canonicalize, create_dir_all, read_dir, remove_dir_all, remove_file, File, OpenOptions};
 use std::io::{BufRead, BufReader, Error, ErrorKind, Seek, SeekFrom, Write};
 use std::iter;
 use std::num::{NonZeroU16, NonZeroU32};
+use std::ops::Range;
 use std::os::unix::io::{AsRawFd, IntoRawFd};
 use std::os::unix::raw::pid_t;
 use std::path::{Path, PathBuf};
@@ -120,8 +123,12 @@
 
 pub static GLOBAL_SERVICE: LazyLock<Strong<dyn IVirtualizationServiceInternal>> =
     LazyLock::new(|| {
-        wait_for_interface(BINDER_SERVICE_IDENTIFIER)
-            .expect("Could not connect to VirtualizationServiceInternal")
+        if cfg!(early) {
+            panic!("Early virtmgr must not connect to VirtualizatinoServiceInternal")
+        } else {
+            wait_for_interface(BINDER_SERVICE_IDENTIFIER)
+                .expect("Could not connect to VirtualizationServiceInternal")
+        }
     });
 static SUPPORTED_OS_NAMES: LazyLock<HashSet<String>> =
     LazyLock::new(|| get_supported_os_names().expect("Failed to get list of supported os names"));
@@ -340,11 +347,110 @@
     }
 }
 
+/// Implementation of the AIDL `IGlobalVmContext` interface for early VMs.
+#[derive(Debug, Default)]
+struct EarlyVmContext {
+    /// The unique CID assigned to the VM for vsock communication.
+    cid: Cid,
+    /// Temporary directory for this VM instance.
+    temp_dir: PathBuf,
+}
+
+impl EarlyVmContext {
+    fn new(cid: Cid, temp_dir: PathBuf) -> Result<Self> {
+        // Remove the entire directory before creating a VM. Early VMs use predefined CIDs and AVF
+        // should trust clients, e.g. they won't run two VMs at the same time
+        let _ = remove_dir_all(&temp_dir);
+        create_dir_all(&temp_dir).context(format!("can't create '{}'", temp_dir.display()))?;
+
+        Ok(Self { cid, temp_dir })
+    }
+}
+
+impl Interface for EarlyVmContext {}
+
+impl Drop for EarlyVmContext {
+    fn drop(&mut self) {
+        if let Err(e) = remove_dir_all(&self.temp_dir) {
+            error!("Cannot remove {} upon dropping: {e}", self.temp_dir.display());
+        }
+    }
+}
+
+impl IGlobalVmContext for EarlyVmContext {
+    fn getCid(&self) -> binder::Result<i32> {
+        Ok(self.cid as i32)
+    }
+
+    fn getTemporaryDirectory(&self) -> binder::Result<String> {
+        Ok(self.temp_dir.to_string_lossy().to_string())
+    }
+
+    fn setHostConsoleName(&self, _pathname: &str) -> binder::Result<()> {
+        Err(Status::new_exception_str(
+            ExceptionCode::UNSUPPORTED_OPERATION,
+            Some("Early VM doesn't support setting host console name"),
+        ))
+    }
+}
+
+fn find_partition(path: &Path) -> binder::Result<String> {
+    match path.components().nth(1) {
+        Some(std::path::Component::Normal(partition)) => {
+            Ok(partition.to_string_lossy().into_owned())
+        }
+        _ => Err(anyhow!("Can't find partition in '{}'", path.display()))
+            .or_service_specific_exception(-1),
+    }
+}
+
 impl VirtualizationService {
     pub fn init() -> VirtualizationService {
         VirtualizationService::default()
     }
 
+    fn create_early_vm_context(
+        &self,
+        config: &VirtualMachineConfig,
+    ) -> binder::Result<(VmContext, Cid, PathBuf)> {
+        let calling_exe_path = format!("/proc/{}/exe", get_calling_pid());
+        let link = fs::read_link(&calling_exe_path)
+            .context(format!("can't read_link '{calling_exe_path}'"))
+            .or_service_specific_exception(-1)?;
+        let partition = find_partition(&link)?;
+
+        let name = match config {
+            VirtualMachineConfig::RawConfig(config) => &config.name,
+            VirtualMachineConfig::AppConfig(config) => &config.name,
+        };
+        let early_vm =
+            find_early_vm_for_partition(&partition, name).or_service_specific_exception(-1)?;
+        if Path::new(&early_vm.path) != link {
+            return Err(anyhow!(
+                "VM '{name}' in partition '{partition}' must be created with '{}', not '{}'",
+                &early_vm.path,
+                link.display()
+            ))
+            .or_service_specific_exception(-1);
+        }
+
+        let cid = early_vm.cid as Cid;
+        let temp_dir = PathBuf::from(format!("/mnt/vm/early/{cid}"));
+
+        let context = EarlyVmContext::new(cid, temp_dir.clone())
+            .context(format!("Can't create early vm contexts for {cid}"))
+            .or_service_specific_exception(-1)?;
+        let service = VirtualMachineService::new_binder(self.state.clone(), cid).as_binder();
+
+        // Start VM service listening for connections from the new CID on port=CID.
+        let port = cid;
+        let vm_server = RpcServer::new_vsock(service, cid, port)
+            .context(format!("Could not start RpcServer on port {port}"))
+            .or_service_specific_exception(-1)?;
+        vm_server.start();
+        Ok((VmContext::new(Strong::new(Box::new(context)), vm_server), cid, temp_dir))
+    }
+
     fn create_vm_context(
         &self,
         requester_debug_pid: pid_t,
@@ -386,8 +492,16 @@
 
         check_config_features(config)?;
 
+        if cfg!(early) {
+            check_config_allowed_for_early_vms(config)?;
+        }
+
         // Allocating VM context checks the MANAGE_VIRTUAL_MACHINE permission.
-        let (vm_context, cid, temporary_directory) = self.create_vm_context(requester_debug_pid)?;
+        let (vm_context, cid, temporary_directory) = if cfg!(early) {
+            self.create_early_vm_context(config)?
+        } else {
+            self.create_vm_context(requester_debug_pid)?
+        };
 
         if is_custom_config(config) {
             check_use_custom_virtual_machine()?;
@@ -1110,6 +1224,10 @@
 
 /// Checks whether the caller has a specific permission
 fn check_permission(perm: &str) -> binder::Result<()> {
+    if cfg!(early) {
+        // Skip permission check for early VMs, in favor of SELinux
+        return Ok(());
+    }
     let calling_pid = get_calling_pid();
     let calling_uid = get_calling_uid();
     // Root can do anything
@@ -1583,6 +1701,13 @@
     Ok(())
 }
 
+fn check_config_allowed_for_early_vms(config: &VirtualMachineConfig) -> binder::Result<()> {
+    check_no_vendor_modules(config)?;
+    check_no_devices(config)?;
+
+    Ok(())
+}
+
 fn clone_or_prepare_logger_fd(
     fd: Option<&ParcelFileDescriptor>,
     tag: String,
@@ -1796,6 +1921,74 @@
     }
 }
 
+// KEEP IN SYNC WITH early_vms.xsd
+#[derive(Debug, Deserialize, PartialEq)]
+struct EarlyVm {
+    #[allow(dead_code)]
+    name: String,
+    #[allow(dead_code)]
+    cid: i32,
+    #[allow(dead_code)]
+    path: String,
+}
+
+#[derive(Debug, Default, Deserialize)]
+struct EarlyVms {
+    #[allow(dead_code)]
+    early_vm: Vec<EarlyVm>,
+}
+
+fn range_for_partition(partition: &str) -> Result<Range<Cid>> {
+    match partition {
+        "system" => Ok(100..200),
+        "system_ext" | "product" => Ok(200..300),
+        _ => Err(anyhow!("Early VMs are not supported for {partition}")),
+    }
+}
+
+fn find_early_vm(xml_path: &Path, cid_range: &Range<Cid>, name: &str) -> Result<EarlyVm> {
+    if !xml_path.exists() {
+        bail!("{} doesn't exist", xml_path.display());
+    }
+
+    let xml =
+        fs::read(xml_path).with_context(|| format!("Failed to read {}", xml_path.display()))?;
+    let xml = String::from_utf8(xml)
+        .with_context(|| format!("{} is not a valid UTF-8 file", xml_path.display()))?;
+    let early_vms: EarlyVms = serde_xml_rs::from_str(&xml)
+        .with_context(|| format!("Can't parse {}", xml_path.display()))?;
+
+    let mut found_vm: Option<EarlyVm> = None;
+
+    for early_vm in early_vms.early_vm {
+        if early_vm.name != name {
+            continue;
+        }
+
+        let cid = early_vm
+            .cid
+            .try_into()
+            .with_context(|| format!("Invalid CID value {}", early_vm.cid))?;
+
+        if !cid_range.contains(&cid) {
+            bail!("VM '{}' uses CID {cid} which is out of range. Available CIDs for '{}': {cid_range:?}", xml_path.display(), early_vm.name);
+        }
+
+        if found_vm.is_some() {
+            bail!("Multiple VMs named {name} are found in {}", xml_path.display());
+        }
+
+        found_vm = Some(early_vm);
+    }
+
+    found_vm.ok_or_else(|| anyhow!("Can't find {name} in {}", xml_path.display()))
+}
+
+fn find_early_vm_for_partition(partition: &str, name: &str) -> Result<EarlyVm> {
+    let cid_range = range_for_partition(partition)?;
+    find_early_vm(Path::new(&format!("/{partition}/etc/avf/early_vms.xml")), &cid_range, name)
+}
+
 #[cfg(test)]
 mod tests {
     use super::*;
@@ -2011,4 +2204,69 @@
         }
         Ok(())
     }
+
+    #[test]
+    fn test_find_early_vms_from_xml() -> Result<()> {
+        let tmp_dir = tempfile::TempDir::new()?;
+        let tmp_dir_path = tmp_dir.path().to_owned();
+        let xml_path = tmp_dir_path.join("early_vms.xml");
+
+        std::fs::write(
+            &xml_path,
+            br#"<?xml version="1.0" encoding="utf-8"?>
+        <early_vms>
+            <early_vm>
+                <name>vm_demo_native_early</name>
+                <cid>123</cid>
+                <path>/system/bin/vm_demo_native_early</path>
+            </early_vm>
+            <early_vm>
+                <name>vm_demo_duplicated_name</name>
+                <cid>456</cid>
+                <path>/system/bin/vm_demo_duplicated_name_1</path>
+            </early_vm>
+            <early_vm>
+                <name>vm_demo_duplicated_name</name>
+                <cid>789</cid>
+                <path>/system/bin/vm_demo_duplicated_name_2</path>
+            </early_vm>
+            <early_vm>
+                <name>vm_demo_invalid_cid_1</name>
+                <cid>-1</cid>
+                <path>/system/bin/vm_demo_invalid_cid_1</path>
+            </early_vm>
+            <early_vm>
+                <name>vm_demo_invalid_cid_2</name>
+                <cid>999999</cid>
+                <path>/system/bin/vm_demo_invalid_cid_2</path>
+            </early_vm>
+        </early_vms>
+        "#,
+        )?;
+
+        let cid_range = 100..1000;
+
+        let result = find_early_vm(&xml_path, &cid_range, "vm_demo_native_early")?;
+        let expected = EarlyVm {
+            name: "vm_demo_native_early".to_owned(),
+            cid: 123,
+            path: "/system/bin/vm_demo_native_early".to_owned(),
+        };
+        assert_eq!(result, expected);
+
+        assert!(
+            find_early_vm(&xml_path, &cid_range, "vm_demo_duplicated_name").is_err(),
+            "should fail"
+        );
+        assert!(
+            find_early_vm(&xml_path, &cid_range, "vm_demo_invalid_cid_1").is_err(),
+            "should fail"
+        );
+        assert!(
+            find_early_vm(&xml_path, &cid_range, "vm_demo_invalid_cid_2").is_err(),
+            "should fail"
+        );
+
+        Ok(())
+    }
 }
diff --git a/android/virtmgr/src/atom.rs b/android/virtmgr/src/atom.rs
index 1d2d191..45b020e 100644
--- a/android/virtmgr/src/atom.rs
+++ b/android/virtmgr/src/atom.rs
@@ -99,6 +99,10 @@
     is_protected: bool,
     ret: &binder::Result<Strong<dyn IVirtualMachine>>,
 ) {
+    if cfg!(early) {
+        info!("Writing VmCreationRequested atom for early VMs is not implemented; skipping");
+        return;
+    }
     let creation_succeeded;
     let binder_exception_code;
     match ret {
@@ -165,6 +169,11 @@
     vm_identifier: &str,
     vm_start_timestamp: Option<SystemTime>,
 ) {
+    if cfg!(early) {
+        info!("Writing VmCreationRequested atom for early VMs is not implemented; skipping");
+        return;
+    }
+
     let vm_identifier = vm_identifier.to_owned();
     let duration = get_duration(vm_start_timestamp);
 
@@ -190,6 +199,10 @@
     exit_signal: Option<i32>,
     vm_metric: &VmMetric,
 ) {
+    if cfg!(early) {
+        info!("Writing VmExited atom for early VMs is not implemented; skipping");
+        return;
+    }
     let vm_identifier = vm_identifier.to_owned();
     let elapsed_time_millis = get_duration(vm_metric.start_timestamp).as_millis() as i64;
     let guest_time_millis = vm_metric.cpu_guest_time.unwrap_or_default();
diff --git a/android/virtmgr/src/main.rs b/android/virtmgr/src/main.rs
index a4e75a7..e4a96ac 100644
--- a/android/virtmgr/src/main.rs
+++ b/android/virtmgr/src/main.rs
@@ -102,7 +102,17 @@
     ProcessState::start_thread_pool();
 
     if cfg!(early) {
-        panic!("Early VM not implemented");
+        // we can't access VirtualizationServiceInternal, so directly call rlimit
+        let pid = i32::from(*PID_CURRENT);
+        let lim = libc::rlimit { rlim_cur: libc::RLIM_INFINITY, rlim_max: libc::RLIM_INFINITY };
+
+        // SAFETY: borrowing the new limit struct only. prlimit doesn't use lim outside its lifetime
+        let ret = unsafe { libc::prlimit(pid, libc::RLIMIT_MEMLOCK, &lim, std::ptr::null_mut()) };
+        if ret == -1 {
+            panic!("rlimit error: {}", std::io::Error::last_os_error());
+        } else if ret != 0 {
+            panic!("Unexpected return value from prlimit(): {ret}");
+        }
     } else {
         GLOBAL_SERVICE.removeMemlockRlimit().expect("Failed to remove memlock rlimit");
     }
diff --git a/android/virtmgr/src/payload.rs b/android/virtmgr/src/payload.rs
index 82d9ba0..81e02b7 100644
--- a/android/virtmgr/src/payload.rs
+++ b/android/virtmgr/src/payload.rs
@@ -35,6 +35,7 @@
 use serde::Deserialize;
 use serde_xml_rs::from_reader;
 use std::collections::HashSet;
+use std::ffi::OsStr;
 use std::fs::{metadata, File, OpenOptions};
 use std::path::{Path, PathBuf};
 use std::process::Command;
@@ -94,11 +95,13 @@
             // For active APEXes, we run derive_classpath and parse its output to see if it
             // contributes to the classpath(s). (This allows us to handle any new classpath env
             // vars seamlessly.)
-            let classpath_vars = run_derive_classpath()?;
-            let classpath_apexes = find_apex_names_in_classpath(&classpath_vars)?;
+            if !cfg!(early) {
+                let classpath_vars = run_derive_classpath()?;
+                let classpath_apexes = find_apex_names_in_classpath(&classpath_vars)?;
 
-            for apex_info in apex_info_list.list.iter_mut() {
-                apex_info.has_classpath_jar = classpath_apexes.contains(&apex_info.name);
+                for apex_info in apex_info_list.list.iter_mut() {
+                    apex_info.has_classpath_jar = classpath_apexes.contains(&apex_info.name);
+                }
             }
 
             Ok(apex_info_list)
@@ -169,6 +172,9 @@
         let mut list = self.apex_info_list.clone();
         // When prefer_staged, we override ApexInfo by consulting "package_native"
         if prefer_staged {
+            if cfg!(early) {
+                return Err(anyhow!("Can't turn on prefer_staged on early boot VMs"));
+            }
             let pm =
                 wait_for_interface::<dyn IPackageManagerNative>(PACKAGE_MANAGER_NATIVE_SERVICE)
                     .context("Failed to get service when prefer_staged is set.")?;
@@ -293,7 +299,16 @@
     }];
 
     for (i, apex_info) in apex_infos.iter().enumerate() {
-        let apex_file = open_parcel_file(&apex_info.path, false)?;
+        let path = if cfg!(early) {
+            let path = &apex_info.preinstalled_path;
+            if path.extension().and_then(OsStr::to_str).unwrap_or("") != "apex" {
+                bail!("compressed APEX {} not supported", path.display());
+            }
+            path
+        } else {
+            &apex_info.path
+        };
+        let apex_file = open_parcel_file(path, false)?;
         partitions.push(Partition {
             label: format!("microdroid-apex-{}", i),
             image: Some(apex_file),
diff --git a/docs/early_vm.md b/docs/early_vm.md
new file mode 100644
index 0000000..44b71ff
--- /dev/null
+++ b/docs/early_vm.md
@@ -0,0 +1,52 @@
+# Early VM
+
+Early VMs are specialized virtual machines that can run even before the `/data`
+partition is mounted, unlike regular VMs. `early_virtmgr` is a binary that
+serves as the interface for early VMs, functioning similarly to `virtmgr`,
+which provides the [`IVirtualizationService`](../android/virtualizationservice/aidl/android/system/virtualizationservice/IVirtualizationService.aidl)
+aidl interface.
+
+To run an early VM, clients must follow these steps.
+
+1) Early VMs need to be defined in `{partition}/etc/avf/early_vms.xml`. The
+schema for this file is defined in [`early_vms.xsd`](../android/virtmgr/early_vms.xsd).
+
+```early_vms.xml
+<early_vms>
+    <early_vm>
+        <name>vm_demo_native_early</name>
+        <cid>123</cid>
+        <path>/system/bin/vm_demo_native_early</path>
+    </early_vm>
+</early_vms>
+```
+
+In this example, the binary `/system/bin/vm_demo_native_early` can establish a
+connection with `early_virtmgr` and create a VM named `vm_demo_native_early`,
+which will be assigned the static CID 123.
+
+2) The client must have the following three or four capabilities.
+
+* `IPC_LOCK`
+* `NET_BIND_SERVICE`
+* `SYS_NICE` (required if `RELEASE_AVF_ENABLE_VIRT_CPUFREQ` is enabled)
+* `SYS_RESOURCES`
+
+Typically, the client is defined as a service in an init script, where
+capabilities can also be specified.
+
+```vm_demo_native_early.rc
+service vm_demo_native_early /system/bin/vm_demo_native_early
+    user system
+    group system virtualmachine
+    capabilities IPC_LOCK NET_BIND_SERVICE SYS_RESOURCE SYS_NICE
+    oneshot
+    stdio_to_kmsg
+    class early_hal
+```
+
+3) The client forks `early_virtmgr` instead of `virtmgr`.
+
+The remaining steps are identical to those for regular VMs: connect to
+`early_virtmgr`, obtain the `IVirtualizationService` interface, then create and
+run the VM.