Collect cpu/mem stats via VmExited atom Bug: 257159905 Test: atest MicrodroidHostTests#testTelemetryPushedAtoms Change-Id: I156bdc0f30e89ce28c75f8d360959d2660957c01

commit: 6d265d9b94d0b51494cfa8b15ab820a6cc256824 [log] [tgz]
author: Seungjae Yoo <seungjaeyoo@google.com> Tue Nov 15 10:51:33 2022 +0900
committer: Seungjae Yoo <seungjaeyoo@google.com> Thu Nov 17 10:22:52 2022 +0900
tree: 7ab763169cea5d927358d50f93e14d5817140aca
parent: 9a537e00abf705e273b7b1059e8347a900ef4cb7 [diff]
diff --git a/virtualizationservice/src/aidl.rs b/virtualizationservice/src/aidl.rs
index bc697e3..00a46bf 100644
--- a/virtualizationservice/src/aidl.rs
+++ b/virtualizationservice/src/aidl.rs

@@ -1075,8 +1075,8 @@
             let stream = vm.stream.lock().unwrap().take();
             vm.callbacks.notify_payload_started(cid, stream);
 
-            let vm_start_timestamp = vm.vm_start_timestamp.lock().unwrap();
-            write_vm_booted_stats(vm.requester_uid as i32, &vm.name, *vm_start_timestamp);
+            let vm_start_timestamp = vm.vm_metric.lock().unwrap().start_timestamp;
+            write_vm_booted_stats(vm.requester_uid as i32, &vm.name, vm_start_timestamp);
             Ok(())
         } else {
             error!("notifyPayloadStarted is called from an unknown CID {}", cid);

diff --git a/virtualizationservice/src/atom.rs b/virtualizationservice/src/atom.rs
index 20f88e7..9c74d1e 100644
--- a/virtualizationservice/src/atom.rs
+++ b/virtualizationservice/src/atom.rs

@@ -15,6 +15,7 @@
 //! Functions for creating and collecting atoms.
 
 use crate::aidl::clone_file;
+use crate::crosvm::VmMetric;
 use android_system_virtualizationservice::aidl::android::system::virtualizationservice::{
     DeathReason::DeathReason,
     IVirtualMachine::IVirtualMachine,
@@ -164,15 +165,18 @@
     uid: i32,
     vm_identifier: &str,
     reason: DeathReason,
-    vm_start_timestamp: Option<SystemTime>,
+    vm_metric: &VmMetric,
 ) {
     let vm_identifier = vm_identifier.to_owned();
-    let duration = get_duration(vm_start_timestamp);
+    let elapsed_time_millis = get_duration(vm_metric.start_timestamp).as_millis() as i64;
+    let guest_time_millis = vm_metric.cpu_guest_time.unwrap_or_default();
+    let rss = vm_metric.rss.unwrap_or_default();
+
     thread::spawn(move || {
         let vm_exited = vm_exited::VmExited {
             uid,
             vm_identifier: &vm_identifier,
-            elapsed_time_millis: duration.as_millis() as i64,
+            elapsed_time_millis,
             death_reason: match reason {
                 DeathReason::INFRASTRUCTURE_ERROR => vm_exited::DeathReason::InfrastructureError,
                 DeathReason::KILLED => vm_exited::DeathReason::Killed,
@@ -211,6 +215,9 @@
                 DeathReason::HANGUP => vm_exited::DeathReason::Hangup,
                 _ => vm_exited::DeathReason::Unknown,
             },
+            guest_time_millis,
+            rss_vm_kb: rss.vm,
+            rss_crosvm_kb: rss.crosvm,
         };
         wait_for_statsd().unwrap_or_else(|e| warn!("failed to wait for statsd with error: {}", e));
         match vm_exited.stats_write() {

diff --git a/virtualizationservice/src/crosvm.rs b/virtualizationservice/src/crosvm.rs
index 1b8061e..2ada1ec 100644
--- a/virtualizationservice/src/crosvm.rs
+++ b/virtualizationservice/src/crosvm.rs

@@ -16,16 +16,18 @@
 
 use crate::aidl::{Cid, VirtualMachineCallbacks};
 use crate::atom::write_vm_exited_stats;
-use anyhow::{anyhow, bail, Context, Error};
+use anyhow::{anyhow, bail, Context, Error, Result};
 use command_fds::CommandFdExt;
 use lazy_static::lazy_static;
+use libc::{sysconf, _SC_CLK_TCK};
 use log::{debug, error, info};
 use semver::{Version, VersionReq};
 use nix::{fcntl::OFlag, unistd::pipe2};
 use regex::{Captures, Regex};
 use shared_child::SharedChild;
 use std::borrow::Cow;
-use std::fs::{remove_dir_all, File};
+use std::cmp::max;
+use std::fs::{read_to_string, remove_dir_all, File};
 use std::io::{self, Read};
 use std::mem;
 use std::num::NonZeroU32;
@@ -61,6 +63,8 @@
 /// The exit status which crosvm returns when vcpu is stalled.
 const CROSVM_WATCHDOG_REBOOT_STATUS: i32 = 36;
 
+const MILLIS_PER_SEC: i64 = 1000;
+
 lazy_static! {
     /// If the VM doesn't move to the Started state within this amount time, a hang-up error is
     /// triggered.
@@ -133,6 +137,24 @@
     Failed,
 }
 
+/// RSS values of VM and CrosVM process itself.
+#[derive(Copy, Clone, Debug, Default)]
+pub struct Rss {
+    pub vm: i64,
+    pub crosvm: i64,
+}
+
+/// Metrics regarding the VM.
+#[derive(Debug, Default)]
+pub struct VmMetric {
+    /// Recorded timestamp when the VM is started.
+    pub start_timestamp: Option<SystemTime>,
+    /// Update most recent guest_time periodically from /proc/[crosvm pid]/stat while VM is running.
+    pub cpu_guest_time: Option<i64>,
+    /// Update maximum RSS values periodically from /proc/[crosvm pid]/smaps while VM is running.
+    pub rss: Option<Rss>,
+}
+
 impl VmState {
     /// Tries to start the VM, if it is in the `NotStarted` state.
     ///
@@ -147,6 +169,12 @@
             let child =
                 Arc::new(run_vm(config, &instance.temporary_directory, failure_pipe_write)?);
 
+            let instance_monitor_status = instance.clone();
+            let child_monitor_status = child.clone();
+            thread::spawn(move || {
+                instance_monitor_status.clone().monitor_vm_status(child_monitor_status);
+            });
+
             let child_clone = child.clone();
             let instance_clone = instance.clone();
             thread::spawn(move || {
@@ -194,8 +222,8 @@
     pub stream: Mutex<Option<VsockStream>>,
     /// VirtualMachineService binder object for the VM.
     pub vm_service: Mutex<Option<Strong<dyn IVirtualMachineService>>>,
-    /// Recorded timestamp when the VM is started.
-    pub vm_start_timestamp: Mutex<Option<SystemTime>>,
+    /// Recorded metrics of VM such as timestamp or cpu / memory usage.
+    pub vm_metric: Mutex<VmMetric>,
     /// The latest lifecycle state which the payload reported itself to be in.
     payload_state: Mutex<PayloadState>,
     /// Represents the condition that payload_state was updated
@@ -225,7 +253,7 @@
             callbacks: Default::default(),
             stream: Mutex::new(None),
             vm_service: Mutex::new(None),
-            vm_start_timestamp: Mutex::new(None),
+            vm_metric: Mutex::new(Default::default()),
             payload_state: Mutex::new(PayloadState::Starting),
             payload_state_updated: Condvar::new(),
         })
@@ -234,7 +262,8 @@
     /// Starts an instance of `crosvm` to manage the VM. The `crosvm` instance will be killed when
     /// the `VmInstance` is dropped.
     pub fn start(self: &Arc<Self>) -> Result<(), Error> {
-        *self.vm_start_timestamp.lock().unwrap() = Some(SystemTime::now());
+        let mut vm_metric = self.vm_metric.lock().unwrap();
+        vm_metric.start_timestamp = Some(SystemTime::now());
         self.vm_state.lock().unwrap().start(self.clone())
     }
 
@@ -283,13 +312,8 @@
         let death_reason = death_reason(&result, &failure_reason);
         self.callbacks.callback_on_died(self.cid, death_reason);
 
-        let vm_start_timestamp = self.vm_start_timestamp.lock().unwrap();
-        write_vm_exited_stats(
-            self.requester_uid as i32,
-            &self.name,
-            death_reason,
-            *vm_start_timestamp,
-        );
+        let vm_metric = self.vm_metric.lock().unwrap();
+        write_vm_exited_stats(self.requester_uid as i32, &self.name, death_reason, &*vm_metric);
 
         // Delete temporary files.
         if let Err(e) = remove_dir_all(&self.temporary_directory) {
@@ -321,6 +345,42 @@
         }
     }
 
+    fn monitor_vm_status(&self, child: Arc<SharedChild>) {
+        let pid = child.id();
+
+        loop {
+            {
+                // Check VM state
+                let vm_state = &*self.vm_state.lock().unwrap();
+                if let VmState::Dead = vm_state {
+                    break;
+                }
+
+                let mut vm_metric = self.vm_metric.lock().unwrap();
+
+                // Get CPU Information
+                // TODO: Collect it once right before VM dies using SIGCHLD
+                if let Ok(guest_time) = get_guest_time(pid) {
+                    vm_metric.cpu_guest_time = Some(guest_time);
+                } else {
+                    error!("Failed to parse /proc/[pid]/stat");
+                }
+
+                // Get Memory Information
+                if let Ok(rss) = get_rss(pid) {
+                    vm_metric.rss = match &vm_metric.rss {
+                        Some(x) => Some(Rss::extract_max(x, &rss)),
+                        None => Some(rss),
+                    }
+                } else {
+                    error!("Failed to parse /proc/[pid]/smaps");
+                }
+            }
+
+            thread::sleep(Duration::from_secs(1));
+        }
+    }
+
     /// Returns the last reported state of the VM payload.
     pub fn payload_state(&self) -> PayloadState {
         *self.payload_state.lock().unwrap()
@@ -391,6 +451,60 @@
     }
 }
 
+impl Rss {
+    fn extract_max(x: &Rss, y: &Rss) -> Rss {
+        Rss { vm: max(x.vm, y.vm), crosvm: max(x.crosvm, y.crosvm) }
+    }
+}
+
+// Get guest time from /proc/[crosvm pid]/stat
+fn get_guest_time(pid: u32) -> Result<i64> {
+    let file = read_to_string(format!("/proc/{}/stat", pid))?;
+    let data_list: Vec<_> = file.split_whitespace().collect();
+
+    // Information about guest_time is at 43th place of the file split with the whitespace.
+    // Example of /proc/[pid]/stat :
+    // 6603 (kworker/104:1H-kblockd) I 2 0 0 0 -1 69238880 0 0 0 0 0 88 0 0 0 -20 1 0 1845 0 0
+    // 18446744073709551615 0 0 0 0 0 0 0 2147483647 0 0 0 0 17 104 0 0 0 0 0 0 0 0 0 0 0 0 0
+    if data_list.len() < 43 {
+        bail!("Failed to parse command result for getting guest time : {}", file);
+    }
+
+    let guest_time_ticks = data_list[42].parse::<i64>()?;
+    // SAFETY : It just returns an integer about CPU tick information.
+    let ticks_per_sec = unsafe { sysconf(_SC_CLK_TCK) } as i64;
+    Ok(guest_time_ticks * MILLIS_PER_SEC / ticks_per_sec)
+}
+
+// Get rss from /proc/[crosvm pid]/smaps
+fn get_rss(pid: u32) -> Result<Rss> {
+    let file = read_to_string(format!("/proc/{}/smaps", pid))?;
+    let lines: Vec<_> = file.split('\n').collect();
+
+    let mut rss_vm_total = 0i64;
+    let mut rss_crosvm_total = 0i64;
+    let mut is_vm = false;
+    for line in lines {
+        if line.contains("crosvm_guest") {
+            is_vm = true;
+        } else if line.contains("Rss:") {
+            let data_list: Vec<_> = line.split_whitespace().collect();
+            if data_list.len() < 2 {
+                bail!("Failed to parse command result for getting rss :\n{}", line);
+            }
+            let rss = data_list[1].parse::<i64>()?;
+
+            if is_vm {
+                rss_vm_total += rss;
+                is_vm = false;
+            }
+            rss_crosvm_total += rss;
+        }
+    }
+
+    Ok(Rss { vm: rss_vm_total, crosvm: rss_crosvm_total })
+}
+
 fn death_reason(result: &Result<ExitStatus, io::Error>, mut failure_reason: &str) -> DeathReason {
     if let Some(position) = failure_reason.find('|') {
         // Separator indicates extra context information is present after the failure name.
commit	6d265d9b94d0b51494cfa8b15ab820a6cc256824	[log] [tgz]
author	Seungjae Yoo <seungjaeyoo@google.com>	Tue Nov 15 10:51:33 2022 +0900
committer	Seungjae Yoo <seungjaeyoo@google.com>	Thu Nov 17 10:22:52 2022 +0900
tree	7ab763169cea5d927358d50f93e14d5817140aca
parent	9a537e00abf705e273b7b1059e8347a900ef4cb7 [diff]