Retry mechanism for VM secrets

Secretkeeper, from which VM retrieves its secret, keeps bounded number
of live session keys. This means that Sk can fail requests when multiple
VMs are attempting concurrently. On seeing such failed requests, VM
should backoff for radom time (between 100-200ms, which is the
upper bound on the Sk connection + access latency) & then retry with
refreshed session.

Test: Builds
Bug: 399304956
Change-Id: I6a8cb9f269461d13973cd024c5e59374f5cc19be
diff --git a/guest/microdroid_manager/src/vm_secret.rs b/guest/microdroid_manager/src/vm_secret.rs
index f031859..674f010 100644
--- a/guest/microdroid_manager/src/vm_secret.rs
+++ b/guest/microdroid_manager/src/vm_secret.rs
@@ -35,6 +35,9 @@
     StoreSecretRequest, GetSecretResponse, GetSecretRequest};
 use secretkeeper_comm::data_types::error::SecretkeeperError;
 use std::fs;
+use std::thread;
+use rand::Rng;
+use std::time::Duration;
 use zeroize::Zeroizing;
 use std::sync::Mutex;
 use std::sync::Arc;
@@ -63,6 +66,8 @@
     0x55, 0xF8, 0x08, 0x23, 0x81, 0x5F, 0xF5, 0x16, 0x20, 0x3E, 0xBE, 0xBA, 0xB7, 0xA8, 0x43, 0x92,
 ];
 
+const BACKOFF_SK_ACCESS_MS: u64 = 100;
+
 pub enum VmSecret {
     // V2 secrets are derived from 2 independently secured secrets:
     //      1. Secretkeeper protected secrets (skp secret).
@@ -118,15 +123,19 @@
             .map_err(|e| anyhow!("Failed to build a sealing_policy: {e}"))?;
         let session = SkVmSession::new(vm_service, &explicit_dice, policy)?;
         let mut skp_secret = Zeroizing::new([0u8; SECRET_SIZE]);
-        if let Some(secret) = session.get_secret(id)? {
-            *skp_secret = secret;
-            *state = VmInstanceState::PreviouslySeen;
-        } else {
-            log::warn!("No entry found in Secretkeeper for this VM instance, creating new secret.");
-            *skp_secret = rand::random();
-            session.store_secret(id, skp_secret.clone())?;
-            *state = VmInstanceState::NewlyCreated;
-        }
+        get_or_create_sk_secret(&session, id, &mut skp_secret, state).or_else(|e| {
+            // TODO(b/399304956): Secretkeeper rejects requests when overloaded with
+            // connections from multiple clients. Backoff & retry again, hoping it is
+            // less busy then. Secretkeeper changes are required for more robust solutions.
+            log::info!(
+                "get_or_create_sk_secret failed with {e:?}. Refreshing connection & retrying!"
+            );
+            let mut rng = rand::thread_rng();
+            let backoff = rng.gen_range(BACKOFF_SK_ACCESS_MS..2 * BACKOFF_SK_ACCESS_MS);
+            thread::sleep(Duration::from_millis(backoff));
+            session.refresh()?;
+            get_or_create_sk_secret(&session, id, &mut skp_secret, state)
+        })?;
         Ok(Self::V2 {
             instance_id: id,
             dice_artifacts: explicit_dice,
@@ -283,8 +292,6 @@
     sealing_policy: Vec<u8>,
 }
 
-// TODO(b/378911776): This get_secret/store_secret fails on expired session.
-// Introduce retry after refreshing the session
 impl SkVmSession {
     fn new(
         vm_service: &Strong<dyn IVirtualMachineService>,
@@ -366,3 +373,21 @@
             ))
         })?)
 }
+
+fn get_or_create_sk_secret(
+    session: &SkVmSession,
+    id: [u8; ID_SIZE],
+    skp_secret: &mut Zeroizing<[u8; SECRET_SIZE]>,
+    state: &mut VmInstanceState,
+) -> Result<()> {
+    if let Some(secret) = session.get_secret(id)? {
+        **skp_secret = secret;
+        *state = VmInstanceState::PreviouslySeen;
+    } else {
+        log::warn!("No entry found in Secretkeeper for this VM instance, creating new secret.");
+        **skp_secret = rand::random();
+        session.store_secret(id, skp_secret.clone())?;
+        *state = VmInstanceState::NewlyCreated;
+    }
+    Ok(())
+}