Don't start authfs until fd_server is ready

We pass a pipe fd to fd_server which it closes when it is ready to
accept connections. pvm_exec waits for this before triggering
compilation, which will cause authfs in the VM to connect back to
fd_server.

Also improved a bunch of error handling/error logging code, since I
managed to hit a lot of failure cases.

Fix: 201764092
Bug: 186126194
Test: atest ComposTestCase
Change-Id: I99ea5ba4fb390d47485aef3cbaaccedd046ae9fe
diff --git a/authfs/fd_server/src/main.rs b/authfs/fd_server/src/main.rs
index 12f013c..32ffc0a 100644
--- a/authfs/fd_server/src/main.rs
+++ b/authfs/fd_server/src/main.rs
@@ -32,6 +32,7 @@
 use std::convert::TryInto;
 use std::fs::File;
 use std::io;
+use std::os::raw;
 use std::os::unix::fs::FileExt;
 use std::os::unix::io::{AsRawFd, FromRawFd};
 
@@ -292,7 +293,12 @@
     Ok((fd, FdConfig::ReadWrite(file)))
 }
 
-fn parse_args() -> Result<BTreeMap<i32, FdConfig>> {
+struct Args {
+    fd_pool: BTreeMap<i32, FdConfig>,
+    ready_fd: Option<File>,
+}
+
+fn parse_args() -> Result<Args> {
     #[rustfmt::skip]
     let matches = clap::App::new("fd_server")
         .arg(clap::Arg::with_name("ro-fds")
@@ -303,6 +309,9 @@
              .long("rw-fds")
              .multiple(true)
              .number_of_values(1))
+        .arg(clap::Arg::with_name("ready-fd")
+            .long("ready-fd")
+            .takes_value(true))
         .get_matches();
 
     let mut fd_pool = BTreeMap::new();
@@ -318,8 +327,13 @@
             fd_pool.insert(fd, config);
         }
     }
-
-    Ok(fd_pool)
+    let ready_fd = if let Some(arg) = matches.value_of("ready-fd") {
+        let fd = arg.parse::<i32>()?;
+        Some(fd_to_file(fd)?)
+    } else {
+        None
+    };
+    Ok(Args { fd_pool, ready_fd })
 }
 
 fn main() -> Result<()> {
@@ -327,16 +341,22 @@
         android_logger::Config::default().with_tag("fd_server").with_min_level(log::Level::Debug),
     );
 
-    let fd_pool = parse_args()?;
+    let args = parse_args()?;
 
-    let mut service = FdService::new_binder(fd_pool).as_binder();
+    let mut service = FdService::new_binder(args.fd_pool).as_binder();
+    let mut ready_notifier = ReadyNotifier(args.ready_fd);
+
     debug!("fd_server is starting as a rpc service.");
     // SAFETY: Service ownership is transferring to the server and won't be valid afterward.
     // Plus the binder objects are threadsafe.
+    // RunRpcServerCallback does not retain a reference to ready_callback, and only ever
+    // calls it with the param we provide during the lifetime of ready_notifier.
     let retval = unsafe {
-        binder_rpc_unstable_bindgen::RunRpcServer(
+        binder_rpc_unstable_bindgen::RunRpcServerCallback(
             service.as_native_mut() as *mut binder_rpc_unstable_bindgen::AIBinder,
             RPC_SERVICE_PORT,
+            Some(ReadyNotifier::ready_callback),
+            ready_notifier.as_void_ptr(),
         )
     };
     if retval {
@@ -346,3 +366,25 @@
         bail!("Premature termination of RPC server");
     }
 }
+
+struct ReadyNotifier(Option<File>);
+
+impl ReadyNotifier {
+    fn notify(&mut self) {
+        debug!("fd_server is ready");
+        // Close the ready-fd if we were given one to signal our readiness.
+        drop(self.0.take());
+    }
+
+    fn as_void_ptr(&mut self) -> *mut raw::c_void {
+        self as *mut _ as *mut raw::c_void
+    }
+
+    unsafe extern "C" fn ready_callback(param: *mut raw::c_void) {
+        // SAFETY: This is only ever called by RunRpcServerCallback, within the lifetime of the
+        // ReadyNotifier, with param taking the value returned by as_void_ptr (so a properly aligned
+        // non-null pointer to an initialized instance).
+        let ready_notifier = param as *mut Self;
+        ready_notifier.as_mut().unwrap().notify()
+    }
+}
diff --git a/authfs/service/src/authfs.rs b/authfs/service/src/authfs.rs
index 5601738..6d87243 100644
--- a/authfs/service/src/authfs.rs
+++ b/authfs/service/src/authfs.rs
@@ -82,8 +82,11 @@
             &config.outputFdAnnotations,
             debuggable,
         )?;
-        wait_until_authfs_ready(&mountpoint).map_err(|e| {
-            debug!("Wait for authfs: {:?}", child.wait());
+        wait_until_authfs_ready(&child, &mountpoint).map_err(|e| {
+            match child.wait() {
+                Ok(status) => debug!("Wait for authfs: {}", status),
+                Err(e) => warn!("Failed to wait for child: {}", e),
+            }
             e
         })?;
 
@@ -144,13 +147,18 @@
     SharedChild::spawn(&mut command).context("Spawn authfs")
 }
 
-fn wait_until_authfs_ready(mountpoint: &OsStr) -> Result<()> {
+fn wait_until_authfs_ready(child: &SharedChild, mountpoint: &OsStr) -> Result<()> {
     let start_time = Instant::now();
     loop {
         if is_fuse(mountpoint)? {
             break;
         }
+        if let Some(exit_status) = child.try_wait()? {
+            // If the child has exited, we will never become ready.
+            bail!("Child has exited: {}", exit_status);
+        }
         if start_time.elapsed() > AUTHFS_SETUP_TIMEOUT_SEC {
+            let _ = child.kill();
             bail!("Time out mounting authfs");
         }
         sleep(AUTHFS_SETUP_POLL_INTERVAL_MS);
diff --git a/authfs/service/src/main.rs b/authfs/service/src/main.rs
index af8c7f9..890e108 100644
--- a/authfs/service/src/main.rs
+++ b/authfs/service/src/main.rs
@@ -59,7 +59,7 @@
         create_dir(&mountpoint).map_err(|e| {
             new_binder_exception(
                 ExceptionCode::SERVICE_SPECIFIC,
-                format!("Cannot create mount directory {:?}: {}", &mountpoint, e),
+                format!("Cannot create mount directory {:?}: {:?}", &mountpoint, e),
             )
         })?;
 
@@ -109,7 +109,7 @@
     Ok(())
 }
 
-fn main() -> Result<()> {
+fn try_main() -> Result<()> {
     let debuggable = env!("TARGET_BUILD_VARIANT") != "user";
     let log_level = if debuggable { log::Level::Trace } else { log::Level::Info };
     android_logger::init_once(
@@ -128,3 +128,10 @@
     ProcessState::join_thread_pool();
     bail!("Unexpected exit after join_thread_pool")
 }
+
+fn main() {
+    if let Err(e) = try_main() {
+        error!("failed with {:?}", e);
+        std::process::exit(1);
+    }
+}
diff --git a/authfs/src/main.rs b/authfs/src/main.rs
index c85d801..ecb0e68 100644
--- a/authfs/src/main.rs
+++ b/authfs/src/main.rs
@@ -28,6 +28,7 @@
 //! e.g. /mountpoint/42.
 
 use anyhow::{bail, Context, Result};
+use log::error;
 use std::collections::BTreeMap;
 use std::convert::TryInto;
 use std::fs::File;
@@ -325,7 +326,7 @@
     Ok(file_pool)
 }
 
-fn main() -> Result<()> {
+fn try_main() -> Result<()> {
     let args = Args::from_args();
 
     let log_level = if args.debug { log::Level::Debug } else { log::Level::Info };
@@ -337,3 +338,10 @@
     fusefs::loop_forever(file_pool, &args.mount_point, &args.extra_options)?;
     bail!("Unexpected exit after the handler loop")
 }
+
+fn main() {
+    if let Err(e) = try_main() {
+        error!("failed with {:?}", e);
+        std::process::exit(1);
+    }
+}
diff --git a/compos/src/compsvc_main.rs b/compos/src/compsvc_main.rs
index 388e79b..d0c920a 100644
--- a/compos/src/compsvc_main.rs
+++ b/compos/src/compsvc_main.rs
@@ -43,7 +43,14 @@
 /// The CID representing the host VM
 const VMADDR_CID_HOST: u32 = 2;
 
-fn main() -> Result<()> {
+fn main() {
+    if let Err(e) = try_main() {
+        error!("failed with {:?}", e);
+        std::process::exit(1);
+    }
+}
+
+fn try_main() -> Result<()> {
     let args = clap::App::new("compsvc")
         .arg(clap::Arg::with_name("log_to_stderr").long("log_to_stderr"))
         .get_matches();
diff --git a/compos/src/pvm_exec.rs b/compos/src/pvm_exec.rs
index fdd9c57..8ce27db 100644
--- a/compos/src/pvm_exec.rs
+++ b/compos/src/pvm_exec.rs
@@ -32,8 +32,11 @@
 use clap::{value_t, App, Arg};
 use log::{debug, error, warn};
 use minijail::Minijail;
-use nix::fcntl::{fcntl, FcntlArg::F_GETFD};
-use std::os::unix::io::RawFd;
+use nix::fcntl::{fcntl, FcntlArg::F_GETFD, OFlag};
+use nix::unistd::pipe2;
+use std::fs::File;
+use std::io::Read;
+use std::os::unix::io::{AsRawFd, FromRawFd, RawFd};
 use std::path::Path;
 use std::process::exit;
 
@@ -69,7 +72,11 @@
     }
 }
 
-fn spawn_fd_server(fd_annotation: &FdAnnotation, debuggable: bool) -> Result<Minijail> {
+fn spawn_fd_server(
+    fd_annotation: &FdAnnotation,
+    ready_file: File,
+    debuggable: bool,
+) -> Result<Minijail> {
     let mut inheritable_fds = if debuggable {
         vec![1, 2] // inherit/redirect stdout/stderr for debugging
     } else {
@@ -87,6 +94,10 @@
         args.push(fd.to_string());
         inheritable_fds.push(*fd);
     }
+    let ready_fd = ready_file.as_raw_fd();
+    args.push("--ready-fd".to_string());
+    args.push(ready_fd.to_string());
+    inheritable_fds.push(ready_fd);
 
     let jail = Minijail::new()?;
     let _pid = jail.run(Path::new(FD_SERVER_BIN), &inheritable_fds, &args)?;
@@ -153,12 +164,31 @@
     Ok(Config { args, fd_annotation: FdAnnotation { input_fds, output_fds }, cid, debuggable })
 }
 
+fn create_pipe() -> Result<(File, File)> {
+    let (raw_read, raw_write) = pipe2(OFlag::O_CLOEXEC)?;
+    // SAFETY: We are the sole owners of these fds as they were just created.
+    let read_fd = unsafe { File::from_raw_fd(raw_read) };
+    let write_fd = unsafe { File::from_raw_fd(raw_write) };
+    Ok((read_fd, write_fd))
+}
+
+fn wait_for_fd_server_ready(mut ready_fd: File) -> Result<()> {
+    let mut buffer = [0];
+    // When fd_server is ready it closes its end of the pipe. And if it exits, the pipe is also
+    // closed. Either way this read will return 0 bytes at that point, and there's no point waiting
+    // any longer.
+    let _ = ready_fd.read(&mut buffer).context("Waiting for fd_server to be ready")?;
+    debug!("fd_server is ready");
+    Ok(())
+}
+
 fn try_main() -> Result<()> {
     // 1. Parse the command line arguments for collect execution data.
     let Config { args, fd_annotation, cid, debuggable } = parse_args()?;
 
     // 2. Spawn and configure a fd_server to serve remote read/write requests.
-    let fd_server_jail = spawn_fd_server(&fd_annotation, debuggable)?;
+    let (ready_read_fd, ready_write_fd) = create_pipe()?;
+    let fd_server_jail = spawn_fd_server(&fd_annotation, ready_write_fd, debuggable)?;
     let fd_server_lifetime = scopeguard::guard(fd_server_jail, |fd_server_jail| {
         if let Err(e) = fd_server_jail.kill() {
             if !matches!(e, minijail::Error::Killed(_)) {
@@ -171,10 +201,12 @@
     let result = if cid == VMADDR_CID_ANY {
         // Sentinel value that indicates we should use composd
         let composd = get_composd()?;
+        wait_for_fd_server_ready(ready_read_fd)?;
         composd.compile(&args, &fd_annotation)
     } else {
         // Call directly into the VM
         let compos_vm = get_rpc_binder(cid)?;
+        wait_for_fd_server_ready(ready_read_fd)?;
         compos_vm.compile(&args, &fd_annotation)
     };
     let result = result.context("Binder call failed")?;
@@ -190,7 +222,7 @@
     // Be explicit about the lifetime, which should last at least until the task is finished.
     drop(fd_server_lifetime);
 
-    if result.exitCode > 0 {
+    if result.exitCode != 0 {
         error!("remote execution failed with exit code {}", result.exitCode);
         exit(result.exitCode as i32);
     }