Add a timeout for all installd operations.

Bug: 216514270
Test: atest installd_utils_test
Test: atest installd_dexopt_test
Ignore-AOSP-First: Merge conflicts.
Change-Id: I6c25e1df756ca486de1c8611185dbedf60d7cd7d
diff --git a/cmds/installd/dexopt.cpp b/cmds/installd/dexopt.cpp
index c796da6..9647865 100644
--- a/cmds/installd/dexopt.cpp
+++ b/cmds/installd/dexopt.cpp
@@ -74,10 +74,19 @@
 using android::base::ReadFully;
 using android::base::StringPrintf;
 using android::base::WriteFully;
+using android::base::borrowed_fd;
 using android::base::unique_fd;
 
 namespace {
 
+// Timeout for short operations, such as merging profiles.
+constexpr int kShortTimeoutMs = 60000; // 1 minute.
+
+// Timeout for long operations, such as compilation. This should be smaller than the Package Manager
+// watchdog (PackageManagerService.WATCHDOG_TIMEOUT, 10 minutes), so that the operation will be
+// aborted before that watchdog would take down the system server.
+constexpr int kLongTimeoutMs = 570000; // 9.5 minutes.
+
 class DexOptStatus {
  public:
     // Check if dexopt is cancelled and fork if it is not cancelled.
@@ -486,6 +495,25 @@
     }
 }
 
+// Cleans up an output file specified by a file descriptor. This function should be called whenever
+// a subprocess that modifies a system-managed file crashes.
+// If the subprocess crashes while it's writing to the file, the file is likely corrupted, so we
+// should remove it.
+// If the subprocess times out and is killed while it's acquiring a flock on the file, there is
+// probably a deadlock, so it's also good to remove the file so that later operations won't
+// encounter the same problem. It's safe to do so because the process that is holding the flock will
+// still have access to the file until the file descriptor is closed.
+// Note that we can't do `clear_reference_profile` here even if the fd points to a reference profile
+// because that also requires a flock and is therefore likely to be stuck in the second case.
+static bool cleanup_output_fd(int fd) {
+    std::string path;
+    bool ret = remove_file_at_fd(fd, &path);
+    if (ret) {
+        LOG(INFO) << "Removed file at path " << path;
+    }
+    return ret;
+}
+
 static constexpr int PROFMAN_BIN_RETURN_CODE_SUCCESS = 0;
 static constexpr int PROFMAN_BIN_RETURN_CODE_COMPILE = 1;
 static constexpr int PROFMAN_BIN_RETURN_CODE_SKIP_COMPILATION_NOT_ENOUGH_DELTA = 2;
@@ -497,13 +525,14 @@
 
 class RunProfman : public ExecVHelper {
   public:
-   void SetupArgs(const std::vector<unique_fd>& profile_fds,
-                  const unique_fd& reference_profile_fd,
-                  const std::vector<unique_fd>& apk_fds,
-                  const std::vector<std::string>& dex_locations,
-                  bool copy_and_update,
-                  bool for_snapshot,
-                  bool for_boot_image) {
+    template <typename T, typename U>
+    void SetupArgs(const std::vector<T>& profile_fds,
+                   const unique_fd& reference_profile_fd,
+                   const std::vector<U>& apk_fds,
+                   const std::vector<std::string>& dex_locations,
+                   bool copy_and_update,
+                   bool for_snapshot,
+                   bool for_boot_image) {
 
         // TODO(calin): Assume for now we run in the bg compile job (which is in
         // most of the invocation). With the current data flow, is not very easy or
@@ -519,11 +548,11 @@
             AddArg("--reference-profile-file-fd=" + std::to_string(reference_profile_fd.get()));
         }
 
-        for (const unique_fd& fd : profile_fds) {
+        for (const T& fd : profile_fds) {
             AddArg("--profile-file-fd=" + std::to_string(fd.get()));
         }
 
-        for (const unique_fd& fd : apk_fds) {
+        for (const U& fd : apk_fds) {
             AddArg("--apk-fd=" + std::to_string(fd.get()));
         }
 
@@ -582,20 +611,14 @@
                   for_boot_image);
     }
 
-    void SetupCopyAndUpdate(unique_fd&& profile_fd,
-                            unique_fd&& reference_profile_fd,
-                            unique_fd&& apk_fd,
+    void SetupCopyAndUpdate(const unique_fd& profile_fd,
+                            const unique_fd& reference_profile_fd,
+                            const unique_fd& apk_fd,
                             const std::string& dex_location) {
-        // The fds need to stay open longer than the scope of the function, so put them into a local
-        // variable vector.
-        profiles_fd_.push_back(std::move(profile_fd));
-        apk_fds_.push_back(std::move(apk_fd));
-        reference_profile_fd_ = std::move(reference_profile_fd);
-        std::vector<std::string> dex_locations = {dex_location};
-        SetupArgs(profiles_fd_,
-                  reference_profile_fd_,
-                  apk_fds_,
-                  dex_locations,
+        SetupArgs(std::vector<borrowed_fd>{profile_fd},
+                  reference_profile_fd,
+                  std::vector<borrowed_fd>{apk_fd},
+                  {dex_location},
                   /*copy_and_update=*/true,
                   /*for_snapshot*/false,
                   /*for_boot_image*/false);
@@ -621,11 +644,6 @@
     void Exec() {
         ExecVHelper::Exec(DexoptReturnCodes::kProfmanExec);
     }
-
-  private:
-    unique_fd reference_profile_fd_;
-    std::vector<unique_fd> profiles_fd_;
-    std::vector<unique_fd> apk_fds_;
 };
 
 static int analyze_profiles(uid_t uid, const std::string& package_name,
@@ -657,13 +675,14 @@
         profman_merge.Exec();
     }
     /* parent */
-    int return_code = wait_child(pid);
+    int return_code = wait_child_with_timeout(pid, kShortTimeoutMs);
     bool need_to_compile = false;
     bool empty_profiles = false;
     bool should_clear_current_profiles = false;
     bool should_clear_reference_profile = false;
     if (!WIFEXITED(return_code)) {
         LOG(WARNING) << "profman failed for location " << location << ": " << return_code;
+        cleanup_output_fd(reference_profile_fd.get());
     } else {
         return_code = WEXITSTATUS(return_code);
         switch (return_code) {
@@ -797,10 +816,10 @@
         profman_dump.Exec();
     }
     /* parent */
-    int return_code = wait_child(pid);
+    int return_code = wait_child_with_timeout(pid, kShortTimeoutMs);
     if (!WIFEXITED(return_code)) {
-        LOG(WARNING) << "profman failed for package " << pkgname << ": "
-                << return_code;
+        LOG(WARNING) << "profman failed for package " << pkgname << ": " << return_code;
+        cleanup_output_fd(output_fd.get());
         return false;
     }
     return true;
@@ -871,7 +890,11 @@
         _exit(0);
     }
     /* parent */
-    int return_code = wait_child(pid);
+    int return_code = wait_child_with_timeout(pid, kShortTimeoutMs);
+    if (!WIFEXITED(return_code)) {
+        cleanup_output_fd(out_fd.get());
+        return false;
+    }
     return return_code == 0;
 }
 
@@ -1521,7 +1544,7 @@
     }
     pipe_read.reset();
 
-    int return_code = wait_child(pid);
+    int return_code = wait_child_with_timeout(pid, kShortTimeoutMs);
     if (!WIFEXITED(return_code)) {
         PLOG(ERROR) << "Error waiting for child dexoptanalyzer process";
         return false;
@@ -1695,7 +1718,7 @@
     }
 
     /* parent */
-    int result = wait_child(pid);
+    int result = wait_child_with_timeout(pid, kShortTimeoutMs);
     cancelled = dexopt_status_->check_if_killed_and_remove_dexopt_pid(pid);
     if (!WIFEXITED(result)) {
         if ((WTERMSIG(result) == SIGKILL) && cancelled) {
@@ -1954,7 +1977,7 @@
 
         runner.Exec(DexoptReturnCodes::kDex2oatExec);
     } else {
-        int res = wait_child(pid);
+        int res = wait_child_with_timeout(pid, kLongTimeoutMs);
         bool cancelled = dexopt_status_->check_if_killed_and_remove_dexopt_pid(pid);
         if (res == 0) {
             LOG(VERBOSE) << "DexInv: --- END '" << dex_path << "' (success) ---";
@@ -2143,7 +2166,7 @@
         _exit(result ? kReconcileSecondaryDexCleanedUp : kReconcileSecondaryDexAccessIOError);
     }
 
-    int return_code = wait_child(pid);
+    int return_code = wait_child_with_timeout(pid, kShortTimeoutMs);
     if (!WIFEXITED(return_code)) {
         LOG(WARNING) << "reconcile dex failed for location " << dex_path << ": " << return_code;
     } else {
@@ -2261,7 +2284,7 @@
     if (!ReadFully(pipe_read, out_secondary_dex_hash->data(), out_secondary_dex_hash->size())) {
         out_secondary_dex_hash->clear();
     }
-    return wait_child(pid) == 0;
+    return wait_child_with_timeout(pid, kShortTimeoutMs) == 0;
 }
 
 // Helper for move_ab, so that we can have common failure-case cleanup.
@@ -2591,9 +2614,10 @@
     }
 
     /* parent */
-    int return_code = wait_child(pid);
+    int return_code = wait_child_with_timeout(pid, kShortTimeoutMs);
     if (!WIFEXITED(return_code)) {
         LOG(WARNING) << "profman failed for " << package_name << ":" << profile_name;
+        cleanup_output_fd(snapshot_fd.get());
         return false;
     }
 
@@ -2700,10 +2724,11 @@
         }
 
         /* parent */
-        int return_code = wait_child(pid);
+        int return_code = wait_child_with_timeout(pid, kShortTimeoutMs);
 
         if (!WIFEXITED(return_code)) {
             PLOG(WARNING) << "profman failed for " << package_name << ":" << profile_name;
+            cleanup_output_fd(snapshot_fd.get());
             return false;
         }
 
@@ -2774,9 +2799,9 @@
     }
 
     RunProfman args;
-    args.SetupCopyAndUpdate(std::move(dex_metadata_fd),
-                            std::move(ref_profile_fd),
-                            std::move(apk_fd),
+    args.SetupCopyAndUpdate(dex_metadata_fd,
+                            ref_profile_fd,
+                            apk_fd,
                             code_path);
     pid_t pid = fork();
     if (pid == 0) {
@@ -2789,9 +2814,10 @@
     }
 
     /* parent */
-    int return_code = wait_child(pid);
+    int return_code = wait_child_with_timeout(pid, kShortTimeoutMs);
     if (!WIFEXITED(return_code)) {
         PLOG(WARNING) << "profman failed for " << package_name << ":" << profile_name;
+        cleanup_output_fd(ref_profile_fd.get());
         return false;
     }
     return true;