Merge "init: Add more diagnostics for signalfd hangs." am: fe62ca7165 am: 885dcf9661 am: 4a79453de3

Original change: https://android-review.googlesource.com/c/platform/system/core/+/2027088

Change-Id: I71ec165ab5f9ed16dfc83a73006ad88df5c0d5a7
diff --git a/init/init.cpp b/init/init.cpp
index eca7bc5..5a0b3a6 100644
--- a/init/init.cpp
+++ b/init/init.cpp
@@ -578,12 +578,29 @@
     HandlePowerctlMessage("shutdown,container");
 }
 
+static constexpr std::chrono::milliseconds kDiagnosticTimeout = 10s;
+
 static void HandleSignalFd() {
     signalfd_siginfo siginfo;
-    ssize_t bytes_read = TEMP_FAILURE_RETRY(read(signal_fd, &siginfo, sizeof(siginfo)));
-    if (bytes_read != sizeof(siginfo)) {
-        PLOG(ERROR) << "Failed to read siginfo from signal_fd";
-        return;
+    auto started = std::chrono::steady_clock::now();
+    for (;;) {
+        ssize_t bytes_read = TEMP_FAILURE_RETRY(read(signal_fd, &siginfo, sizeof(siginfo)));
+        if (bytes_read < 0 && errno == EAGAIN) {
+            auto now = std::chrono::steady_clock::now();
+            std::chrono::duration<double> waited = now - started;
+            if (waited >= kDiagnosticTimeout) {
+                LOG(ERROR) << "epoll() woke us up, but we waited with no SIGCHLD!";
+                started = now;
+            }
+
+            std::this_thread::sleep_for(100ms);
+            continue;
+        }
+        if (bytes_read != sizeof(siginfo)) {
+            PLOG(ERROR) << "Failed to read siginfo from signal_fd";
+            return;
+        }
+        break;
     }
 
     switch (siginfo.ssi_signo) {
@@ -639,7 +656,7 @@
         LOG(FATAL) << "Failed to register a fork handler: " << strerror(result);
     }
 
-    signal_fd = signalfd(-1, &mask, SFD_CLOEXEC);
+    signal_fd = signalfd(-1, &mask, SFD_CLOEXEC | SFD_NONBLOCK);
     if (signal_fd == -1) {
         PLOG(FATAL) << "failed to create signalfd";
     }
@@ -938,7 +955,7 @@
     setpriority(PRIO_PROCESS, 0, 0);
     while (true) {
         // By default, sleep until something happens.
-        auto epoll_timeout = std::optional<std::chrono::milliseconds>{};
+        auto epoll_timeout = std::optional<std::chrono::milliseconds>{kDiagnosticTimeout};
 
         auto shutdown_command = shutdown_state.CheckShutdown();
         if (shutdown_command) {
@@ -978,6 +995,13 @@
             for (const auto& function : *pending_functions) {
                 (*function)();
             }
+        } else if (Service::is_exec_service_running()) {
+            std::chrono::duration<double> waited =
+                    std::chrono::steady_clock::now() - Service::exec_service_started();
+            if (waited >= kDiagnosticTimeout) {
+                LOG(ERROR) << "Exec service is hung? Waited " << waited.count()
+                           << " without SIGCHLD";
+            }
         }
         if (!IsShuttingDown()) {
             HandleControlMessages();
diff --git a/init/service.cpp b/init/service.cpp
index 8a9cc0a..2ebf87e 100644
--- a/init/service.cpp
+++ b/init/service.cpp
@@ -127,6 +127,7 @@
 
 unsigned long Service::next_start_order_ = 1;
 bool Service::is_exec_service_running_ = false;
+std::chrono::time_point<std::chrono::steady_clock> Service::exec_service_started_;
 
 Service::Service(const std::string& name, Subcontext* subcontext_for_restart_commands,
                  const std::vector<std::string>& args, bool from_apex)
@@ -388,6 +389,7 @@
 
     flags_ |= SVC_EXEC;
     is_exec_service_running_ = true;
+    exec_service_started_ = std::chrono::steady_clock::now();
 
     LOG(INFO) << "SVC_EXEC service '" << name_ << "' pid " << pid_ << " (uid " << proc_attr_.uid
               << " gid " << proc_attr_.gid << "+" << proc_attr_.supp_gids.size() << " context "
diff --git a/init/service.h b/init/service.h
index 3f12aa2..d233cbf 100644
--- a/init/service.h
+++ b/init/service.h
@@ -102,6 +102,9 @@
     size_t CheckAllCommands() const { return onrestart_.CheckAllCommands(); }
 
     static bool is_exec_service_running() { return is_exec_service_running_; }
+    static std::chrono::time_point<std::chrono::steady_clock> exec_service_started() {
+        return exec_service_started_;
+    }
 
     const std::string& name() const { return name_; }
     const std::set<std::string>& classnames() const { return classnames_; }
@@ -154,6 +157,8 @@
 
     static unsigned long next_start_order_;
     static bool is_exec_service_running_;
+    static std::chrono::time_point<std::chrono::steady_clock> exec_service_started_;
+    static pid_t exec_service_pid_;
 
     std::string name_;
     std::set<std::string> classnames_;