init: Add more diagnostics for signalfd hangs.

This adds two new diagnostics. First, signalfd reads are now non-blocking. If the read takes more than 10 seconds, we log an error.

Second, init now wakes up from epoll() every 10 seconds. If it waits on an "exec" command for more than 10 seconds, it logs an error.

This change will be reverted as soon as we get feedback.

Bug: 223076262
Test: device boots
Change-Id: I7ee98d159599217a641b3de2564a92c2435f57ef
diff --git a/init/init.cpp b/init/init.cpp
index 1df4c44..d5c79f8 100644
--- a/init/init.cpp
+++ b/init/init.cpp
@@ -580,12 +580,29 @@
     HandlePowerctlMessage("shutdown,container");
 }
 
+static constexpr std::chrono::milliseconds kDiagnosticTimeout = 10s;
+
 static void HandleSignalFd() {
     signalfd_siginfo siginfo;
-    ssize_t bytes_read = TEMP_FAILURE_RETRY(read(signal_fd, &siginfo, sizeof(siginfo)));
-    if (bytes_read != sizeof(siginfo)) {
-        PLOG(ERROR) << "Failed to read siginfo from signal_fd";
-        return;
+    auto started = std::chrono::steady_clock::now();
+    for (;;) {
+        ssize_t bytes_read = TEMP_FAILURE_RETRY(read(signal_fd, &siginfo, sizeof(siginfo)));
+        if (bytes_read < 0 && errno == EAGAIN) {
+            auto now = std::chrono::steady_clock::now();
+            std::chrono::duration<double> waited = now - started;
+            if (waited >= kDiagnosticTimeout) {
+                LOG(ERROR) << "epoll() woke us up, but we waited with no SIGCHLD!";
+                started = now;
+            }
+
+            std::this_thread::sleep_for(100ms);
+            continue;
+        }
+        if (bytes_read != sizeof(siginfo)) {
+            PLOG(ERROR) << "Failed to read siginfo from signal_fd";
+            return;
+        }
+        break;
     }
 
     switch (siginfo.ssi_signo) {
@@ -641,7 +658,7 @@
         LOG(FATAL) << "Failed to register a fork handler: " << strerror(result);
     }
 
-    signal_fd = signalfd(-1, &mask, SFD_CLOEXEC);
+    signal_fd = signalfd(-1, &mask, SFD_CLOEXEC | SFD_NONBLOCK);
     if (signal_fd == -1) {
         PLOG(FATAL) << "failed to create signalfd";
     }
@@ -1020,7 +1037,7 @@
     setpriority(PRIO_PROCESS, 0, 0);
     while (true) {
         // By default, sleep until something happens.
-        auto epoll_timeout = std::optional<std::chrono::milliseconds>{};
+        auto epoll_timeout = std::optional<std::chrono::milliseconds>{kDiagnosticTimeout};
 
         auto shutdown_command = shutdown_state.CheckShutdown();
         if (shutdown_command) {
@@ -1060,6 +1077,13 @@
             for (const auto& function : *pending_functions) {
                 (*function)();
             }
+        } else if (Service::is_exec_service_running()) {
+            std::chrono::duration<double> waited =
+                    std::chrono::steady_clock::now() - Service::exec_service_started();
+            if (waited >= kDiagnosticTimeout) {
+                LOG(ERROR) << "Exec service is hung? Waited " << waited.count()
+                           << " without SIGCHLD";
+            }
         }
         if (!IsShuttingDown()) {
             HandleControlMessages();
diff --git a/init/service.cpp b/init/service.cpp
index f6dd9b9..19e32b8 100644
--- a/init/service.cpp
+++ b/init/service.cpp
@@ -127,6 +127,7 @@
 
 unsigned long Service::next_start_order_ = 1;
 bool Service::is_exec_service_running_ = false;
+std::chrono::time_point<std::chrono::steady_clock> Service::exec_service_started_;
 
 Service::Service(const std::string& name, Subcontext* subcontext_for_restart_commands,
                  const std::vector<std::string>& args, bool from_apex)
@@ -388,6 +389,7 @@
 
     flags_ |= SVC_EXEC;
     is_exec_service_running_ = true;
+    exec_service_started_ = std::chrono::steady_clock::now();
 
     LOG(INFO) << "SVC_EXEC service '" << name_ << "' pid " << pid_ << " (uid " << proc_attr_.uid
               << " gid " << proc_attr_.gid << "+" << proc_attr_.supp_gids.size() << " context "
diff --git a/init/service.h b/init/service.h
index 3289f54..a9dc966 100644
--- a/init/service.h
+++ b/init/service.h
@@ -102,6 +102,9 @@
     size_t CheckAllCommands() const { return onrestart_.CheckAllCommands(); }
 
     static bool is_exec_service_running() { return is_exec_service_running_; }
+    static std::chrono::time_point<std::chrono::steady_clock> exec_service_started() {
+        return exec_service_started_;
+    }
 
     const std::string& name() const { return name_; }
     const std::set<std::string>& classnames() const { return classnames_; }
@@ -148,6 +151,8 @@
 
     static unsigned long next_start_order_;
     static bool is_exec_service_running_;
+    static std::chrono::time_point<std::chrono::steady_clock> exec_service_started_;
+    static pid_t exec_service_pid_;
 
     std::string name_;
     std::set<std::string> classnames_;