Merge "init: Add more diagnostics for signalfd hangs." am: fe62ca7165 am: 885dcf9661 am: 4a79453de3
Original change: https://android-review.googlesource.com/c/platform/system/core/+/2027088
Change-Id: I71ec165ab5f9ed16dfc83a73006ad88df5c0d5a7
diff --git a/init/init.cpp b/init/init.cpp
index eca7bc5..5a0b3a6 100644
--- a/init/init.cpp
+++ b/init/init.cpp
@@ -578,12 +578,29 @@
HandlePowerctlMessage("shutdown,container");
}
+static constexpr std::chrono::milliseconds kDiagnosticTimeout = 10s;
+
static void HandleSignalFd() {
signalfd_siginfo siginfo;
- ssize_t bytes_read = TEMP_FAILURE_RETRY(read(signal_fd, &siginfo, sizeof(siginfo)));
- if (bytes_read != sizeof(siginfo)) {
- PLOG(ERROR) << "Failed to read siginfo from signal_fd";
- return;
+ auto started = std::chrono::steady_clock::now();
+ for (;;) {
+ ssize_t bytes_read = TEMP_FAILURE_RETRY(read(signal_fd, &siginfo, sizeof(siginfo)));
+ if (bytes_read < 0 && errno == EAGAIN) {
+ auto now = std::chrono::steady_clock::now();
+ std::chrono::duration<double> waited = now - started;
+ if (waited >= kDiagnosticTimeout) {
+ LOG(ERROR) << "epoll() woke us up, but we waited with no SIGCHLD!";
+ started = now;
+ }
+
+ std::this_thread::sleep_for(100ms);
+ continue;
+ }
+ if (bytes_read != sizeof(siginfo)) {
+ PLOG(ERROR) << "Failed to read siginfo from signal_fd";
+ return;
+ }
+ break;
}
switch (siginfo.ssi_signo) {
@@ -639,7 +656,7 @@
LOG(FATAL) << "Failed to register a fork handler: " << strerror(result);
}
- signal_fd = signalfd(-1, &mask, SFD_CLOEXEC);
+ signal_fd = signalfd(-1, &mask, SFD_CLOEXEC | SFD_NONBLOCK);
if (signal_fd == -1) {
PLOG(FATAL) << "failed to create signalfd";
}
@@ -938,7 +955,7 @@
setpriority(PRIO_PROCESS, 0, 0);
while (true) {
// By default, sleep until something happens.
- auto epoll_timeout = std::optional<std::chrono::milliseconds>{};
+ auto epoll_timeout = std::optional<std::chrono::milliseconds>{kDiagnosticTimeout};
auto shutdown_command = shutdown_state.CheckShutdown();
if (shutdown_command) {
@@ -978,6 +995,13 @@
for (const auto& function : *pending_functions) {
(*function)();
}
+ } else if (Service::is_exec_service_running()) {
+ std::chrono::duration<double> waited =
+ std::chrono::steady_clock::now() - Service::exec_service_started();
+ if (waited >= kDiagnosticTimeout) {
+ LOG(ERROR) << "Exec service is hung? Waited " << waited.count()
+ << " without SIGCHLD";
+ }
}
if (!IsShuttingDown()) {
HandleControlMessages();
diff --git a/init/service.cpp b/init/service.cpp
index 8a9cc0a..2ebf87e 100644
--- a/init/service.cpp
+++ b/init/service.cpp
@@ -127,6 +127,7 @@
unsigned long Service::next_start_order_ = 1;
bool Service::is_exec_service_running_ = false;
+std::chrono::time_point<std::chrono::steady_clock> Service::exec_service_started_;
Service::Service(const std::string& name, Subcontext* subcontext_for_restart_commands,
const std::vector<std::string>& args, bool from_apex)
@@ -388,6 +389,7 @@
flags_ |= SVC_EXEC;
is_exec_service_running_ = true;
+ exec_service_started_ = std::chrono::steady_clock::now();
LOG(INFO) << "SVC_EXEC service '" << name_ << "' pid " << pid_ << " (uid " << proc_attr_.uid
<< " gid " << proc_attr_.gid << "+" << proc_attr_.supp_gids.size() << " context "
diff --git a/init/service.h b/init/service.h
index 3f12aa2..d233cbf 100644
--- a/init/service.h
+++ b/init/service.h
@@ -102,6 +102,9 @@
size_t CheckAllCommands() const { return onrestart_.CheckAllCommands(); }
static bool is_exec_service_running() { return is_exec_service_running_; }
+ static std::chrono::time_point<std::chrono::steady_clock> exec_service_started() {
+ return exec_service_started_;
+ }
const std::string& name() const { return name_; }
const std::set<std::string>& classnames() const { return classnames_; }
@@ -154,6 +157,8 @@
static unsigned long next_start_order_;
static bool is_exec_service_running_;
+ static std::chrono::time_point<std::chrono::steady_clock> exec_service_started_;
+ static pid_t exec_service_pid_;
std::string name_;
std::set<std::string> classnames_;