init: Add more diagnostics for b/223076262.
This adds three more diagnostics to stuck exec services:
1. /proc/pid/fds is dumped
2. /proc/pid/status is dumped
3. HandleSignalFd is called to see if a SIGCHLD got stuck somewhere
Bug: 223076262
Test: while (1) in linkerconfig
Ignore-AOSP-First: diagnostics
Change-Id: Ida601d86e18be9d49b143fb88b418cbc171ecac6
diff --git a/init/init.cpp b/init/init.cpp
index 5a0b3a6..513d27d 100644
--- a/init/init.cpp
+++ b/init/init.cpp
@@ -33,7 +33,10 @@
#define _REALLY_INCLUDE_SYS__SYSTEM_PROPERTIES_H_
#include <sys/_system_properties.h>
+#include <filesystem>
+#include <fstream>
#include <functional>
+#include <iostream>
#include <map>
#include <memory>
#include <mutex>
@@ -580,10 +583,10 @@
static constexpr std::chrono::milliseconds kDiagnosticTimeout = 10s;
-static void HandleSignalFd() {
+static void HandleSignalFd(bool one_off) {
signalfd_siginfo siginfo;
auto started = std::chrono::steady_clock::now();
- for (;;) {
+ do {
ssize_t bytes_read = TEMP_FAILURE_RETRY(read(signal_fd, &siginfo, sizeof(siginfo)));
if (bytes_read < 0 && errno == EAGAIN) {
auto now = std::chrono::steady_clock::now();
@@ -601,7 +604,7 @@
return;
}
break;
- }
+ } while (!one_off);
switch (siginfo.ssi_signo) {
case SIGCHLD:
@@ -662,7 +665,8 @@
}
constexpr int flags = EPOLLIN | EPOLLPRI;
- if (auto result = epoll->RegisterHandler(signal_fd, HandleSignalFd, flags); !result.ok()) {
+ auto handler = std::bind(HandleSignalFd, false);
+ if (auto result = epoll->RegisterHandler(signal_fd, handler, flags); !result.ok()) {
LOG(FATAL) << result.error();
}
}
@@ -791,6 +795,32 @@
return {};
}
+static void DumpPidFds(const std::string& prefix, pid_t pid) {
+ std::error_code ec;
+ std::string proc_dir = "/proc/" + std::to_string(pid) + "/fd";
+ for (const auto& entry : std::filesystem::directory_iterator(proc_dir)) {
+ std::string target;
+ if (android::base::Readlink(entry.path(), &target)) {
+ LOG(ERROR) << prefix << target;
+ } else {
+ LOG(ERROR) << prefix << entry.path();
+ }
+ }
+}
+
+static void DumpFile(const std::string& prefix, const std::string& file) {
+ std::ifstream fp(file);
+ if (!fp) {
+ LOG(ERROR) << "Could not open " << file;
+ return;
+ }
+
+ std::string line;
+ while (std::getline(fp, line)) {
+ LOG(ERROR) << prefix << line;
+ }
+}
+
int SecondStageMain(int argc, char** argv) {
if (REBOOT_BOOTLOADER_ON_PANIC) {
InstallRebootSignalHandlers();
@@ -996,11 +1026,23 @@
(*function)();
}
} else if (Service::is_exec_service_running()) {
+ static bool dumped_diagnostics = false;
std::chrono::duration<double> waited =
std::chrono::steady_clock::now() - Service::exec_service_started();
if (waited >= kDiagnosticTimeout) {
LOG(ERROR) << "Exec service is hung? Waited " << waited.count()
<< " without SIGCHLD";
+ if (!dumped_diagnostics) {
+ DumpPidFds("exec service opened: ", Service::exec_service_pid());
+
+ std::string status_file =
+ "/proc/" + std::to_string(Service::exec_service_pid()) + "/status";
+ DumpFile("exec service: ", status_file);
+ dumped_diagnostics = true;
+
+ LOG(INFO) << "Attempting to handle any stuck SIGCHLDs...";
+ HandleSignalFd(true);
+ }
}
}
if (!IsShuttingDown()) {
diff --git a/init/service.cpp b/init/service.cpp
index 2ebf87e..48688f5 100644
--- a/init/service.cpp
+++ b/init/service.cpp
@@ -127,6 +127,7 @@
unsigned long Service::next_start_order_ = 1;
bool Service::is_exec_service_running_ = false;
+pid_t Service::exec_service_pid_ = -1;
std::chrono::time_point<std::chrono::steady_clock> Service::exec_service_started_;
Service::Service(const std::string& name, Subcontext* subcontext_for_restart_commands,
@@ -389,6 +390,7 @@
flags_ |= SVC_EXEC;
is_exec_service_running_ = true;
+ exec_service_pid_ = pid_;
exec_service_started_ = std::chrono::steady_clock::now();
LOG(INFO) << "SVC_EXEC service '" << name_ << "' pid " << pid_ << " (uid " << proc_attr_.uid
diff --git a/init/service.h b/init/service.h
index d233cbf..c314aa1 100644
--- a/init/service.h
+++ b/init/service.h
@@ -102,6 +102,7 @@
size_t CheckAllCommands() const { return onrestart_.CheckAllCommands(); }
static bool is_exec_service_running() { return is_exec_service_running_; }
+ static pid_t exec_service_pid() { return exec_service_pid_; }
static std::chrono::time_point<std::chrono::steady_clock> exec_service_started() {
return exec_service_started_;
}