dumpstate: Kill vendor dumpstate HAL and grab partial dump in timeout
dumpstate_board should not block forever but it is also a valuable
source for debugging system problems. In timeout case, we should kill
dumpstate HAL in case it hangs (e.g. cat from a pipe or so), so the
following calls can be served by HAL service.
Bug: 77489941
Test: simulate delay in dumpstate HAL and get BR, see partial content
from dumpstate_board.tx and below log from dumpstate_log.txt
dumpstateBoard timed out after 10s, killing dumpstate vendor HAL
dumpstateBoard failed: Status(EX_TRANSACTION_FAILED): 'DEAD_OBJECT: '
Change-Id: Iadfd0c234c26714b6a2f1aa2941a85d4e01d2cbc
diff --git a/cmds/dumpstate/dumpstate.cpp b/cmds/dumpstate/dumpstate.cpp
index 19bf216..5ee543c 100644
--- a/cmds/dumpstate/dumpstate.cpp
+++ b/cmds/dumpstate/dumpstate.cpp
@@ -1236,7 +1236,6 @@
}
static void DumpHals() {
- using android::sp;
using android::hidl::manager::V1_0::IServiceManager;
using android::hardware::defaultServiceManager;
@@ -1560,69 +1559,80 @@
paths[i])));
}
+ sp<IDumpstateDevice> dumpstate_device(IDumpstateDevice::getService());
+ if (dumpstate_device == nullptr) {
+ MYLOGE("No IDumpstateDevice implementation\n");
+ return;
+ }
+
+ using ScopedNativeHandle =
+ std::unique_ptr<native_handle_t, std::function<void(native_handle_t*)>>;
+ ScopedNativeHandle handle(native_handle_create(static_cast<int>(paths.size()), 0),
+ [](native_handle_t* handle) {
+ native_handle_close(handle);
+ native_handle_delete(handle);
+ });
+ if (handle == nullptr) {
+ MYLOGE("Could not create native_handle\n");
+ return;
+ }
+
+ for (size_t i = 0; i < paths.size(); i++) {
+ MYLOGI("Calling IDumpstateDevice implementation using path %s\n", paths[i].c_str());
+
+ android::base::unique_fd fd(TEMP_FAILURE_RETRY(
+ open(paths[i].c_str(), O_WRONLY | O_CREAT | O_TRUNC | O_CLOEXEC | O_NOFOLLOW,
+ S_IRUSR | S_IWUSR | S_IRGRP | S_IROTH)));
+ if (fd < 0) {
+ MYLOGE("Could not open file %s: %s\n", paths[i].c_str(), strerror(errno));
+ return;
+ }
+ handle.get()->data[i] = fd.release();
+ }
+
// Given that bugreport is required to diagnose failures, it's better to
- // drop the result of IDumpstateDevice than to block the rest of bugreport
- // for an arbitrary amount of time.
- std::packaged_task<std::unique_ptr<ssize_t[]>()>
- dumpstate_task([paths]() -> std::unique_ptr<ssize_t[]> {
- ::android::sp<IDumpstateDevice> dumpstate_device(IDumpstateDevice::getService());
- if (dumpstate_device == nullptr) {
- MYLOGE("No IDumpstateDevice implementation\n");
- return nullptr;
- }
-
- using ScopedNativeHandle =
- std::unique_ptr<native_handle_t, std::function<void(native_handle_t*)>>;
- ScopedNativeHandle handle(native_handle_create(static_cast<int>(paths.size()), 0),
- [](native_handle_t* handle) {
- native_handle_close(handle);
- native_handle_delete(handle);
- });
- if (handle == nullptr) {
- MYLOGE("Could not create native_handle\n");
- return nullptr;
- }
-
- for (size_t i = 0; i < paths.size(); i++) {
- MYLOGI("Calling IDumpstateDevice implementation using path %s\n", paths[i].c_str());
-
- android::base::unique_fd fd(TEMP_FAILURE_RETRY(
- open(paths[i].c_str(), O_WRONLY | O_CREAT | O_TRUNC | O_CLOEXEC | O_NOFOLLOW,
- S_IRUSR | S_IWUSR | S_IRGRP | S_IROTH)));
- if (fd < 0) {
- MYLOGE("Could not open file %s: %s\n", paths[i].c_str(), strerror(errno));
- return nullptr;
- }
- handle.get()->data[i] = fd.release();
- }
-
+ // set an arbitrary amount of timeout for IDumpstateDevice than to block the
+ // rest of bugreport. In the timeout case, we will kill dumpstate board HAL
+ // and grab whatever dumped
+ std::packaged_task<bool()>
+ dumpstate_task([paths, dumpstate_device, &handle]() -> bool {
android::hardware::Return<void> status = dumpstate_device->dumpstateBoard(handle.get());
if (!status.isOk()) {
MYLOGE("dumpstateBoard failed: %s\n", status.description().c_str());
- return nullptr;
+ return false;
}
- auto file_sizes = std::make_unique<ssize_t[]>(paths.size());
- for (size_t i = 0; i < paths.size(); i++) {
- struct stat s;
- if (fstat(handle.get()->data[i], &s) == -1) {
- MYLOGE("Failed to fstat %s: %s\n", kDumpstateBoardFiles[i].c_str(),
- strerror(errno));
- file_sizes[i] = -1;
- continue;
- }
- file_sizes[i] = s.st_size;
- }
- return file_sizes;
+ return true;
});
+
auto result = dumpstate_task.get_future();
std::thread(std::move(dumpstate_task)).detach();
- if (result.wait_for(30s) != std::future_status::ready) {
- MYLOGE("dumpstateBoard timed out after 30s\n");
- return;
+
+ constexpr size_t timeout_sec = 30;
+ if (result.wait_for(std::chrono::seconds(timeout_sec)) != std::future_status::ready) {
+ MYLOGE("dumpstateBoard timed out after %zus, killing dumpstate vendor HAL\n", timeout_sec);
+ if (!android::base::SetProperty("ctl.interface_restart",
+ android::base::StringPrintf("%s/default",
+ IDumpstateDevice::descriptor))) {
+ MYLOGE("Couldn't restart dumpstate HAL\n");
+ }
}
- std::unique_ptr<ssize_t[]> file_sizes = result.get();
- if (file_sizes == nullptr) {
- return;
+ // Wait some time for init to kill dumpstate vendor HAL
+ constexpr size_t killing_timeout_sec = 10;
+ if (result.wait_for(std::chrono::seconds(killing_timeout_sec)) != std::future_status::ready) {
+ MYLOGE("killing dumpstateBoard timed out after %zus, continue and "
+ "there might be racing in content\n", killing_timeout_sec);
+ }
+
+ auto file_sizes = std::make_unique<ssize_t[]>(paths.size());
+ for (size_t i = 0; i < paths.size(); i++) {
+ struct stat s;
+ if (fstat(handle.get()->data[i], &s) == -1) {
+ MYLOGE("Failed to fstat %s: %s\n", kDumpstateBoardFiles[i].c_str(),
+ strerror(errno));
+ file_sizes[i] = -1;
+ continue;
+ }
+ file_sizes[i] = s.st_size;
}
for (size_t i = 0; i < paths.size(); i++) {