init: Add reboot timeout handler
In order to prevent device stuck at reboot, we try to create shutdownt
monitor thread with a timeout (default 30s). It will dump init process
and blocked tasks call trace in last kmsg then trigger kernel panic to
reboot device.
Test: reboot device
bug: 128561401
Change-Id: Ieb400ab9fbd983544b61241a4f4b8aa2f4baa863
diff --git a/init/reboot.cpp b/init/reboot.cpp
index 5b90969..0966b6c 100644
--- a/init/reboot.cpp
+++ b/init/reboot.cpp
@@ -19,13 +19,14 @@
 #include <dirent.h>
 #include <fcntl.h>
 #include <linux/fs.h>
-#include <mntent.h>
 #include <linux/loop.h>
+#include <mntent.h>
+#include <semaphore.h>
 #include <sys/cdefs.h>
 #include <sys/ioctl.h>
 #include <sys/mount.h>
-#include <sys/swap.h>
 #include <sys/stat.h>
+#include <sys/swap.h>
 #include <sys/syscall.h>
 #include <sys/types.h>
 #include <sys/wait.h>
@@ -57,11 +58,14 @@
 #include "service.h"
 #include "sigchld_handler.h"
 
+#define PROC_SYSRQ "/proc/sysrq-trigger"
+
 using android::base::GetBoolProperty;
 using android::base::Split;
 using android::base::StringPrintf;
 using android::base::Timer;
 using android::base::unique_fd;
+using android::base::WriteStringToFile;
 
 namespace android {
 namespace init {
@@ -207,8 +211,8 @@
     }
     FindPartitionsToUmount(nullptr, nullptr, true);
     // dump current CPU stack traces and uninterruptible tasks
-    android::base::WriteStringToFile("l", "/proc/sysrq-trigger");
-    android::base::WriteStringToFile("w", "/proc/sysrq-trigger");
+    WriteStringToFile("l", PROC_SYSRQ);
+    WriteStringToFile("w", PROC_SYSRQ);
 }
 
 static UmountStat UmountPartitions(std::chrono::milliseconds timeout) {
@@ -248,7 +252,91 @@
     }
 }
 
-static void KillAllProcesses() { android::base::WriteStringToFile("i", "/proc/sysrq-trigger"); }
+static void KillAllProcesses() {
+    WriteStringToFile("i", PROC_SYSRQ);
+}
+
+// Create reboot/shutdwon monitor thread
+void RebootMonitorThread(unsigned int cmd, const std::string& rebootTarget, sem_t* reboot_semaphore,
+                         std::chrono::milliseconds shutdown_timeout, bool* reboot_monitor_run) {
+    unsigned int remaining_shutdown_time = 0;
+
+    // 30 seconds more than the timeout passed to the thread as there is a final Umount pass
+    // after the timeout is reached.
+    constexpr unsigned int shutdown_watchdog_timeout_default = 30;
+    auto shutdown_watchdog_timeout = android::base::GetUintProperty(
+            "ro.build.shutdown.watchdog.timeout", shutdown_watchdog_timeout_default);
+    remaining_shutdown_time = shutdown_watchdog_timeout + shutdown_timeout.count() / 1000;
+
+    while (*reboot_monitor_run == true) {
+        if (TEMP_FAILURE_RETRY(sem_wait(reboot_semaphore)) == -1) {
+            LOG(ERROR) << "sem_wait failed and exit RebootMonitorThread()";
+            return;
+        }
+
+        timespec shutdown_timeout_timespec;
+        if (clock_gettime(CLOCK_MONOTONIC, &shutdown_timeout_timespec) == -1) {
+            LOG(ERROR) << "clock_gettime() fail! exit RebootMonitorThread()";
+            return;
+        }
+
+        // If there are some remaining shutdown time left from previous round, we use
+        // remaining time here.
+        shutdown_timeout_timespec.tv_sec += remaining_shutdown_time;
+
+        LOG(INFO) << "shutdown_timeout_timespec.tv_sec: " << shutdown_timeout_timespec.tv_sec;
+
+        int sem_return = 0;
+        while ((sem_return = sem_timedwait_monotonic_np(reboot_semaphore,
+                                                        &shutdown_timeout_timespec)) == -1 &&
+               errno == EINTR) {
+        }
+
+        if (sem_return == -1) {
+            LOG(ERROR) << "Reboot thread timed out";
+
+            if (android::base::GetBoolProperty("ro.debuggable", false) == true) {
+                LOG(INFO) << "Try to dump init process call trace:";
+                const char* vdc_argv[] = {"/system/bin/debuggerd", "-b", "1"};
+                int status;
+                android_fork_execvp_ext(arraysize(vdc_argv), (char**)vdc_argv, &status, true,
+                                        LOG_KLOG, true, nullptr, nullptr, 0);
+
+                LOG(INFO) << "Show stack for all active CPU:";
+                WriteStringToFile("l", PROC_SYSRQ);
+
+                LOG(INFO) << "Show tasks that are in disk sleep(uninterruptable sleep), which are "
+                             "like "
+                             "blocked in mutex or hardware register access:";
+                WriteStringToFile("w", PROC_SYSRQ);
+            }
+
+            // In shutdown case,notify kernel to sync and umount fs to read-only before shutdown.
+            if (cmd == ANDROID_RB_POWEROFF || cmd == ANDROID_RB_THERMOFF) {
+                WriteStringToFile("s", PROC_SYSRQ);
+
+                WriteStringToFile("u", PROC_SYSRQ);
+
+                RebootSystem(cmd, rebootTarget);
+            }
+
+            LOG(ERROR) << "Trigger crash at last!";
+            WriteStringToFile("c", PROC_SYSRQ);
+        } else {
+            timespec current_time_timespec;
+
+            if (clock_gettime(CLOCK_MONOTONIC, ¤t_time_timespec) == -1) {
+                LOG(ERROR) << "clock_gettime() fail! exit RebootMonitorThread()";
+                return;
+            }
+
+            remaining_shutdown_time =
+                    shutdown_timeout_timespec.tv_sec - current_time_timespec.tv_sec;
+
+            LOG(INFO) << "remaining_shutdown_time: " << remaining_shutdown_time;
+        }
+    }
+}
 
 /* Try umounting all emulated file systems R/W block device cfile systems.
  * This will just try umount and give it up if it fails.
@@ -259,7 +347,8 @@
  *
  * return true when umount was successful. false when timed out.
  */
-static UmountStat TryUmountAndFsck(bool runFsck, std::chrono::milliseconds timeout) {
+static UmountStat TryUmountAndFsck(unsigned int cmd, const std::string& rebootTarget, bool runFsck,
+                                   std::chrono::milliseconds timeout, sem_t* reboot_semaphore) {
     Timer t;
     std::vector<MountEntry> block_devices;
     std::vector<MountEntry> emulated_devices;
@@ -279,11 +368,17 @@
     }
 
     if (stat == UMOUNT_STAT_SUCCESS && runFsck) {
+        LOG(INFO) << "Pause reboot monitor thread before fsck";
+        sem_post(reboot_semaphore);
+
         // fsck part is excluded from timeout check. It only runs for user initiated shutdown
         // and should not affect reboot time.
         for (auto& entry : block_devices) {
             entry.DoFsck();
         }
+
+        LOG(INFO) << "Resume reboot monitor thread after fsck";
+        sem_post(reboot_semaphore);
     }
     return stat;
 }
@@ -311,7 +406,7 @@
     }
     LOG(INFO) << "swapoff() took " << swap_timer;;
 
-    if (!android::base::WriteStringToFile("1", ZRAM_RESET)) {
+    if (!WriteStringToFile("1", ZRAM_RESET)) {
         LOG(ERROR) << "zram_backing_dev: reset (" << backing_dev << ")" << " failed";
         return;
     }
@@ -369,6 +464,23 @@
     }
     LOG(INFO) << "Shutdown timeout: " << shutdown_timeout.count() << " ms";
 
+    sem_t reboot_semaphore;
+    if (sem_init(&reboot_semaphore, false, 0) == -1) {
+        // These should never fail, but if they do, skip the graceful reboot and reboot immediately.
+        LOG(ERROR) << "sem_init() fail and RebootSystem() return!";
+        RebootSystem(cmd, rebootTarget);
+    }
+
+    // Start a thread to monitor init shutdown process
+    LOG(INFO) << "Create reboot monitor thread.";
+    bool reboot_monitor_run = true;
+    std::thread reboot_monitor_thread(&RebootMonitorThread, cmd, rebootTarget, &reboot_semaphore,
+                                      shutdown_timeout, &reboot_monitor_run);
+    reboot_monitor_thread.detach();
+
+    // Start reboot monitor thread
+    sem_post(&reboot_semaphore);
+
     // keep debugging tools until non critical ones are all gone.
     const std::set<std::string> kill_after_apps{"tombstoned", "logd", "adbd"};
     // watchdogd is a vendor specific component but should be alive to complete shutdown safely.
@@ -497,7 +609,8 @@
     // 5. drop caches and disable zram backing device, if exist
     KillZramBackingDevice();
 
-    UmountStat stat = TryUmountAndFsck(runFsck, shutdown_timeout - t.duration());
+    UmountStat stat = TryUmountAndFsck(cmd, rebootTarget, runFsck, shutdown_timeout - t.duration(),
+                                       &reboot_semaphore);
     // Follow what linux shutdown is doing: one more sync with little bit delay
     {
         Timer sync_timer;
@@ -507,6 +620,11 @@
     }
     if (!is_thermal_shutdown) std::this_thread::sleep_for(100ms);
     LogShutdownTime(stat, &t);
+
+    // Send signal to terminate reboot monitor thread.
+    reboot_monitor_run = false;
+    sem_post(&reboot_semaphore);
+
     // Reboot regardless of umount status. If umount fails, fsck after reboot will fix it.
     RebootSystem(cmd, rebootTarget);
     abort();