bootstat: introduce sys.boot.reason

Adding functionality to bootstat --record_boot_complete and
--record_boot_reason to initialize sys.boot.reason, the canonical
system boot reason.

Filter out ro.boot.bootreason oem noise into sys.boot.reason.  Add
heuristics to determine what the boot reason is, when otherwise would
be defaulting to the blunt and relatively devoid of detail catch-all
reboot reasons ("reboot", "shutdown", "cold", "warm", "hard").

boot_reason_test.sh is also a compliance test.

Test: boot_reason_test.sh all
Bug: 63736262
Change-Id: Ic9a42cccbcfc89a5c0e081ba66d577a97c8c8c76
diff --git a/bootstat/bootstat.cpp b/bootstat/bootstat.cpp
index bd611f0..418c711 100644
--- a/bootstat/bootstat.cpp
+++ b/bootstat/bootstat.cpp
@@ -20,6 +20,7 @@
 
 #include <getopt.h>
 #include <unistd.h>
+#include <sys/klog.h>
 
 #include <chrono>
 #include <cmath>
@@ -32,11 +33,14 @@
 #include <vector>
 
 #include <android-base/chrono_utils.h>
+#include <android-base/file.h>
 #include <android-base/logging.h>
 #include <android-base/parseint.h>
 #include <android-base/strings.h>
 #include <android/log.h>
+#include <cutils/android_reboot.h>
 #include <cutils/properties.h>
+#include <log/logcat.h>
 #include <metricslogger/metrics_logger.h>
 
 #include "boot_event_record_store.h"
@@ -118,6 +122,14 @@
   return std::string(&temp[0], len);
 }
 
+void SetProperty(const char* key, const std::string& val) {
+  property_set(key, val.c_str());
+}
+
+void SetProperty(const char* key, const char* val) {
+  property_set(key, val);
+}
+
 constexpr int32_t kUnknownBootReason = 1;
 
 // A mapping from boot reason string, as read from the ro.boot.bootreason
@@ -175,7 +187,21 @@
   {"reboot,cold", 48},
   {"reboot,recovery", 49},
   {"thermal_shutdown", 50},
-  {"s3_wakeup", 51}
+  {"s3_wakeup", 51},
+  {"kernel_panic,sysrq", 52},
+  {"kernel_panic,NULL", 53},
+  {"kernel_panic,BUG", 54},
+  {"bootloader", 55},
+  {"cold", 56},
+  {"hard", 57},
+  {"warm", 58},
+  {"recovery", 59},
+  {"thermal-shutdown", 60},
+  {"shutdown,thermal", 61},
+  {"shutdown,battery", 62},
+  {"reboot,ota", 63},
+  {"reboot,factory_reset", 64},
+  {"reboot,", 65},
 };
 
 // Converts a string value representing the reason the system booted to an
@@ -191,6 +217,323 @@
   return kUnknownBootReason;
 }
 
+// Canonical list of supported primary reboot reasons.
+const std::vector<const std::string> knownReasons = {
+  // kernel
+  "watchdog",
+  "kernel_panic",
+  // strong
+  "recovery",      // Should not happen from ro.boot.bootreason
+  "bootloader",    // Should not happen from ro.boot.bootreason
+  // blunt
+  "cold",
+  "hard",
+  "warm",
+  "shutdown",      // Can not happen from ro.boot.bootreason
+  "reboot",        // Default catch-all for anything unknown
+};
+
+// Returns true if the supplied reason prefix is considered detailed enough.
+bool isStrongRebootReason(const std::string& r) {
+  for (auto &s : knownReasons) {
+    if (s == "cold") break;
+    // Prefix defined as terminated by a nul or comma (,).
+    if (android::base::StartsWith(r, s.c_str()) &&
+        ((r.length() == s.length()) || (r[s.length()] == ','))) {
+      return true;
+    }
+  }
+  return false;
+}
+
+// Returns true if the supplied reason prefix is associated with the kernel.
+bool isKernelRebootReason(const std::string& r) {
+  for (auto &s : knownReasons) {
+    if (s == "recovery") break;
+    // Prefix defined as terminated by a nul or comma (,).
+    if (android::base::StartsWith(r, s.c_str()) &&
+        ((r.length() == s.length()) || (r[s.length()] == ','))) {
+      return true;
+    }
+  }
+  return false;
+}
+
+// Returns true if the supplied reason prefix is considered known.
+bool isKnownRebootReason(const std::string& r) {
+  for (auto &s : knownReasons) {
+    // Prefix defined as terminated by a nul or comma (,).
+    if (android::base::StartsWith(r, s.c_str()) &&
+        ((r.length() == s.length()) || (r[s.length()] == ','))) {
+      return true;
+    }
+  }
+  return false;
+}
+
+// If the reboot reason should be improved, report true if is too blunt.
+bool isBluntRebootReason(const std::string& r) {
+  if (isStrongRebootReason(r)) return false;
+
+  if (!isKnownRebootReason(r)) return true; // Can not support unknown as detail
+
+  size_t pos = 0;
+  while ((pos = r.find(',', pos)) != std::string::npos) {
+    ++pos;
+    std::string next(r.substr(pos));
+    if (next.length() == 0) break;
+    if (next[0] == ',') continue;
+    if (!isKnownRebootReason(next)) return false; // Unknown subreason is good.
+    if (isStrongRebootReason(next)) return false; // eg: reboot,reboot
+  }
+  return true;
+}
+
+// std::transform Helper callback functions:
+// Converts a string value representing the reason the system booted to a
+// string complying with Android system standard reason.
+char tounderline(char c) { return ::isblank(c) ? '_' : c; }
+char toprintable(char c) { return ::isprint(c) ? c : '?'; }
+
+const char system_reboot_reason_property[] = "sys.boot.reason";
+const char last_reboot_reason_property[] = LAST_REBOOT_REASON_PROPERTY;
+const char bootloader_reboot_reason_property[] = "ro.boot.bootreason";
+
+// Scrub, Sanitize, Standardize and Enhance the boot reason string supplied.
+std::string BootReasonStrToReason(const std::string& boot_reason) {
+  std::string ret(GetProperty(system_reboot_reason_property));
+  std::string reason(boot_reason);
+  // If sys.boot.reason == ro.boot.bootreason, let's re-evaluate
+  if (reason == ret) ret = "";
+
+  // Cleanup boot_reason regarding acceptable character set
+  std::transform(reason.begin(), reason.end(), reason.begin(), ::tolower);
+  std::transform(reason.begin(), reason.end(), reason.begin(), tounderline);
+  std::transform(reason.begin(), reason.end(), reason.begin(), toprintable);
+
+  // Is the current system boot reason sys.boot.reason valid?
+  if (!isKnownRebootReason(ret)) ret = "";
+
+  if (ret == "") {
+    // Is the bootloader boot reason ro.boot.bootreason known?
+    std::vector<std::string> words(android::base::Split(reason, ",_-"));
+    for (auto &s : knownReasons) {
+      std::string blunt;
+      for (auto &r : words) {
+        if (r == s) {
+          if (isBluntRebootReason(s)) {
+            blunt = s;
+          } else {
+            ret = s;
+            break;
+          }
+        }
+      }
+      if (ret == "") ret = blunt;
+      if (ret != "") break;
+    }
+  }
+
+  if (ret == "") {
+    // A series of checks to take some officially unsupported reasons
+    // reported by the bootloader and find some logical and canonical
+    // sense.  In an ideal world, we would require those bootloaders
+    // to behave and follow our standards.
+    static const std::vector<std::pair<const std::string, const std::string>> aliasReasons = {
+      {"watchdog", "wdog"},
+      {"cold,powerkey", "powerkey"},
+      {"kernel_panic", "panic"},
+      {"shutdown,thermal", "thermal"},
+      {"warm,s3_wakeup", "s3_wakeup"},
+      {"hard,hw_reset", "hw_reset"},
+      {"bootloader", ""},
+    };
+
+    // Either the primary or alias is found _somewhere_ in the reason string.
+    for (auto &s : aliasReasons) {
+      if (reason.find(s.first) != std::string::npos) {
+        ret = s.first;
+        break;
+      }
+      if (s.second.size() && (reason.find(s.second) != std::string::npos)) {
+        ret = s.first;
+        break;
+      }
+    }
+  }
+
+  // If watchdog is the reason, see if there is a security angle?
+  if (ret == "watchdog") {
+    if (reason.find("sec") != std::string::npos) {
+      ret += ",security";
+    }
+  }
+
+  // Check the other reason resources if the reason is still blunt.
+  if (isBluntRebootReason(ret)) {
+    // Check to see if last klog has some refinement hints.
+    std::string content;
+    if (!android::base::ReadFileToString("/sys/fs/pstore/console-ramoops-0",
+                                         &content)) {
+        android::base::ReadFileToString("/sys/fs/pstore/console-ramoops",
+                                         &content);
+    }
+
+    // The toybox reboot command used directly (unlikely)? But also
+    // catches init's response to the Android's more controlled reboot command.
+    if (content.rfind("reboot: Power down") != std::string::npos) {
+      ret = "shutdown"; // Still too blunt, but more accurate.
+      // ToDo: init should record the shutdown reason to kernel messages ala:
+      //           init: shutdown system with command 'last_reboot_reason'
+      //       so that if pstore has persistence we can get some details
+      //       that could be missing in last_reboot_reason_property.
+    }
+
+    static const char cmd[] = "reboot: Restarting system with command '";
+    size_t pos = content.rfind(cmd);
+    if (pos != std::string::npos) {
+      pos += strlen(cmd);
+      std::string subReason(content.substr(pos));
+      pos = subReason.find('\'');
+      if (pos != std::string::npos) subReason.erase(pos);
+      if (subReason != "") { // Will not land "reboot" as that is too blunt.
+        if (isKernelRebootReason(subReason)) {
+          ret = "reboot," + subReason; // User space can't talk kernel reasons.
+        } else {
+          ret = subReason;
+        }
+      }
+    }
+
+    // Check for kernel panics, but allowed to override reboot command.
+    if (content.rfind("sysrq: SysRq : Trigger a crash") != std::string::npos) {
+      // Can not happen, except on userdebug, during testing/debugging.
+      ret = "kernel_panic,sysrq";
+    } else if (content.rfind(
+        "Unable to handle kernel NULL pointer dereference at virtual address")
+               != std::string::npos) {
+      ret = "kernel_panic,NULL";
+    } else if (content.rfind("Kernel BUG at ") != std::string::npos) {
+      ret = "kernel_panic,BUG";
+    } else if ((content.rfind("Power held for ") != std::string::npos) ||
+        (content.rfind("charger: [") != std::string::npos)) {
+      ret = "cold";
+    }
+
+    // The following battery test should migrate to a default system health HAL
+
+    // Let us not worry if the reboot command was issued, for the cases of
+    // reboot -p, reboot <no reason>, reboot cold, reboot warm and reboot hard.
+    // Same for bootloader and ro.boot.bootreasons of this set, but a dead
+    // battery could conceivably lead to these, so worthy of override.
+    if (isBluntRebootReason(ret)) {
+      // Heuristic to determine if shutdown possibly because of a dead battery?
+      // Really a hail-mary pass to find it in last klog content ...
+      static const int battery_dead_threshold = 2; // percent
+      static const char battery[] = "healthd: battery l=";
+      pos = content.rfind(battery); // last one
+      if (pos != std::string::npos) {
+        int level = atoi(content.substr(pos + strlen(battery)).c_str());
+        LOG(INFO) << "Battery level at shutdown " << level << "%";
+        if (level <= battery_dead_threshold) {
+          ret = "shutdown,battery";
+        }
+      } else { // Most likely
+        // Content buffer no longer will have console data. Beware if more
+        // checks added below, that depend on parsing console content.
+        content = "";
+
+        LOG(DEBUG) << "Can not find last low battery in last console messages";
+        android_logcat_context ctx = create_android_logcat();
+        FILE *fp = android_logcat_popen(&ctx, "logcat -b kernel -v brief -d");
+        if (fp != nullptr) {
+          android::base::ReadFdToString(fileno(fp), &content);
+        }
+        android_logcat_pclose(&ctx, fp);
+        android_logcat_destroy(&ctx);
+        static const char logcat_battery[] = "W/healthd (    0): battery l=";
+        const char* match = logcat_battery;
+
+        if (content == "") {
+          // Service logd.klog not running, go to smaller buffer in the kernel.
+          int rc = klogctl(KLOG_SIZE_BUFFER, nullptr, 0);
+          if (rc > 0) {
+            ssize_t len = rc + 1024; // 1K Margin should it grow between calls.
+            std::unique_ptr<char[]> buf(new char[len]);
+            rc = klogctl(KLOG_READ_ALL, buf.get(), len);
+            if (rc < len) {
+              len = rc + 1;
+            }
+            buf[--len] = '\0';
+            content = buf.get();
+          }
+          match = battery;
+        }
+
+        pos = content.find(match); // The first one it finds.
+        if (pos != std::string::npos) {
+          pos += strlen(match);
+          int level = atoi(content.substr(pos).c_str());
+          LOG(INFO) << "Battery level at startup " << level << "%";
+          if (level <= battery_dead_threshold) {
+            ret = "shutdown,battery";
+          }
+        } else {
+          LOG(DEBUG) << "Can not find first battery level in dmesg or logcat";
+        }
+      }
+    }
+
+    // Is there a controlled shutdown hint in last_reboot_reason_property?
+    if (isBluntRebootReason(ret)) {
+      // Content buffer no longer will have console data. Beware if more
+      // checks added below, that depend on parsing console content.
+      content = GetProperty(last_reboot_reason_property);
+
+      // String is either "reboot,<reason>" or "shutdown,<reason>".
+      // We will set if default reasons, only override with detail if thermal.
+      if (!isBluntRebootReason(content)) {
+        // Ok, we want it, let's squash it if secondReason is known.
+        pos = content.find(',');
+        if (pos != std::string::npos) {
+          ++pos;
+          std::string secondReason(content.substr(pos));
+          ret = isKnownRebootReason(secondReason) ? secondReason : content;
+        } else {
+          ret = content;
+        }
+      }
+    }
+
+    // Other System Health HAL reasons?
+
+    // ToDo: /proc/sys/kernel/boot_reason needs a HAL interface to
+    //       possibly offer hardware-specific clues from the PMIC.
+  }
+
+  // If unknown left over from above, make it "reboot,<boot_reason>"
+  if (ret == "") {
+    ret = "reboot";
+    if (android::base::StartsWith(reason, "reboot")) {
+      reason = reason.substr(strlen("reboot"));
+      while (reason[0] == ',') {
+        reason = reason.substr(1);
+      }
+    }
+    if (reason != "") {
+      ret += ",";
+      ret += reason;
+    }
+  }
+
+  LOG(INFO) << "Canonical boot reason: " << ret;
+  if (isKernelRebootReason(ret) && (GetProperty(last_reboot_reason_property) != "")) {
+    // Rewrite as it must be old news, kernel reasons trump user space.
+    SetProperty(last_reboot_reason_property, ret);
+  }
+  return ret;
+}
+
 // Returns the appropriate metric key prefix for the boot_complete metric such
 // that boot metrics after a system update are labeled as ota_boot_complete;
 // otherwise, they are labeled as boot_complete.  This method encapsulates the
@@ -212,9 +555,19 @@
   if (!boot_event_store.GetBootEvent(kBuildDateKey, &record)) {
     boot_complete_prefix = "factory_reset_" + boot_complete_prefix;
     boot_event_store.AddBootEventWithValue(kBuildDateKey, build_date);
+    LOG(INFO) << "Canonical boot reason: " << "reboot,factory_reset";
+    SetProperty(system_reboot_reason_property, "reboot,factory_reset");
+    if (GetProperty(bootloader_reboot_reason_property) == "") {
+      SetProperty(bootloader_reboot_reason_property, "reboot,factory_reset");
+    }
   } else if (build_date != record.second) {
     boot_complete_prefix = "ota_" + boot_complete_prefix;
     boot_event_store.AddBootEventWithValue(kBuildDateKey, build_date);
+    LOG(INFO) << "Canonical boot reason: " << "reboot,ota";
+    SetProperty(system_reboot_reason_property, "reboot,ota");
+    if (GetProperty(bootloader_reboot_reason_property) == "") {
+      SetProperty(bootloader_reboot_reason_property, "reboot,ota");
+    }
   }
 
   return boot_complete_prefix;
@@ -358,9 +711,23 @@
 // Records the boot_reason metric by querying the ro.boot.bootreason system
 // property.
 void RecordBootReason() {
-  int32_t boot_reason = BootReasonStrToEnum(GetProperty("ro.boot.bootreason"));
+  const std::string reason(GetProperty(bootloader_reboot_reason_property));
+
+  // Log the raw bootloader_boot_reason property value.
+  int32_t boot_reason = BootReasonStrToEnum(reason);
   BootEventRecordStore boot_event_store;
   boot_event_store.AddBootEventWithValue("boot_reason", boot_reason);
+
+  // Log the scrubbed system_boot_reason.
+  const std::string system_reason(BootReasonStrToReason(reason));
+  int32_t system_boot_reason = BootReasonStrToEnum(system_reason);
+  boot_event_store.AddBootEventWithValue("system_boot_reason", system_boot_reason);
+
+  // Record the scrubbed system_boot_reason to the property
+  SetProperty(system_reboot_reason_property, system_reason);
+  if (reason == "") {
+    SetProperty(bootloader_reboot_reason_property, system_reason);
+  }
 }
 
 // Records two metrics related to the user resetting a device: the time at