Support monitoring the subprocess in edit monitor The daemon manager will keep monitoring the memory/cpu usage of the daemon process and kill it in case the process is consuming too much resources (specified with a threshold). Test: atest daemon_manager_test bug: 365617369 Change-Id: Ic9f8eb5a338de4e9cf7c8aba381ad752cf6aeba0

commit: dc2840dafc7d188ee7a4c0fb717fd3d1b2791a99 [log] [tgz]
author: Zhuoyao Zhang <zhuoyao@google.com> Thu Sep 19 23:29:27 2024 +0000
committer: Zhuoyao Zhang <zhuoyao@google.com> Wed Sep 25 19:05:34 2024 +0000
tree: 3ee7df6681ced3ef3ad4e62f5554c69cc44e009c
parent: 480c30410ca67f0ef65216163fe5419e3e29e49e [diff] [blame]
diff --git a/tools/edit_monitor/daemon_manager.py b/tools/edit_monitor/daemon_manager.py
index 8ec2588..79831a7 100644
--- a/tools/edit_monitor/daemon_manager.py
+++ b/tools/edit_monitor/daemon_manager.py

@@ -25,6 +25,9 @@
 
 
 DEFAULT_PROCESS_TERMINATION_TIMEOUT_SECONDS = 1
+DEFAULT_MONITOR_INTERVAL_SECONDS = 5
+DEFAULT_MEMORY_USAGE_THRESHOLD = 2000
+DEFAULT_CPU_USAGE_THRESHOLD = 10
 
 
 def default_daemon_target():
@@ -48,6 +51,9 @@
     self.pid = os.getpid()
     self.daemon_process = None
 
+    self.max_memory_usage = 0
+    self.max_cpu_usage = 0
+
     pid_file_dir = pathlib.Path(tempfile.gettempdir()).joinpath("edit_monitor")
     pid_file_dir.mkdir(parents=True, exist_ok=True)
     self.pid_file_path = self._get_pid_file_path(pid_file_dir)
@@ -61,6 +67,50 @@
     except Exception as e:
       logging.exception("Failed to start daemon manager with error %s", e)
 
+  def monitor_daemon(
+      self,
+      interval: int = DEFAULT_MONITOR_INTERVAL_SECONDS,
+      memory_threshold: float = DEFAULT_MEMORY_USAGE_THRESHOLD,
+      cpu_threshold: float = DEFAULT_CPU_USAGE_THRESHOLD,
+  ):
+    """Monits the daemon process status.
+
+    Periodically check the CPU/Memory usage of the daemon process as long as the
+    process is still running and kill the process if the resource usage is above
+    given thresholds.
+    """
+    logging.info("start monitoring daemon process %d.", self.daemon_process.pid)
+
+    while self.daemon_process.is_alive():
+      try:
+        memory_usage = self._get_process_memory_percent(self.daemon_process.pid)
+        self.max_memory_usage = max(self.max_memory_usage, memory_usage)
+
+        cpu_usage = self._get_process_cpu_percent(self.daemon_process.pid)
+        self.max_cpu_usage = max(self.max_cpu_usage, cpu_usage)
+
+        time.sleep(interval)
+      except Exception as e:
+        # Logging the error and continue.
+        logging.warning("Failed to monitor daemon process with error: %s", e)
+
+      if (
+          self.max_memory_usage >= memory_threshold
+          or self.max_cpu_usage >= cpu_threshold
+      ):
+        logging.error(
+            "Daemon process is consuming too much resource, killing..."
+        ),
+        self._terminate_process(self.daemon_process.pid)
+
+    logging.info(
+        "Daemon process %d terminated. Max memory usage: %f, Max cpu"
+        " usage: %f.",
+        self.daemon_process.pid,
+        self.max_memory_usage,
+        self.max_cpu_usage,
+    )
+
   def stop(self):
     """Stops the daemon process and removes the pidfile."""
 
@@ -180,3 +230,45 @@
     logging.info("pid_file_path: %s", pid_file_path)
 
     return pid_file_path
+
+  def _get_process_memory_percent(self, pid: int) -> float:
+    try:
+      with open(f"/proc/{pid}/stat", "r") as f:
+        stat_data = f.readline().split()
+        # RSS is the 24th field in /proc/[pid]/stat
+        rss_pages = int(stat_data[23])
+        return rss_pages * 4 / 1024  # Covert to MB
+    except (FileNotFoundError, IndexError, ValueError, IOError) as e:
+      logging.exception("Failed to get memory usage.")
+      raise e
+
+  def _get_process_cpu_percent(self, pid: int, interval: int = 1) -> float:
+    try:
+      total_start_time = self._get_total_cpu_time(pid)
+      with open("/proc/uptime", "r") as f:
+        uptime_start = float(f.readline().split()[0])
+
+      time.sleep(interval)
+
+      total_end_time = self._get_total_cpu_time(pid)
+      with open("/proc/uptime", "r") as f:
+        uptime_end = float(f.readline().split()[0])
+
+      return (
+          (total_end_time - total_start_time)
+          / (uptime_end - uptime_start)
+          * 100
+      )
+    except (FileNotFoundError, IndexError, ValueError, IOError) as e:
+      logging.exception("Failed to get CPU usage.")
+      raise e
+
+  def _get_total_cpu_time(self, pid: int) -> float:
+    with open(f"/proc/{str(pid)}/stat", "r") as f:
+      stats = f.readline().split()
+      # utime is the 14th field in /proc/[pid]/stat measured in clock ticks.
+      utime = int(stats[13])
+      # stime is the 15th field in /proc/[pid]/stat measured in clock ticks.
+      stime = int(stats[14])
+      return (utime + stime) / os.sysconf(os.sysconf_names["SC_CLK_TCK"])
+
commit	dc2840dafc7d188ee7a4c0fb717fd3d1b2791a99	[log] [tgz]
author	Zhuoyao Zhang <zhuoyao@google.com>	Thu Sep 19 23:29:27 2024 +0000
committer	Zhuoyao Zhang <zhuoyao@google.com>	Wed Sep 25 19:05:34 2024 +0000
tree	3ee7df6681ced3ef3ad4e62f5554c69cc44e009c
parent	480c30410ca67f0ef65216163fe5419e3e29e49e [diff] [blame]