Support monitoring the subprocess in edit monitor
The daemon manager will keep monitoring the memory/cpu usage of the daemon process and kill it in case the process is consuming too much resources (specified with a threshold).
Test: atest daemon_manager_test
bug: 365617369
Change-Id: Ic9f8eb5a338de4e9cf7c8aba381ad752cf6aeba0
diff --git a/tools/edit_monitor/daemon_manager.py b/tools/edit_monitor/daemon_manager.py
index 8ec2588..79831a7 100644
--- a/tools/edit_monitor/daemon_manager.py
+++ b/tools/edit_monitor/daemon_manager.py
@@ -25,6 +25,9 @@
DEFAULT_PROCESS_TERMINATION_TIMEOUT_SECONDS = 1
+DEFAULT_MONITOR_INTERVAL_SECONDS = 5
+DEFAULT_MEMORY_USAGE_THRESHOLD = 2000
+DEFAULT_CPU_USAGE_THRESHOLD = 10
def default_daemon_target():
@@ -48,6 +51,9 @@
self.pid = os.getpid()
self.daemon_process = None
+ self.max_memory_usage = 0
+ self.max_cpu_usage = 0
+
pid_file_dir = pathlib.Path(tempfile.gettempdir()).joinpath("edit_monitor")
pid_file_dir.mkdir(parents=True, exist_ok=True)
self.pid_file_path = self._get_pid_file_path(pid_file_dir)
@@ -61,6 +67,50 @@
except Exception as e:
logging.exception("Failed to start daemon manager with error %s", e)
+ def monitor_daemon(
+ self,
+ interval: int = DEFAULT_MONITOR_INTERVAL_SECONDS,
+ memory_threshold: float = DEFAULT_MEMORY_USAGE_THRESHOLD,
+ cpu_threshold: float = DEFAULT_CPU_USAGE_THRESHOLD,
+ ):
+ """Monits the daemon process status.
+
+ Periodically check the CPU/Memory usage of the daemon process as long as the
+ process is still running and kill the process if the resource usage is above
+ given thresholds.
+ """
+ logging.info("start monitoring daemon process %d.", self.daemon_process.pid)
+
+ while self.daemon_process.is_alive():
+ try:
+ memory_usage = self._get_process_memory_percent(self.daemon_process.pid)
+ self.max_memory_usage = max(self.max_memory_usage, memory_usage)
+
+ cpu_usage = self._get_process_cpu_percent(self.daemon_process.pid)
+ self.max_cpu_usage = max(self.max_cpu_usage, cpu_usage)
+
+ time.sleep(interval)
+ except Exception as e:
+ # Logging the error and continue.
+ logging.warning("Failed to monitor daemon process with error: %s", e)
+
+ if (
+ self.max_memory_usage >= memory_threshold
+ or self.max_cpu_usage >= cpu_threshold
+ ):
+ logging.error(
+ "Daemon process is consuming too much resource, killing..."
+ ),
+ self._terminate_process(self.daemon_process.pid)
+
+ logging.info(
+ "Daemon process %d terminated. Max memory usage: %f, Max cpu"
+ " usage: %f.",
+ self.daemon_process.pid,
+ self.max_memory_usage,
+ self.max_cpu_usage,
+ )
+
def stop(self):
"""Stops the daemon process and removes the pidfile."""
@@ -180,3 +230,45 @@
logging.info("pid_file_path: %s", pid_file_path)
return pid_file_path
+
+ def _get_process_memory_percent(self, pid: int) -> float:
+ try:
+ with open(f"/proc/{pid}/stat", "r") as f:
+ stat_data = f.readline().split()
+ # RSS is the 24th field in /proc/[pid]/stat
+ rss_pages = int(stat_data[23])
+ return rss_pages * 4 / 1024 # Covert to MB
+ except (FileNotFoundError, IndexError, ValueError, IOError) as e:
+ logging.exception("Failed to get memory usage.")
+ raise e
+
+ def _get_process_cpu_percent(self, pid: int, interval: int = 1) -> float:
+ try:
+ total_start_time = self._get_total_cpu_time(pid)
+ with open("/proc/uptime", "r") as f:
+ uptime_start = float(f.readline().split()[0])
+
+ time.sleep(interval)
+
+ total_end_time = self._get_total_cpu_time(pid)
+ with open("/proc/uptime", "r") as f:
+ uptime_end = float(f.readline().split()[0])
+
+ return (
+ (total_end_time - total_start_time)
+ / (uptime_end - uptime_start)
+ * 100
+ )
+ except (FileNotFoundError, IndexError, ValueError, IOError) as e:
+ logging.exception("Failed to get CPU usage.")
+ raise e
+
+ def _get_total_cpu_time(self, pid: int) -> float:
+ with open(f"/proc/{str(pid)}/stat", "r") as f:
+ stats = f.readline().split()
+ # utime is the 14th field in /proc/[pid]/stat measured in clock ticks.
+ utime = int(stats[13])
+ # stime is the 15th field in /proc/[pid]/stat measured in clock ticks.
+ stime = int(stats[14])
+ return (utime + stime) / os.sysconf(os.sysconf_names["SC_CLK_TCK"])
+