Support monitoring the subprocess in edit monitor
The daemon manager will keep monitoring the memory/cpu usage of the daemon process and kill it in case the process is consuming too much resources (specified with a threshold).
Test: atest daemon_manager_test
bug: 365617369
Change-Id: Ic9f8eb5a338de4e9cf7c8aba381ad752cf6aeba0
diff --git a/tools/edit_monitor/daemon_manager.py b/tools/edit_monitor/daemon_manager.py
index 8ec2588..79831a7 100644
--- a/tools/edit_monitor/daemon_manager.py
+++ b/tools/edit_monitor/daemon_manager.py
@@ -25,6 +25,9 @@
DEFAULT_PROCESS_TERMINATION_TIMEOUT_SECONDS = 1
+DEFAULT_MONITOR_INTERVAL_SECONDS = 5
+DEFAULT_MEMORY_USAGE_THRESHOLD = 2000
+DEFAULT_CPU_USAGE_THRESHOLD = 10
def default_daemon_target():
@@ -48,6 +51,9 @@
self.pid = os.getpid()
self.daemon_process = None
+ self.max_memory_usage = 0
+ self.max_cpu_usage = 0
+
pid_file_dir = pathlib.Path(tempfile.gettempdir()).joinpath("edit_monitor")
pid_file_dir.mkdir(parents=True, exist_ok=True)
self.pid_file_path = self._get_pid_file_path(pid_file_dir)
@@ -61,6 +67,50 @@
except Exception as e:
logging.exception("Failed to start daemon manager with error %s", e)
+ def monitor_daemon(
+ self,
+ interval: int = DEFAULT_MONITOR_INTERVAL_SECONDS,
+ memory_threshold: float = DEFAULT_MEMORY_USAGE_THRESHOLD,
+ cpu_threshold: float = DEFAULT_CPU_USAGE_THRESHOLD,
+ ):
+ """Monits the daemon process status.
+
+ Periodically check the CPU/Memory usage of the daemon process as long as the
+ process is still running and kill the process if the resource usage is above
+ given thresholds.
+ """
+ logging.info("start monitoring daemon process %d.", self.daemon_process.pid)
+
+ while self.daemon_process.is_alive():
+ try:
+ memory_usage = self._get_process_memory_percent(self.daemon_process.pid)
+ self.max_memory_usage = max(self.max_memory_usage, memory_usage)
+
+ cpu_usage = self._get_process_cpu_percent(self.daemon_process.pid)
+ self.max_cpu_usage = max(self.max_cpu_usage, cpu_usage)
+
+ time.sleep(interval)
+ except Exception as e:
+ # Logging the error and continue.
+ logging.warning("Failed to monitor daemon process with error: %s", e)
+
+ if (
+ self.max_memory_usage >= memory_threshold
+ or self.max_cpu_usage >= cpu_threshold
+ ):
+ logging.error(
+ "Daemon process is consuming too much resource, killing..."
+ ),
+ self._terminate_process(self.daemon_process.pid)
+
+ logging.info(
+ "Daemon process %d terminated. Max memory usage: %f, Max cpu"
+ " usage: %f.",
+ self.daemon_process.pid,
+ self.max_memory_usage,
+ self.max_cpu_usage,
+ )
+
def stop(self):
"""Stops the daemon process and removes the pidfile."""
@@ -180,3 +230,45 @@
logging.info("pid_file_path: %s", pid_file_path)
return pid_file_path
+
+ def _get_process_memory_percent(self, pid: int) -> float:
+ try:
+ with open(f"/proc/{pid}/stat", "r") as f:
+ stat_data = f.readline().split()
+ # RSS is the 24th field in /proc/[pid]/stat
+ rss_pages = int(stat_data[23])
+ return rss_pages * 4 / 1024 # Covert to MB
+ except (FileNotFoundError, IndexError, ValueError, IOError) as e:
+ logging.exception("Failed to get memory usage.")
+ raise e
+
+ def _get_process_cpu_percent(self, pid: int, interval: int = 1) -> float:
+ try:
+ total_start_time = self._get_total_cpu_time(pid)
+ with open("/proc/uptime", "r") as f:
+ uptime_start = float(f.readline().split()[0])
+
+ time.sleep(interval)
+
+ total_end_time = self._get_total_cpu_time(pid)
+ with open("/proc/uptime", "r") as f:
+ uptime_end = float(f.readline().split()[0])
+
+ return (
+ (total_end_time - total_start_time)
+ / (uptime_end - uptime_start)
+ * 100
+ )
+ except (FileNotFoundError, IndexError, ValueError, IOError) as e:
+ logging.exception("Failed to get CPU usage.")
+ raise e
+
+ def _get_total_cpu_time(self, pid: int) -> float:
+ with open(f"/proc/{str(pid)}/stat", "r") as f:
+ stats = f.readline().split()
+ # utime is the 14th field in /proc/[pid]/stat measured in clock ticks.
+ utime = int(stats[13])
+ # stime is the 15th field in /proc/[pid]/stat measured in clock ticks.
+ stime = int(stats[14])
+ return (utime + stime) / os.sysconf(os.sysconf_names["SC_CLK_TCK"])
+
diff --git a/tools/edit_monitor/daemon_manager_test.py b/tools/edit_monitor/daemon_manager_test.py
index 214b038..0c9e04b 100644
--- a/tools/edit_monitor/daemon_manager_test.py
+++ b/tools/edit_monitor/daemon_manager_test.py
@@ -43,6 +43,25 @@
time.sleep(1)
+def memory_consume_daemon_target(size_mb):
+ try:
+ size_bytes = size_mb * 1024 * 1024
+ dummy_data = bytearray(size_bytes)
+ time.sleep(10)
+ except MemoryError:
+ print(f'Process failed to allocate {size_mb} MB of memory.')
+
+
+def cpu_consume_daemon_target(target_usage_percent):
+ while True:
+ start_time = time.time()
+ while time.time() - start_time < target_usage_percent / 100:
+ pass # Busy loop to consume CPU
+
+ # Sleep to reduce CPU usage
+ time.sleep(1 - target_usage_percent / 100)
+
+
class DaemonManagerTest(unittest.TestCase):
@classmethod
@@ -102,7 +121,7 @@
def test_start_success_with_existing_instance_from_different_binary(self):
# First start an instance based on "some_binary_path"
existing_dm = daemon_manager.DaemonManager(
- "some_binary_path",
+ 'some_binary_path',
daemon_target=long_running_daemon,
)
existing_dm.start()
@@ -149,6 +168,35 @@
# Verifies no daemon process is started.
self.assertIsNone(dm.daemon_process)
+ def test_monitor_daemon_subprocess_killed_high_memory_usage(self):
+ dm = daemon_manager.DaemonManager(
+ TEST_BINARY_FILE,
+ daemon_target=memory_consume_daemon_target,
+ daemon_args=(2,),
+ )
+ dm.start()
+ dm.monitor_daemon(interval=1, memory_threshold=2)
+
+ self.assertTrue(dm.max_memory_usage >= 2)
+ self.assert_no_subprocess_running()
+
+ def test_monitor_daemon_subprocess_killed_high_cpu_usage(self):
+ dm = daemon_manager.DaemonManager(
+ TEST_BINARY_FILE,
+ daemon_target=cpu_consume_daemon_target,
+ daemon_args=(20,),
+ )
+ dm.start()
+ dm.monitor_daemon(interval=1, cpu_threshold=20)
+
+ self.assertTrue(dm.max_cpu_usage >= 20)
+ self.assert_no_subprocess_running()
+
+ @mock.patch('subprocess.check_output')
+ def test_monitor_daemon_failed_does_not_matter(self, mock_output):
+ mock_output.side_effect = OSError('Unknown OSError')
+ self.assert_run_simple_daemon_success()
+
def test_stop_success(self):
dm = daemon_manager.DaemonManager(
TEST_BINARY_FILE, daemon_target=long_running_daemon
@@ -194,7 +242,7 @@
daemon_args=(damone_output_file.name,),
)
dm.start()
- dm.daemon_process.join()
+ dm.monitor_daemon(interval=1)
# Verifies the expected pid file is created.
expected_pid_file_path = pathlib.Path(self.working_dir.name).joinpath(