Support monitoring the subprocess in edit monitor

The daemon manager will keep monitoring the memory/cpu usage of the daemon process and kill it in case the process is consuming too much resources (specified with a threshold).

Test: atest daemon_manager_test
bug: 365617369
Change-Id: Ic9f8eb5a338de4e9cf7c8aba381ad752cf6aeba0
diff --git a/tools/edit_monitor/daemon_manager.py b/tools/edit_monitor/daemon_manager.py
index 8ec2588..79831a7 100644
--- a/tools/edit_monitor/daemon_manager.py
+++ b/tools/edit_monitor/daemon_manager.py
@@ -25,6 +25,9 @@
 
 
 DEFAULT_PROCESS_TERMINATION_TIMEOUT_SECONDS = 1
+DEFAULT_MONITOR_INTERVAL_SECONDS = 5
+DEFAULT_MEMORY_USAGE_THRESHOLD = 2000
+DEFAULT_CPU_USAGE_THRESHOLD = 10
 
 
 def default_daemon_target():
@@ -48,6 +51,9 @@
     self.pid = os.getpid()
     self.daemon_process = None
 
+    self.max_memory_usage = 0
+    self.max_cpu_usage = 0
+
     pid_file_dir = pathlib.Path(tempfile.gettempdir()).joinpath("edit_monitor")
     pid_file_dir.mkdir(parents=True, exist_ok=True)
     self.pid_file_path = self._get_pid_file_path(pid_file_dir)
@@ -61,6 +67,50 @@
     except Exception as e:
       logging.exception("Failed to start daemon manager with error %s", e)
 
+  def monitor_daemon(
+      self,
+      interval: int = DEFAULT_MONITOR_INTERVAL_SECONDS,
+      memory_threshold: float = DEFAULT_MEMORY_USAGE_THRESHOLD,
+      cpu_threshold: float = DEFAULT_CPU_USAGE_THRESHOLD,
+  ):
+    """Monits the daemon process status.
+
+    Periodically check the CPU/Memory usage of the daemon process as long as the
+    process is still running and kill the process if the resource usage is above
+    given thresholds.
+    """
+    logging.info("start monitoring daemon process %d.", self.daemon_process.pid)
+
+    while self.daemon_process.is_alive():
+      try:
+        memory_usage = self._get_process_memory_percent(self.daemon_process.pid)
+        self.max_memory_usage = max(self.max_memory_usage, memory_usage)
+
+        cpu_usage = self._get_process_cpu_percent(self.daemon_process.pid)
+        self.max_cpu_usage = max(self.max_cpu_usage, cpu_usage)
+
+        time.sleep(interval)
+      except Exception as e:
+        # Logging the error and continue.
+        logging.warning("Failed to monitor daemon process with error: %s", e)
+
+      if (
+          self.max_memory_usage >= memory_threshold
+          or self.max_cpu_usage >= cpu_threshold
+      ):
+        logging.error(
+            "Daemon process is consuming too much resource, killing..."
+        ),
+        self._terminate_process(self.daemon_process.pid)
+
+    logging.info(
+        "Daemon process %d terminated. Max memory usage: %f, Max cpu"
+        " usage: %f.",
+        self.daemon_process.pid,
+        self.max_memory_usage,
+        self.max_cpu_usage,
+    )
+
   def stop(self):
     """Stops the daemon process and removes the pidfile."""
 
@@ -180,3 +230,45 @@
     logging.info("pid_file_path: %s", pid_file_path)
 
     return pid_file_path
+
+  def _get_process_memory_percent(self, pid: int) -> float:
+    try:
+      with open(f"/proc/{pid}/stat", "r") as f:
+        stat_data = f.readline().split()
+        # RSS is the 24th field in /proc/[pid]/stat
+        rss_pages = int(stat_data[23])
+        return rss_pages * 4 / 1024  # Covert to MB
+    except (FileNotFoundError, IndexError, ValueError, IOError) as e:
+      logging.exception("Failed to get memory usage.")
+      raise e
+
+  def _get_process_cpu_percent(self, pid: int, interval: int = 1) -> float:
+    try:
+      total_start_time = self._get_total_cpu_time(pid)
+      with open("/proc/uptime", "r") as f:
+        uptime_start = float(f.readline().split()[0])
+
+      time.sleep(interval)
+
+      total_end_time = self._get_total_cpu_time(pid)
+      with open("/proc/uptime", "r") as f:
+        uptime_end = float(f.readline().split()[0])
+
+      return (
+          (total_end_time - total_start_time)
+          / (uptime_end - uptime_start)
+          * 100
+      )
+    except (FileNotFoundError, IndexError, ValueError, IOError) as e:
+      logging.exception("Failed to get CPU usage.")
+      raise e
+
+  def _get_total_cpu_time(self, pid: int) -> float:
+    with open(f"/proc/{str(pid)}/stat", "r") as f:
+      stats = f.readline().split()
+      # utime is the 14th field in /proc/[pid]/stat measured in clock ticks.
+      utime = int(stats[13])
+      # stime is the 15th field in /proc/[pid]/stat measured in clock ticks.
+      stime = int(stats[14])
+      return (utime + stime) / os.sysconf(os.sysconf_names["SC_CLK_TCK"])
+
diff --git a/tools/edit_monitor/daemon_manager_test.py b/tools/edit_monitor/daemon_manager_test.py
index 214b038..0c9e04b 100644
--- a/tools/edit_monitor/daemon_manager_test.py
+++ b/tools/edit_monitor/daemon_manager_test.py
@@ -43,6 +43,25 @@
     time.sleep(1)
 
 
+def memory_consume_daemon_target(size_mb):
+  try:
+    size_bytes = size_mb * 1024 * 1024
+    dummy_data = bytearray(size_bytes)
+    time.sleep(10)
+  except MemoryError:
+    print(f'Process failed to allocate {size_mb} MB of memory.')
+
+
+def cpu_consume_daemon_target(target_usage_percent):
+  while True:
+    start_time = time.time()
+    while time.time() - start_time < target_usage_percent / 100:
+      pass  # Busy loop to consume CPU
+
+    # Sleep to reduce CPU usage
+    time.sleep(1 - target_usage_percent / 100)
+
+
 class DaemonManagerTest(unittest.TestCase):
 
   @classmethod
@@ -102,7 +121,7 @@
   def test_start_success_with_existing_instance_from_different_binary(self):
     # First start an instance based on "some_binary_path"
     existing_dm = daemon_manager.DaemonManager(
-        "some_binary_path",
+        'some_binary_path',
         daemon_target=long_running_daemon,
     )
     existing_dm.start()
@@ -149,6 +168,35 @@
     # Verifies no daemon process is started.
     self.assertIsNone(dm.daemon_process)
 
+  def test_monitor_daemon_subprocess_killed_high_memory_usage(self):
+    dm = daemon_manager.DaemonManager(
+        TEST_BINARY_FILE,
+        daemon_target=memory_consume_daemon_target,
+        daemon_args=(2,),
+    )
+    dm.start()
+    dm.monitor_daemon(interval=1, memory_threshold=2)
+
+    self.assertTrue(dm.max_memory_usage >= 2)
+    self.assert_no_subprocess_running()
+
+  def test_monitor_daemon_subprocess_killed_high_cpu_usage(self):
+    dm = daemon_manager.DaemonManager(
+        TEST_BINARY_FILE,
+        daemon_target=cpu_consume_daemon_target,
+        daemon_args=(20,),
+    )
+    dm.start()
+    dm.monitor_daemon(interval=1, cpu_threshold=20)
+
+    self.assertTrue(dm.max_cpu_usage >= 20)
+    self.assert_no_subprocess_running()
+
+  @mock.patch('subprocess.check_output')
+  def test_monitor_daemon_failed_does_not_matter(self, mock_output):
+    mock_output.side_effect = OSError('Unknown OSError')
+    self.assert_run_simple_daemon_success()
+
   def test_stop_success(self):
     dm = daemon_manager.DaemonManager(
         TEST_BINARY_FILE, daemon_target=long_running_daemon
@@ -194,7 +242,7 @@
         daemon_args=(damone_output_file.name,),
     )
     dm.start()
-    dm.daemon_process.join()
+    dm.monitor_daemon(interval=1)
 
     # Verifies the expected pid file is created.
     expected_pid_file_path = pathlib.Path(self.working_dir.name).joinpath(