Support to cleanup all existing edit monitor instances Add a cleanup method in daemon manager to stop all the existing edit monitor instances and place a block sign to provent any edit monitor to start. This method is only used in emergency case when something goes wrong with edit monitor and need immediate cleanup to prevent damage to the system. Test: atest daemon_manager_test bug: 365617369 Change-Id: I8bff4f82a8ce272ccafb4ff8b076f05f56609426

commit: d28da5cfe3e9362f2c83241e106505703f9f4d73 [log] [tgz]
author: Zhuoyao Zhang <zhuoyao@google.com> Tue Sep 24 19:46:12 2024 +0000
committer: Zhuoyao Zhang <zhuoyao@google.com> Tue Oct 01 18:22:36 2024 +0000
tree: a579455dff11e3f0c1919401311029ab081be10c
parent: ae332aea96352698a8323b74c15a56891521d949 [diff]
diff --git a/tools/edit_monitor/daemon_manager.py b/tools/edit_monitor/daemon_manager.py
index 1876451..445d849 100644
--- a/tools/edit_monitor/daemon_manager.py
+++ b/tools/edit_monitor/daemon_manager.py

@@ -30,6 +30,7 @@
 DEFAULT_MEMORY_USAGE_THRESHOLD = 2000
 DEFAULT_CPU_USAGE_THRESHOLD = 200
 DEFAULT_REBOOT_TIMEOUT_SECONDS = 60 * 60 * 24
+BLOCK_SIGN_FILE = "edit_monitor_block_sign"
 
 
 def default_daemon_target():
@@ -59,15 +60,19 @@
     pid_file_dir = pathlib.Path(tempfile.gettempdir()).joinpath("edit_monitor")
     pid_file_dir.mkdir(parents=True, exist_ok=True)
     self.pid_file_path = self._get_pid_file_path(pid_file_dir)
+    self.block_sign = pathlib.Path(tempfile.gettempdir()).joinpath(
+        BLOCK_SIGN_FILE
+    )
 
   def start(self):
     """Writes the pidfile and starts the daemon proces."""
-    try:
-      self._stop_any_existing_instance()
-      self._write_pid_to_pidfile()
-      self._start_daemon_process()
-    except Exception as e:
-      logging.exception("Failed to start daemon manager with error %s", e)
+    if self.block_sign.exists():
+      logging.warning("Block sign found, exiting...")
+      return
+
+    self._stop_any_existing_instance()
+    self._write_pid_to_pidfile()
+    self._start_daemon_process()
 
   def monitor_daemon(
       self,
@@ -82,6 +87,9 @@
     process is still running and kill the process if the resource usage is above
     given thresholds.
     """
+    if not self.daemon_process:
+      return
+
     logging.info("start monitoring daemon process %d.", self.daemon_process.pid)
     reboot_time = time.time() + reboot_timeout
     while self.daemon_process.is_alive():
@@ -150,6 +158,33 @@
       logging.exception("Failed to reboot process with error: %s.", e)
       sys.exit(1)  # Indicate an error occurred
 
+  def cleanup(self):
+    """Wipes out all edit monitor instances in the system.
+
+    Stops all the existing edit monitor instances and place a block sign
+    to prevent any edit monitor process to start. This method is only used
+    in emergency case when there's something goes wrong with the edit monitor
+    that requires immediate cleanup to prevent damanger to the system.
+    """
+    logging.debug("Start cleaning up all existing instances.")
+
+    try:
+      # First places a block sign to prevent any edit monitor process to start.
+      self.block_sign.touch()
+    except (FileNotFoundError, PermissionError, OSError):
+      logging.exception("Failed to place the block sign")
+
+    # Finds and kills all the existing instances of edit monitor.
+    existing_instances_pids = self._find_all_instances_pids()
+    for pid in existing_instances_pids:
+      logging.info(
+          "Found existing edit monitor instance with pid %d, killing...", pid
+      )
+      try:
+        self._terminate_process(pid)
+      except Exception:
+        logging.exception("Failed to terminate process %d", pid)
+
   def _stop_any_existing_instance(self):
     if not self.pid_file_path.exists():
       logging.debug("No existing instances.")
@@ -300,3 +335,15 @@
       stime = int(stats[14])
       return (utime + stime) / os.sysconf(os.sysconf_names["SC_CLK_TCK"])
 
+  def _find_all_instances_pids(self) -> list[int]:
+    pids = []
+
+    for file in os.listdir(self.pid_file_path.parent):
+      if file.endswith(".lock"):
+        try:
+          with open(self.pid_file_path.parent.joinpath(file), "r") as f:
+            pids.append(int(f.read().strip()))
+        except (FileNotFoundError, IOError, ValueError, TypeError):
+          logging.exception("Failed to get pid from file path: %s", file)
+
+    return pids
\ No newline at end of file

diff --git a/tools/edit_monitor/daemon_manager_test.py b/tools/edit_monitor/daemon_manager_test.py
index bcfa850..d62eade 100644
--- a/tools/edit_monitor/daemon_manager_test.py
+++ b/tools/edit_monitor/daemon_manager_test.py

@@ -27,6 +27,7 @@
 from unittest import mock
 from edit_monitor import daemon_manager
 
+
 TEST_BINARY_FILE = '/path/to/test_binary'
 TEST_PID_FILE_PATH = (
     '587239c2d1050afdf54512e2d799f3b929f86b43575eb3c7b4bab105dd9bd25e.lock'
@@ -92,20 +93,10 @@
     self.assert_run_simple_daemon_success()
 
   def test_start_success_with_existing_instance_running(self):
-    # Create a long running subprocess
-    p = multiprocessing.Process(target=long_running_daemon)
-    p.start()
-
-    # Create a pidfile with the subprocess pid
-    pid_file_path_dir = pathlib.Path(self.working_dir.name).joinpath(
-        'edit_monitor'
-    )
-    pid_file_path_dir.mkdir(parents=True, exist_ok=True)
-    with open(pid_file_path_dir.joinpath(TEST_PID_FILE_PATH), 'w') as f:
-      f.write(str(p.pid))
+    # Create a running daemon subprocess
+    p = self._create_fake_deamon_process()
 
     self.assert_run_simple_daemon_success()
-    p.terminate()
 
   def test_start_success_with_existing_instance_already_dead(self):
     # Create a pidfile with pid that does not exist.
@@ -129,6 +120,17 @@
     self.assert_run_simple_daemon_success()
     existing_dm.stop()
 
+  def test_start_return_directly_if_block_sign_exists(self):
+    # Creates the block sign.
+    pathlib.Path(self.working_dir.name).joinpath(
+        daemon_manager.BLOCK_SIGN_FILE
+    ).touch()
+
+    dm = daemon_manager.DaemonManager(TEST_BINARY_FILE)
+    dm.start()
+    # Verify no daemon process is started.
+    self.assertIsNone(dm.daemon_process)
+
   @mock.patch('os.kill')
   def test_start_failed_to_kill_existing_instance(self, mock_kill):
     mock_kill.side_effect = OSError('Unknown OSError')
@@ -139,11 +141,9 @@
     with open(pid_file_path_dir.joinpath(TEST_PID_FILE_PATH), 'w') as f:
       f.write('123456')
 
-    dm = daemon_manager.DaemonManager(TEST_BINARY_FILE)
-    dm.start()
-
-    # Verify no daemon process is started.
-    self.assertIsNone(dm.daemon_process)
+    with self.assertRaises(OSError) as error:
+      dm = daemon_manager.DaemonManager(TEST_BINARY_FILE)
+      dm.start()
 
   def test_start_failed_to_write_pidfile(self):
     pid_file_path_dir = pathlib.Path(self.working_dir.name).joinpath(
@@ -153,20 +153,16 @@
     # Makes the directory read-only so write pidfile will fail.
     os.chmod(pid_file_path_dir, 0o555)
 
-    dm = daemon_manager.DaemonManager(TEST_BINARY_FILE)
-    dm.start()
-
-    # Verifies no daemon process is started.
-    self.assertIsNone(dm.daemon_process)
+    with self.assertRaises(PermissionError) as error:
+      dm = daemon_manager.DaemonManager(TEST_BINARY_FILE)
+      dm.start()
 
   def test_start_failed_to_start_daemon_process(self):
-    dm = daemon_manager.DaemonManager(
-        TEST_BINARY_FILE, daemon_target='wrong_target', daemon_args=(1)
-    )
-    dm.start()
-
-    # Verifies no daemon process is started.
-    self.assertIsNone(dm.daemon_process)
+    with self.assertRaises(TypeError) as error:
+      dm = daemon_manager.DaemonManager(
+          TEST_BINARY_FILE, daemon_target='wrong_target', daemon_args=(1)
+      )
+      dm.start()
 
   def test_monitor_daemon_subprocess_killed_high_memory_usage(self):
     dm = daemon_manager.DaemonManager(
@@ -321,7 +317,7 @@
           self._is_process_alive(child_pid), f'process {child_pid} still alive'
       )
 
-  def _get_child_processes(self, parent_pid):
+  def _get_child_processes(self, parent_pid: int) -> list[int]:
     try:
       output = subprocess.check_output(
           ['ps', '-o', 'pid,ppid', '--no-headers'], text=True
@@ -336,7 +332,7 @@
     except subprocess.CalledProcessError as e:
       self.fail(f'failed to get child process, error: {e}')
 
-  def _is_process_alive(self, pid):
+  def _is_process_alive(self, pid: int) -> bool:
     try:
       output = subprocess.check_output(
           ['ps', '-p', str(pid), '-o', 'state='], text=True
@@ -355,6 +351,22 @@
         # process already terminated
         pass
 
+  def _create_fake_deamon_process(
+      self, name: str = ''
+  ) -> multiprocessing.Process:
+    # Create a long running subprocess
+    p = multiprocessing.Process(target=long_running_daemon)
+    p.start()
+
+    # Create the pidfile with the subprocess pid
+    pid_file_path_dir = pathlib.Path(self.working_dir.name).joinpath(
+        'edit_monitor'
+    )
+    pid_file_path_dir.mkdir(parents=True, exist_ok=True)
+    with open(pid_file_path_dir.joinpath(name + 'pid.lock'), 'w') as f:
+      f.write(str(p.pid))
+    return p
+
 
 if __name__ == '__main__':
   unittest.main()
commit	d28da5cfe3e9362f2c83241e106505703f9f4d73	[log] [tgz]
author	Zhuoyao Zhang <zhuoyao@google.com>	Tue Sep 24 19:46:12 2024 +0000
committer	Zhuoyao Zhang <zhuoyao@google.com>	Tue Oct 01 18:22:36 2024 +0000
tree	a579455dff11e3f0c1919401311029ab081be10c
parent	ae332aea96352698a8323b74c15a56891521d949 [diff]