Fix a race condition in edit monitor When there are multiple edit monitor starting around the same time, there will be cases that the pidfile is deleted or overridden as multiple processes are trying to remove and create the pidfile around the same time (see details in b/382135550). This cl fix it by checking the pid in the pidfile and only remove the pidfile if the pid stored there is the expected one. Bug: 382135550 Test: atest edit_monitor_integration_test Change-Id: Ifdb4c29c8b36052fea83ce59de1be9d99db3a852

commit: a7107913d9ed38b23d140fe29eef1ff374663902 [log] [tgz]
author: Zhuoyao Zhang <zhuoyao@google.com> Wed Dec 04 17:39:54 2024 +0000
committer: Zhuoyao Zhang <zhuoyao@google.com> Wed Dec 04 21:09:44 2024 +0000
tree: 603c7aa29ff5ce6a91989a8a7875d873bb19bf65
parent: 3b2cf40ee8595d3cc88b66a86dede83e56228a8d [diff]
diff --git a/tools/edit_monitor/daemon_manager.py b/tools/edit_monitor/daemon_manager.py
index 0c31ab8..a352e30 100644
--- a/tools/edit_monitor/daemon_manager.py
+++ b/tools/edit_monitor/daemon_manager.py

@@ -149,15 +149,15 @@
             edit_event_pb2.EditEvent.KILLED_DUE_TO_EXCEEDED_MEMORY_USAGE
         )
         logging.error(
-            "Daemon process is consuming too much memory, rebooting...")
+            "Daemon process is consuming too much memory, rebooting..."
+        )
         self.reboot()
 
       if self.max_cpu_usage >= cpu_threshold:
         self._send_error_event_to_clearcut(
             edit_event_pb2.EditEvent.KILLED_DUE_TO_EXCEEDED_CPU_USAGE
         )
-        logging.error(
-            "Daemon process is consuming too much cpu, killing...")
+        logging.error("Daemon process is consuming too much cpu, killing...")
         self._terminate_process(self.daemon_process.pid)
 
     logging.info(
@@ -179,7 +179,7 @@
         self._wait_for_process_terminate(self.daemon_process.pid, 1)
         if self.daemon_process.is_alive():
           self._terminate_process(self.daemon_process.pid)
-      self._remove_pidfile()
+      self._remove_pidfile(self.pid)
       logging.info("Successfully stopped daemon manager.")
     except Exception as e:
       logging.exception("Failed to stop daemon manager with error %s", e)
@@ -253,11 +253,15 @@
     if ex_pid:
       logging.info("Found another instance with pid %d.", ex_pid)
       self._terminate_process(ex_pid)
-      self._remove_pidfile()
+      self._remove_pidfile(ex_pid)
 
-  def _read_pid_from_pidfile(self):
-    with open(self.pid_file_path, "r") as f:
-      return int(f.read().strip())
+  def _read_pid_from_pidfile(self) -> int | None:
+    try:
+      with open(self.pid_file_path, "r") as f:
+        return int(f.read().strip())
+    except FileNotFoundError as e:
+      logging.warning("pidfile %s does not exist.", self.pid_file_path)
+      return None
 
   def _write_pid_to_pidfile(self):
     """Creates a pidfile and writes the current pid to the file.
@@ -333,7 +337,23 @@
       )
       return True
 
-  def _remove_pidfile(self):
+  def _remove_pidfile(self, expected_pid: int):
+    recorded_pid = self._read_pid_from_pidfile()
+
+    if recorded_pid is None:
+      logging.info("pid file %s already removed.", self.pid_file_path)
+      return
+
+    if recorded_pid != expected_pid:
+      logging.warning(
+          "pid file contains pid from a different process, expected pid: %d,"
+          " actual pid: %d.",
+          expected_pid,
+          recorded_pid,
+      )
+      return
+
+    logging.debug("removing pidfile written by process %s", expected_pid)
     try:
       os.remove(self.pid_file_path)
     except FileNotFoundError:
@@ -378,9 +398,7 @@
       uptime_end = float(f.readline().split()[0])
 
     return (
-        (total_end_time - total_start_time)
-        / (uptime_end - uptime_start)
-        * 100
+        (total_end_time - total_start_time) / (uptime_end - uptime_start) * 100
     )
 
   def _get_total_cpu_time(self, pid: int) -> float:

diff --git a/tools/edit_monitor/edit_monitor_integration_test.py b/tools/edit_monitor/edit_monitor_integration_test.py
index 3d28274..f39b936 100644
--- a/tools/edit_monitor/edit_monitor_integration_test.py
+++ b/tools/edit_monitor/edit_monitor_integration_test.py

@@ -15,6 +15,7 @@
 """Integration tests for Edit Monitor."""
 
 import glob
+from importlib import resources
 import logging
 import os
 import pathlib
@@ -25,8 +26,6 @@
 import tempfile
 import time
 import unittest
-
-from importlib import resources
 from unittest import mock
 
 
@@ -49,7 +48,8 @@
     self.root_monitoring_path.mkdir()
     self.edit_monitor_binary_path = self._import_executable("edit_monitor")
     self.patch = mock.patch.dict(
-        os.environ, {'ENABLE_ANDROID_EDIT_MONITOR': 'true'})
+        os.environ, {"ENABLE_ANDROID_EDIT_MONITOR": "true"}
+    )
     self.patch.start()
 
   def tearDown(self):
@@ -83,7 +83,21 @@
 
     self.assertEqual(self._get_logged_events_num(), 4)
 
-  def _start_edit_monitor_process(self):
+  def test_start_multiple_edit_monitor_only_one_started(self):
+    p1 = self._start_edit_monitor_process(wait_for_observer_start=False)
+    p2 = self._start_edit_monitor_process(wait_for_observer_start=False)
+    p3 = self._start_edit_monitor_process(wait_for_observer_start=False)
+
+    live_processes = self._get_live_processes([p1, p2, p3])
+
+    # Cleanup all live processes.
+    for p in live_processes:
+      os.kill(p.pid, signal.SIGINT)
+      p.communicate()
+
+    self.assertEqual(len(live_processes), 1)
+
+  def _start_edit_monitor_process(self, wait_for_observer_start=True):
     command = f"""
     export TMPDIR="{self.working_dir.name}"
     {self.edit_monitor_binary_path} --path={self.root_monitoring_path} --dry_run"""
@@ -94,7 +108,9 @@
         start_new_session=True,
         executable="/bin/bash",
     )
-    self._wait_for_observer_start(time_out=5)
+    if wait_for_observer_start:
+      self._wait_for_observer_start(time_out=5)
+
     return p
 
   def _wait_for_observer_start(self, time_out):
@@ -125,6 +141,18 @@
 
     return 0
 
+  def _get_live_processes(self, processes):
+    live_processes = []
+    for p in processes:
+      try:
+        p.wait(timeout=5)
+      except subprocess.TimeoutExpired as e:
+        live_processes.append(p)
+        logging.info("process: %d still alive.", p.pid)
+      else:
+        logging.info("process: %d stopped.", p.pid)
+    return live_processes
+
   def _import_executable(self, executable_name: str) -> pathlib.Path:
     binary_dir = pathlib.Path(self.working_dir.name).joinpath("binary")
     binary_dir.mkdir()

diff --git a/tools/edit_monitor/main.py b/tools/edit_monitor/main.py
index 49385f1..7ca0daa 100644
--- a/tools/edit_monitor/main.py
+++ b/tools/edit_monitor/main.py

@@ -72,7 +72,8 @@
   root_logging_dir = tempfile.mkdtemp(prefix='edit_monitor_')
   _, log_path = tempfile.mkstemp(dir=root_logging_dir, suffix='.log')
 
-  log_fmt = '%(asctime)s %(filename)s:%(lineno)s:%(levelname)s: %(message)s'
+
+  log_fmt = '%(asctime)s.%(msecs)03d %(filename)s:%(lineno)s:%(levelname)s: %(message)s'
   date_fmt = '%Y-%m-%d %H:%M:%S'
   log_level = logging.DEBUG if verbose else logging.INFO
commit	a7107913d9ed38b23d140fe29eef1ff374663902	[log] [tgz]
author	Zhuoyao Zhang <zhuoyao@google.com>	Wed Dec 04 17:39:54 2024 +0000
committer	Zhuoyao Zhang <zhuoyao@google.com>	Wed Dec 04 21:09:44 2024 +0000
tree	603c7aa29ff5ce6a91989a8a7875d873bb19bf65
parent	3b2cf40ee8595d3cc88b66a86dede83e56228a8d [diff]