Fix a race condition in edit monitor
When there are multiple edit monitor starting around the same time,
there will be cases that the pidfile is deleted or overridden as
multiple processes are trying to remove and create the pidfile around
the same time (see details in b/382135550).
This cl fix it by checking the pid in the pidfile and
only remove the pidfile if the pid stored there is the expected one.
Bug: 382135550
Test: atest edit_monitor_integration_test
Change-Id: Ifdb4c29c8b36052fea83ce59de1be9d99db3a852
diff --git a/tools/edit_monitor/daemon_manager.py b/tools/edit_monitor/daemon_manager.py
index 0c31ab8..a352e30 100644
--- a/tools/edit_monitor/daemon_manager.py
+++ b/tools/edit_monitor/daemon_manager.py
@@ -149,15 +149,15 @@
edit_event_pb2.EditEvent.KILLED_DUE_TO_EXCEEDED_MEMORY_USAGE
)
logging.error(
- "Daemon process is consuming too much memory, rebooting...")
+ "Daemon process is consuming too much memory, rebooting..."
+ )
self.reboot()
if self.max_cpu_usage >= cpu_threshold:
self._send_error_event_to_clearcut(
edit_event_pb2.EditEvent.KILLED_DUE_TO_EXCEEDED_CPU_USAGE
)
- logging.error(
- "Daemon process is consuming too much cpu, killing...")
+ logging.error("Daemon process is consuming too much cpu, killing...")
self._terminate_process(self.daemon_process.pid)
logging.info(
@@ -179,7 +179,7 @@
self._wait_for_process_terminate(self.daemon_process.pid, 1)
if self.daemon_process.is_alive():
self._terminate_process(self.daemon_process.pid)
- self._remove_pidfile()
+ self._remove_pidfile(self.pid)
logging.info("Successfully stopped daemon manager.")
except Exception as e:
logging.exception("Failed to stop daemon manager with error %s", e)
@@ -253,11 +253,15 @@
if ex_pid:
logging.info("Found another instance with pid %d.", ex_pid)
self._terminate_process(ex_pid)
- self._remove_pidfile()
+ self._remove_pidfile(ex_pid)
- def _read_pid_from_pidfile(self):
- with open(self.pid_file_path, "r") as f:
- return int(f.read().strip())
+ def _read_pid_from_pidfile(self) -> int | None:
+ try:
+ with open(self.pid_file_path, "r") as f:
+ return int(f.read().strip())
+ except FileNotFoundError as e:
+ logging.warning("pidfile %s does not exist.", self.pid_file_path)
+ return None
def _write_pid_to_pidfile(self):
"""Creates a pidfile and writes the current pid to the file.
@@ -333,7 +337,23 @@
)
return True
- def _remove_pidfile(self):
+ def _remove_pidfile(self, expected_pid: int):
+ recorded_pid = self._read_pid_from_pidfile()
+
+ if recorded_pid is None:
+ logging.info("pid file %s already removed.", self.pid_file_path)
+ return
+
+ if recorded_pid != expected_pid:
+ logging.warning(
+ "pid file contains pid from a different process, expected pid: %d,"
+ " actual pid: %d.",
+ expected_pid,
+ recorded_pid,
+ )
+ return
+
+ logging.debug("removing pidfile written by process %s", expected_pid)
try:
os.remove(self.pid_file_path)
except FileNotFoundError:
@@ -378,9 +398,7 @@
uptime_end = float(f.readline().split()[0])
return (
- (total_end_time - total_start_time)
- / (uptime_end - uptime_start)
- * 100
+ (total_end_time - total_start_time) / (uptime_end - uptime_start) * 100
)
def _get_total_cpu_time(self, pid: int) -> float:
diff --git a/tools/edit_monitor/edit_monitor_integration_test.py b/tools/edit_monitor/edit_monitor_integration_test.py
index 3d28274..f39b936 100644
--- a/tools/edit_monitor/edit_monitor_integration_test.py
+++ b/tools/edit_monitor/edit_monitor_integration_test.py
@@ -15,6 +15,7 @@
"""Integration tests for Edit Monitor."""
import glob
+from importlib import resources
import logging
import os
import pathlib
@@ -25,8 +26,6 @@
import tempfile
import time
import unittest
-
-from importlib import resources
from unittest import mock
@@ -49,7 +48,8 @@
self.root_monitoring_path.mkdir()
self.edit_monitor_binary_path = self._import_executable("edit_monitor")
self.patch = mock.patch.dict(
- os.environ, {'ENABLE_ANDROID_EDIT_MONITOR': 'true'})
+ os.environ, {"ENABLE_ANDROID_EDIT_MONITOR": "true"}
+ )
self.patch.start()
def tearDown(self):
@@ -83,7 +83,21 @@
self.assertEqual(self._get_logged_events_num(), 4)
- def _start_edit_monitor_process(self):
+ def test_start_multiple_edit_monitor_only_one_started(self):
+ p1 = self._start_edit_monitor_process(wait_for_observer_start=False)
+ p2 = self._start_edit_monitor_process(wait_for_observer_start=False)
+ p3 = self._start_edit_monitor_process(wait_for_observer_start=False)
+
+ live_processes = self._get_live_processes([p1, p2, p3])
+
+ # Cleanup all live processes.
+ for p in live_processes:
+ os.kill(p.pid, signal.SIGINT)
+ p.communicate()
+
+ self.assertEqual(len(live_processes), 1)
+
+ def _start_edit_monitor_process(self, wait_for_observer_start=True):
command = f"""
export TMPDIR="{self.working_dir.name}"
{self.edit_monitor_binary_path} --path={self.root_monitoring_path} --dry_run"""
@@ -94,7 +108,9 @@
start_new_session=True,
executable="/bin/bash",
)
- self._wait_for_observer_start(time_out=5)
+ if wait_for_observer_start:
+ self._wait_for_observer_start(time_out=5)
+
return p
def _wait_for_observer_start(self, time_out):
@@ -125,6 +141,18 @@
return 0
+ def _get_live_processes(self, processes):
+ live_processes = []
+ for p in processes:
+ try:
+ p.wait(timeout=5)
+ except subprocess.TimeoutExpired as e:
+ live_processes.append(p)
+ logging.info("process: %d still alive.", p.pid)
+ else:
+ logging.info("process: %d stopped.", p.pid)
+ return live_processes
+
def _import_executable(self, executable_name: str) -> pathlib.Path:
binary_dir = pathlib.Path(self.working_dir.name).joinpath("binary")
binary_dir.mkdir()
diff --git a/tools/edit_monitor/main.py b/tools/edit_monitor/main.py
index 49385f1..7ca0daa 100644
--- a/tools/edit_monitor/main.py
+++ b/tools/edit_monitor/main.py
@@ -72,7 +72,8 @@
root_logging_dir = tempfile.mkdtemp(prefix='edit_monitor_')
_, log_path = tempfile.mkstemp(dir=root_logging_dir, suffix='.log')
- log_fmt = '%(asctime)s %(filename)s:%(lineno)s:%(levelname)s: %(message)s'
+
+ log_fmt = '%(asctime)s.%(msecs)03d %(filename)s:%(lineno)s:%(levelname)s: %(message)s'
date_fmt = '%Y-%m-%d %H:%M:%S'
log_level = logging.DEBUG if verbose else logging.INFO