Report edit monitor error to clearcut
Report unexpected errors to clearcut when the daemon manager detect
anything wrong when start/monitor/stop/reboot the edit monitor process.
Test: atest daemon_manager_test
Bug: 365617369
Change-Id: I2027229c387ce2525d1f730d06483032662ffc4b
diff --git a/tools/edit_monitor/daemon_manager.py b/tools/edit_monitor/daemon_manager.py
index 4ff4ec8..29746e4 100644
--- a/tools/edit_monitor/daemon_manager.py
+++ b/tools/edit_monitor/daemon_manager.py
@@ -13,17 +13,22 @@
# limitations under the License.
+import getpass
import hashlib
import logging
import multiprocessing
import os
import pathlib
+import platform
import signal
import subprocess
import sys
import tempfile
import time
+from atest.metrics import clearcut_client
+from atest.proto import clientanalytics_pb2
+from proto import edit_event_pb2
DEFAULT_PROCESS_TERMINATION_TIMEOUT_SECONDS = 1
DEFAULT_MONITOR_INTERVAL_SECONDS = 5
@@ -31,6 +36,9 @@
DEFAULT_CPU_USAGE_THRESHOLD = 200
DEFAULT_REBOOT_TIMEOUT_SECONDS = 60 * 60 * 24
BLOCK_SIGN_FILE = "edit_monitor_block_sign"
+# Enum of the Clearcut log source defined under
+# /google3/wireless/android/play/playlog/proto/log_source_enum.proto
+LOG_SOURCE = 2524
def default_daemon_target():
@@ -46,11 +54,16 @@
binary_path: str,
daemon_target: callable = default_daemon_target,
daemon_args: tuple = (),
+ cclient: clearcut_client.Clearcut | None = None,
):
self.binary_path = binary_path
self.daemon_target = daemon_target
self.daemon_args = daemon_args
+ self.cclient = cclient or clearcut_client.Clearcut(LOG_SOURCE)
+ self.user_name = getpass.getuser()
+ self.host_name = platform.node()
+ self.source_root = os.environ.get("ANDROID_BUILD_TOP", "")
self.pid = os.getpid()
self.daemon_process = None
@@ -70,13 +83,20 @@
logging.warning("Block sign found, exiting...")
return
- if self.binary_path.startswith('/google/cog/'):
+ if self.binary_path.startswith("/google/cog/"):
logging.warning("Edit monitor for cog is not supported, exiting...")
return
- self._stop_any_existing_instance()
- self._write_pid_to_pidfile()
- self._start_daemon_process()
+ try:
+ self._stop_any_existing_instance()
+ self._write_pid_to_pidfile()
+ self._start_daemon_process()
+ except Exception as e:
+ logging.exception("Failed to start daemon manager with error %s", e)
+ self._send_error_event_to_clearcut(
+ edit_event_pb2.EditEvent.FAILED_TO_START_EDIT_MONITOR
+ )
+ raise e
def monitor_daemon(
self,
@@ -118,6 +138,9 @@
logging.error(
"Daemon process is consuming too much resource, killing..."
),
+ self._send_error_event_to_clearcut(
+ edit_event_pb2.EditEvent.KILLED_DUE_TO_EXCEEDED_RESOURCE_USAGE
+ )
self._terminate_process(self.daemon_process.pid)
logging.info(
@@ -143,6 +166,12 @@
logging.debug("Successfully stopped daemon manager.")
except Exception as e:
logging.exception("Failed to stop daemon manager with error %s", e)
+ self._send_error_event_to_clearcut(
+ edit_event_pb2.EditEvent.FAILED_TO_STOP_EDIT_MONITOR
+ )
+ sys.exit(1)
+ finally:
+ self.cclient.flush_events()
def reboot(self):
"""Reboots the current process.
@@ -164,6 +193,9 @@
os.execv(self.binary_path, sys.argv)
except OSError as e:
logging.exception("Failed to reboot process with error: %s.", e)
+ self._send_error_event_to_clearcut(
+ edit_event_pb2.EditEvent.FAILED_TO_REBOOT_EDIT_MONITOR
+ )
sys.exit(1) # Indicate an error occurred
def cleanup(self):
@@ -175,6 +207,7 @@
that requires immediate cleanup to prevent damanger to the system.
"""
logging.debug("Start cleaning up all existing instances.")
+ self._send_error_event_to_clearcut(edit_event_pb2.EditEvent.FORCE_CLEANUP)
try:
# First places a block sign to prevent any edit monitor process to start.
@@ -355,4 +388,19 @@
except (FileNotFoundError, IOError, ValueError, TypeError):
logging.exception("Failed to get pid from file path: %s", file)
- return pids
\ No newline at end of file
+ return pids
+
+ def _send_error_event_to_clearcut(self, error_type):
+ edit_monitor_error_event_proto = edit_event_pb2.EditEvent(
+ user_name=self.user_name,
+ host_name=self.host_name,
+ source_root=self.source_root,
+ )
+ edit_monitor_error_event_proto.edit_monitor_error_event.CopyFrom(
+ edit_event_pb2.EditEvent.EditMonitorErrorEvent(error_type=error_type)
+ )
+ log_event = clientanalytics_pb2.LogEvent(
+ event_time_ms=int(time.time() * 1000),
+ source_extension=edit_monitor_error_event_proto.SerializeToString(),
+ )
+ self.cclient.log(log_event)
diff --git a/tools/edit_monitor/daemon_manager_test.py b/tools/edit_monitor/daemon_manager_test.py
index 72442c6..e132000 100644
--- a/tools/edit_monitor/daemon_manager_test.py
+++ b/tools/edit_monitor/daemon_manager_test.py
@@ -26,6 +26,7 @@
import unittest
from unittest import mock
from edit_monitor import daemon_manager
+from proto import edit_event_pb2
TEST_BINARY_FILE = '/path/to/test_binary'
@@ -133,7 +134,8 @@
def test_start_return_directly_if_in_cog_env(self):
dm = daemon_manager.DaemonManager(
- '/google/cog/cloud/user/workspace/edit_monitor')
+ '/google/cog/cloud/user/workspace/edit_monitor'
+ )
dm.start()
# Verify no daemon process is started.
self.assertIsNone(dm.daemon_process)
@@ -148,9 +150,13 @@
with open(pid_file_path_dir.joinpath(TEST_PID_FILE_PATH), 'w') as f:
f.write('123456')
- with self.assertRaises(OSError) as error:
- dm = daemon_manager.DaemonManager(TEST_BINARY_FILE)
+ fake_cclient = FakeClearcutClient()
+ with self.assertRaises(OSError):
+ dm = daemon_manager.DaemonManager(TEST_BINARY_FILE, cclient=fake_cclient)
dm.start()
+ self._assert_error_event_logged(
+ fake_cclient, edit_event_pb2.EditEvent.FAILED_TO_START_EDIT_MONITOR
+ )
def test_start_failed_to_write_pidfile(self):
pid_file_path_dir = pathlib.Path(self.working_dir.name).joinpath(
@@ -160,40 +166,63 @@
# Makes the directory read-only so write pidfile will fail.
os.chmod(pid_file_path_dir, 0o555)
- with self.assertRaises(PermissionError) as error:
- dm = daemon_manager.DaemonManager(TEST_BINARY_FILE)
+ fake_cclient = FakeClearcutClient()
+ with self.assertRaises(PermissionError):
+ dm = daemon_manager.DaemonManager(TEST_BINARY_FILE, cclient=fake_cclient)
dm.start()
+ self._assert_error_event_logged(
+ fake_cclient, edit_event_pb2.EditEvent.FAILED_TO_START_EDIT_MONITOR
+ )
def test_start_failed_to_start_daemon_process(self):
- with self.assertRaises(TypeError) as error:
+ fake_cclient = FakeClearcutClient()
+ with self.assertRaises(TypeError):
dm = daemon_manager.DaemonManager(
- TEST_BINARY_FILE, daemon_target='wrong_target', daemon_args=(1)
+ TEST_BINARY_FILE,
+ daemon_target='wrong_target',
+ daemon_args=(1),
+ cclient=fake_cclient,
)
dm.start()
+ self._assert_error_event_logged(
+ fake_cclient, edit_event_pb2.EditEvent.FAILED_TO_START_EDIT_MONITOR
+ )
def test_monitor_daemon_subprocess_killed_high_memory_usage(self):
+ fake_cclient = FakeClearcutClient()
dm = daemon_manager.DaemonManager(
TEST_BINARY_FILE,
daemon_target=memory_consume_daemon_target,
daemon_args=(2,),
+ cclient=fake_cclient,
)
dm.start()
dm.monitor_daemon(interval=1, memory_threshold=2)
self.assertTrue(dm.max_memory_usage >= 2)
self.assert_no_subprocess_running()
+ self._assert_error_event_logged(
+ fake_cclient,
+ edit_event_pb2.EditEvent.KILLED_DUE_TO_EXCEEDED_RESOURCE_USAGE,
+ )
def test_monitor_daemon_subprocess_killed_high_cpu_usage(self):
+ fake_cclient = FakeClearcutClient()
dm = daemon_manager.DaemonManager(
TEST_BINARY_FILE,
daemon_target=cpu_consume_daemon_target,
daemon_args=(20,),
+ cclient=fake_cclient,
)
dm.start()
dm.monitor_daemon(interval=1, cpu_threshold=20)
self.assertTrue(dm.max_cpu_usage >= 20)
self.assert_no_subprocess_running()
+ self._assert_error_event_logged(
+ fake_cclient,
+ edit_event_pb2.EditEvent.KILLED_DUE_TO_EXCEEDED_RESOURCE_USAGE,
+ )
@mock.patch('subprocess.check_output')
def test_monitor_daemon_failed_does_not_matter(self, mock_output):
@@ -207,7 +236,8 @@
)
dm = daemon_manager.DaemonManager(
- binary_file.name, daemon_target=long_running_daemon
+ binary_file.name,
+ daemon_target=long_running_daemon,
)
dm.start()
dm.monitor_daemon(reboot_timeout=0.5)
@@ -226,27 +256,42 @@
@mock.patch('os.kill')
def test_stop_failed_to_kill_daemon_process(self, mock_kill):
mock_kill.side_effect = OSError('Unknown OSError')
+ fake_cclient = FakeClearcutClient()
dm = daemon_manager.DaemonManager(
- TEST_BINARY_FILE, daemon_target=long_running_daemon
+ TEST_BINARY_FILE,
+ daemon_target=long_running_daemon,
+ cclient=fake_cclient,
)
- dm.start()
- dm.stop()
- self.assertTrue(dm.daemon_process.is_alive())
- self.assertTrue(dm.pid_file_path.exists())
+ with self.assertRaises(SystemExit):
+ dm.start()
+ dm.stop()
+ self.assertTrue(dm.daemon_process.is_alive())
+ self.assertTrue(dm.pid_file_path.exists())
+ self._assert_error_event_logged(
+ fake_cclient, edit_event_pb2.EditEvent.FAILED_TO_STOP_EDIT_MONITOR
+ )
@mock.patch('os.remove')
def test_stop_failed_to_remove_pidfile(self, mock_remove):
mock_remove.side_effect = OSError('Unknown OSError')
+ fake_cclient = FakeClearcutClient()
dm = daemon_manager.DaemonManager(
- TEST_BINARY_FILE, daemon_target=long_running_daemon
+ TEST_BINARY_FILE,
+ daemon_target=long_running_daemon,
+ cclient=fake_cclient,
)
- dm.start()
- dm.stop()
- self.assert_no_subprocess_running()
- self.assertTrue(dm.pid_file_path.exists())
+ with self.assertRaises(SystemExit):
+ dm.start()
+ dm.stop()
+ self.assert_no_subprocess_running()
+ self.assertTrue(dm.pid_file_path.exists())
+
+ self._assert_error_event_logged(
+ fake_cclient, edit_event_pb2.EditEvent.FAILED_TO_STOP_EDIT_MONITOR
+ )
@mock.patch('os.execv')
def test_reboot_success(self, mock_execv):
@@ -273,7 +318,7 @@
)
dm.start()
- with self.assertRaises(SystemExit) as cm:
+ with self.assertRaises(SystemExit):
dm.reboot()
mock_execv.assert_not_called()
self.assertEqual(cm.exception.code, 0)
@@ -281,18 +326,24 @@
@mock.patch('os.execv')
def test_reboot_failed(self, mock_execv):
mock_execv.side_effect = OSError('Unknown OSError')
+ fake_cclient = FakeClearcutClient()
binary_file = tempfile.NamedTemporaryFile(
dir=self.working_dir.name, delete=False
)
dm = daemon_manager.DaemonManager(
- binary_file.name, daemon_target=long_running_daemon
+ binary_file.name,
+ daemon_target=long_running_daemon,
+ cclient=fake_cclient,
)
dm.start()
- with self.assertRaises(SystemExit) as cm:
+ with self.assertRaises(SystemExit):
dm.reboot()
self.assertEqual(cm.exception.code, 1)
+ self._assert_error_event_logged(
+ fake_cclient, edit_event_pb2.EditEvent.FAILED_TO_REBOOT_EDIT_MONITOR
+ )
def assert_run_simple_daemon_success(self):
damone_output_file = tempfile.NamedTemporaryFile(
@@ -374,6 +425,33 @@
f.write(str(p.pid))
return p
+ def _assert_error_event_logged(self, fake_cclient, error_type):
+ error_events = fake_cclient.get_sent_events()
+ self.assertEquals(len(error_events), 1)
+ self.assertEquals(
+ edit_event_pb2.EditEvent.FromString(
+ error_events[0].source_extension
+ ).edit_monitor_error_event.error_type,
+ error_type,
+ )
+
+
+class FakeClearcutClient:
+
+ def __init__(self):
+ self.pending_log_events = []
+ self.sent_log_event = []
+
+ def log(self, log_event):
+ self.pending_log_events.append(log_event)
+
+ def flush_events(self):
+ self.sent_log_event.extend(self.pending_log_events)
+ self.pending_log_events.clear()
+
+ def get_sent_events(self):
+ return self.sent_log_event + self.pending_log_events
+
if __name__ == '__main__':
unittest.main()