Report edit monitor error to clearcut

Report unexpected errors to clearcut when the daemon manager detect
anything wrong when start/monitor/stop/reboot the edit monitor process.

Test: atest daemon_manager_test
Bug: 365617369
Change-Id: I2027229c387ce2525d1f730d06483032662ffc4b
diff --git a/tools/edit_monitor/daemon_manager.py b/tools/edit_monitor/daemon_manager.py
index 4ff4ec8..29746e4 100644
--- a/tools/edit_monitor/daemon_manager.py
+++ b/tools/edit_monitor/daemon_manager.py
@@ -13,17 +13,22 @@
 # limitations under the License.
 
 
+import getpass
 import hashlib
 import logging
 import multiprocessing
 import os
 import pathlib
+import platform
 import signal
 import subprocess
 import sys
 import tempfile
 import time
 
+from atest.metrics import clearcut_client
+from atest.proto import clientanalytics_pb2
+from proto import edit_event_pb2
 
 DEFAULT_PROCESS_TERMINATION_TIMEOUT_SECONDS = 1
 DEFAULT_MONITOR_INTERVAL_SECONDS = 5
@@ -31,6 +36,9 @@
 DEFAULT_CPU_USAGE_THRESHOLD = 200
 DEFAULT_REBOOT_TIMEOUT_SECONDS = 60 * 60 * 24
 BLOCK_SIGN_FILE = "edit_monitor_block_sign"
+# Enum of the Clearcut log source defined under
+# /google3/wireless/android/play/playlog/proto/log_source_enum.proto
+LOG_SOURCE = 2524
 
 
 def default_daemon_target():
@@ -46,11 +54,16 @@
       binary_path: str,
       daemon_target: callable = default_daemon_target,
       daemon_args: tuple = (),
+      cclient: clearcut_client.Clearcut | None = None,
   ):
     self.binary_path = binary_path
     self.daemon_target = daemon_target
     self.daemon_args = daemon_args
+    self.cclient = cclient or clearcut_client.Clearcut(LOG_SOURCE)
 
+    self.user_name = getpass.getuser()
+    self.host_name = platform.node()
+    self.source_root = os.environ.get("ANDROID_BUILD_TOP", "")
     self.pid = os.getpid()
     self.daemon_process = None
 
@@ -70,13 +83,20 @@
       logging.warning("Block sign found, exiting...")
       return
 
-    if self.binary_path.startswith('/google/cog/'):
+    if self.binary_path.startswith("/google/cog/"):
       logging.warning("Edit monitor for cog is not supported, exiting...")
       return
 
-    self._stop_any_existing_instance()
-    self._write_pid_to_pidfile()
-    self._start_daemon_process()
+    try:
+      self._stop_any_existing_instance()
+      self._write_pid_to_pidfile()
+      self._start_daemon_process()
+    except Exception as e:
+      logging.exception("Failed to start daemon manager with error %s", e)
+      self._send_error_event_to_clearcut(
+          edit_event_pb2.EditEvent.FAILED_TO_START_EDIT_MONITOR
+      )
+      raise e
 
   def monitor_daemon(
       self,
@@ -118,6 +138,9 @@
         logging.error(
             "Daemon process is consuming too much resource, killing..."
         ),
+        self._send_error_event_to_clearcut(
+            edit_event_pb2.EditEvent.KILLED_DUE_TO_EXCEEDED_RESOURCE_USAGE
+        )
         self._terminate_process(self.daemon_process.pid)
 
     logging.info(
@@ -143,6 +166,12 @@
       logging.debug("Successfully stopped daemon manager.")
     except Exception as e:
       logging.exception("Failed to stop daemon manager with error %s", e)
+      self._send_error_event_to_clearcut(
+          edit_event_pb2.EditEvent.FAILED_TO_STOP_EDIT_MONITOR
+      )
+      sys.exit(1)
+    finally:
+      self.cclient.flush_events()
 
   def reboot(self):
     """Reboots the current process.
@@ -164,6 +193,9 @@
       os.execv(self.binary_path, sys.argv)
     except OSError as e:
       logging.exception("Failed to reboot process with error: %s.", e)
+      self._send_error_event_to_clearcut(
+          edit_event_pb2.EditEvent.FAILED_TO_REBOOT_EDIT_MONITOR
+      )
       sys.exit(1)  # Indicate an error occurred
 
   def cleanup(self):
@@ -175,6 +207,7 @@
     that requires immediate cleanup to prevent damanger to the system.
     """
     logging.debug("Start cleaning up all existing instances.")
+    self._send_error_event_to_clearcut(edit_event_pb2.EditEvent.FORCE_CLEANUP)
 
     try:
       # First places a block sign to prevent any edit monitor process to start.
@@ -355,4 +388,19 @@
         except (FileNotFoundError, IOError, ValueError, TypeError):
           logging.exception("Failed to get pid from file path: %s", file)
 
-    return pids
\ No newline at end of file
+    return pids
+
+  def _send_error_event_to_clearcut(self, error_type):
+    edit_monitor_error_event_proto = edit_event_pb2.EditEvent(
+        user_name=self.user_name,
+        host_name=self.host_name,
+        source_root=self.source_root,
+    )
+    edit_monitor_error_event_proto.edit_monitor_error_event.CopyFrom(
+        edit_event_pb2.EditEvent.EditMonitorErrorEvent(error_type=error_type)
+    )
+    log_event = clientanalytics_pb2.LogEvent(
+        event_time_ms=int(time.time() * 1000),
+        source_extension=edit_monitor_error_event_proto.SerializeToString(),
+    )
+    self.cclient.log(log_event)
diff --git a/tools/edit_monitor/daemon_manager_test.py b/tools/edit_monitor/daemon_manager_test.py
index 72442c6..e132000 100644
--- a/tools/edit_monitor/daemon_manager_test.py
+++ b/tools/edit_monitor/daemon_manager_test.py
@@ -26,6 +26,7 @@
 import unittest
 from unittest import mock
 from edit_monitor import daemon_manager
+from proto import edit_event_pb2
 
 
 TEST_BINARY_FILE = '/path/to/test_binary'
@@ -133,7 +134,8 @@
 
   def test_start_return_directly_if_in_cog_env(self):
     dm = daemon_manager.DaemonManager(
-        '/google/cog/cloud/user/workspace/edit_monitor')
+        '/google/cog/cloud/user/workspace/edit_monitor'
+    )
     dm.start()
     # Verify no daemon process is started.
     self.assertIsNone(dm.daemon_process)
@@ -148,9 +150,13 @@
     with open(pid_file_path_dir.joinpath(TEST_PID_FILE_PATH), 'w') as f:
       f.write('123456')
 
-    with self.assertRaises(OSError) as error:
-      dm = daemon_manager.DaemonManager(TEST_BINARY_FILE)
+    fake_cclient = FakeClearcutClient()
+    with self.assertRaises(OSError):
+      dm = daemon_manager.DaemonManager(TEST_BINARY_FILE, cclient=fake_cclient)
       dm.start()
+    self._assert_error_event_logged(
+        fake_cclient, edit_event_pb2.EditEvent.FAILED_TO_START_EDIT_MONITOR
+    )
 
   def test_start_failed_to_write_pidfile(self):
     pid_file_path_dir = pathlib.Path(self.working_dir.name).joinpath(
@@ -160,40 +166,63 @@
     # Makes the directory read-only so write pidfile will fail.
     os.chmod(pid_file_path_dir, 0o555)
 
-    with self.assertRaises(PermissionError) as error:
-      dm = daemon_manager.DaemonManager(TEST_BINARY_FILE)
+    fake_cclient = FakeClearcutClient()
+    with self.assertRaises(PermissionError):
+      dm = daemon_manager.DaemonManager(TEST_BINARY_FILE, cclient=fake_cclient)
       dm.start()
+    self._assert_error_event_logged(
+        fake_cclient, edit_event_pb2.EditEvent.FAILED_TO_START_EDIT_MONITOR
+    )
 
   def test_start_failed_to_start_daemon_process(self):
-    with self.assertRaises(TypeError) as error:
+    fake_cclient = FakeClearcutClient()
+    with self.assertRaises(TypeError):
       dm = daemon_manager.DaemonManager(
-          TEST_BINARY_FILE, daemon_target='wrong_target', daemon_args=(1)
+          TEST_BINARY_FILE,
+          daemon_target='wrong_target',
+          daemon_args=(1),
+          cclient=fake_cclient,
       )
       dm.start()
+    self._assert_error_event_logged(
+        fake_cclient, edit_event_pb2.EditEvent.FAILED_TO_START_EDIT_MONITOR
+    )
 
   def test_monitor_daemon_subprocess_killed_high_memory_usage(self):
+    fake_cclient = FakeClearcutClient()
     dm = daemon_manager.DaemonManager(
         TEST_BINARY_FILE,
         daemon_target=memory_consume_daemon_target,
         daemon_args=(2,),
+        cclient=fake_cclient,
     )
     dm.start()
     dm.monitor_daemon(interval=1, memory_threshold=2)
 
     self.assertTrue(dm.max_memory_usage >= 2)
     self.assert_no_subprocess_running()
+    self._assert_error_event_logged(
+        fake_cclient,
+        edit_event_pb2.EditEvent.KILLED_DUE_TO_EXCEEDED_RESOURCE_USAGE,
+    )
 
   def test_monitor_daemon_subprocess_killed_high_cpu_usage(self):
+    fake_cclient = FakeClearcutClient()
     dm = daemon_manager.DaemonManager(
         TEST_BINARY_FILE,
         daemon_target=cpu_consume_daemon_target,
         daemon_args=(20,),
+        cclient=fake_cclient,
     )
     dm.start()
     dm.monitor_daemon(interval=1, cpu_threshold=20)
 
     self.assertTrue(dm.max_cpu_usage >= 20)
     self.assert_no_subprocess_running()
+    self._assert_error_event_logged(
+        fake_cclient,
+        edit_event_pb2.EditEvent.KILLED_DUE_TO_EXCEEDED_RESOURCE_USAGE,
+    )
 
   @mock.patch('subprocess.check_output')
   def test_monitor_daemon_failed_does_not_matter(self, mock_output):
@@ -207,7 +236,8 @@
     )
 
     dm = daemon_manager.DaemonManager(
-        binary_file.name, daemon_target=long_running_daemon
+        binary_file.name,
+        daemon_target=long_running_daemon,
     )
     dm.start()
     dm.monitor_daemon(reboot_timeout=0.5)
@@ -226,27 +256,42 @@
   @mock.patch('os.kill')
   def test_stop_failed_to_kill_daemon_process(self, mock_kill):
     mock_kill.side_effect = OSError('Unknown OSError')
+    fake_cclient = FakeClearcutClient()
     dm = daemon_manager.DaemonManager(
-        TEST_BINARY_FILE, daemon_target=long_running_daemon
+        TEST_BINARY_FILE,
+        daemon_target=long_running_daemon,
+        cclient=fake_cclient,
     )
-    dm.start()
-    dm.stop()
 
-    self.assertTrue(dm.daemon_process.is_alive())
-    self.assertTrue(dm.pid_file_path.exists())
+    with self.assertRaises(SystemExit):
+      dm.start()
+      dm.stop()
+      self.assertTrue(dm.daemon_process.is_alive())
+      self.assertTrue(dm.pid_file_path.exists())
+    self._assert_error_event_logged(
+        fake_cclient, edit_event_pb2.EditEvent.FAILED_TO_STOP_EDIT_MONITOR
+    )
 
   @mock.patch('os.remove')
   def test_stop_failed_to_remove_pidfile(self, mock_remove):
     mock_remove.side_effect = OSError('Unknown OSError')
 
+    fake_cclient = FakeClearcutClient()
     dm = daemon_manager.DaemonManager(
-        TEST_BINARY_FILE, daemon_target=long_running_daemon
+        TEST_BINARY_FILE,
+        daemon_target=long_running_daemon,
+        cclient=fake_cclient,
     )
-    dm.start()
-    dm.stop()
 
-    self.assert_no_subprocess_running()
-    self.assertTrue(dm.pid_file_path.exists())
+    with self.assertRaises(SystemExit):
+      dm.start()
+      dm.stop()
+      self.assert_no_subprocess_running()
+      self.assertTrue(dm.pid_file_path.exists())
+
+    self._assert_error_event_logged(
+        fake_cclient, edit_event_pb2.EditEvent.FAILED_TO_STOP_EDIT_MONITOR
+    )
 
   @mock.patch('os.execv')
   def test_reboot_success(self, mock_execv):
@@ -273,7 +318,7 @@
     )
     dm.start()
 
-    with self.assertRaises(SystemExit) as cm:
+    with self.assertRaises(SystemExit):
       dm.reboot()
       mock_execv.assert_not_called()
       self.assertEqual(cm.exception.code, 0)
@@ -281,18 +326,24 @@
   @mock.patch('os.execv')
   def test_reboot_failed(self, mock_execv):
     mock_execv.side_effect = OSError('Unknown OSError')
+    fake_cclient = FakeClearcutClient()
     binary_file = tempfile.NamedTemporaryFile(
         dir=self.working_dir.name, delete=False
     )
 
     dm = daemon_manager.DaemonManager(
-        binary_file.name, daemon_target=long_running_daemon
+        binary_file.name,
+        daemon_target=long_running_daemon,
+        cclient=fake_cclient,
     )
     dm.start()
 
-    with self.assertRaises(SystemExit) as cm:
+    with self.assertRaises(SystemExit):
       dm.reboot()
       self.assertEqual(cm.exception.code, 1)
+    self._assert_error_event_logged(
+        fake_cclient, edit_event_pb2.EditEvent.FAILED_TO_REBOOT_EDIT_MONITOR
+    )
 
   def assert_run_simple_daemon_success(self):
     damone_output_file = tempfile.NamedTemporaryFile(
@@ -374,6 +425,33 @@
       f.write(str(p.pid))
     return p
 
+  def _assert_error_event_logged(self, fake_cclient, error_type):
+    error_events = fake_cclient.get_sent_events()
+    self.assertEquals(len(error_events), 1)
+    self.assertEquals(
+        edit_event_pb2.EditEvent.FromString(
+            error_events[0].source_extension
+        ).edit_monitor_error_event.error_type,
+        error_type,
+    )
+
+
+class FakeClearcutClient:
+
+  def __init__(self):
+    self.pending_log_events = []
+    self.sent_log_event = []
+
+  def log(self, log_event):
+    self.pending_log_events.append(log_event)
+
+  def flush_events(self):
+    self.sent_log_event.extend(self.pending_log_events)
+    self.pending_log_events.clear()
+
+  def get_sent_events(self):
+    return self.sent_log_event + self.pending_log_events
+
 
 if __name__ == '__main__':
   unittest.main()