Zhuoyao Zhang | 5335955 | 2024-09-16 23:58:11 +0000 | [diff] [blame] | 1 | # Copyright 2024, The Android Open Source Project |
| 2 | # |
| 3 | # Licensed under the Apache License, Version 2.0 (the "License"); |
| 4 | # you may not use this file except in compliance with the License. |
| 5 | # You may obtain a copy of the License at |
| 6 | # |
| 7 | # http://www.apache.org/licenses/LICENSE-2.0 |
| 8 | # |
| 9 | # Unless required by applicable law or agreed to in writing, software |
| 10 | # distributed under the License is distributed on an "AS IS" BASIS, |
| 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| 12 | # See the License for the specific language governing permissions and |
| 13 | # limitations under the License. |
| 14 | |
| 15 | |
Zhuoyao Zhang | ec657ce | 2024-12-06 19:17:46 +0000 | [diff] [blame] | 16 | import errno |
| 17 | import fcntl |
Zhuoyao Zhang | ba64f31 | 2024-10-14 20:32:53 +0000 | [diff] [blame] | 18 | import getpass |
Zhuoyao Zhang | 5335955 | 2024-09-16 23:58:11 +0000 | [diff] [blame] | 19 | import hashlib |
| 20 | import logging |
| 21 | import multiprocessing |
| 22 | import os |
| 23 | import pathlib |
Zhuoyao Zhang | ba64f31 | 2024-10-14 20:32:53 +0000 | [diff] [blame] | 24 | import platform |
Zhuoyao Zhang | 5335955 | 2024-09-16 23:58:11 +0000 | [diff] [blame] | 25 | import signal |
| 26 | import subprocess |
Zhuoyao Zhang | 205a2fc | 2024-09-20 18:19:59 +0000 | [diff] [blame] | 27 | import sys |
Zhuoyao Zhang | 5335955 | 2024-09-16 23:58:11 +0000 | [diff] [blame] | 28 | import tempfile |
| 29 | import time |
| 30 | |
Zhuoyao Zhang | ba64f31 | 2024-10-14 20:32:53 +0000 | [diff] [blame] | 31 | from atest.metrics import clearcut_client |
| 32 | from atest.proto import clientanalytics_pb2 |
Zhuoyao Zhang | 3ca7cef | 2024-10-31 22:07:31 +0000 | [diff] [blame] | 33 | from edit_monitor import utils |
Zhuoyao Zhang | ba64f31 | 2024-10-14 20:32:53 +0000 | [diff] [blame] | 34 | from proto import edit_event_pb2 |
Zhuoyao Zhang | 5335955 | 2024-09-16 23:58:11 +0000 | [diff] [blame] | 35 | |
Zhuoyao Zhang | 5d16222 | 2024-10-24 23:10:04 +0000 | [diff] [blame] | 36 | DEFAULT_PROCESS_TERMINATION_TIMEOUT_SECONDS = 5 |
Zhuoyao Zhang | dc2840d | 2024-09-19 23:29:27 +0000 | [diff] [blame] | 37 | DEFAULT_MONITOR_INTERVAL_SECONDS = 5 |
Zhuoyao Zhang | 6988272 | 2024-11-15 18:32:18 +0000 | [diff] [blame] | 38 | DEFAULT_MEMORY_USAGE_THRESHOLD = 0.02 # 2% of total memory |
Zhuoyao Zhang | 205a2fc | 2024-09-20 18:19:59 +0000 | [diff] [blame] | 39 | DEFAULT_CPU_USAGE_THRESHOLD = 200 |
| 40 | DEFAULT_REBOOT_TIMEOUT_SECONDS = 60 * 60 * 24 |
Zhuoyao Zhang | d28da5c | 2024-09-24 19:46:12 +0000 | [diff] [blame] | 41 | BLOCK_SIGN_FILE = "edit_monitor_block_sign" |
Zhuoyao Zhang | ba64f31 | 2024-10-14 20:32:53 +0000 | [diff] [blame] | 42 | # Enum of the Clearcut log source defined under |
| 43 | # /google3/wireless/android/play/playlog/proto/log_source_enum.proto |
| 44 | LOG_SOURCE = 2524 |
Zhuoyao Zhang | 5335955 | 2024-09-16 23:58:11 +0000 | [diff] [blame] | 45 | |
| 46 | |
| 47 | def default_daemon_target(): |
| 48 | """Place holder for the default daemon target.""" |
| 49 | print("default daemon target") |
| 50 | |
| 51 | |
| 52 | class DaemonManager: |
| 53 | """Class to manage and monitor the daemon run as a subprocess.""" |
| 54 | |
| 55 | def __init__( |
| 56 | self, |
| 57 | binary_path: str, |
| 58 | daemon_target: callable = default_daemon_target, |
| 59 | daemon_args: tuple = (), |
Zhuoyao Zhang | ba64f31 | 2024-10-14 20:32:53 +0000 | [diff] [blame] | 60 | cclient: clearcut_client.Clearcut | None = None, |
Zhuoyao Zhang | 5335955 | 2024-09-16 23:58:11 +0000 | [diff] [blame] | 61 | ): |
| 62 | self.binary_path = binary_path |
| 63 | self.daemon_target = daemon_target |
| 64 | self.daemon_args = daemon_args |
Zhuoyao Zhang | ba64f31 | 2024-10-14 20:32:53 +0000 | [diff] [blame] | 65 | self.cclient = cclient or clearcut_client.Clearcut(LOG_SOURCE) |
Zhuoyao Zhang | 5335955 | 2024-09-16 23:58:11 +0000 | [diff] [blame] | 66 | |
Zhuoyao Zhang | ba64f31 | 2024-10-14 20:32:53 +0000 | [diff] [blame] | 67 | self.user_name = getpass.getuser() |
| 68 | self.host_name = platform.node() |
| 69 | self.source_root = os.environ.get("ANDROID_BUILD_TOP", "") |
Zhuoyao Zhang | 5335955 | 2024-09-16 23:58:11 +0000 | [diff] [blame] | 70 | self.pid = os.getpid() |
| 71 | self.daemon_process = None |
| 72 | |
Zhuoyao Zhang | dc2840d | 2024-09-19 23:29:27 +0000 | [diff] [blame] | 73 | self.max_memory_usage = 0 |
| 74 | self.max_cpu_usage = 0 |
Zhuoyao Zhang | 6988272 | 2024-11-15 18:32:18 +0000 | [diff] [blame] | 75 | self.total_memory_size = os.sysconf("SC_PAGE_SIZE") * os.sysconf( |
| 76 | "SC_PHYS_PAGES" |
| 77 | ) |
Zhuoyao Zhang | dc2840d | 2024-09-19 23:29:27 +0000 | [diff] [blame] | 78 | |
Zhuoyao Zhang | 5335955 | 2024-09-16 23:58:11 +0000 | [diff] [blame] | 79 | pid_file_dir = pathlib.Path(tempfile.gettempdir()).joinpath("edit_monitor") |
| 80 | pid_file_dir.mkdir(parents=True, exist_ok=True) |
| 81 | self.pid_file_path = self._get_pid_file_path(pid_file_dir) |
Zhuoyao Zhang | d28da5c | 2024-09-24 19:46:12 +0000 | [diff] [blame] | 82 | self.block_sign = pathlib.Path(tempfile.gettempdir()).joinpath( |
| 83 | BLOCK_SIGN_FILE |
| 84 | ) |
Zhuoyao Zhang | 5335955 | 2024-09-16 23:58:11 +0000 | [diff] [blame] | 85 | |
| 86 | def start(self): |
| 87 | """Writes the pidfile and starts the daemon proces.""" |
Zhuoyao Zhang | 3ca7cef | 2024-10-31 22:07:31 +0000 | [diff] [blame] | 88 | if not utils.is_feature_enabled( |
| 89 | "edit_monitor", |
| 90 | self.user_name, |
Zhuoyao Zhang | d1c4a8b | 2024-11-06 21:48:45 +0000 | [diff] [blame] | 91 | "ENABLE_ANDROID_EDIT_MONITOR", |
Zhuoyao Zhang | bedd13b | 2024-11-21 19:42:48 +0000 | [diff] [blame] | 92 | 100, |
Zhuoyao Zhang | 3ca7cef | 2024-10-31 22:07:31 +0000 | [diff] [blame] | 93 | ): |
| 94 | logging.warning("Edit monitor is disabled, exiting...") |
| 95 | return |
| 96 | |
Zhuoyao Zhang | d28da5c | 2024-09-24 19:46:12 +0000 | [diff] [blame] | 97 | if self.block_sign.exists(): |
| 98 | logging.warning("Block sign found, exiting...") |
| 99 | return |
| 100 | |
Zhuoyao Zhang | ba64f31 | 2024-10-14 20:32:53 +0000 | [diff] [blame] | 101 | if self.binary_path.startswith("/google/cog/"): |
Zhuoyao Zhang | 05e28fa | 2024-10-04 21:58:39 +0000 | [diff] [blame] | 102 | logging.warning("Edit monitor for cog is not supported, exiting...") |
| 103 | return |
| 104 | |
Zhuoyao Zhang | ec657ce | 2024-12-06 19:17:46 +0000 | [diff] [blame] | 105 | setup_lock_file = pathlib.Path(tempfile.gettempdir()).joinpath( |
| 106 | self.pid_file_path.name + ".setup" |
| 107 | ) |
| 108 | logging.info("setup lock file: %s", setup_lock_file) |
| 109 | with open(setup_lock_file, "w") as f: |
| 110 | try: |
| 111 | # Acquire an exclusive lock |
| 112 | fcntl.flock(f, fcntl.LOCK_EX | fcntl.LOCK_NB) |
| 113 | self._stop_any_existing_instance() |
| 114 | self._write_pid_to_pidfile() |
| 115 | self._start_daemon_process() |
| 116 | except Exception as e: |
| 117 | if ( |
| 118 | isinstance(e, IOError) and e.errno == errno.EAGAIN |
| 119 | ): # Failed to acquire the file lock. |
| 120 | logging.warning("Another edit monitor is starting, exitinng...") |
| 121 | return |
| 122 | else: |
| 123 | logging.exception("Failed to start daemon manager with error %s", e) |
| 124 | self._send_error_event_to_clearcut( |
| 125 | edit_event_pb2.EditEvent.FAILED_TO_START_EDIT_MONITOR |
| 126 | ) |
| 127 | raise e |
| 128 | finally: |
| 129 | # Release the lock |
| 130 | fcntl.flock(f, fcntl.LOCK_UN) |
Zhuoyao Zhang | 5335955 | 2024-09-16 23:58:11 +0000 | [diff] [blame] | 131 | |
Zhuoyao Zhang | dc2840d | 2024-09-19 23:29:27 +0000 | [diff] [blame] | 132 | def monitor_daemon( |
| 133 | self, |
| 134 | interval: int = DEFAULT_MONITOR_INTERVAL_SECONDS, |
| 135 | memory_threshold: float = DEFAULT_MEMORY_USAGE_THRESHOLD, |
| 136 | cpu_threshold: float = DEFAULT_CPU_USAGE_THRESHOLD, |
Zhuoyao Zhang | 205a2fc | 2024-09-20 18:19:59 +0000 | [diff] [blame] | 137 | reboot_timeout: int = DEFAULT_REBOOT_TIMEOUT_SECONDS, |
Zhuoyao Zhang | dc2840d | 2024-09-19 23:29:27 +0000 | [diff] [blame] | 138 | ): |
| 139 | """Monits the daemon process status. |
| 140 | |
| 141 | Periodically check the CPU/Memory usage of the daemon process as long as the |
| 142 | process is still running and kill the process if the resource usage is above |
| 143 | given thresholds. |
| 144 | """ |
Zhuoyao Zhang | d28da5c | 2024-09-24 19:46:12 +0000 | [diff] [blame] | 145 | if not self.daemon_process: |
| 146 | return |
| 147 | |
Zhuoyao Zhang | dc2840d | 2024-09-19 23:29:27 +0000 | [diff] [blame] | 148 | logging.info("start monitoring daemon process %d.", self.daemon_process.pid) |
Zhuoyao Zhang | 205a2fc | 2024-09-20 18:19:59 +0000 | [diff] [blame] | 149 | reboot_time = time.time() + reboot_timeout |
Zhuoyao Zhang | dc2840d | 2024-09-19 23:29:27 +0000 | [diff] [blame] | 150 | while self.daemon_process.is_alive(): |
Zhuoyao Zhang | 205a2fc | 2024-09-20 18:19:59 +0000 | [diff] [blame] | 151 | if time.time() > reboot_time: |
| 152 | self.reboot() |
Zhuoyao Zhang | dc2840d | 2024-09-19 23:29:27 +0000 | [diff] [blame] | 153 | try: |
| 154 | memory_usage = self._get_process_memory_percent(self.daemon_process.pid) |
| 155 | self.max_memory_usage = max(self.max_memory_usage, memory_usage) |
| 156 | |
| 157 | cpu_usage = self._get_process_cpu_percent(self.daemon_process.pid) |
| 158 | self.max_cpu_usage = max(self.max_cpu_usage, cpu_usage) |
| 159 | |
| 160 | time.sleep(interval) |
| 161 | except Exception as e: |
| 162 | # Logging the error and continue. |
| 163 | logging.warning("Failed to monitor daemon process with error: %s", e) |
| 164 | |
Zhuoyao Zhang | 585b434 | 2024-11-12 22:46:43 +0000 | [diff] [blame] | 165 | if self.max_memory_usage >= memory_threshold: |
Zhuoyao Zhang | 78fd076 | 2024-11-18 22:15:42 +0000 | [diff] [blame] | 166 | self._send_error_event_to_clearcut( |
| 167 | edit_event_pb2.EditEvent.KILLED_DUE_TO_EXCEEDED_MEMORY_USAGE |
| 168 | ) |
| 169 | logging.error( |
Zhuoyao Zhang | a710791 | 2024-12-04 17:39:54 +0000 | [diff] [blame] | 170 | "Daemon process is consuming too much memory, rebooting..." |
| 171 | ) |
Zhuoyao Zhang | 78fd076 | 2024-11-18 22:15:42 +0000 | [diff] [blame] | 172 | self.reboot() |
Zhuoyao Zhang | 585b434 | 2024-11-12 22:46:43 +0000 | [diff] [blame] | 173 | |
| 174 | if self.max_cpu_usage >= cpu_threshold: |
Zhuoyao Zhang | 78fd076 | 2024-11-18 22:15:42 +0000 | [diff] [blame] | 175 | self._send_error_event_to_clearcut( |
| 176 | edit_event_pb2.EditEvent.KILLED_DUE_TO_EXCEEDED_CPU_USAGE |
| 177 | ) |
Zhuoyao Zhang | a710791 | 2024-12-04 17:39:54 +0000 | [diff] [blame] | 178 | logging.error("Daemon process is consuming too much cpu, killing...") |
Zhuoyao Zhang | 78fd076 | 2024-11-18 22:15:42 +0000 | [diff] [blame] | 179 | self._terminate_process(self.daemon_process.pid) |
Zhuoyao Zhang | dc2840d | 2024-09-19 23:29:27 +0000 | [diff] [blame] | 180 | |
| 181 | logging.info( |
| 182 | "Daemon process %d terminated. Max memory usage: %f, Max cpu" |
| 183 | " usage: %f.", |
| 184 | self.daemon_process.pid, |
| 185 | self.max_memory_usage, |
| 186 | self.max_cpu_usage, |
| 187 | ) |
| 188 | |
Zhuoyao Zhang | 5335955 | 2024-09-16 23:58:11 +0000 | [diff] [blame] | 189 | def stop(self): |
| 190 | """Stops the daemon process and removes the pidfile.""" |
| 191 | |
Zhuoyao Zhang | 64ad75f | 2024-10-25 00:21:45 +0000 | [diff] [blame] | 192 | logging.info("in daemon manager cleanup.") |
Zhuoyao Zhang | 5335955 | 2024-09-16 23:58:11 +0000 | [diff] [blame] | 193 | try: |
Zhuoyao Zhang | 8a22579 | 2024-10-09 18:04:39 +0000 | [diff] [blame] | 194 | if self.daemon_process: |
| 195 | # The daemon process might already in termination process, |
| 196 | # wait some time before kill it explicitly. |
| 197 | self._wait_for_process_terminate(self.daemon_process.pid, 1) |
| 198 | if self.daemon_process.is_alive(): |
| 199 | self._terminate_process(self.daemon_process.pid) |
Zhuoyao Zhang | a710791 | 2024-12-04 17:39:54 +0000 | [diff] [blame] | 200 | self._remove_pidfile(self.pid) |
Zhuoyao Zhang | 64ad75f | 2024-10-25 00:21:45 +0000 | [diff] [blame] | 201 | logging.info("Successfully stopped daemon manager.") |
Zhuoyao Zhang | 5335955 | 2024-09-16 23:58:11 +0000 | [diff] [blame] | 202 | except Exception as e: |
| 203 | logging.exception("Failed to stop daemon manager with error %s", e) |
Zhuoyao Zhang | ba64f31 | 2024-10-14 20:32:53 +0000 | [diff] [blame] | 204 | self._send_error_event_to_clearcut( |
| 205 | edit_event_pb2.EditEvent.FAILED_TO_STOP_EDIT_MONITOR |
| 206 | ) |
| 207 | sys.exit(1) |
| 208 | finally: |
| 209 | self.cclient.flush_events() |
Zhuoyao Zhang | 5335955 | 2024-09-16 23:58:11 +0000 | [diff] [blame] | 210 | |
Zhuoyao Zhang | 205a2fc | 2024-09-20 18:19:59 +0000 | [diff] [blame] | 211 | def reboot(self): |
| 212 | """Reboots the current process. |
| 213 | |
| 214 | Stops the current daemon manager and reboots the entire process based on |
| 215 | the binary file. Exits directly If the binary file no longer exists. |
| 216 | """ |
Zhuoyao Zhang | 64ad75f | 2024-10-25 00:21:45 +0000 | [diff] [blame] | 217 | logging.info("Rebooting process based on binary %s.", self.binary_path) |
Zhuoyao Zhang | 205a2fc | 2024-09-20 18:19:59 +0000 | [diff] [blame] | 218 | |
| 219 | # Stop the current daemon manager first. |
| 220 | self.stop() |
| 221 | |
| 222 | # If the binary no longer exists, exit directly. |
| 223 | if not os.path.exists(self.binary_path): |
| 224 | logging.info("binary %s no longer exists, exiting.", self.binary_path) |
| 225 | sys.exit(0) |
| 226 | |
| 227 | try: |
| 228 | os.execv(self.binary_path, sys.argv) |
| 229 | except OSError as e: |
| 230 | logging.exception("Failed to reboot process with error: %s.", e) |
Zhuoyao Zhang | ba64f31 | 2024-10-14 20:32:53 +0000 | [diff] [blame] | 231 | self._send_error_event_to_clearcut( |
| 232 | edit_event_pb2.EditEvent.FAILED_TO_REBOOT_EDIT_MONITOR |
| 233 | ) |
Zhuoyao Zhang | 205a2fc | 2024-09-20 18:19:59 +0000 | [diff] [blame] | 234 | sys.exit(1) # Indicate an error occurred |
| 235 | |
Zhuoyao Zhang | d28da5c | 2024-09-24 19:46:12 +0000 | [diff] [blame] | 236 | def cleanup(self): |
| 237 | """Wipes out all edit monitor instances in the system. |
| 238 | |
| 239 | Stops all the existing edit monitor instances and place a block sign |
| 240 | to prevent any edit monitor process to start. This method is only used |
| 241 | in emergency case when there's something goes wrong with the edit monitor |
| 242 | that requires immediate cleanup to prevent damanger to the system. |
| 243 | """ |
| 244 | logging.debug("Start cleaning up all existing instances.") |
Zhuoyao Zhang | ba64f31 | 2024-10-14 20:32:53 +0000 | [diff] [blame] | 245 | self._send_error_event_to_clearcut(edit_event_pb2.EditEvent.FORCE_CLEANUP) |
Zhuoyao Zhang | d28da5c | 2024-09-24 19:46:12 +0000 | [diff] [blame] | 246 | |
| 247 | try: |
| 248 | # First places a block sign to prevent any edit monitor process to start. |
| 249 | self.block_sign.touch() |
| 250 | except (FileNotFoundError, PermissionError, OSError): |
| 251 | logging.exception("Failed to place the block sign") |
| 252 | |
| 253 | # Finds and kills all the existing instances of edit monitor. |
| 254 | existing_instances_pids = self._find_all_instances_pids() |
| 255 | for pid in existing_instances_pids: |
| 256 | logging.info( |
| 257 | "Found existing edit monitor instance with pid %d, killing...", pid |
| 258 | ) |
| 259 | try: |
| 260 | self._terminate_process(pid) |
| 261 | except Exception: |
| 262 | logging.exception("Failed to terminate process %d", pid) |
| 263 | |
Zhuoyao Zhang | 4d48559 | 2024-09-17 21:14:38 +0000 | [diff] [blame] | 264 | def _stop_any_existing_instance(self): |
| 265 | if not self.pid_file_path.exists(): |
| 266 | logging.debug("No existing instances.") |
| 267 | return |
| 268 | |
| 269 | ex_pid = self._read_pid_from_pidfile() |
| 270 | |
| 271 | if ex_pid: |
| 272 | logging.info("Found another instance with pid %d.", ex_pid) |
| 273 | self._terminate_process(ex_pid) |
Zhuoyao Zhang | a710791 | 2024-12-04 17:39:54 +0000 | [diff] [blame] | 274 | self._remove_pidfile(ex_pid) |
Zhuoyao Zhang | 4d48559 | 2024-09-17 21:14:38 +0000 | [diff] [blame] | 275 | |
Zhuoyao Zhang | a710791 | 2024-12-04 17:39:54 +0000 | [diff] [blame] | 276 | def _read_pid_from_pidfile(self) -> int | None: |
| 277 | try: |
| 278 | with open(self.pid_file_path, "r") as f: |
| 279 | return int(f.read().strip()) |
| 280 | except FileNotFoundError as e: |
| 281 | logging.warning("pidfile %s does not exist.", self.pid_file_path) |
| 282 | return None |
Zhuoyao Zhang | 4d48559 | 2024-09-17 21:14:38 +0000 | [diff] [blame] | 283 | |
Zhuoyao Zhang | 5335955 | 2024-09-16 23:58:11 +0000 | [diff] [blame] | 284 | def _write_pid_to_pidfile(self): |
| 285 | """Creates a pidfile and writes the current pid to the file. |
| 286 | |
| 287 | Raise FileExistsError if the pidfile already exists. |
| 288 | """ |
| 289 | try: |
| 290 | # Use the 'x' mode to open the file for exclusive creation |
| 291 | with open(self.pid_file_path, "x") as f: |
| 292 | f.write(f"{self.pid}") |
| 293 | except FileExistsError as e: |
| 294 | # This could be caused due to race condition that a user is trying |
| 295 | # to start two edit monitors at the same time. Or because there is |
| 296 | # already an existing edit monitor running and we can not kill it |
| 297 | # for some reason. |
| 298 | logging.exception("pidfile %s already exists.", self.pid_file_path) |
| 299 | raise e |
| 300 | |
| 301 | def _start_daemon_process(self): |
| 302 | """Starts a subprocess to run the daemon.""" |
| 303 | p = multiprocessing.Process( |
| 304 | target=self.daemon_target, args=self.daemon_args |
| 305 | ) |
Zhuoyao Zhang | 8a22579 | 2024-10-09 18:04:39 +0000 | [diff] [blame] | 306 | p.daemon = True |
Zhuoyao Zhang | 5335955 | 2024-09-16 23:58:11 +0000 | [diff] [blame] | 307 | p.start() |
| 308 | |
| 309 | logging.info("Start subprocess with PID %d", p.pid) |
| 310 | self.daemon_process = p |
| 311 | |
| 312 | def _terminate_process( |
| 313 | self, pid: int, timeout: int = DEFAULT_PROCESS_TERMINATION_TIMEOUT_SECONDS |
| 314 | ): |
| 315 | """Terminates a process with given pid. |
| 316 | |
| 317 | It first sends a SIGTERM to the process to allow it for proper |
| 318 | termination with a timeout. If the process is not terminated within |
| 319 | the timeout, kills it forcefully. |
| 320 | """ |
| 321 | try: |
| 322 | os.kill(pid, signal.SIGTERM) |
| 323 | if not self._wait_for_process_terminate(pid, timeout): |
| 324 | logging.warning( |
| 325 | "Process %d not terminated within timeout, try force kill", pid |
| 326 | ) |
| 327 | os.kill(pid, signal.SIGKILL) |
| 328 | except ProcessLookupError: |
| 329 | logging.info("Process with PID %d not found (already terminated)", pid) |
| 330 | |
| 331 | def _wait_for_process_terminate(self, pid: int, timeout: int) -> bool: |
| 332 | start_time = time.time() |
| 333 | |
| 334 | while time.time() < start_time + timeout: |
| 335 | if not self._is_process_alive(pid): |
| 336 | return True |
| 337 | time.sleep(1) |
| 338 | |
| 339 | logging.error("Process %d not terminated within %d seconds.", pid, timeout) |
| 340 | return False |
| 341 | |
| 342 | def _is_process_alive(self, pid: int) -> bool: |
| 343 | try: |
| 344 | output = subprocess.check_output( |
| 345 | ["ps", "-p", str(pid), "-o", "state="], text=True |
| 346 | ).strip() |
| 347 | state = output.split()[0] |
| 348 | return state != "Z" # Check if the state is not 'Z' (zombie) |
| 349 | except subprocess.CalledProcessError: |
| 350 | # Process not found (already dead). |
| 351 | return False |
| 352 | except (FileNotFoundError, OSError, ValueError) as e: |
| 353 | logging.warning( |
| 354 | "Unable to check the status for process %d with error: %s.", pid, e |
| 355 | ) |
| 356 | return True |
| 357 | |
Zhuoyao Zhang | a710791 | 2024-12-04 17:39:54 +0000 | [diff] [blame] | 358 | def _remove_pidfile(self, expected_pid: int): |
| 359 | recorded_pid = self._read_pid_from_pidfile() |
| 360 | |
| 361 | if recorded_pid is None: |
| 362 | logging.info("pid file %s already removed.", self.pid_file_path) |
| 363 | return |
| 364 | |
| 365 | if recorded_pid != expected_pid: |
| 366 | logging.warning( |
| 367 | "pid file contains pid from a different process, expected pid: %d," |
| 368 | " actual pid: %d.", |
| 369 | expected_pid, |
| 370 | recorded_pid, |
| 371 | ) |
| 372 | return |
| 373 | |
| 374 | logging.debug("removing pidfile written by process %s", expected_pid) |
Zhuoyao Zhang | 5335955 | 2024-09-16 23:58:11 +0000 | [diff] [blame] | 375 | try: |
| 376 | os.remove(self.pid_file_path) |
| 377 | except FileNotFoundError: |
| 378 | logging.info("pid file %s already removed.", self.pid_file_path) |
| 379 | |
| 380 | def _get_pid_file_path(self, pid_file_dir: pathlib.Path) -> pathlib.Path: |
| 381 | """Generates the path to store the pidfile. |
| 382 | |
| 383 | The file path should have the format of "/tmp/edit_monitor/xxxx.lock" |
| 384 | where xxxx is a hashed value based on the binary path that starts the |
| 385 | process. |
| 386 | """ |
| 387 | hash_object = hashlib.sha256() |
| 388 | hash_object.update(self.binary_path.encode("utf-8")) |
| 389 | pid_file_path = pid_file_dir.joinpath(hash_object.hexdigest() + ".lock") |
| 390 | logging.info("pid_file_path: %s", pid_file_path) |
| 391 | |
| 392 | return pid_file_path |
Zhuoyao Zhang | dc2840d | 2024-09-19 23:29:27 +0000 | [diff] [blame] | 393 | |
| 394 | def _get_process_memory_percent(self, pid: int) -> float: |
Zhuoyao Zhang | 5d16222 | 2024-10-24 23:10:04 +0000 | [diff] [blame] | 395 | with open(f"/proc/{pid}/stat", "r") as f: |
| 396 | stat_data = f.readline().split() |
| 397 | # RSS is the 24th field in /proc/[pid]/stat |
| 398 | rss_pages = int(stat_data[23]) |
Zhuoyao Zhang | 6988272 | 2024-11-15 18:32:18 +0000 | [diff] [blame] | 399 | process_memory = rss_pages * 4 * 1024 # Convert to bytes |
| 400 | |
| 401 | return ( |
| 402 | process_memory / self.total_memory_size |
| 403 | if self.total_memory_size |
| 404 | else 0.0 |
| 405 | ) |
Zhuoyao Zhang | dc2840d | 2024-09-19 23:29:27 +0000 | [diff] [blame] | 406 | |
| 407 | def _get_process_cpu_percent(self, pid: int, interval: int = 1) -> float: |
Zhuoyao Zhang | 5d16222 | 2024-10-24 23:10:04 +0000 | [diff] [blame] | 408 | total_start_time = self._get_total_cpu_time(pid) |
| 409 | with open("/proc/uptime", "r") as f: |
| 410 | uptime_start = float(f.readline().split()[0]) |
Zhuoyao Zhang | dc2840d | 2024-09-19 23:29:27 +0000 | [diff] [blame] | 411 | |
Zhuoyao Zhang | 5d16222 | 2024-10-24 23:10:04 +0000 | [diff] [blame] | 412 | time.sleep(interval) |
Zhuoyao Zhang | dc2840d | 2024-09-19 23:29:27 +0000 | [diff] [blame] | 413 | |
Zhuoyao Zhang | 5d16222 | 2024-10-24 23:10:04 +0000 | [diff] [blame] | 414 | total_end_time = self._get_total_cpu_time(pid) |
| 415 | with open("/proc/uptime", "r") as f: |
| 416 | uptime_end = float(f.readline().split()[0]) |
Zhuoyao Zhang | dc2840d | 2024-09-19 23:29:27 +0000 | [diff] [blame] | 417 | |
Zhuoyao Zhang | 5d16222 | 2024-10-24 23:10:04 +0000 | [diff] [blame] | 418 | return ( |
Zhuoyao Zhang | a710791 | 2024-12-04 17:39:54 +0000 | [diff] [blame] | 419 | (total_end_time - total_start_time) / (uptime_end - uptime_start) * 100 |
Zhuoyao Zhang | 5d16222 | 2024-10-24 23:10:04 +0000 | [diff] [blame] | 420 | ) |
Zhuoyao Zhang | dc2840d | 2024-09-19 23:29:27 +0000 | [diff] [blame] | 421 | |
| 422 | def _get_total_cpu_time(self, pid: int) -> float: |
| 423 | with open(f"/proc/{str(pid)}/stat", "r") as f: |
| 424 | stats = f.readline().split() |
| 425 | # utime is the 14th field in /proc/[pid]/stat measured in clock ticks. |
| 426 | utime = int(stats[13]) |
| 427 | # stime is the 15th field in /proc/[pid]/stat measured in clock ticks. |
| 428 | stime = int(stats[14]) |
| 429 | return (utime + stime) / os.sysconf(os.sysconf_names["SC_CLK_TCK"]) |
| 430 | |
Zhuoyao Zhang | d28da5c | 2024-09-24 19:46:12 +0000 | [diff] [blame] | 431 | def _find_all_instances_pids(self) -> list[int]: |
| 432 | pids = [] |
| 433 | |
Zhuoyao Zhang | 130da8e | 2024-12-05 23:16:43 +0000 | [diff] [blame] | 434 | try: |
Zhuoyao Zhang | ec657ce | 2024-12-06 19:17:46 +0000 | [diff] [blame] | 435 | output = subprocess.check_output(["ps", "-ef", "--no-headers"], text=True) |
Zhuoyao Zhang | 130da8e | 2024-12-05 23:16:43 +0000 | [diff] [blame] | 436 | for line in output.splitlines(): |
Zhuoyao Zhang | ec657ce | 2024-12-06 19:17:46 +0000 | [diff] [blame] | 437 | parts = line.split() |
| 438 | process_path = parts[7] |
| 439 | if pathlib.Path(process_path).name == "edit_monitor": |
| 440 | pid = int(parts[1]) |
| 441 | if pid != self.pid: # exclude the current process |
| 442 | pids.append(pid) |
Zhuoyao Zhang | 130da8e | 2024-12-05 23:16:43 +0000 | [diff] [blame] | 443 | except Exception: |
| 444 | logging.exception( |
Zhuoyao Zhang | ec657ce | 2024-12-06 19:17:46 +0000 | [diff] [blame] | 445 | "Failed to get pids of existing edit monitors from ps command." |
| 446 | ) |
Zhuoyao Zhang | d28da5c | 2024-09-24 19:46:12 +0000 | [diff] [blame] | 447 | |
Zhuoyao Zhang | ba64f31 | 2024-10-14 20:32:53 +0000 | [diff] [blame] | 448 | return pids |
| 449 | |
| 450 | def _send_error_event_to_clearcut(self, error_type): |
| 451 | edit_monitor_error_event_proto = edit_event_pb2.EditEvent( |
| 452 | user_name=self.user_name, |
| 453 | host_name=self.host_name, |
| 454 | source_root=self.source_root, |
| 455 | ) |
| 456 | edit_monitor_error_event_proto.edit_monitor_error_event.CopyFrom( |
| 457 | edit_event_pb2.EditEvent.EditMonitorErrorEvent(error_type=error_type) |
| 458 | ) |
| 459 | log_event = clientanalytics_pb2.LogEvent( |
| 460 | event_time_ms=int(time.time() * 1000), |
| 461 | source_extension=edit_monitor_error_event_proto.SerializeToString(), |
| 462 | ) |
| 463 | self.cclient.log(log_event) |