blob: 36a7593e3adb94df823177b2b12b32d705a739d9 [file] [log] [blame]
Zhuoyao Zhang53359552024-09-16 23:58:11 +00001# Copyright 2024, The Android Open Source Project
2#
3# Licensed under the Apache License, Version 2.0 (the "License");
4# you may not use this file except in compliance with the License.
5# You may obtain a copy of the License at
6#
7# http://www.apache.org/licenses/LICENSE-2.0
8#
9# Unless required by applicable law or agreed to in writing, software
10# distributed under the License is distributed on an "AS IS" BASIS,
11# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12# See the License for the specific language governing permissions and
13# limitations under the License.
14
15
Zhuoyao Zhangba64f312024-10-14 20:32:53 +000016import getpass
Zhuoyao Zhang53359552024-09-16 23:58:11 +000017import hashlib
18import logging
19import multiprocessing
20import os
21import pathlib
Zhuoyao Zhangba64f312024-10-14 20:32:53 +000022import platform
Zhuoyao Zhang53359552024-09-16 23:58:11 +000023import signal
24import subprocess
Zhuoyao Zhang205a2fc2024-09-20 18:19:59 +000025import sys
Zhuoyao Zhang53359552024-09-16 23:58:11 +000026import tempfile
27import time
28
Zhuoyao Zhangba64f312024-10-14 20:32:53 +000029from atest.metrics import clearcut_client
30from atest.proto import clientanalytics_pb2
Zhuoyao Zhang3ca7cef2024-10-31 22:07:31 +000031from edit_monitor import utils
Zhuoyao Zhangba64f312024-10-14 20:32:53 +000032from proto import edit_event_pb2
Zhuoyao Zhang53359552024-09-16 23:58:11 +000033
Zhuoyao Zhang5d162222024-10-24 23:10:04 +000034DEFAULT_PROCESS_TERMINATION_TIMEOUT_SECONDS = 5
Zhuoyao Zhangdc2840d2024-09-19 23:29:27 +000035DEFAULT_MONITOR_INTERVAL_SECONDS = 5
Zhuoyao Zhang69882722024-11-15 18:32:18 +000036DEFAULT_MEMORY_USAGE_THRESHOLD = 0.02 # 2% of total memory
Zhuoyao Zhang205a2fc2024-09-20 18:19:59 +000037DEFAULT_CPU_USAGE_THRESHOLD = 200
38DEFAULT_REBOOT_TIMEOUT_SECONDS = 60 * 60 * 24
Zhuoyao Zhangd28da5c2024-09-24 19:46:12 +000039BLOCK_SIGN_FILE = "edit_monitor_block_sign"
Zhuoyao Zhangba64f312024-10-14 20:32:53 +000040# Enum of the Clearcut log source defined under
41# /google3/wireless/android/play/playlog/proto/log_source_enum.proto
42LOG_SOURCE = 2524
Zhuoyao Zhang53359552024-09-16 23:58:11 +000043
44
45def default_daemon_target():
46 """Place holder for the default daemon target."""
47 print("default daemon target")
48
49
50class DaemonManager:
51 """Class to manage and monitor the daemon run as a subprocess."""
52
53 def __init__(
54 self,
55 binary_path: str,
56 daemon_target: callable = default_daemon_target,
57 daemon_args: tuple = (),
Zhuoyao Zhangba64f312024-10-14 20:32:53 +000058 cclient: clearcut_client.Clearcut | None = None,
Zhuoyao Zhang53359552024-09-16 23:58:11 +000059 ):
60 self.binary_path = binary_path
61 self.daemon_target = daemon_target
62 self.daemon_args = daemon_args
Zhuoyao Zhangba64f312024-10-14 20:32:53 +000063 self.cclient = cclient or clearcut_client.Clearcut(LOG_SOURCE)
Zhuoyao Zhang53359552024-09-16 23:58:11 +000064
Zhuoyao Zhangba64f312024-10-14 20:32:53 +000065 self.user_name = getpass.getuser()
66 self.host_name = platform.node()
67 self.source_root = os.environ.get("ANDROID_BUILD_TOP", "")
Zhuoyao Zhang53359552024-09-16 23:58:11 +000068 self.pid = os.getpid()
69 self.daemon_process = None
70
Zhuoyao Zhangdc2840d2024-09-19 23:29:27 +000071 self.max_memory_usage = 0
72 self.max_cpu_usage = 0
Zhuoyao Zhang69882722024-11-15 18:32:18 +000073 self.total_memory_size = os.sysconf("SC_PAGE_SIZE") * os.sysconf(
74 "SC_PHYS_PAGES"
75 )
Zhuoyao Zhangdc2840d2024-09-19 23:29:27 +000076
Zhuoyao Zhang53359552024-09-16 23:58:11 +000077 pid_file_dir = pathlib.Path(tempfile.gettempdir()).joinpath("edit_monitor")
78 pid_file_dir.mkdir(parents=True, exist_ok=True)
79 self.pid_file_path = self._get_pid_file_path(pid_file_dir)
Zhuoyao Zhangd28da5c2024-09-24 19:46:12 +000080 self.block_sign = pathlib.Path(tempfile.gettempdir()).joinpath(
81 BLOCK_SIGN_FILE
82 )
Zhuoyao Zhang53359552024-09-16 23:58:11 +000083
84 def start(self):
85 """Writes the pidfile and starts the daemon proces."""
Zhuoyao Zhang3ca7cef2024-10-31 22:07:31 +000086 if not utils.is_feature_enabled(
87 "edit_monitor",
88 self.user_name,
Zhuoyao Zhangd1c4a8b2024-11-06 21:48:45 +000089 "ENABLE_ANDROID_EDIT_MONITOR",
Zhuoyao Zhangbedd13b2024-11-21 19:42:48 +000090 100,
Zhuoyao Zhang3ca7cef2024-10-31 22:07:31 +000091 ):
92 logging.warning("Edit monitor is disabled, exiting...")
93 return
94
Zhuoyao Zhangd28da5c2024-09-24 19:46:12 +000095 if self.block_sign.exists():
96 logging.warning("Block sign found, exiting...")
97 return
98
Zhuoyao Zhangba64f312024-10-14 20:32:53 +000099 if self.binary_path.startswith("/google/cog/"):
Zhuoyao Zhang05e28fa2024-10-04 21:58:39 +0000100 logging.warning("Edit monitor for cog is not supported, exiting...")
101 return
102
Zhuoyao Zhangba64f312024-10-14 20:32:53 +0000103 try:
104 self._stop_any_existing_instance()
105 self._write_pid_to_pidfile()
106 self._start_daemon_process()
107 except Exception as e:
108 logging.exception("Failed to start daemon manager with error %s", e)
109 self._send_error_event_to_clearcut(
110 edit_event_pb2.EditEvent.FAILED_TO_START_EDIT_MONITOR
111 )
112 raise e
Zhuoyao Zhang53359552024-09-16 23:58:11 +0000113
Zhuoyao Zhangdc2840d2024-09-19 23:29:27 +0000114 def monitor_daemon(
115 self,
116 interval: int = DEFAULT_MONITOR_INTERVAL_SECONDS,
117 memory_threshold: float = DEFAULT_MEMORY_USAGE_THRESHOLD,
118 cpu_threshold: float = DEFAULT_CPU_USAGE_THRESHOLD,
Zhuoyao Zhang205a2fc2024-09-20 18:19:59 +0000119 reboot_timeout: int = DEFAULT_REBOOT_TIMEOUT_SECONDS,
Zhuoyao Zhangdc2840d2024-09-19 23:29:27 +0000120 ):
121 """Monits the daemon process status.
122
123 Periodically check the CPU/Memory usage of the daemon process as long as the
124 process is still running and kill the process if the resource usage is above
125 given thresholds.
126 """
Zhuoyao Zhangd28da5c2024-09-24 19:46:12 +0000127 if not self.daemon_process:
128 return
129
Zhuoyao Zhangdc2840d2024-09-19 23:29:27 +0000130 logging.info("start monitoring daemon process %d.", self.daemon_process.pid)
Zhuoyao Zhang205a2fc2024-09-20 18:19:59 +0000131 reboot_time = time.time() + reboot_timeout
Zhuoyao Zhangdc2840d2024-09-19 23:29:27 +0000132 while self.daemon_process.is_alive():
Zhuoyao Zhang205a2fc2024-09-20 18:19:59 +0000133 if time.time() > reboot_time:
134 self.reboot()
Zhuoyao Zhangdc2840d2024-09-19 23:29:27 +0000135 try:
136 memory_usage = self._get_process_memory_percent(self.daemon_process.pid)
137 self.max_memory_usage = max(self.max_memory_usage, memory_usage)
138
139 cpu_usage = self._get_process_cpu_percent(self.daemon_process.pid)
140 self.max_cpu_usage = max(self.max_cpu_usage, cpu_usage)
141
142 time.sleep(interval)
143 except Exception as e:
144 # Logging the error and continue.
145 logging.warning("Failed to monitor daemon process with error: %s", e)
146
Zhuoyao Zhang585b4342024-11-12 22:46:43 +0000147 if self.max_memory_usage >= memory_threshold:
Zhuoyao Zhang78fd0762024-11-18 22:15:42 +0000148 self._send_error_event_to_clearcut(
149 edit_event_pb2.EditEvent.KILLED_DUE_TO_EXCEEDED_MEMORY_USAGE
150 )
151 logging.error(
Zhuoyao Zhanga7107912024-12-04 17:39:54 +0000152 "Daemon process is consuming too much memory, rebooting..."
153 )
Zhuoyao Zhang78fd0762024-11-18 22:15:42 +0000154 self.reboot()
Zhuoyao Zhang585b4342024-11-12 22:46:43 +0000155
156 if self.max_cpu_usage >= cpu_threshold:
Zhuoyao Zhang78fd0762024-11-18 22:15:42 +0000157 self._send_error_event_to_clearcut(
158 edit_event_pb2.EditEvent.KILLED_DUE_TO_EXCEEDED_CPU_USAGE
159 )
Zhuoyao Zhanga7107912024-12-04 17:39:54 +0000160 logging.error("Daemon process is consuming too much cpu, killing...")
Zhuoyao Zhang78fd0762024-11-18 22:15:42 +0000161 self._terminate_process(self.daemon_process.pid)
Zhuoyao Zhangdc2840d2024-09-19 23:29:27 +0000162
163 logging.info(
164 "Daemon process %d terminated. Max memory usage: %f, Max cpu"
165 " usage: %f.",
166 self.daemon_process.pid,
167 self.max_memory_usage,
168 self.max_cpu_usage,
169 )
170
Zhuoyao Zhang53359552024-09-16 23:58:11 +0000171 def stop(self):
172 """Stops the daemon process and removes the pidfile."""
173
Zhuoyao Zhang64ad75f2024-10-25 00:21:45 +0000174 logging.info("in daemon manager cleanup.")
Zhuoyao Zhang53359552024-09-16 23:58:11 +0000175 try:
Zhuoyao Zhang8a225792024-10-09 18:04:39 +0000176 if self.daemon_process:
177 # The daemon process might already in termination process,
178 # wait some time before kill it explicitly.
179 self._wait_for_process_terminate(self.daemon_process.pid, 1)
180 if self.daemon_process.is_alive():
181 self._terminate_process(self.daemon_process.pid)
Zhuoyao Zhanga7107912024-12-04 17:39:54 +0000182 self._remove_pidfile(self.pid)
Zhuoyao Zhang64ad75f2024-10-25 00:21:45 +0000183 logging.info("Successfully stopped daemon manager.")
Zhuoyao Zhang53359552024-09-16 23:58:11 +0000184 except Exception as e:
185 logging.exception("Failed to stop daemon manager with error %s", e)
Zhuoyao Zhangba64f312024-10-14 20:32:53 +0000186 self._send_error_event_to_clearcut(
187 edit_event_pb2.EditEvent.FAILED_TO_STOP_EDIT_MONITOR
188 )
189 sys.exit(1)
190 finally:
191 self.cclient.flush_events()
Zhuoyao Zhang53359552024-09-16 23:58:11 +0000192
Zhuoyao Zhang205a2fc2024-09-20 18:19:59 +0000193 def reboot(self):
194 """Reboots the current process.
195
196 Stops the current daemon manager and reboots the entire process based on
197 the binary file. Exits directly If the binary file no longer exists.
198 """
Zhuoyao Zhang64ad75f2024-10-25 00:21:45 +0000199 logging.info("Rebooting process based on binary %s.", self.binary_path)
Zhuoyao Zhang205a2fc2024-09-20 18:19:59 +0000200
201 # Stop the current daemon manager first.
202 self.stop()
203
204 # If the binary no longer exists, exit directly.
205 if not os.path.exists(self.binary_path):
206 logging.info("binary %s no longer exists, exiting.", self.binary_path)
207 sys.exit(0)
208
209 try:
210 os.execv(self.binary_path, sys.argv)
211 except OSError as e:
212 logging.exception("Failed to reboot process with error: %s.", e)
Zhuoyao Zhangba64f312024-10-14 20:32:53 +0000213 self._send_error_event_to_clearcut(
214 edit_event_pb2.EditEvent.FAILED_TO_REBOOT_EDIT_MONITOR
215 )
Zhuoyao Zhang205a2fc2024-09-20 18:19:59 +0000216 sys.exit(1) # Indicate an error occurred
217
Zhuoyao Zhangd28da5c2024-09-24 19:46:12 +0000218 def cleanup(self):
219 """Wipes out all edit monitor instances in the system.
220
221 Stops all the existing edit monitor instances and place a block sign
222 to prevent any edit monitor process to start. This method is only used
223 in emergency case when there's something goes wrong with the edit monitor
224 that requires immediate cleanup to prevent damanger to the system.
225 """
226 logging.debug("Start cleaning up all existing instances.")
Zhuoyao Zhangba64f312024-10-14 20:32:53 +0000227 self._send_error_event_to_clearcut(edit_event_pb2.EditEvent.FORCE_CLEANUP)
Zhuoyao Zhangd28da5c2024-09-24 19:46:12 +0000228
229 try:
230 # First places a block sign to prevent any edit monitor process to start.
231 self.block_sign.touch()
232 except (FileNotFoundError, PermissionError, OSError):
233 logging.exception("Failed to place the block sign")
234
235 # Finds and kills all the existing instances of edit monitor.
236 existing_instances_pids = self._find_all_instances_pids()
237 for pid in existing_instances_pids:
238 logging.info(
239 "Found existing edit monitor instance with pid %d, killing...", pid
240 )
241 try:
242 self._terminate_process(pid)
243 except Exception:
244 logging.exception("Failed to terminate process %d", pid)
245
Zhuoyao Zhang4d485592024-09-17 21:14:38 +0000246 def _stop_any_existing_instance(self):
247 if not self.pid_file_path.exists():
248 logging.debug("No existing instances.")
249 return
250
251 ex_pid = self._read_pid_from_pidfile()
252
253 if ex_pid:
254 logging.info("Found another instance with pid %d.", ex_pid)
255 self._terminate_process(ex_pid)
Zhuoyao Zhanga7107912024-12-04 17:39:54 +0000256 self._remove_pidfile(ex_pid)
Zhuoyao Zhang4d485592024-09-17 21:14:38 +0000257
Zhuoyao Zhanga7107912024-12-04 17:39:54 +0000258 def _read_pid_from_pidfile(self) -> int | None:
259 try:
260 with open(self.pid_file_path, "r") as f:
261 return int(f.read().strip())
262 except FileNotFoundError as e:
263 logging.warning("pidfile %s does not exist.", self.pid_file_path)
264 return None
Zhuoyao Zhang4d485592024-09-17 21:14:38 +0000265
Zhuoyao Zhang53359552024-09-16 23:58:11 +0000266 def _write_pid_to_pidfile(self):
267 """Creates a pidfile and writes the current pid to the file.
268
269 Raise FileExistsError if the pidfile already exists.
270 """
271 try:
272 # Use the 'x' mode to open the file for exclusive creation
273 with open(self.pid_file_path, "x") as f:
274 f.write(f"{self.pid}")
275 except FileExistsError as e:
276 # This could be caused due to race condition that a user is trying
277 # to start two edit monitors at the same time. Or because there is
278 # already an existing edit monitor running and we can not kill it
279 # for some reason.
280 logging.exception("pidfile %s already exists.", self.pid_file_path)
281 raise e
282
283 def _start_daemon_process(self):
284 """Starts a subprocess to run the daemon."""
285 p = multiprocessing.Process(
286 target=self.daemon_target, args=self.daemon_args
287 )
Zhuoyao Zhang8a225792024-10-09 18:04:39 +0000288 p.daemon = True
Zhuoyao Zhang53359552024-09-16 23:58:11 +0000289 p.start()
290
291 logging.info("Start subprocess with PID %d", p.pid)
292 self.daemon_process = p
293
294 def _terminate_process(
295 self, pid: int, timeout: int = DEFAULT_PROCESS_TERMINATION_TIMEOUT_SECONDS
296 ):
297 """Terminates a process with given pid.
298
299 It first sends a SIGTERM to the process to allow it for proper
300 termination with a timeout. If the process is not terminated within
301 the timeout, kills it forcefully.
302 """
303 try:
304 os.kill(pid, signal.SIGTERM)
305 if not self._wait_for_process_terminate(pid, timeout):
306 logging.warning(
307 "Process %d not terminated within timeout, try force kill", pid
308 )
309 os.kill(pid, signal.SIGKILL)
310 except ProcessLookupError:
311 logging.info("Process with PID %d not found (already terminated)", pid)
312
313 def _wait_for_process_terminate(self, pid: int, timeout: int) -> bool:
314 start_time = time.time()
315
316 while time.time() < start_time + timeout:
317 if not self._is_process_alive(pid):
318 return True
319 time.sleep(1)
320
321 logging.error("Process %d not terminated within %d seconds.", pid, timeout)
322 return False
323
324 def _is_process_alive(self, pid: int) -> bool:
325 try:
326 output = subprocess.check_output(
327 ["ps", "-p", str(pid), "-o", "state="], text=True
328 ).strip()
329 state = output.split()[0]
330 return state != "Z" # Check if the state is not 'Z' (zombie)
331 except subprocess.CalledProcessError:
332 # Process not found (already dead).
333 return False
334 except (FileNotFoundError, OSError, ValueError) as e:
335 logging.warning(
336 "Unable to check the status for process %d with error: %s.", pid, e
337 )
338 return True
339
Zhuoyao Zhanga7107912024-12-04 17:39:54 +0000340 def _remove_pidfile(self, expected_pid: int):
341 recorded_pid = self._read_pid_from_pidfile()
342
343 if recorded_pid is None:
344 logging.info("pid file %s already removed.", self.pid_file_path)
345 return
346
347 if recorded_pid != expected_pid:
348 logging.warning(
349 "pid file contains pid from a different process, expected pid: %d,"
350 " actual pid: %d.",
351 expected_pid,
352 recorded_pid,
353 )
354 return
355
356 logging.debug("removing pidfile written by process %s", expected_pid)
Zhuoyao Zhang53359552024-09-16 23:58:11 +0000357 try:
358 os.remove(self.pid_file_path)
359 except FileNotFoundError:
360 logging.info("pid file %s already removed.", self.pid_file_path)
361
362 def _get_pid_file_path(self, pid_file_dir: pathlib.Path) -> pathlib.Path:
363 """Generates the path to store the pidfile.
364
365 The file path should have the format of "/tmp/edit_monitor/xxxx.lock"
366 where xxxx is a hashed value based on the binary path that starts the
367 process.
368 """
369 hash_object = hashlib.sha256()
370 hash_object.update(self.binary_path.encode("utf-8"))
371 pid_file_path = pid_file_dir.joinpath(hash_object.hexdigest() + ".lock")
372 logging.info("pid_file_path: %s", pid_file_path)
373
374 return pid_file_path
Zhuoyao Zhangdc2840d2024-09-19 23:29:27 +0000375
376 def _get_process_memory_percent(self, pid: int) -> float:
Zhuoyao Zhang5d162222024-10-24 23:10:04 +0000377 with open(f"/proc/{pid}/stat", "r") as f:
378 stat_data = f.readline().split()
379 # RSS is the 24th field in /proc/[pid]/stat
380 rss_pages = int(stat_data[23])
Zhuoyao Zhang69882722024-11-15 18:32:18 +0000381 process_memory = rss_pages * 4 * 1024 # Convert to bytes
382
383 return (
384 process_memory / self.total_memory_size
385 if self.total_memory_size
386 else 0.0
387 )
Zhuoyao Zhangdc2840d2024-09-19 23:29:27 +0000388
389 def _get_process_cpu_percent(self, pid: int, interval: int = 1) -> float:
Zhuoyao Zhang5d162222024-10-24 23:10:04 +0000390 total_start_time = self._get_total_cpu_time(pid)
391 with open("/proc/uptime", "r") as f:
392 uptime_start = float(f.readline().split()[0])
Zhuoyao Zhangdc2840d2024-09-19 23:29:27 +0000393
Zhuoyao Zhang5d162222024-10-24 23:10:04 +0000394 time.sleep(interval)
Zhuoyao Zhangdc2840d2024-09-19 23:29:27 +0000395
Zhuoyao Zhang5d162222024-10-24 23:10:04 +0000396 total_end_time = self._get_total_cpu_time(pid)
397 with open("/proc/uptime", "r") as f:
398 uptime_end = float(f.readline().split()[0])
Zhuoyao Zhangdc2840d2024-09-19 23:29:27 +0000399
Zhuoyao Zhang5d162222024-10-24 23:10:04 +0000400 return (
Zhuoyao Zhanga7107912024-12-04 17:39:54 +0000401 (total_end_time - total_start_time) / (uptime_end - uptime_start) * 100
Zhuoyao Zhang5d162222024-10-24 23:10:04 +0000402 )
Zhuoyao Zhangdc2840d2024-09-19 23:29:27 +0000403
404 def _get_total_cpu_time(self, pid: int) -> float:
405 with open(f"/proc/{str(pid)}/stat", "r") as f:
406 stats = f.readline().split()
407 # utime is the 14th field in /proc/[pid]/stat measured in clock ticks.
408 utime = int(stats[13])
409 # stime is the 15th field in /proc/[pid]/stat measured in clock ticks.
410 stime = int(stats[14])
411 return (utime + stime) / os.sysconf(os.sysconf_names["SC_CLK_TCK"])
412
Zhuoyao Zhangd28da5c2024-09-24 19:46:12 +0000413 def _find_all_instances_pids(self) -> list[int]:
414 pids = []
415
Zhuoyao Zhang130da8e2024-12-05 23:16:43 +0000416 try:
417 output = subprocess.check_output(
418 ["ps", "-ef", "--no-headers"], text=True)
419 for line in output.splitlines():
420 parts = line.split()
421 process_path = parts[7]
422 if pathlib.Path(process_path).name == 'edit_monitor':
423 pid = int(parts[1])
424 if pid != self.pid: # exclude the current process
425 pids.append(pid)
426 except Exception:
427 logging.exception(
428 "Failed to get pids of existing edit monitors from ps command.")
Zhuoyao Zhangd28da5c2024-09-24 19:46:12 +0000429
Zhuoyao Zhangba64f312024-10-14 20:32:53 +0000430 return pids
431
432 def _send_error_event_to_clearcut(self, error_type):
433 edit_monitor_error_event_proto = edit_event_pb2.EditEvent(
434 user_name=self.user_name,
435 host_name=self.host_name,
436 source_root=self.source_root,
437 )
438 edit_monitor_error_event_proto.edit_monitor_error_event.CopyFrom(
439 edit_event_pb2.EditEvent.EditMonitorErrorEvent(error_type=error_type)
440 )
441 log_event = clientanalytics_pb2.LogEvent(
442 event_time_ms=int(time.time() * 1000),
443 source_extension=edit_monitor_error_event_proto.SerializeToString(),
444 )
445 self.cclient.log(log_event)