blob: 22782f72c921f6202d4cac2ecf11de8a3f6064cc [file] [log] [blame]
Zhuoyao Zhang53359552024-09-16 23:58:11 +00001# Copyright 2024, The Android Open Source Project
2#
3# Licensed under the Apache License, Version 2.0 (the "License");
4# you may not use this file except in compliance with the License.
5# You may obtain a copy of the License at
6#
7# http://www.apache.org/licenses/LICENSE-2.0
8#
9# Unless required by applicable law or agreed to in writing, software
10# distributed under the License is distributed on an "AS IS" BASIS,
11# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12# See the License for the specific language governing permissions and
13# limitations under the License.
14
15
Zhuoyao Zhangba64f312024-10-14 20:32:53 +000016import getpass
Zhuoyao Zhang53359552024-09-16 23:58:11 +000017import hashlib
18import logging
19import multiprocessing
20import os
21import pathlib
Zhuoyao Zhangba64f312024-10-14 20:32:53 +000022import platform
Zhuoyao Zhang53359552024-09-16 23:58:11 +000023import signal
24import subprocess
Zhuoyao Zhang205a2fc2024-09-20 18:19:59 +000025import sys
Zhuoyao Zhang53359552024-09-16 23:58:11 +000026import tempfile
27import time
28
Zhuoyao Zhangba64f312024-10-14 20:32:53 +000029from atest.metrics import clearcut_client
30from atest.proto import clientanalytics_pb2
Zhuoyao Zhang3ca7cef2024-10-31 22:07:31 +000031from edit_monitor import utils
Zhuoyao Zhangba64f312024-10-14 20:32:53 +000032from proto import edit_event_pb2
Zhuoyao Zhang53359552024-09-16 23:58:11 +000033
Zhuoyao Zhang5d162222024-10-24 23:10:04 +000034DEFAULT_PROCESS_TERMINATION_TIMEOUT_SECONDS = 5
Zhuoyao Zhangdc2840d2024-09-19 23:29:27 +000035DEFAULT_MONITOR_INTERVAL_SECONDS = 5
Zhuoyao Zhang69882722024-11-15 18:32:18 +000036DEFAULT_MEMORY_USAGE_THRESHOLD = 0.02 # 2% of total memory
Zhuoyao Zhang205a2fc2024-09-20 18:19:59 +000037DEFAULT_CPU_USAGE_THRESHOLD = 200
38DEFAULT_REBOOT_TIMEOUT_SECONDS = 60 * 60 * 24
Zhuoyao Zhangd28da5c2024-09-24 19:46:12 +000039BLOCK_SIGN_FILE = "edit_monitor_block_sign"
Zhuoyao Zhangba64f312024-10-14 20:32:53 +000040# Enum of the Clearcut log source defined under
41# /google3/wireless/android/play/playlog/proto/log_source_enum.proto
42LOG_SOURCE = 2524
Zhuoyao Zhang53359552024-09-16 23:58:11 +000043
44
45def default_daemon_target():
46 """Place holder for the default daemon target."""
47 print("default daemon target")
48
49
50class DaemonManager:
51 """Class to manage and monitor the daemon run as a subprocess."""
52
53 def __init__(
54 self,
55 binary_path: str,
56 daemon_target: callable = default_daemon_target,
57 daemon_args: tuple = (),
Zhuoyao Zhangba64f312024-10-14 20:32:53 +000058 cclient: clearcut_client.Clearcut | None = None,
Zhuoyao Zhang53359552024-09-16 23:58:11 +000059 ):
60 self.binary_path = binary_path
61 self.daemon_target = daemon_target
62 self.daemon_args = daemon_args
Zhuoyao Zhangba64f312024-10-14 20:32:53 +000063 self.cclient = cclient or clearcut_client.Clearcut(LOG_SOURCE)
Zhuoyao Zhang53359552024-09-16 23:58:11 +000064
Zhuoyao Zhangba64f312024-10-14 20:32:53 +000065 self.user_name = getpass.getuser()
66 self.host_name = platform.node()
67 self.source_root = os.environ.get("ANDROID_BUILD_TOP", "")
Zhuoyao Zhang53359552024-09-16 23:58:11 +000068 self.pid = os.getpid()
69 self.daemon_process = None
70
Zhuoyao Zhangdc2840d2024-09-19 23:29:27 +000071 self.max_memory_usage = 0
72 self.max_cpu_usage = 0
Zhuoyao Zhang69882722024-11-15 18:32:18 +000073 self.total_memory_size = os.sysconf("SC_PAGE_SIZE") * os.sysconf(
74 "SC_PHYS_PAGES"
75 )
Zhuoyao Zhangdc2840d2024-09-19 23:29:27 +000076
Zhuoyao Zhang53359552024-09-16 23:58:11 +000077 pid_file_dir = pathlib.Path(tempfile.gettempdir()).joinpath("edit_monitor")
78 pid_file_dir.mkdir(parents=True, exist_ok=True)
79 self.pid_file_path = self._get_pid_file_path(pid_file_dir)
Zhuoyao Zhangd28da5c2024-09-24 19:46:12 +000080 self.block_sign = pathlib.Path(tempfile.gettempdir()).joinpath(
81 BLOCK_SIGN_FILE
82 )
Zhuoyao Zhang53359552024-09-16 23:58:11 +000083
84 def start(self):
85 """Writes the pidfile and starts the daemon proces."""
Zhuoyao Zhang3ca7cef2024-10-31 22:07:31 +000086 if not utils.is_feature_enabled(
87 "edit_monitor",
88 self.user_name,
Zhuoyao Zhangd1c4a8b2024-11-06 21:48:45 +000089 "ENABLE_ANDROID_EDIT_MONITOR",
Zhuoyao Zhangffb01682024-11-15 00:03:26 +000090 50,
Zhuoyao Zhang3ca7cef2024-10-31 22:07:31 +000091 ):
92 logging.warning("Edit monitor is disabled, exiting...")
93 return
94
Zhuoyao Zhangd28da5c2024-09-24 19:46:12 +000095 if self.block_sign.exists():
96 logging.warning("Block sign found, exiting...")
97 return
98
Zhuoyao Zhangba64f312024-10-14 20:32:53 +000099 if self.binary_path.startswith("/google/cog/"):
Zhuoyao Zhang05e28fa2024-10-04 21:58:39 +0000100 logging.warning("Edit monitor for cog is not supported, exiting...")
101 return
102
Zhuoyao Zhangba64f312024-10-14 20:32:53 +0000103 try:
104 self._stop_any_existing_instance()
105 self._write_pid_to_pidfile()
106 self._start_daemon_process()
107 except Exception as e:
108 logging.exception("Failed to start daemon manager with error %s", e)
109 self._send_error_event_to_clearcut(
110 edit_event_pb2.EditEvent.FAILED_TO_START_EDIT_MONITOR
111 )
112 raise e
Zhuoyao Zhang53359552024-09-16 23:58:11 +0000113
Zhuoyao Zhangdc2840d2024-09-19 23:29:27 +0000114 def monitor_daemon(
115 self,
116 interval: int = DEFAULT_MONITOR_INTERVAL_SECONDS,
117 memory_threshold: float = DEFAULT_MEMORY_USAGE_THRESHOLD,
118 cpu_threshold: float = DEFAULT_CPU_USAGE_THRESHOLD,
Zhuoyao Zhang205a2fc2024-09-20 18:19:59 +0000119 reboot_timeout: int = DEFAULT_REBOOT_TIMEOUT_SECONDS,
Zhuoyao Zhangdc2840d2024-09-19 23:29:27 +0000120 ):
121 """Monits the daemon process status.
122
123 Periodically check the CPU/Memory usage of the daemon process as long as the
124 process is still running and kill the process if the resource usage is above
125 given thresholds.
126 """
Zhuoyao Zhangd28da5c2024-09-24 19:46:12 +0000127 if not self.daemon_process:
128 return
129
Zhuoyao Zhangdc2840d2024-09-19 23:29:27 +0000130 logging.info("start monitoring daemon process %d.", self.daemon_process.pid)
Zhuoyao Zhang205a2fc2024-09-20 18:19:59 +0000131 reboot_time = time.time() + reboot_timeout
Zhuoyao Zhangdc2840d2024-09-19 23:29:27 +0000132 while self.daemon_process.is_alive():
Zhuoyao Zhang205a2fc2024-09-20 18:19:59 +0000133 if time.time() > reboot_time:
134 self.reboot()
Zhuoyao Zhangdc2840d2024-09-19 23:29:27 +0000135 try:
136 memory_usage = self._get_process_memory_percent(self.daemon_process.pid)
137 self.max_memory_usage = max(self.max_memory_usage, memory_usage)
138
139 cpu_usage = self._get_process_cpu_percent(self.daemon_process.pid)
140 self.max_cpu_usage = max(self.max_cpu_usage, cpu_usage)
141
142 time.sleep(interval)
143 except Exception as e:
144 # Logging the error and continue.
145 logging.warning("Failed to monitor daemon process with error: %s", e)
146
Zhuoyao Zhang585b4342024-11-12 22:46:43 +0000147 if self.max_memory_usage >= memory_threshold:
Zhuoyao Zhang78fd0762024-11-18 22:15:42 +0000148 self._send_error_event_to_clearcut(
149 edit_event_pb2.EditEvent.KILLED_DUE_TO_EXCEEDED_MEMORY_USAGE
150 )
151 logging.error(
152 "Daemon process is consuming too much memory, rebooting...")
153 self.reboot()
Zhuoyao Zhang585b4342024-11-12 22:46:43 +0000154
155 if self.max_cpu_usage >= cpu_threshold:
Zhuoyao Zhang78fd0762024-11-18 22:15:42 +0000156 self._send_error_event_to_clearcut(
157 edit_event_pb2.EditEvent.KILLED_DUE_TO_EXCEEDED_CPU_USAGE
158 )
159 logging.error(
160 "Daemon process is consuming too much cpu, killing...")
161 self._terminate_process(self.daemon_process.pid)
Zhuoyao Zhangdc2840d2024-09-19 23:29:27 +0000162
163 logging.info(
164 "Daemon process %d terminated. Max memory usage: %f, Max cpu"
165 " usage: %f.",
166 self.daemon_process.pid,
167 self.max_memory_usage,
168 self.max_cpu_usage,
169 )
170
Zhuoyao Zhang53359552024-09-16 23:58:11 +0000171 def stop(self):
172 """Stops the daemon process and removes the pidfile."""
173
Zhuoyao Zhang64ad75f2024-10-25 00:21:45 +0000174 logging.info("in daemon manager cleanup.")
Zhuoyao Zhang53359552024-09-16 23:58:11 +0000175 try:
Zhuoyao Zhang8a225792024-10-09 18:04:39 +0000176 if self.daemon_process:
177 # The daemon process might already in termination process,
178 # wait some time before kill it explicitly.
179 self._wait_for_process_terminate(self.daemon_process.pid, 1)
180 if self.daemon_process.is_alive():
181 self._terminate_process(self.daemon_process.pid)
Zhuoyao Zhang53359552024-09-16 23:58:11 +0000182 self._remove_pidfile()
Zhuoyao Zhang64ad75f2024-10-25 00:21:45 +0000183 logging.info("Successfully stopped daemon manager.")
Zhuoyao Zhang53359552024-09-16 23:58:11 +0000184 except Exception as e:
185 logging.exception("Failed to stop daemon manager with error %s", e)
Zhuoyao Zhangba64f312024-10-14 20:32:53 +0000186 self._send_error_event_to_clearcut(
187 edit_event_pb2.EditEvent.FAILED_TO_STOP_EDIT_MONITOR
188 )
189 sys.exit(1)
190 finally:
191 self.cclient.flush_events()
Zhuoyao Zhang53359552024-09-16 23:58:11 +0000192
Zhuoyao Zhang205a2fc2024-09-20 18:19:59 +0000193 def reboot(self):
194 """Reboots the current process.
195
196 Stops the current daemon manager and reboots the entire process based on
197 the binary file. Exits directly If the binary file no longer exists.
198 """
Zhuoyao Zhang64ad75f2024-10-25 00:21:45 +0000199 logging.info("Rebooting process based on binary %s.", self.binary_path)
Zhuoyao Zhang205a2fc2024-09-20 18:19:59 +0000200
201 # Stop the current daemon manager first.
202 self.stop()
203
204 # If the binary no longer exists, exit directly.
205 if not os.path.exists(self.binary_path):
206 logging.info("binary %s no longer exists, exiting.", self.binary_path)
207 sys.exit(0)
208
209 try:
210 os.execv(self.binary_path, sys.argv)
211 except OSError as e:
212 logging.exception("Failed to reboot process with error: %s.", e)
Zhuoyao Zhangba64f312024-10-14 20:32:53 +0000213 self._send_error_event_to_clearcut(
214 edit_event_pb2.EditEvent.FAILED_TO_REBOOT_EDIT_MONITOR
215 )
Zhuoyao Zhang205a2fc2024-09-20 18:19:59 +0000216 sys.exit(1) # Indicate an error occurred
217
Zhuoyao Zhangd28da5c2024-09-24 19:46:12 +0000218 def cleanup(self):
219 """Wipes out all edit monitor instances in the system.
220
221 Stops all the existing edit monitor instances and place a block sign
222 to prevent any edit monitor process to start. This method is only used
223 in emergency case when there's something goes wrong with the edit monitor
224 that requires immediate cleanup to prevent damanger to the system.
225 """
226 logging.debug("Start cleaning up all existing instances.")
Zhuoyao Zhangba64f312024-10-14 20:32:53 +0000227 self._send_error_event_to_clearcut(edit_event_pb2.EditEvent.FORCE_CLEANUP)
Zhuoyao Zhangd28da5c2024-09-24 19:46:12 +0000228
229 try:
230 # First places a block sign to prevent any edit monitor process to start.
231 self.block_sign.touch()
232 except (FileNotFoundError, PermissionError, OSError):
233 logging.exception("Failed to place the block sign")
234
235 # Finds and kills all the existing instances of edit monitor.
236 existing_instances_pids = self._find_all_instances_pids()
237 for pid in existing_instances_pids:
238 logging.info(
239 "Found existing edit monitor instance with pid %d, killing...", pid
240 )
241 try:
242 self._terminate_process(pid)
243 except Exception:
244 logging.exception("Failed to terminate process %d", pid)
245
Zhuoyao Zhang4d485592024-09-17 21:14:38 +0000246 def _stop_any_existing_instance(self):
247 if not self.pid_file_path.exists():
248 logging.debug("No existing instances.")
249 return
250
251 ex_pid = self._read_pid_from_pidfile()
252
253 if ex_pid:
254 logging.info("Found another instance with pid %d.", ex_pid)
255 self._terminate_process(ex_pid)
256 self._remove_pidfile()
257
258 def _read_pid_from_pidfile(self):
259 with open(self.pid_file_path, "r") as f:
260 return int(f.read().strip())
261
Zhuoyao Zhang53359552024-09-16 23:58:11 +0000262 def _write_pid_to_pidfile(self):
263 """Creates a pidfile and writes the current pid to the file.
264
265 Raise FileExistsError if the pidfile already exists.
266 """
267 try:
268 # Use the 'x' mode to open the file for exclusive creation
269 with open(self.pid_file_path, "x") as f:
270 f.write(f"{self.pid}")
271 except FileExistsError as e:
272 # This could be caused due to race condition that a user is trying
273 # to start two edit monitors at the same time. Or because there is
274 # already an existing edit monitor running and we can not kill it
275 # for some reason.
276 logging.exception("pidfile %s already exists.", self.pid_file_path)
277 raise e
278
279 def _start_daemon_process(self):
280 """Starts a subprocess to run the daemon."""
281 p = multiprocessing.Process(
282 target=self.daemon_target, args=self.daemon_args
283 )
Zhuoyao Zhang8a225792024-10-09 18:04:39 +0000284 p.daemon = True
Zhuoyao Zhang53359552024-09-16 23:58:11 +0000285 p.start()
286
287 logging.info("Start subprocess with PID %d", p.pid)
288 self.daemon_process = p
289
290 def _terminate_process(
291 self, pid: int, timeout: int = DEFAULT_PROCESS_TERMINATION_TIMEOUT_SECONDS
292 ):
293 """Terminates a process with given pid.
294
295 It first sends a SIGTERM to the process to allow it for proper
296 termination with a timeout. If the process is not terminated within
297 the timeout, kills it forcefully.
298 """
299 try:
300 os.kill(pid, signal.SIGTERM)
301 if not self._wait_for_process_terminate(pid, timeout):
302 logging.warning(
303 "Process %d not terminated within timeout, try force kill", pid
304 )
305 os.kill(pid, signal.SIGKILL)
306 except ProcessLookupError:
307 logging.info("Process with PID %d not found (already terminated)", pid)
308
309 def _wait_for_process_terminate(self, pid: int, timeout: int) -> bool:
310 start_time = time.time()
311
312 while time.time() < start_time + timeout:
313 if not self._is_process_alive(pid):
314 return True
315 time.sleep(1)
316
317 logging.error("Process %d not terminated within %d seconds.", pid, timeout)
318 return False
319
320 def _is_process_alive(self, pid: int) -> bool:
321 try:
322 output = subprocess.check_output(
323 ["ps", "-p", str(pid), "-o", "state="], text=True
324 ).strip()
325 state = output.split()[0]
326 return state != "Z" # Check if the state is not 'Z' (zombie)
327 except subprocess.CalledProcessError:
328 # Process not found (already dead).
329 return False
330 except (FileNotFoundError, OSError, ValueError) as e:
331 logging.warning(
332 "Unable to check the status for process %d with error: %s.", pid, e
333 )
334 return True
335
336 def _remove_pidfile(self):
337 try:
338 os.remove(self.pid_file_path)
339 except FileNotFoundError:
340 logging.info("pid file %s already removed.", self.pid_file_path)
341
342 def _get_pid_file_path(self, pid_file_dir: pathlib.Path) -> pathlib.Path:
343 """Generates the path to store the pidfile.
344
345 The file path should have the format of "/tmp/edit_monitor/xxxx.lock"
346 where xxxx is a hashed value based on the binary path that starts the
347 process.
348 """
349 hash_object = hashlib.sha256()
350 hash_object.update(self.binary_path.encode("utf-8"))
351 pid_file_path = pid_file_dir.joinpath(hash_object.hexdigest() + ".lock")
352 logging.info("pid_file_path: %s", pid_file_path)
353
354 return pid_file_path
Zhuoyao Zhangdc2840d2024-09-19 23:29:27 +0000355
356 def _get_process_memory_percent(self, pid: int) -> float:
Zhuoyao Zhang5d162222024-10-24 23:10:04 +0000357 with open(f"/proc/{pid}/stat", "r") as f:
358 stat_data = f.readline().split()
359 # RSS is the 24th field in /proc/[pid]/stat
360 rss_pages = int(stat_data[23])
Zhuoyao Zhang69882722024-11-15 18:32:18 +0000361 process_memory = rss_pages * 4 * 1024 # Convert to bytes
362
363 return (
364 process_memory / self.total_memory_size
365 if self.total_memory_size
366 else 0.0
367 )
Zhuoyao Zhangdc2840d2024-09-19 23:29:27 +0000368
369 def _get_process_cpu_percent(self, pid: int, interval: int = 1) -> float:
Zhuoyao Zhang5d162222024-10-24 23:10:04 +0000370 total_start_time = self._get_total_cpu_time(pid)
371 with open("/proc/uptime", "r") as f:
372 uptime_start = float(f.readline().split()[0])
Zhuoyao Zhangdc2840d2024-09-19 23:29:27 +0000373
Zhuoyao Zhang5d162222024-10-24 23:10:04 +0000374 time.sleep(interval)
Zhuoyao Zhangdc2840d2024-09-19 23:29:27 +0000375
Zhuoyao Zhang5d162222024-10-24 23:10:04 +0000376 total_end_time = self._get_total_cpu_time(pid)
377 with open("/proc/uptime", "r") as f:
378 uptime_end = float(f.readline().split()[0])
Zhuoyao Zhangdc2840d2024-09-19 23:29:27 +0000379
Zhuoyao Zhang5d162222024-10-24 23:10:04 +0000380 return (
381 (total_end_time - total_start_time)
382 / (uptime_end - uptime_start)
383 * 100
384 )
Zhuoyao Zhangdc2840d2024-09-19 23:29:27 +0000385
386 def _get_total_cpu_time(self, pid: int) -> float:
387 with open(f"/proc/{str(pid)}/stat", "r") as f:
388 stats = f.readline().split()
389 # utime is the 14th field in /proc/[pid]/stat measured in clock ticks.
390 utime = int(stats[13])
391 # stime is the 15th field in /proc/[pid]/stat measured in clock ticks.
392 stime = int(stats[14])
393 return (utime + stime) / os.sysconf(os.sysconf_names["SC_CLK_TCK"])
394
Zhuoyao Zhangd28da5c2024-09-24 19:46:12 +0000395 def _find_all_instances_pids(self) -> list[int]:
396 pids = []
397
398 for file in os.listdir(self.pid_file_path.parent):
399 if file.endswith(".lock"):
400 try:
401 with open(self.pid_file_path.parent.joinpath(file), "r") as f:
402 pids.append(int(f.read().strip()))
403 except (FileNotFoundError, IOError, ValueError, TypeError):
404 logging.exception("Failed to get pid from file path: %s", file)
405
Zhuoyao Zhangba64f312024-10-14 20:32:53 +0000406 return pids
407
408 def _send_error_event_to_clearcut(self, error_type):
409 edit_monitor_error_event_proto = edit_event_pb2.EditEvent(
410 user_name=self.user_name,
411 host_name=self.host_name,
412 source_root=self.source_root,
413 )
414 edit_monitor_error_event_proto.edit_monitor_error_event.CopyFrom(
415 edit_event_pb2.EditEvent.EditMonitorErrorEvent(error_type=error_type)
416 )
417 log_event = clientanalytics_pb2.LogEvent(
418 event_time_ms=int(time.time() * 1000),
419 source_extension=edit_monitor_error_event_proto.SerializeToString(),
420 )
421 self.cclient.log(log_event)