blob: c0a57abdaa1e5fea144e2c486b484a153ab95aca [file] [log] [blame]
Zhuoyao Zhang53359552024-09-16 23:58:11 +00001# Copyright 2024, The Android Open Source Project
2#
3# Licensed under the Apache License, Version 2.0 (the "License");
4# you may not use this file except in compliance with the License.
5# You may obtain a copy of the License at
6#
7# http://www.apache.org/licenses/LICENSE-2.0
8#
9# Unless required by applicable law or agreed to in writing, software
10# distributed under the License is distributed on an "AS IS" BASIS,
11# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12# See the License for the specific language governing permissions and
13# limitations under the License.
14
15
Zhuoyao Zhangba64f312024-10-14 20:32:53 +000016import getpass
Zhuoyao Zhang53359552024-09-16 23:58:11 +000017import hashlib
18import logging
19import multiprocessing
20import os
21import pathlib
Zhuoyao Zhangba64f312024-10-14 20:32:53 +000022import platform
Zhuoyao Zhang53359552024-09-16 23:58:11 +000023import signal
24import subprocess
Zhuoyao Zhang205a2fc2024-09-20 18:19:59 +000025import sys
Zhuoyao Zhang53359552024-09-16 23:58:11 +000026import tempfile
27import time
28
Zhuoyao Zhangba64f312024-10-14 20:32:53 +000029from atest.metrics import clearcut_client
30from atest.proto import clientanalytics_pb2
31from proto import edit_event_pb2
Zhuoyao Zhang53359552024-09-16 23:58:11 +000032
Zhuoyao Zhang5d162222024-10-24 23:10:04 +000033DEFAULT_PROCESS_TERMINATION_TIMEOUT_SECONDS = 5
Zhuoyao Zhangdc2840d2024-09-19 23:29:27 +000034DEFAULT_MONITOR_INTERVAL_SECONDS = 5
Zhuoyao Zhang5d162222024-10-24 23:10:04 +000035DEFAULT_MEMORY_USAGE_THRESHOLD = 2 * 1024 # 2GB
Zhuoyao Zhang205a2fc2024-09-20 18:19:59 +000036DEFAULT_CPU_USAGE_THRESHOLD = 200
37DEFAULT_REBOOT_TIMEOUT_SECONDS = 60 * 60 * 24
Zhuoyao Zhangd28da5c2024-09-24 19:46:12 +000038BLOCK_SIGN_FILE = "edit_monitor_block_sign"
Zhuoyao Zhangba64f312024-10-14 20:32:53 +000039# Enum of the Clearcut log source defined under
40# /google3/wireless/android/play/playlog/proto/log_source_enum.proto
41LOG_SOURCE = 2524
Zhuoyao Zhang53359552024-09-16 23:58:11 +000042
43
44def default_daemon_target():
45 """Place holder for the default daemon target."""
46 print("default daemon target")
47
48
49class DaemonManager:
50 """Class to manage and monitor the daemon run as a subprocess."""
51
52 def __init__(
53 self,
54 binary_path: str,
55 daemon_target: callable = default_daemon_target,
56 daemon_args: tuple = (),
Zhuoyao Zhangba64f312024-10-14 20:32:53 +000057 cclient: clearcut_client.Clearcut | None = None,
Zhuoyao Zhang53359552024-09-16 23:58:11 +000058 ):
59 self.binary_path = binary_path
60 self.daemon_target = daemon_target
61 self.daemon_args = daemon_args
Zhuoyao Zhangba64f312024-10-14 20:32:53 +000062 self.cclient = cclient or clearcut_client.Clearcut(LOG_SOURCE)
Zhuoyao Zhang53359552024-09-16 23:58:11 +000063
Zhuoyao Zhangba64f312024-10-14 20:32:53 +000064 self.user_name = getpass.getuser()
65 self.host_name = platform.node()
66 self.source_root = os.environ.get("ANDROID_BUILD_TOP", "")
Zhuoyao Zhang53359552024-09-16 23:58:11 +000067 self.pid = os.getpid()
68 self.daemon_process = None
69
Zhuoyao Zhangdc2840d2024-09-19 23:29:27 +000070 self.max_memory_usage = 0
71 self.max_cpu_usage = 0
72
Zhuoyao Zhang53359552024-09-16 23:58:11 +000073 pid_file_dir = pathlib.Path(tempfile.gettempdir()).joinpath("edit_monitor")
74 pid_file_dir.mkdir(parents=True, exist_ok=True)
75 self.pid_file_path = self._get_pid_file_path(pid_file_dir)
Zhuoyao Zhangd28da5c2024-09-24 19:46:12 +000076 self.block_sign = pathlib.Path(tempfile.gettempdir()).joinpath(
77 BLOCK_SIGN_FILE
78 )
Zhuoyao Zhang53359552024-09-16 23:58:11 +000079
80 def start(self):
81 """Writes the pidfile and starts the daemon proces."""
Zhuoyao Zhangd28da5c2024-09-24 19:46:12 +000082 if self.block_sign.exists():
83 logging.warning("Block sign found, exiting...")
84 return
85
Zhuoyao Zhangba64f312024-10-14 20:32:53 +000086 if self.binary_path.startswith("/google/cog/"):
Zhuoyao Zhang05e28fa2024-10-04 21:58:39 +000087 logging.warning("Edit monitor for cog is not supported, exiting...")
88 return
89
Zhuoyao Zhangba64f312024-10-14 20:32:53 +000090 try:
91 self._stop_any_existing_instance()
92 self._write_pid_to_pidfile()
93 self._start_daemon_process()
94 except Exception as e:
95 logging.exception("Failed to start daemon manager with error %s", e)
96 self._send_error_event_to_clearcut(
97 edit_event_pb2.EditEvent.FAILED_TO_START_EDIT_MONITOR
98 )
99 raise e
Zhuoyao Zhang53359552024-09-16 23:58:11 +0000100
Zhuoyao Zhangdc2840d2024-09-19 23:29:27 +0000101 def monitor_daemon(
102 self,
103 interval: int = DEFAULT_MONITOR_INTERVAL_SECONDS,
104 memory_threshold: float = DEFAULT_MEMORY_USAGE_THRESHOLD,
105 cpu_threshold: float = DEFAULT_CPU_USAGE_THRESHOLD,
Zhuoyao Zhang205a2fc2024-09-20 18:19:59 +0000106 reboot_timeout: int = DEFAULT_REBOOT_TIMEOUT_SECONDS,
Zhuoyao Zhangdc2840d2024-09-19 23:29:27 +0000107 ):
108 """Monits the daemon process status.
109
110 Periodically check the CPU/Memory usage of the daemon process as long as the
111 process is still running and kill the process if the resource usage is above
112 given thresholds.
113 """
Zhuoyao Zhangd28da5c2024-09-24 19:46:12 +0000114 if not self.daemon_process:
115 return
116
Zhuoyao Zhangdc2840d2024-09-19 23:29:27 +0000117 logging.info("start monitoring daemon process %d.", self.daemon_process.pid)
Zhuoyao Zhang205a2fc2024-09-20 18:19:59 +0000118 reboot_time = time.time() + reboot_timeout
Zhuoyao Zhangdc2840d2024-09-19 23:29:27 +0000119 while self.daemon_process.is_alive():
Zhuoyao Zhang205a2fc2024-09-20 18:19:59 +0000120 if time.time() > reboot_time:
121 self.reboot()
Zhuoyao Zhangdc2840d2024-09-19 23:29:27 +0000122 try:
123 memory_usage = self._get_process_memory_percent(self.daemon_process.pid)
124 self.max_memory_usage = max(self.max_memory_usage, memory_usage)
125
126 cpu_usage = self._get_process_cpu_percent(self.daemon_process.pid)
127 self.max_cpu_usage = max(self.max_cpu_usage, cpu_usage)
128
129 time.sleep(interval)
130 except Exception as e:
131 # Logging the error and continue.
132 logging.warning("Failed to monitor daemon process with error: %s", e)
133
134 if (
135 self.max_memory_usage >= memory_threshold
136 or self.max_cpu_usage >= cpu_threshold
137 ):
138 logging.error(
139 "Daemon process is consuming too much resource, killing..."
140 ),
Zhuoyao Zhangba64f312024-10-14 20:32:53 +0000141 self._send_error_event_to_clearcut(
142 edit_event_pb2.EditEvent.KILLED_DUE_TO_EXCEEDED_RESOURCE_USAGE
143 )
Zhuoyao Zhangdc2840d2024-09-19 23:29:27 +0000144 self._terminate_process(self.daemon_process.pid)
145
146 logging.info(
147 "Daemon process %d terminated. Max memory usage: %f, Max cpu"
148 " usage: %f.",
149 self.daemon_process.pid,
150 self.max_memory_usage,
151 self.max_cpu_usage,
152 )
153
Zhuoyao Zhang53359552024-09-16 23:58:11 +0000154 def stop(self):
155 """Stops the daemon process and removes the pidfile."""
156
Zhuoyao Zhang64ad75f2024-10-25 00:21:45 +0000157 logging.info("in daemon manager cleanup.")
Zhuoyao Zhang53359552024-09-16 23:58:11 +0000158 try:
Zhuoyao Zhang8a225792024-10-09 18:04:39 +0000159 if self.daemon_process:
160 # The daemon process might already in termination process,
161 # wait some time before kill it explicitly.
162 self._wait_for_process_terminate(self.daemon_process.pid, 1)
163 if self.daemon_process.is_alive():
164 self._terminate_process(self.daemon_process.pid)
Zhuoyao Zhang53359552024-09-16 23:58:11 +0000165 self._remove_pidfile()
Zhuoyao Zhang64ad75f2024-10-25 00:21:45 +0000166 logging.info("Successfully stopped daemon manager.")
Zhuoyao Zhang53359552024-09-16 23:58:11 +0000167 except Exception as e:
168 logging.exception("Failed to stop daemon manager with error %s", e)
Zhuoyao Zhangba64f312024-10-14 20:32:53 +0000169 self._send_error_event_to_clearcut(
170 edit_event_pb2.EditEvent.FAILED_TO_STOP_EDIT_MONITOR
171 )
172 sys.exit(1)
173 finally:
174 self.cclient.flush_events()
Zhuoyao Zhang53359552024-09-16 23:58:11 +0000175
Zhuoyao Zhang205a2fc2024-09-20 18:19:59 +0000176 def reboot(self):
177 """Reboots the current process.
178
179 Stops the current daemon manager and reboots the entire process based on
180 the binary file. Exits directly If the binary file no longer exists.
181 """
Zhuoyao Zhang64ad75f2024-10-25 00:21:45 +0000182 logging.info("Rebooting process based on binary %s.", self.binary_path)
Zhuoyao Zhang205a2fc2024-09-20 18:19:59 +0000183
184 # Stop the current daemon manager first.
185 self.stop()
186
187 # If the binary no longer exists, exit directly.
188 if not os.path.exists(self.binary_path):
189 logging.info("binary %s no longer exists, exiting.", self.binary_path)
190 sys.exit(0)
191
192 try:
193 os.execv(self.binary_path, sys.argv)
194 except OSError as e:
195 logging.exception("Failed to reboot process with error: %s.", e)
Zhuoyao Zhangba64f312024-10-14 20:32:53 +0000196 self._send_error_event_to_clearcut(
197 edit_event_pb2.EditEvent.FAILED_TO_REBOOT_EDIT_MONITOR
198 )
Zhuoyao Zhang205a2fc2024-09-20 18:19:59 +0000199 sys.exit(1) # Indicate an error occurred
200
Zhuoyao Zhangd28da5c2024-09-24 19:46:12 +0000201 def cleanup(self):
202 """Wipes out all edit monitor instances in the system.
203
204 Stops all the existing edit monitor instances and place a block sign
205 to prevent any edit monitor process to start. This method is only used
206 in emergency case when there's something goes wrong with the edit monitor
207 that requires immediate cleanup to prevent damanger to the system.
208 """
209 logging.debug("Start cleaning up all existing instances.")
Zhuoyao Zhangba64f312024-10-14 20:32:53 +0000210 self._send_error_event_to_clearcut(edit_event_pb2.EditEvent.FORCE_CLEANUP)
Zhuoyao Zhangd28da5c2024-09-24 19:46:12 +0000211
212 try:
213 # First places a block sign to prevent any edit monitor process to start.
214 self.block_sign.touch()
215 except (FileNotFoundError, PermissionError, OSError):
216 logging.exception("Failed to place the block sign")
217
218 # Finds and kills all the existing instances of edit monitor.
219 existing_instances_pids = self._find_all_instances_pids()
220 for pid in existing_instances_pids:
221 logging.info(
222 "Found existing edit monitor instance with pid %d, killing...", pid
223 )
224 try:
225 self._terminate_process(pid)
226 except Exception:
227 logging.exception("Failed to terminate process %d", pid)
228
Zhuoyao Zhang4d485592024-09-17 21:14:38 +0000229 def _stop_any_existing_instance(self):
230 if not self.pid_file_path.exists():
231 logging.debug("No existing instances.")
232 return
233
234 ex_pid = self._read_pid_from_pidfile()
235
236 if ex_pid:
237 logging.info("Found another instance with pid %d.", ex_pid)
238 self._terminate_process(ex_pid)
239 self._remove_pidfile()
240
241 def _read_pid_from_pidfile(self):
242 with open(self.pid_file_path, "r") as f:
243 return int(f.read().strip())
244
Zhuoyao Zhang53359552024-09-16 23:58:11 +0000245 def _write_pid_to_pidfile(self):
246 """Creates a pidfile and writes the current pid to the file.
247
248 Raise FileExistsError if the pidfile already exists.
249 """
250 try:
251 # Use the 'x' mode to open the file for exclusive creation
252 with open(self.pid_file_path, "x") as f:
253 f.write(f"{self.pid}")
254 except FileExistsError as e:
255 # This could be caused due to race condition that a user is trying
256 # to start two edit monitors at the same time. Or because there is
257 # already an existing edit monitor running and we can not kill it
258 # for some reason.
259 logging.exception("pidfile %s already exists.", self.pid_file_path)
260 raise e
261
262 def _start_daemon_process(self):
263 """Starts a subprocess to run the daemon."""
264 p = multiprocessing.Process(
265 target=self.daemon_target, args=self.daemon_args
266 )
Zhuoyao Zhang8a225792024-10-09 18:04:39 +0000267 p.daemon = True
Zhuoyao Zhang53359552024-09-16 23:58:11 +0000268 p.start()
269
270 logging.info("Start subprocess with PID %d", p.pid)
271 self.daemon_process = p
272
273 def _terminate_process(
274 self, pid: int, timeout: int = DEFAULT_PROCESS_TERMINATION_TIMEOUT_SECONDS
275 ):
276 """Terminates a process with given pid.
277
278 It first sends a SIGTERM to the process to allow it for proper
279 termination with a timeout. If the process is not terminated within
280 the timeout, kills it forcefully.
281 """
282 try:
283 os.kill(pid, signal.SIGTERM)
284 if not self._wait_for_process_terminate(pid, timeout):
285 logging.warning(
286 "Process %d not terminated within timeout, try force kill", pid
287 )
288 os.kill(pid, signal.SIGKILL)
289 except ProcessLookupError:
290 logging.info("Process with PID %d not found (already terminated)", pid)
291
292 def _wait_for_process_terminate(self, pid: int, timeout: int) -> bool:
293 start_time = time.time()
294
295 while time.time() < start_time + timeout:
296 if not self._is_process_alive(pid):
297 return True
298 time.sleep(1)
299
300 logging.error("Process %d not terminated within %d seconds.", pid, timeout)
301 return False
302
303 def _is_process_alive(self, pid: int) -> bool:
304 try:
305 output = subprocess.check_output(
306 ["ps", "-p", str(pid), "-o", "state="], text=True
307 ).strip()
308 state = output.split()[0]
309 return state != "Z" # Check if the state is not 'Z' (zombie)
310 except subprocess.CalledProcessError:
311 # Process not found (already dead).
312 return False
313 except (FileNotFoundError, OSError, ValueError) as e:
314 logging.warning(
315 "Unable to check the status for process %d with error: %s.", pid, e
316 )
317 return True
318
319 def _remove_pidfile(self):
320 try:
321 os.remove(self.pid_file_path)
322 except FileNotFoundError:
323 logging.info("pid file %s already removed.", self.pid_file_path)
324
325 def _get_pid_file_path(self, pid_file_dir: pathlib.Path) -> pathlib.Path:
326 """Generates the path to store the pidfile.
327
328 The file path should have the format of "/tmp/edit_monitor/xxxx.lock"
329 where xxxx is a hashed value based on the binary path that starts the
330 process.
331 """
332 hash_object = hashlib.sha256()
333 hash_object.update(self.binary_path.encode("utf-8"))
334 pid_file_path = pid_file_dir.joinpath(hash_object.hexdigest() + ".lock")
335 logging.info("pid_file_path: %s", pid_file_path)
336
337 return pid_file_path
Zhuoyao Zhangdc2840d2024-09-19 23:29:27 +0000338
339 def _get_process_memory_percent(self, pid: int) -> float:
Zhuoyao Zhang5d162222024-10-24 23:10:04 +0000340 with open(f"/proc/{pid}/stat", "r") as f:
341 stat_data = f.readline().split()
342 # RSS is the 24th field in /proc/[pid]/stat
343 rss_pages = int(stat_data[23])
344 return rss_pages * 4 / 1024 # Covert to MB
Zhuoyao Zhangdc2840d2024-09-19 23:29:27 +0000345
346 def _get_process_cpu_percent(self, pid: int, interval: int = 1) -> float:
Zhuoyao Zhang5d162222024-10-24 23:10:04 +0000347 total_start_time = self._get_total_cpu_time(pid)
348 with open("/proc/uptime", "r") as f:
349 uptime_start = float(f.readline().split()[0])
Zhuoyao Zhangdc2840d2024-09-19 23:29:27 +0000350
Zhuoyao Zhang5d162222024-10-24 23:10:04 +0000351 time.sleep(interval)
Zhuoyao Zhangdc2840d2024-09-19 23:29:27 +0000352
Zhuoyao Zhang5d162222024-10-24 23:10:04 +0000353 total_end_time = self._get_total_cpu_time(pid)
354 with open("/proc/uptime", "r") as f:
355 uptime_end = float(f.readline().split()[0])
Zhuoyao Zhangdc2840d2024-09-19 23:29:27 +0000356
Zhuoyao Zhang5d162222024-10-24 23:10:04 +0000357 return (
358 (total_end_time - total_start_time)
359 / (uptime_end - uptime_start)
360 * 100
361 )
Zhuoyao Zhangdc2840d2024-09-19 23:29:27 +0000362
363 def _get_total_cpu_time(self, pid: int) -> float:
364 with open(f"/proc/{str(pid)}/stat", "r") as f:
365 stats = f.readline().split()
366 # utime is the 14th field in /proc/[pid]/stat measured in clock ticks.
367 utime = int(stats[13])
368 # stime is the 15th field in /proc/[pid]/stat measured in clock ticks.
369 stime = int(stats[14])
370 return (utime + stime) / os.sysconf(os.sysconf_names["SC_CLK_TCK"])
371
Zhuoyao Zhangd28da5c2024-09-24 19:46:12 +0000372 def _find_all_instances_pids(self) -> list[int]:
373 pids = []
374
375 for file in os.listdir(self.pid_file_path.parent):
376 if file.endswith(".lock"):
377 try:
378 with open(self.pid_file_path.parent.joinpath(file), "r") as f:
379 pids.append(int(f.read().strip()))
380 except (FileNotFoundError, IOError, ValueError, TypeError):
381 logging.exception("Failed to get pid from file path: %s", file)
382
Zhuoyao Zhangba64f312024-10-14 20:32:53 +0000383 return pids
384
385 def _send_error_event_to_clearcut(self, error_type):
386 edit_monitor_error_event_proto = edit_event_pb2.EditEvent(
387 user_name=self.user_name,
388 host_name=self.host_name,
389 source_root=self.source_root,
390 )
391 edit_monitor_error_event_proto.edit_monitor_error_event.CopyFrom(
392 edit_event_pb2.EditEvent.EditMonitorErrorEvent(error_type=error_type)
393 )
394 log_event = clientanalytics_pb2.LogEvent(
395 event_time_ms=int(time.time() * 1000),
396 source_extension=edit_monitor_error_event_proto.SerializeToString(),
397 )
398 self.cclient.log(log_event)