blob: 328b6ee9a686b7ba09f232fe420e1cfd5407cf01 [file] [log] [blame]
Zhuoyao Zhang53359552024-09-16 23:58:11 +00001# Copyright 2024, The Android Open Source Project
2#
3# Licensed under the Apache License, Version 2.0 (the "License");
4# you may not use this file except in compliance with the License.
5# You may obtain a copy of the License at
6#
7# http://www.apache.org/licenses/LICENSE-2.0
8#
9# Unless required by applicable law or agreed to in writing, software
10# distributed under the License is distributed on an "AS IS" BASIS,
11# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12# See the License for the specific language governing permissions and
13# limitations under the License.
14
15
Zhuoyao Zhangba64f312024-10-14 20:32:53 +000016import getpass
Zhuoyao Zhang53359552024-09-16 23:58:11 +000017import hashlib
18import logging
19import multiprocessing
20import os
21import pathlib
Zhuoyao Zhangba64f312024-10-14 20:32:53 +000022import platform
Zhuoyao Zhang53359552024-09-16 23:58:11 +000023import signal
24import subprocess
Zhuoyao Zhang205a2fc2024-09-20 18:19:59 +000025import sys
Zhuoyao Zhang53359552024-09-16 23:58:11 +000026import tempfile
27import time
28
Zhuoyao Zhangba64f312024-10-14 20:32:53 +000029from atest.metrics import clearcut_client
30from atest.proto import clientanalytics_pb2
Zhuoyao Zhang3ca7cef2024-10-31 22:07:31 +000031from edit_monitor import utils
Zhuoyao Zhangba64f312024-10-14 20:32:53 +000032from proto import edit_event_pb2
Zhuoyao Zhang53359552024-09-16 23:58:11 +000033
Zhuoyao Zhang5d162222024-10-24 23:10:04 +000034DEFAULT_PROCESS_TERMINATION_TIMEOUT_SECONDS = 5
Zhuoyao Zhangdc2840d2024-09-19 23:29:27 +000035DEFAULT_MONITOR_INTERVAL_SECONDS = 5
Zhuoyao Zhang5d162222024-10-24 23:10:04 +000036DEFAULT_MEMORY_USAGE_THRESHOLD = 2 * 1024 # 2GB
Zhuoyao Zhang205a2fc2024-09-20 18:19:59 +000037DEFAULT_CPU_USAGE_THRESHOLD = 200
38DEFAULT_REBOOT_TIMEOUT_SECONDS = 60 * 60 * 24
Zhuoyao Zhangd28da5c2024-09-24 19:46:12 +000039BLOCK_SIGN_FILE = "edit_monitor_block_sign"
Zhuoyao Zhangba64f312024-10-14 20:32:53 +000040# Enum of the Clearcut log source defined under
41# /google3/wireless/android/play/playlog/proto/log_source_enum.proto
42LOG_SOURCE = 2524
Zhuoyao Zhang53359552024-09-16 23:58:11 +000043
44
45def default_daemon_target():
46 """Place holder for the default daemon target."""
47 print("default daemon target")
48
49
50class DaemonManager:
51 """Class to manage and monitor the daemon run as a subprocess."""
52
53 def __init__(
54 self,
55 binary_path: str,
56 daemon_target: callable = default_daemon_target,
57 daemon_args: tuple = (),
Zhuoyao Zhangba64f312024-10-14 20:32:53 +000058 cclient: clearcut_client.Clearcut | None = None,
Zhuoyao Zhang53359552024-09-16 23:58:11 +000059 ):
60 self.binary_path = binary_path
61 self.daemon_target = daemon_target
62 self.daemon_args = daemon_args
Zhuoyao Zhangba64f312024-10-14 20:32:53 +000063 self.cclient = cclient or clearcut_client.Clearcut(LOG_SOURCE)
Zhuoyao Zhang53359552024-09-16 23:58:11 +000064
Zhuoyao Zhangba64f312024-10-14 20:32:53 +000065 self.user_name = getpass.getuser()
66 self.host_name = platform.node()
67 self.source_root = os.environ.get("ANDROID_BUILD_TOP", "")
Zhuoyao Zhang53359552024-09-16 23:58:11 +000068 self.pid = os.getpid()
69 self.daemon_process = None
70
Zhuoyao Zhangdc2840d2024-09-19 23:29:27 +000071 self.max_memory_usage = 0
72 self.max_cpu_usage = 0
73
Zhuoyao Zhang53359552024-09-16 23:58:11 +000074 pid_file_dir = pathlib.Path(tempfile.gettempdir()).joinpath("edit_monitor")
75 pid_file_dir.mkdir(parents=True, exist_ok=True)
76 self.pid_file_path = self._get_pid_file_path(pid_file_dir)
Zhuoyao Zhangd28da5c2024-09-24 19:46:12 +000077 self.block_sign = pathlib.Path(tempfile.gettempdir()).joinpath(
78 BLOCK_SIGN_FILE
79 )
Zhuoyao Zhang53359552024-09-16 23:58:11 +000080
81 def start(self):
82 """Writes the pidfile and starts the daemon proces."""
Zhuoyao Zhang3ca7cef2024-10-31 22:07:31 +000083 if not utils.is_feature_enabled(
84 "edit_monitor",
85 self.user_name,
Zhuoyao Zhangd1c4a8b2024-11-06 21:48:45 +000086 "ENABLE_ANDROID_EDIT_MONITOR",
Zhuoyao Zhanga5bec262024-11-05 19:28:23 +000087 10,
Zhuoyao Zhang3ca7cef2024-10-31 22:07:31 +000088 ):
89 logging.warning("Edit monitor is disabled, exiting...")
90 return
91
Zhuoyao Zhangd28da5c2024-09-24 19:46:12 +000092 if self.block_sign.exists():
93 logging.warning("Block sign found, exiting...")
94 return
95
Zhuoyao Zhangba64f312024-10-14 20:32:53 +000096 if self.binary_path.startswith("/google/cog/"):
Zhuoyao Zhang05e28fa2024-10-04 21:58:39 +000097 logging.warning("Edit monitor for cog is not supported, exiting...")
98 return
99
Zhuoyao Zhangba64f312024-10-14 20:32:53 +0000100 try:
101 self._stop_any_existing_instance()
102 self._write_pid_to_pidfile()
103 self._start_daemon_process()
104 except Exception as e:
105 logging.exception("Failed to start daemon manager with error %s", e)
106 self._send_error_event_to_clearcut(
107 edit_event_pb2.EditEvent.FAILED_TO_START_EDIT_MONITOR
108 )
109 raise e
Zhuoyao Zhang53359552024-09-16 23:58:11 +0000110
Zhuoyao Zhangdc2840d2024-09-19 23:29:27 +0000111 def monitor_daemon(
112 self,
113 interval: int = DEFAULT_MONITOR_INTERVAL_SECONDS,
114 memory_threshold: float = DEFAULT_MEMORY_USAGE_THRESHOLD,
115 cpu_threshold: float = DEFAULT_CPU_USAGE_THRESHOLD,
Zhuoyao Zhang205a2fc2024-09-20 18:19:59 +0000116 reboot_timeout: int = DEFAULT_REBOOT_TIMEOUT_SECONDS,
Zhuoyao Zhangdc2840d2024-09-19 23:29:27 +0000117 ):
118 """Monits the daemon process status.
119
120 Periodically check the CPU/Memory usage of the daemon process as long as the
121 process is still running and kill the process if the resource usage is above
122 given thresholds.
123 """
Zhuoyao Zhangd28da5c2024-09-24 19:46:12 +0000124 if not self.daemon_process:
125 return
126
Zhuoyao Zhangdc2840d2024-09-19 23:29:27 +0000127 logging.info("start monitoring daemon process %d.", self.daemon_process.pid)
Zhuoyao Zhang205a2fc2024-09-20 18:19:59 +0000128 reboot_time = time.time() + reboot_timeout
Zhuoyao Zhangdc2840d2024-09-19 23:29:27 +0000129 while self.daemon_process.is_alive():
Zhuoyao Zhang205a2fc2024-09-20 18:19:59 +0000130 if time.time() > reboot_time:
131 self.reboot()
Zhuoyao Zhangdc2840d2024-09-19 23:29:27 +0000132 try:
133 memory_usage = self._get_process_memory_percent(self.daemon_process.pid)
134 self.max_memory_usage = max(self.max_memory_usage, memory_usage)
135
136 cpu_usage = self._get_process_cpu_percent(self.daemon_process.pid)
137 self.max_cpu_usage = max(self.max_cpu_usage, cpu_usage)
138
139 time.sleep(interval)
140 except Exception as e:
141 # Logging the error and continue.
142 logging.warning("Failed to monitor daemon process with error: %s", e)
143
Zhuoyao Zhang585b4342024-11-12 22:46:43 +0000144 if self.max_memory_usage >= memory_threshold:
145 self._handle_resource_exhausted_error("memory")
146
147 if self.max_cpu_usage >= cpu_threshold:
148 self._handle_resource_exhausted_error("cpu")
Zhuoyao Zhangdc2840d2024-09-19 23:29:27 +0000149
150 logging.info(
151 "Daemon process %d terminated. Max memory usage: %f, Max cpu"
152 " usage: %f.",
153 self.daemon_process.pid,
154 self.max_memory_usage,
155 self.max_cpu_usage,
156 )
157
Zhuoyao Zhang53359552024-09-16 23:58:11 +0000158 def stop(self):
159 """Stops the daemon process and removes the pidfile."""
160
Zhuoyao Zhang64ad75f2024-10-25 00:21:45 +0000161 logging.info("in daemon manager cleanup.")
Zhuoyao Zhang53359552024-09-16 23:58:11 +0000162 try:
Zhuoyao Zhang8a225792024-10-09 18:04:39 +0000163 if self.daemon_process:
164 # The daemon process might already in termination process,
165 # wait some time before kill it explicitly.
166 self._wait_for_process_terminate(self.daemon_process.pid, 1)
167 if self.daemon_process.is_alive():
168 self._terminate_process(self.daemon_process.pid)
Zhuoyao Zhang53359552024-09-16 23:58:11 +0000169 self._remove_pidfile()
Zhuoyao Zhang64ad75f2024-10-25 00:21:45 +0000170 logging.info("Successfully stopped daemon manager.")
Zhuoyao Zhang53359552024-09-16 23:58:11 +0000171 except Exception as e:
172 logging.exception("Failed to stop daemon manager with error %s", e)
Zhuoyao Zhangba64f312024-10-14 20:32:53 +0000173 self._send_error_event_to_clearcut(
174 edit_event_pb2.EditEvent.FAILED_TO_STOP_EDIT_MONITOR
175 )
176 sys.exit(1)
177 finally:
178 self.cclient.flush_events()
Zhuoyao Zhang53359552024-09-16 23:58:11 +0000179
Zhuoyao Zhang205a2fc2024-09-20 18:19:59 +0000180 def reboot(self):
181 """Reboots the current process.
182
183 Stops the current daemon manager and reboots the entire process based on
184 the binary file. Exits directly If the binary file no longer exists.
185 """
Zhuoyao Zhang64ad75f2024-10-25 00:21:45 +0000186 logging.info("Rebooting process based on binary %s.", self.binary_path)
Zhuoyao Zhang205a2fc2024-09-20 18:19:59 +0000187
188 # Stop the current daemon manager first.
189 self.stop()
190
191 # If the binary no longer exists, exit directly.
192 if not os.path.exists(self.binary_path):
193 logging.info("binary %s no longer exists, exiting.", self.binary_path)
194 sys.exit(0)
195
196 try:
197 os.execv(self.binary_path, sys.argv)
198 except OSError as e:
199 logging.exception("Failed to reboot process with error: %s.", e)
Zhuoyao Zhangba64f312024-10-14 20:32:53 +0000200 self._send_error_event_to_clearcut(
201 edit_event_pb2.EditEvent.FAILED_TO_REBOOT_EDIT_MONITOR
202 )
Zhuoyao Zhang205a2fc2024-09-20 18:19:59 +0000203 sys.exit(1) # Indicate an error occurred
204
Zhuoyao Zhangd28da5c2024-09-24 19:46:12 +0000205 def cleanup(self):
206 """Wipes out all edit monitor instances in the system.
207
208 Stops all the existing edit monitor instances and place a block sign
209 to prevent any edit monitor process to start. This method is only used
210 in emergency case when there's something goes wrong with the edit monitor
211 that requires immediate cleanup to prevent damanger to the system.
212 """
213 logging.debug("Start cleaning up all existing instances.")
Zhuoyao Zhangba64f312024-10-14 20:32:53 +0000214 self._send_error_event_to_clearcut(edit_event_pb2.EditEvent.FORCE_CLEANUP)
Zhuoyao Zhangd28da5c2024-09-24 19:46:12 +0000215
216 try:
217 # First places a block sign to prevent any edit monitor process to start.
218 self.block_sign.touch()
219 except (FileNotFoundError, PermissionError, OSError):
220 logging.exception("Failed to place the block sign")
221
222 # Finds and kills all the existing instances of edit monitor.
223 existing_instances_pids = self._find_all_instances_pids()
224 for pid in existing_instances_pids:
225 logging.info(
226 "Found existing edit monitor instance with pid %d, killing...", pid
227 )
228 try:
229 self._terminate_process(pid)
230 except Exception:
231 logging.exception("Failed to terminate process %d", pid)
232
Zhuoyao Zhang4d485592024-09-17 21:14:38 +0000233 def _stop_any_existing_instance(self):
234 if not self.pid_file_path.exists():
235 logging.debug("No existing instances.")
236 return
237
238 ex_pid = self._read_pid_from_pidfile()
239
240 if ex_pid:
241 logging.info("Found another instance with pid %d.", ex_pid)
242 self._terminate_process(ex_pid)
243 self._remove_pidfile()
244
245 def _read_pid_from_pidfile(self):
246 with open(self.pid_file_path, "r") as f:
247 return int(f.read().strip())
248
Zhuoyao Zhang53359552024-09-16 23:58:11 +0000249 def _write_pid_to_pidfile(self):
250 """Creates a pidfile and writes the current pid to the file.
251
252 Raise FileExistsError if the pidfile already exists.
253 """
254 try:
255 # Use the 'x' mode to open the file for exclusive creation
256 with open(self.pid_file_path, "x") as f:
257 f.write(f"{self.pid}")
258 except FileExistsError as e:
259 # This could be caused due to race condition that a user is trying
260 # to start two edit monitors at the same time. Or because there is
261 # already an existing edit monitor running and we can not kill it
262 # for some reason.
263 logging.exception("pidfile %s already exists.", self.pid_file_path)
264 raise e
265
266 def _start_daemon_process(self):
267 """Starts a subprocess to run the daemon."""
268 p = multiprocessing.Process(
269 target=self.daemon_target, args=self.daemon_args
270 )
Zhuoyao Zhang8a225792024-10-09 18:04:39 +0000271 p.daemon = True
Zhuoyao Zhang53359552024-09-16 23:58:11 +0000272 p.start()
273
274 logging.info("Start subprocess with PID %d", p.pid)
275 self.daemon_process = p
276
277 def _terminate_process(
278 self, pid: int, timeout: int = DEFAULT_PROCESS_TERMINATION_TIMEOUT_SECONDS
279 ):
280 """Terminates a process with given pid.
281
282 It first sends a SIGTERM to the process to allow it for proper
283 termination with a timeout. If the process is not terminated within
284 the timeout, kills it forcefully.
285 """
286 try:
287 os.kill(pid, signal.SIGTERM)
288 if not self._wait_for_process_terminate(pid, timeout):
289 logging.warning(
290 "Process %d not terminated within timeout, try force kill", pid
291 )
292 os.kill(pid, signal.SIGKILL)
293 except ProcessLookupError:
294 logging.info("Process with PID %d not found (already terminated)", pid)
295
296 def _wait_for_process_terminate(self, pid: int, timeout: int) -> bool:
297 start_time = time.time()
298
299 while time.time() < start_time + timeout:
300 if not self._is_process_alive(pid):
301 return True
302 time.sleep(1)
303
304 logging.error("Process %d not terminated within %d seconds.", pid, timeout)
305 return False
306
307 def _is_process_alive(self, pid: int) -> bool:
308 try:
309 output = subprocess.check_output(
310 ["ps", "-p", str(pid), "-o", "state="], text=True
311 ).strip()
312 state = output.split()[0]
313 return state != "Z" # Check if the state is not 'Z' (zombie)
314 except subprocess.CalledProcessError:
315 # Process not found (already dead).
316 return False
317 except (FileNotFoundError, OSError, ValueError) as e:
318 logging.warning(
319 "Unable to check the status for process %d with error: %s.", pid, e
320 )
321 return True
322
323 def _remove_pidfile(self):
324 try:
325 os.remove(self.pid_file_path)
326 except FileNotFoundError:
327 logging.info("pid file %s already removed.", self.pid_file_path)
328
329 def _get_pid_file_path(self, pid_file_dir: pathlib.Path) -> pathlib.Path:
330 """Generates the path to store the pidfile.
331
332 The file path should have the format of "/tmp/edit_monitor/xxxx.lock"
333 where xxxx is a hashed value based on the binary path that starts the
334 process.
335 """
336 hash_object = hashlib.sha256()
337 hash_object.update(self.binary_path.encode("utf-8"))
338 pid_file_path = pid_file_dir.joinpath(hash_object.hexdigest() + ".lock")
339 logging.info("pid_file_path: %s", pid_file_path)
340
341 return pid_file_path
Zhuoyao Zhangdc2840d2024-09-19 23:29:27 +0000342
343 def _get_process_memory_percent(self, pid: int) -> float:
Zhuoyao Zhang5d162222024-10-24 23:10:04 +0000344 with open(f"/proc/{pid}/stat", "r") as f:
345 stat_data = f.readline().split()
346 # RSS is the 24th field in /proc/[pid]/stat
347 rss_pages = int(stat_data[23])
348 return rss_pages * 4 / 1024 # Covert to MB
Zhuoyao Zhangdc2840d2024-09-19 23:29:27 +0000349
350 def _get_process_cpu_percent(self, pid: int, interval: int = 1) -> float:
Zhuoyao Zhang5d162222024-10-24 23:10:04 +0000351 total_start_time = self._get_total_cpu_time(pid)
352 with open("/proc/uptime", "r") as f:
353 uptime_start = float(f.readline().split()[0])
Zhuoyao Zhangdc2840d2024-09-19 23:29:27 +0000354
Zhuoyao Zhang5d162222024-10-24 23:10:04 +0000355 time.sleep(interval)
Zhuoyao Zhangdc2840d2024-09-19 23:29:27 +0000356
Zhuoyao Zhang5d162222024-10-24 23:10:04 +0000357 total_end_time = self._get_total_cpu_time(pid)
358 with open("/proc/uptime", "r") as f:
359 uptime_end = float(f.readline().split()[0])
Zhuoyao Zhangdc2840d2024-09-19 23:29:27 +0000360
Zhuoyao Zhang5d162222024-10-24 23:10:04 +0000361 return (
362 (total_end_time - total_start_time)
363 / (uptime_end - uptime_start)
364 * 100
365 )
Zhuoyao Zhangdc2840d2024-09-19 23:29:27 +0000366
367 def _get_total_cpu_time(self, pid: int) -> float:
368 with open(f"/proc/{str(pid)}/stat", "r") as f:
369 stats = f.readline().split()
370 # utime is the 14th field in /proc/[pid]/stat measured in clock ticks.
371 utime = int(stats[13])
372 # stime is the 15th field in /proc/[pid]/stat measured in clock ticks.
373 stime = int(stats[14])
374 return (utime + stime) / os.sysconf(os.sysconf_names["SC_CLK_TCK"])
375
Zhuoyao Zhangd28da5c2024-09-24 19:46:12 +0000376 def _find_all_instances_pids(self) -> list[int]:
377 pids = []
378
379 for file in os.listdir(self.pid_file_path.parent):
380 if file.endswith(".lock"):
381 try:
382 with open(self.pid_file_path.parent.joinpath(file), "r") as f:
383 pids.append(int(f.read().strip()))
384 except (FileNotFoundError, IOError, ValueError, TypeError):
385 logging.exception("Failed to get pid from file path: %s", file)
386
Zhuoyao Zhangba64f312024-10-14 20:32:53 +0000387 return pids
388
Zhuoyao Zhang585b4342024-11-12 22:46:43 +0000389 def _handle_resource_exhausted_error(self, resource_type:str):
390 if resource_type == "memory":
391 self._send_error_event_to_clearcut(
392 edit_event_pb2.EditEvent.KILLED_DUE_TO_EXCEEDED_MEMORY_USAGE
393 )
394 else:
395 self._send_error_event_to_clearcut(
396 edit_event_pb2.EditEvent.KILLED_DUE_TO_EXCEEDED_CPU_USAGE
397 )
398 logging.error(
399 "Daemon process is consuming too much %s, killing...", resource_type
400 ),
401 self._terminate_process(self.daemon_process.pid)
402
Zhuoyao Zhangba64f312024-10-14 20:32:53 +0000403 def _send_error_event_to_clearcut(self, error_type):
404 edit_monitor_error_event_proto = edit_event_pb2.EditEvent(
405 user_name=self.user_name,
406 host_name=self.host_name,
407 source_root=self.source_root,
408 )
409 edit_monitor_error_event_proto.edit_monitor_error_event.CopyFrom(
410 edit_event_pb2.EditEvent.EditMonitorErrorEvent(error_type=error_type)
411 )
412 log_event = clientanalytics_pb2.LogEvent(
413 event_time_ms=int(time.time() * 1000),
414 source_extension=edit_monitor_error_event_proto.SerializeToString(),
415 )
416 self.cclient.log(log_event)