blob: 9a0abb60976f516a12000dbda16182bc597e5c7e [file] [log] [blame]
Zhuoyao Zhang53359552024-09-16 23:58:11 +00001# Copyright 2024, The Android Open Source Project
2#
3# Licensed under the Apache License, Version 2.0 (the "License");
4# you may not use this file except in compliance with the License.
5# You may obtain a copy of the License at
6#
7# http://www.apache.org/licenses/LICENSE-2.0
8#
9# Unless required by applicable law or agreed to in writing, software
10# distributed under the License is distributed on an "AS IS" BASIS,
11# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12# See the License for the specific language governing permissions and
13# limitations under the License.
14
15
Zhuoyao Zhangba64f312024-10-14 20:32:53 +000016import getpass
Zhuoyao Zhang53359552024-09-16 23:58:11 +000017import hashlib
18import logging
19import multiprocessing
20import os
21import pathlib
Zhuoyao Zhangba64f312024-10-14 20:32:53 +000022import platform
Zhuoyao Zhang53359552024-09-16 23:58:11 +000023import signal
24import subprocess
Zhuoyao Zhang205a2fc2024-09-20 18:19:59 +000025import sys
Zhuoyao Zhang53359552024-09-16 23:58:11 +000026import tempfile
27import time
28
Zhuoyao Zhangba64f312024-10-14 20:32:53 +000029from atest.metrics import clearcut_client
30from atest.proto import clientanalytics_pb2
Zhuoyao Zhang3ca7cef2024-10-31 22:07:31 +000031from edit_monitor import utils
Zhuoyao Zhangba64f312024-10-14 20:32:53 +000032from proto import edit_event_pb2
Zhuoyao Zhang53359552024-09-16 23:58:11 +000033
Zhuoyao Zhang5d162222024-10-24 23:10:04 +000034DEFAULT_PROCESS_TERMINATION_TIMEOUT_SECONDS = 5
Zhuoyao Zhangdc2840d2024-09-19 23:29:27 +000035DEFAULT_MONITOR_INTERVAL_SECONDS = 5
Zhuoyao Zhang5d162222024-10-24 23:10:04 +000036DEFAULT_MEMORY_USAGE_THRESHOLD = 2 * 1024 # 2GB
Zhuoyao Zhang205a2fc2024-09-20 18:19:59 +000037DEFAULT_CPU_USAGE_THRESHOLD = 200
38DEFAULT_REBOOT_TIMEOUT_SECONDS = 60 * 60 * 24
Zhuoyao Zhangd28da5c2024-09-24 19:46:12 +000039BLOCK_SIGN_FILE = "edit_monitor_block_sign"
Zhuoyao Zhangba64f312024-10-14 20:32:53 +000040# Enum of the Clearcut log source defined under
41# /google3/wireless/android/play/playlog/proto/log_source_enum.proto
42LOG_SOURCE = 2524
Zhuoyao Zhang53359552024-09-16 23:58:11 +000043
44
45def default_daemon_target():
46 """Place holder for the default daemon target."""
47 print("default daemon target")
48
49
50class DaemonManager:
51 """Class to manage and monitor the daemon run as a subprocess."""
52
53 def __init__(
54 self,
55 binary_path: str,
56 daemon_target: callable = default_daemon_target,
57 daemon_args: tuple = (),
Zhuoyao Zhangba64f312024-10-14 20:32:53 +000058 cclient: clearcut_client.Clearcut | None = None,
Zhuoyao Zhang53359552024-09-16 23:58:11 +000059 ):
60 self.binary_path = binary_path
61 self.daemon_target = daemon_target
62 self.daemon_args = daemon_args
Zhuoyao Zhangba64f312024-10-14 20:32:53 +000063 self.cclient = cclient or clearcut_client.Clearcut(LOG_SOURCE)
Zhuoyao Zhang53359552024-09-16 23:58:11 +000064
Zhuoyao Zhangba64f312024-10-14 20:32:53 +000065 self.user_name = getpass.getuser()
66 self.host_name = platform.node()
67 self.source_root = os.environ.get("ANDROID_BUILD_TOP", "")
Zhuoyao Zhang53359552024-09-16 23:58:11 +000068 self.pid = os.getpid()
69 self.daemon_process = None
70
Zhuoyao Zhangdc2840d2024-09-19 23:29:27 +000071 self.max_memory_usage = 0
72 self.max_cpu_usage = 0
73
Zhuoyao Zhang53359552024-09-16 23:58:11 +000074 pid_file_dir = pathlib.Path(tempfile.gettempdir()).joinpath("edit_monitor")
75 pid_file_dir.mkdir(parents=True, exist_ok=True)
76 self.pid_file_path = self._get_pid_file_path(pid_file_dir)
Zhuoyao Zhangd28da5c2024-09-24 19:46:12 +000077 self.block_sign = pathlib.Path(tempfile.gettempdir()).joinpath(
78 BLOCK_SIGN_FILE
79 )
Zhuoyao Zhang53359552024-09-16 23:58:11 +000080
81 def start(self):
82 """Writes the pidfile and starts the daemon proces."""
Zhuoyao Zhang3ca7cef2024-10-31 22:07:31 +000083 if not utils.is_feature_enabled(
84 "edit_monitor",
85 self.user_name,
86 "ENABLE_EDIT_MONITOR",
87 "EDIT_MONITOR_ROLLOUT_PERCENTAGE",
88 ):
89 logging.warning("Edit monitor is disabled, exiting...")
90 return
91
Zhuoyao Zhangd28da5c2024-09-24 19:46:12 +000092 if self.block_sign.exists():
93 logging.warning("Block sign found, exiting...")
94 return
95
Zhuoyao Zhangba64f312024-10-14 20:32:53 +000096 if self.binary_path.startswith("/google/cog/"):
Zhuoyao Zhang05e28fa2024-10-04 21:58:39 +000097 logging.warning("Edit monitor for cog is not supported, exiting...")
98 return
99
Zhuoyao Zhangba64f312024-10-14 20:32:53 +0000100 try:
101 self._stop_any_existing_instance()
102 self._write_pid_to_pidfile()
103 self._start_daemon_process()
104 except Exception as e:
105 logging.exception("Failed to start daemon manager with error %s", e)
106 self._send_error_event_to_clearcut(
107 edit_event_pb2.EditEvent.FAILED_TO_START_EDIT_MONITOR
108 )
109 raise e
Zhuoyao Zhang53359552024-09-16 23:58:11 +0000110
Zhuoyao Zhangdc2840d2024-09-19 23:29:27 +0000111 def monitor_daemon(
112 self,
113 interval: int = DEFAULT_MONITOR_INTERVAL_SECONDS,
114 memory_threshold: float = DEFAULT_MEMORY_USAGE_THRESHOLD,
115 cpu_threshold: float = DEFAULT_CPU_USAGE_THRESHOLD,
Zhuoyao Zhang205a2fc2024-09-20 18:19:59 +0000116 reboot_timeout: int = DEFAULT_REBOOT_TIMEOUT_SECONDS,
Zhuoyao Zhangdc2840d2024-09-19 23:29:27 +0000117 ):
118 """Monits the daemon process status.
119
120 Periodically check the CPU/Memory usage of the daemon process as long as the
121 process is still running and kill the process if the resource usage is above
122 given thresholds.
123 """
Zhuoyao Zhangd28da5c2024-09-24 19:46:12 +0000124 if not self.daemon_process:
125 return
126
Zhuoyao Zhangdc2840d2024-09-19 23:29:27 +0000127 logging.info("start monitoring daemon process %d.", self.daemon_process.pid)
Zhuoyao Zhang205a2fc2024-09-20 18:19:59 +0000128 reboot_time = time.time() + reboot_timeout
Zhuoyao Zhangdc2840d2024-09-19 23:29:27 +0000129 while self.daemon_process.is_alive():
Zhuoyao Zhang205a2fc2024-09-20 18:19:59 +0000130 if time.time() > reboot_time:
131 self.reboot()
Zhuoyao Zhangdc2840d2024-09-19 23:29:27 +0000132 try:
133 memory_usage = self._get_process_memory_percent(self.daemon_process.pid)
134 self.max_memory_usage = max(self.max_memory_usage, memory_usage)
135
136 cpu_usage = self._get_process_cpu_percent(self.daemon_process.pid)
137 self.max_cpu_usage = max(self.max_cpu_usage, cpu_usage)
138
139 time.sleep(interval)
140 except Exception as e:
141 # Logging the error and continue.
142 logging.warning("Failed to monitor daemon process with error: %s", e)
143
144 if (
145 self.max_memory_usage >= memory_threshold
146 or self.max_cpu_usage >= cpu_threshold
147 ):
148 logging.error(
149 "Daemon process is consuming too much resource, killing..."
150 ),
Zhuoyao Zhangba64f312024-10-14 20:32:53 +0000151 self._send_error_event_to_clearcut(
152 edit_event_pb2.EditEvent.KILLED_DUE_TO_EXCEEDED_RESOURCE_USAGE
153 )
Zhuoyao Zhangdc2840d2024-09-19 23:29:27 +0000154 self._terminate_process(self.daemon_process.pid)
155
156 logging.info(
157 "Daemon process %d terminated. Max memory usage: %f, Max cpu"
158 " usage: %f.",
159 self.daemon_process.pid,
160 self.max_memory_usage,
161 self.max_cpu_usage,
162 )
163
Zhuoyao Zhang53359552024-09-16 23:58:11 +0000164 def stop(self):
165 """Stops the daemon process and removes the pidfile."""
166
Zhuoyao Zhang64ad75f2024-10-25 00:21:45 +0000167 logging.info("in daemon manager cleanup.")
Zhuoyao Zhang53359552024-09-16 23:58:11 +0000168 try:
Zhuoyao Zhang8a225792024-10-09 18:04:39 +0000169 if self.daemon_process:
170 # The daemon process might already in termination process,
171 # wait some time before kill it explicitly.
172 self._wait_for_process_terminate(self.daemon_process.pid, 1)
173 if self.daemon_process.is_alive():
174 self._terminate_process(self.daemon_process.pid)
Zhuoyao Zhang53359552024-09-16 23:58:11 +0000175 self._remove_pidfile()
Zhuoyao Zhang64ad75f2024-10-25 00:21:45 +0000176 logging.info("Successfully stopped daemon manager.")
Zhuoyao Zhang53359552024-09-16 23:58:11 +0000177 except Exception as e:
178 logging.exception("Failed to stop daemon manager with error %s", e)
Zhuoyao Zhangba64f312024-10-14 20:32:53 +0000179 self._send_error_event_to_clearcut(
180 edit_event_pb2.EditEvent.FAILED_TO_STOP_EDIT_MONITOR
181 )
182 sys.exit(1)
183 finally:
184 self.cclient.flush_events()
Zhuoyao Zhang53359552024-09-16 23:58:11 +0000185
Zhuoyao Zhang205a2fc2024-09-20 18:19:59 +0000186 def reboot(self):
187 """Reboots the current process.
188
189 Stops the current daemon manager and reboots the entire process based on
190 the binary file. Exits directly If the binary file no longer exists.
191 """
Zhuoyao Zhang64ad75f2024-10-25 00:21:45 +0000192 logging.info("Rebooting process based on binary %s.", self.binary_path)
Zhuoyao Zhang205a2fc2024-09-20 18:19:59 +0000193
194 # Stop the current daemon manager first.
195 self.stop()
196
197 # If the binary no longer exists, exit directly.
198 if not os.path.exists(self.binary_path):
199 logging.info("binary %s no longer exists, exiting.", self.binary_path)
200 sys.exit(0)
201
202 try:
203 os.execv(self.binary_path, sys.argv)
204 except OSError as e:
205 logging.exception("Failed to reboot process with error: %s.", e)
Zhuoyao Zhangba64f312024-10-14 20:32:53 +0000206 self._send_error_event_to_clearcut(
207 edit_event_pb2.EditEvent.FAILED_TO_REBOOT_EDIT_MONITOR
208 )
Zhuoyao Zhang205a2fc2024-09-20 18:19:59 +0000209 sys.exit(1) # Indicate an error occurred
210
Zhuoyao Zhangd28da5c2024-09-24 19:46:12 +0000211 def cleanup(self):
212 """Wipes out all edit monitor instances in the system.
213
214 Stops all the existing edit monitor instances and place a block sign
215 to prevent any edit monitor process to start. This method is only used
216 in emergency case when there's something goes wrong with the edit monitor
217 that requires immediate cleanup to prevent damanger to the system.
218 """
219 logging.debug("Start cleaning up all existing instances.")
Zhuoyao Zhangba64f312024-10-14 20:32:53 +0000220 self._send_error_event_to_clearcut(edit_event_pb2.EditEvent.FORCE_CLEANUP)
Zhuoyao Zhangd28da5c2024-09-24 19:46:12 +0000221
222 try:
223 # First places a block sign to prevent any edit monitor process to start.
224 self.block_sign.touch()
225 except (FileNotFoundError, PermissionError, OSError):
226 logging.exception("Failed to place the block sign")
227
228 # Finds and kills all the existing instances of edit monitor.
229 existing_instances_pids = self._find_all_instances_pids()
230 for pid in existing_instances_pids:
231 logging.info(
232 "Found existing edit monitor instance with pid %d, killing...", pid
233 )
234 try:
235 self._terminate_process(pid)
236 except Exception:
237 logging.exception("Failed to terminate process %d", pid)
238
Zhuoyao Zhang4d485592024-09-17 21:14:38 +0000239 def _stop_any_existing_instance(self):
240 if not self.pid_file_path.exists():
241 logging.debug("No existing instances.")
242 return
243
244 ex_pid = self._read_pid_from_pidfile()
245
246 if ex_pid:
247 logging.info("Found another instance with pid %d.", ex_pid)
248 self._terminate_process(ex_pid)
249 self._remove_pidfile()
250
251 def _read_pid_from_pidfile(self):
252 with open(self.pid_file_path, "r") as f:
253 return int(f.read().strip())
254
Zhuoyao Zhang53359552024-09-16 23:58:11 +0000255 def _write_pid_to_pidfile(self):
256 """Creates a pidfile and writes the current pid to the file.
257
258 Raise FileExistsError if the pidfile already exists.
259 """
260 try:
261 # Use the 'x' mode to open the file for exclusive creation
262 with open(self.pid_file_path, "x") as f:
263 f.write(f"{self.pid}")
264 except FileExistsError as e:
265 # This could be caused due to race condition that a user is trying
266 # to start two edit monitors at the same time. Or because there is
267 # already an existing edit monitor running and we can not kill it
268 # for some reason.
269 logging.exception("pidfile %s already exists.", self.pid_file_path)
270 raise e
271
272 def _start_daemon_process(self):
273 """Starts a subprocess to run the daemon."""
274 p = multiprocessing.Process(
275 target=self.daemon_target, args=self.daemon_args
276 )
Zhuoyao Zhang8a225792024-10-09 18:04:39 +0000277 p.daemon = True
Zhuoyao Zhang53359552024-09-16 23:58:11 +0000278 p.start()
279
280 logging.info("Start subprocess with PID %d", p.pid)
281 self.daemon_process = p
282
283 def _terminate_process(
284 self, pid: int, timeout: int = DEFAULT_PROCESS_TERMINATION_TIMEOUT_SECONDS
285 ):
286 """Terminates a process with given pid.
287
288 It first sends a SIGTERM to the process to allow it for proper
289 termination with a timeout. If the process is not terminated within
290 the timeout, kills it forcefully.
291 """
292 try:
293 os.kill(pid, signal.SIGTERM)
294 if not self._wait_for_process_terminate(pid, timeout):
295 logging.warning(
296 "Process %d not terminated within timeout, try force kill", pid
297 )
298 os.kill(pid, signal.SIGKILL)
299 except ProcessLookupError:
300 logging.info("Process with PID %d not found (already terminated)", pid)
301
302 def _wait_for_process_terminate(self, pid: int, timeout: int) -> bool:
303 start_time = time.time()
304
305 while time.time() < start_time + timeout:
306 if not self._is_process_alive(pid):
307 return True
308 time.sleep(1)
309
310 logging.error("Process %d not terminated within %d seconds.", pid, timeout)
311 return False
312
313 def _is_process_alive(self, pid: int) -> bool:
314 try:
315 output = subprocess.check_output(
316 ["ps", "-p", str(pid), "-o", "state="], text=True
317 ).strip()
318 state = output.split()[0]
319 return state != "Z" # Check if the state is not 'Z' (zombie)
320 except subprocess.CalledProcessError:
321 # Process not found (already dead).
322 return False
323 except (FileNotFoundError, OSError, ValueError) as e:
324 logging.warning(
325 "Unable to check the status for process %d with error: %s.", pid, e
326 )
327 return True
328
329 def _remove_pidfile(self):
330 try:
331 os.remove(self.pid_file_path)
332 except FileNotFoundError:
333 logging.info("pid file %s already removed.", self.pid_file_path)
334
335 def _get_pid_file_path(self, pid_file_dir: pathlib.Path) -> pathlib.Path:
336 """Generates the path to store the pidfile.
337
338 The file path should have the format of "/tmp/edit_monitor/xxxx.lock"
339 where xxxx is a hashed value based on the binary path that starts the
340 process.
341 """
342 hash_object = hashlib.sha256()
343 hash_object.update(self.binary_path.encode("utf-8"))
344 pid_file_path = pid_file_dir.joinpath(hash_object.hexdigest() + ".lock")
345 logging.info("pid_file_path: %s", pid_file_path)
346
347 return pid_file_path
Zhuoyao Zhangdc2840d2024-09-19 23:29:27 +0000348
349 def _get_process_memory_percent(self, pid: int) -> float:
Zhuoyao Zhang5d162222024-10-24 23:10:04 +0000350 with open(f"/proc/{pid}/stat", "r") as f:
351 stat_data = f.readline().split()
352 # RSS is the 24th field in /proc/[pid]/stat
353 rss_pages = int(stat_data[23])
354 return rss_pages * 4 / 1024 # Covert to MB
Zhuoyao Zhangdc2840d2024-09-19 23:29:27 +0000355
356 def _get_process_cpu_percent(self, pid: int, interval: int = 1) -> float:
Zhuoyao Zhang5d162222024-10-24 23:10:04 +0000357 total_start_time = self._get_total_cpu_time(pid)
358 with open("/proc/uptime", "r") as f:
359 uptime_start = float(f.readline().split()[0])
Zhuoyao Zhangdc2840d2024-09-19 23:29:27 +0000360
Zhuoyao Zhang5d162222024-10-24 23:10:04 +0000361 time.sleep(interval)
Zhuoyao Zhangdc2840d2024-09-19 23:29:27 +0000362
Zhuoyao Zhang5d162222024-10-24 23:10:04 +0000363 total_end_time = self._get_total_cpu_time(pid)
364 with open("/proc/uptime", "r") as f:
365 uptime_end = float(f.readline().split()[0])
Zhuoyao Zhangdc2840d2024-09-19 23:29:27 +0000366
Zhuoyao Zhang5d162222024-10-24 23:10:04 +0000367 return (
368 (total_end_time - total_start_time)
369 / (uptime_end - uptime_start)
370 * 100
371 )
Zhuoyao Zhangdc2840d2024-09-19 23:29:27 +0000372
373 def _get_total_cpu_time(self, pid: int) -> float:
374 with open(f"/proc/{str(pid)}/stat", "r") as f:
375 stats = f.readline().split()
376 # utime is the 14th field in /proc/[pid]/stat measured in clock ticks.
377 utime = int(stats[13])
378 # stime is the 15th field in /proc/[pid]/stat measured in clock ticks.
379 stime = int(stats[14])
380 return (utime + stime) / os.sysconf(os.sysconf_names["SC_CLK_TCK"])
381
Zhuoyao Zhangd28da5c2024-09-24 19:46:12 +0000382 def _find_all_instances_pids(self) -> list[int]:
383 pids = []
384
385 for file in os.listdir(self.pid_file_path.parent):
386 if file.endswith(".lock"):
387 try:
388 with open(self.pid_file_path.parent.joinpath(file), "r") as f:
389 pids.append(int(f.read().strip()))
390 except (FileNotFoundError, IOError, ValueError, TypeError):
391 logging.exception("Failed to get pid from file path: %s", file)
392
Zhuoyao Zhangba64f312024-10-14 20:32:53 +0000393 return pids
394
395 def _send_error_event_to_clearcut(self, error_type):
396 edit_monitor_error_event_proto = edit_event_pb2.EditEvent(
397 user_name=self.user_name,
398 host_name=self.host_name,
399 source_root=self.source_root,
400 )
401 edit_monitor_error_event_proto.edit_monitor_error_event.CopyFrom(
402 edit_event_pb2.EditEvent.EditMonitorErrorEvent(error_type=error_type)
403 )
404 log_event = clientanalytics_pb2.LogEvent(
405 event_time_ms=int(time.time() * 1000),
406 source_extension=edit_monitor_error_event_proto.SerializeToString(),
407 )
408 self.cclient.log(log_event)