blob: c73603c488c4a5cc4ef8ae2ed9e3f7d888805a68 [file] [log] [blame]
Zhuoyao Zhang53359552024-09-16 23:58:11 +00001# Copyright 2024, The Android Open Source Project
2#
3# Licensed under the Apache License, Version 2.0 (the "License");
4# you may not use this file except in compliance with the License.
5# You may obtain a copy of the License at
6#
7# http://www.apache.org/licenses/LICENSE-2.0
8#
9# Unless required by applicable law or agreed to in writing, software
10# distributed under the License is distributed on an "AS IS" BASIS,
11# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12# See the License for the specific language governing permissions and
13# limitations under the License.
14
15
Zhuoyao Zhangba64f312024-10-14 20:32:53 +000016import getpass
Zhuoyao Zhang53359552024-09-16 23:58:11 +000017import hashlib
18import logging
19import multiprocessing
20import os
21import pathlib
Zhuoyao Zhangba64f312024-10-14 20:32:53 +000022import platform
Zhuoyao Zhang53359552024-09-16 23:58:11 +000023import signal
24import subprocess
Zhuoyao Zhang205a2fc2024-09-20 18:19:59 +000025import sys
Zhuoyao Zhang53359552024-09-16 23:58:11 +000026import tempfile
27import time
28
Zhuoyao Zhangba64f312024-10-14 20:32:53 +000029from atest.metrics import clearcut_client
30from atest.proto import clientanalytics_pb2
Zhuoyao Zhang3ca7cef2024-10-31 22:07:31 +000031from edit_monitor import utils
Zhuoyao Zhangba64f312024-10-14 20:32:53 +000032from proto import edit_event_pb2
Zhuoyao Zhang53359552024-09-16 23:58:11 +000033
Zhuoyao Zhang5d162222024-10-24 23:10:04 +000034DEFAULT_PROCESS_TERMINATION_TIMEOUT_SECONDS = 5
Zhuoyao Zhangdc2840d2024-09-19 23:29:27 +000035DEFAULT_MONITOR_INTERVAL_SECONDS = 5
Zhuoyao Zhang69882722024-11-15 18:32:18 +000036DEFAULT_MEMORY_USAGE_THRESHOLD = 0.02 # 2% of total memory
Zhuoyao Zhang205a2fc2024-09-20 18:19:59 +000037DEFAULT_CPU_USAGE_THRESHOLD = 200
38DEFAULT_REBOOT_TIMEOUT_SECONDS = 60 * 60 * 24
Zhuoyao Zhangd28da5c2024-09-24 19:46:12 +000039BLOCK_SIGN_FILE = "edit_monitor_block_sign"
Zhuoyao Zhangba64f312024-10-14 20:32:53 +000040# Enum of the Clearcut log source defined under
41# /google3/wireless/android/play/playlog/proto/log_source_enum.proto
42LOG_SOURCE = 2524
Zhuoyao Zhang53359552024-09-16 23:58:11 +000043
44
45def default_daemon_target():
46 """Place holder for the default daemon target."""
47 print("default daemon target")
48
49
50class DaemonManager:
51 """Class to manage and monitor the daemon run as a subprocess."""
52
53 def __init__(
54 self,
55 binary_path: str,
56 daemon_target: callable = default_daemon_target,
57 daemon_args: tuple = (),
Zhuoyao Zhangba64f312024-10-14 20:32:53 +000058 cclient: clearcut_client.Clearcut | None = None,
Zhuoyao Zhang53359552024-09-16 23:58:11 +000059 ):
60 self.binary_path = binary_path
61 self.daemon_target = daemon_target
62 self.daemon_args = daemon_args
Zhuoyao Zhangba64f312024-10-14 20:32:53 +000063 self.cclient = cclient or clearcut_client.Clearcut(LOG_SOURCE)
Zhuoyao Zhang53359552024-09-16 23:58:11 +000064
Zhuoyao Zhangba64f312024-10-14 20:32:53 +000065 self.user_name = getpass.getuser()
66 self.host_name = platform.node()
67 self.source_root = os.environ.get("ANDROID_BUILD_TOP", "")
Zhuoyao Zhang53359552024-09-16 23:58:11 +000068 self.pid = os.getpid()
69 self.daemon_process = None
70
Zhuoyao Zhangdc2840d2024-09-19 23:29:27 +000071 self.max_memory_usage = 0
72 self.max_cpu_usage = 0
Zhuoyao Zhang69882722024-11-15 18:32:18 +000073 self.total_memory_size = os.sysconf("SC_PAGE_SIZE") * os.sysconf(
74 "SC_PHYS_PAGES"
75 )
Zhuoyao Zhangdc2840d2024-09-19 23:29:27 +000076
Zhuoyao Zhang53359552024-09-16 23:58:11 +000077 pid_file_dir = pathlib.Path(tempfile.gettempdir()).joinpath("edit_monitor")
78 pid_file_dir.mkdir(parents=True, exist_ok=True)
79 self.pid_file_path = self._get_pid_file_path(pid_file_dir)
Zhuoyao Zhangd28da5c2024-09-24 19:46:12 +000080 self.block_sign = pathlib.Path(tempfile.gettempdir()).joinpath(
81 BLOCK_SIGN_FILE
82 )
Zhuoyao Zhang53359552024-09-16 23:58:11 +000083
84 def start(self):
85 """Writes the pidfile and starts the daemon proces."""
Zhuoyao Zhang3ca7cef2024-10-31 22:07:31 +000086 if not utils.is_feature_enabled(
87 "edit_monitor",
88 self.user_name,
Zhuoyao Zhangd1c4a8b2024-11-06 21:48:45 +000089 "ENABLE_ANDROID_EDIT_MONITOR",
Zhuoyao Zhangffb01682024-11-15 00:03:26 +000090 50,
Zhuoyao Zhang3ca7cef2024-10-31 22:07:31 +000091 ):
92 logging.warning("Edit monitor is disabled, exiting...")
93 return
94
Zhuoyao Zhangd28da5c2024-09-24 19:46:12 +000095 if self.block_sign.exists():
96 logging.warning("Block sign found, exiting...")
97 return
98
Zhuoyao Zhangba64f312024-10-14 20:32:53 +000099 if self.binary_path.startswith("/google/cog/"):
Zhuoyao Zhang05e28fa2024-10-04 21:58:39 +0000100 logging.warning("Edit monitor for cog is not supported, exiting...")
101 return
102
Zhuoyao Zhangba64f312024-10-14 20:32:53 +0000103 try:
104 self._stop_any_existing_instance()
105 self._write_pid_to_pidfile()
106 self._start_daemon_process()
107 except Exception as e:
108 logging.exception("Failed to start daemon manager with error %s", e)
109 self._send_error_event_to_clearcut(
110 edit_event_pb2.EditEvent.FAILED_TO_START_EDIT_MONITOR
111 )
112 raise e
Zhuoyao Zhang53359552024-09-16 23:58:11 +0000113
Zhuoyao Zhangdc2840d2024-09-19 23:29:27 +0000114 def monitor_daemon(
115 self,
116 interval: int = DEFAULT_MONITOR_INTERVAL_SECONDS,
117 memory_threshold: float = DEFAULT_MEMORY_USAGE_THRESHOLD,
118 cpu_threshold: float = DEFAULT_CPU_USAGE_THRESHOLD,
Zhuoyao Zhang205a2fc2024-09-20 18:19:59 +0000119 reboot_timeout: int = DEFAULT_REBOOT_TIMEOUT_SECONDS,
Zhuoyao Zhangdc2840d2024-09-19 23:29:27 +0000120 ):
121 """Monits the daemon process status.
122
123 Periodically check the CPU/Memory usage of the daemon process as long as the
124 process is still running and kill the process if the resource usage is above
125 given thresholds.
126 """
Zhuoyao Zhangd28da5c2024-09-24 19:46:12 +0000127 if not self.daemon_process:
128 return
129
Zhuoyao Zhangdc2840d2024-09-19 23:29:27 +0000130 logging.info("start monitoring daemon process %d.", self.daemon_process.pid)
Zhuoyao Zhang205a2fc2024-09-20 18:19:59 +0000131 reboot_time = time.time() + reboot_timeout
Zhuoyao Zhangdc2840d2024-09-19 23:29:27 +0000132 while self.daemon_process.is_alive():
Zhuoyao Zhang205a2fc2024-09-20 18:19:59 +0000133 if time.time() > reboot_time:
134 self.reboot()
Zhuoyao Zhangdc2840d2024-09-19 23:29:27 +0000135 try:
136 memory_usage = self._get_process_memory_percent(self.daemon_process.pid)
137 self.max_memory_usage = max(self.max_memory_usage, memory_usage)
138
139 cpu_usage = self._get_process_cpu_percent(self.daemon_process.pid)
140 self.max_cpu_usage = max(self.max_cpu_usage, cpu_usage)
141
142 time.sleep(interval)
143 except Exception as e:
144 # Logging the error and continue.
145 logging.warning("Failed to monitor daemon process with error: %s", e)
146
Zhuoyao Zhang585b4342024-11-12 22:46:43 +0000147 if self.max_memory_usage >= memory_threshold:
148 self._handle_resource_exhausted_error("memory")
149
150 if self.max_cpu_usage >= cpu_threshold:
151 self._handle_resource_exhausted_error("cpu")
Zhuoyao Zhangdc2840d2024-09-19 23:29:27 +0000152
153 logging.info(
154 "Daemon process %d terminated. Max memory usage: %f, Max cpu"
155 " usage: %f.",
156 self.daemon_process.pid,
157 self.max_memory_usage,
158 self.max_cpu_usage,
159 )
160
Zhuoyao Zhang53359552024-09-16 23:58:11 +0000161 def stop(self):
162 """Stops the daemon process and removes the pidfile."""
163
Zhuoyao Zhang64ad75f2024-10-25 00:21:45 +0000164 logging.info("in daemon manager cleanup.")
Zhuoyao Zhang53359552024-09-16 23:58:11 +0000165 try:
Zhuoyao Zhang8a225792024-10-09 18:04:39 +0000166 if self.daemon_process:
167 # The daemon process might already in termination process,
168 # wait some time before kill it explicitly.
169 self._wait_for_process_terminate(self.daemon_process.pid, 1)
170 if self.daemon_process.is_alive():
171 self._terminate_process(self.daemon_process.pid)
Zhuoyao Zhang53359552024-09-16 23:58:11 +0000172 self._remove_pidfile()
Zhuoyao Zhang64ad75f2024-10-25 00:21:45 +0000173 logging.info("Successfully stopped daemon manager.")
Zhuoyao Zhang53359552024-09-16 23:58:11 +0000174 except Exception as e:
175 logging.exception("Failed to stop daemon manager with error %s", e)
Zhuoyao Zhangba64f312024-10-14 20:32:53 +0000176 self._send_error_event_to_clearcut(
177 edit_event_pb2.EditEvent.FAILED_TO_STOP_EDIT_MONITOR
178 )
179 sys.exit(1)
180 finally:
181 self.cclient.flush_events()
Zhuoyao Zhang53359552024-09-16 23:58:11 +0000182
Zhuoyao Zhang205a2fc2024-09-20 18:19:59 +0000183 def reboot(self):
184 """Reboots the current process.
185
186 Stops the current daemon manager and reboots the entire process based on
187 the binary file. Exits directly If the binary file no longer exists.
188 """
Zhuoyao Zhang64ad75f2024-10-25 00:21:45 +0000189 logging.info("Rebooting process based on binary %s.", self.binary_path)
Zhuoyao Zhang205a2fc2024-09-20 18:19:59 +0000190
191 # Stop the current daemon manager first.
192 self.stop()
193
194 # If the binary no longer exists, exit directly.
195 if not os.path.exists(self.binary_path):
196 logging.info("binary %s no longer exists, exiting.", self.binary_path)
197 sys.exit(0)
198
199 try:
200 os.execv(self.binary_path, sys.argv)
201 except OSError as e:
202 logging.exception("Failed to reboot process with error: %s.", e)
Zhuoyao Zhangba64f312024-10-14 20:32:53 +0000203 self._send_error_event_to_clearcut(
204 edit_event_pb2.EditEvent.FAILED_TO_REBOOT_EDIT_MONITOR
205 )
Zhuoyao Zhang205a2fc2024-09-20 18:19:59 +0000206 sys.exit(1) # Indicate an error occurred
207
Zhuoyao Zhangd28da5c2024-09-24 19:46:12 +0000208 def cleanup(self):
209 """Wipes out all edit monitor instances in the system.
210
211 Stops all the existing edit monitor instances and place a block sign
212 to prevent any edit monitor process to start. This method is only used
213 in emergency case when there's something goes wrong with the edit monitor
214 that requires immediate cleanup to prevent damanger to the system.
215 """
216 logging.debug("Start cleaning up all existing instances.")
Zhuoyao Zhangba64f312024-10-14 20:32:53 +0000217 self._send_error_event_to_clearcut(edit_event_pb2.EditEvent.FORCE_CLEANUP)
Zhuoyao Zhangd28da5c2024-09-24 19:46:12 +0000218
219 try:
220 # First places a block sign to prevent any edit monitor process to start.
221 self.block_sign.touch()
222 except (FileNotFoundError, PermissionError, OSError):
223 logging.exception("Failed to place the block sign")
224
225 # Finds and kills all the existing instances of edit monitor.
226 existing_instances_pids = self._find_all_instances_pids()
227 for pid in existing_instances_pids:
228 logging.info(
229 "Found existing edit monitor instance with pid %d, killing...", pid
230 )
231 try:
232 self._terminate_process(pid)
233 except Exception:
234 logging.exception("Failed to terminate process %d", pid)
235
Zhuoyao Zhang4d485592024-09-17 21:14:38 +0000236 def _stop_any_existing_instance(self):
237 if not self.pid_file_path.exists():
238 logging.debug("No existing instances.")
239 return
240
241 ex_pid = self._read_pid_from_pidfile()
242
243 if ex_pid:
244 logging.info("Found another instance with pid %d.", ex_pid)
245 self._terminate_process(ex_pid)
246 self._remove_pidfile()
247
248 def _read_pid_from_pidfile(self):
249 with open(self.pid_file_path, "r") as f:
250 return int(f.read().strip())
251
Zhuoyao Zhang53359552024-09-16 23:58:11 +0000252 def _write_pid_to_pidfile(self):
253 """Creates a pidfile and writes the current pid to the file.
254
255 Raise FileExistsError if the pidfile already exists.
256 """
257 try:
258 # Use the 'x' mode to open the file for exclusive creation
259 with open(self.pid_file_path, "x") as f:
260 f.write(f"{self.pid}")
261 except FileExistsError as e:
262 # This could be caused due to race condition that a user is trying
263 # to start two edit monitors at the same time. Or because there is
264 # already an existing edit monitor running and we can not kill it
265 # for some reason.
266 logging.exception("pidfile %s already exists.", self.pid_file_path)
267 raise e
268
269 def _start_daemon_process(self):
270 """Starts a subprocess to run the daemon."""
271 p = multiprocessing.Process(
272 target=self.daemon_target, args=self.daemon_args
273 )
Zhuoyao Zhang8a225792024-10-09 18:04:39 +0000274 p.daemon = True
Zhuoyao Zhang53359552024-09-16 23:58:11 +0000275 p.start()
276
277 logging.info("Start subprocess with PID %d", p.pid)
278 self.daemon_process = p
279
280 def _terminate_process(
281 self, pid: int, timeout: int = DEFAULT_PROCESS_TERMINATION_TIMEOUT_SECONDS
282 ):
283 """Terminates a process with given pid.
284
285 It first sends a SIGTERM to the process to allow it for proper
286 termination with a timeout. If the process is not terminated within
287 the timeout, kills it forcefully.
288 """
289 try:
290 os.kill(pid, signal.SIGTERM)
291 if not self._wait_for_process_terminate(pid, timeout):
292 logging.warning(
293 "Process %d not terminated within timeout, try force kill", pid
294 )
295 os.kill(pid, signal.SIGKILL)
296 except ProcessLookupError:
297 logging.info("Process with PID %d not found (already terminated)", pid)
298
299 def _wait_for_process_terminate(self, pid: int, timeout: int) -> bool:
300 start_time = time.time()
301
302 while time.time() < start_time + timeout:
303 if not self._is_process_alive(pid):
304 return True
305 time.sleep(1)
306
307 logging.error("Process %d not terminated within %d seconds.", pid, timeout)
308 return False
309
310 def _is_process_alive(self, pid: int) -> bool:
311 try:
312 output = subprocess.check_output(
313 ["ps", "-p", str(pid), "-o", "state="], text=True
314 ).strip()
315 state = output.split()[0]
316 return state != "Z" # Check if the state is not 'Z' (zombie)
317 except subprocess.CalledProcessError:
318 # Process not found (already dead).
319 return False
320 except (FileNotFoundError, OSError, ValueError) as e:
321 logging.warning(
322 "Unable to check the status for process %d with error: %s.", pid, e
323 )
324 return True
325
326 def _remove_pidfile(self):
327 try:
328 os.remove(self.pid_file_path)
329 except FileNotFoundError:
330 logging.info("pid file %s already removed.", self.pid_file_path)
331
332 def _get_pid_file_path(self, pid_file_dir: pathlib.Path) -> pathlib.Path:
333 """Generates the path to store the pidfile.
334
335 The file path should have the format of "/tmp/edit_monitor/xxxx.lock"
336 where xxxx is a hashed value based on the binary path that starts the
337 process.
338 """
339 hash_object = hashlib.sha256()
340 hash_object.update(self.binary_path.encode("utf-8"))
341 pid_file_path = pid_file_dir.joinpath(hash_object.hexdigest() + ".lock")
342 logging.info("pid_file_path: %s", pid_file_path)
343
344 return pid_file_path
Zhuoyao Zhangdc2840d2024-09-19 23:29:27 +0000345
346 def _get_process_memory_percent(self, pid: int) -> float:
Zhuoyao Zhang5d162222024-10-24 23:10:04 +0000347 with open(f"/proc/{pid}/stat", "r") as f:
348 stat_data = f.readline().split()
349 # RSS is the 24th field in /proc/[pid]/stat
350 rss_pages = int(stat_data[23])
Zhuoyao Zhang69882722024-11-15 18:32:18 +0000351 process_memory = rss_pages * 4 * 1024 # Convert to bytes
352
353 return (
354 process_memory / self.total_memory_size
355 if self.total_memory_size
356 else 0.0
357 )
Zhuoyao Zhangdc2840d2024-09-19 23:29:27 +0000358
359 def _get_process_cpu_percent(self, pid: int, interval: int = 1) -> float:
Zhuoyao Zhang5d162222024-10-24 23:10:04 +0000360 total_start_time = self._get_total_cpu_time(pid)
361 with open("/proc/uptime", "r") as f:
362 uptime_start = float(f.readline().split()[0])
Zhuoyao Zhangdc2840d2024-09-19 23:29:27 +0000363
Zhuoyao Zhang5d162222024-10-24 23:10:04 +0000364 time.sleep(interval)
Zhuoyao Zhangdc2840d2024-09-19 23:29:27 +0000365
Zhuoyao Zhang5d162222024-10-24 23:10:04 +0000366 total_end_time = self._get_total_cpu_time(pid)
367 with open("/proc/uptime", "r") as f:
368 uptime_end = float(f.readline().split()[0])
Zhuoyao Zhangdc2840d2024-09-19 23:29:27 +0000369
Zhuoyao Zhang5d162222024-10-24 23:10:04 +0000370 return (
371 (total_end_time - total_start_time)
372 / (uptime_end - uptime_start)
373 * 100
374 )
Zhuoyao Zhangdc2840d2024-09-19 23:29:27 +0000375
376 def _get_total_cpu_time(self, pid: int) -> float:
377 with open(f"/proc/{str(pid)}/stat", "r") as f:
378 stats = f.readline().split()
379 # utime is the 14th field in /proc/[pid]/stat measured in clock ticks.
380 utime = int(stats[13])
381 # stime is the 15th field in /proc/[pid]/stat measured in clock ticks.
382 stime = int(stats[14])
383 return (utime + stime) / os.sysconf(os.sysconf_names["SC_CLK_TCK"])
384
Zhuoyao Zhangd28da5c2024-09-24 19:46:12 +0000385 def _find_all_instances_pids(self) -> list[int]:
386 pids = []
387
388 for file in os.listdir(self.pid_file_path.parent):
389 if file.endswith(".lock"):
390 try:
391 with open(self.pid_file_path.parent.joinpath(file), "r") as f:
392 pids.append(int(f.read().strip()))
393 except (FileNotFoundError, IOError, ValueError, TypeError):
394 logging.exception("Failed to get pid from file path: %s", file)
395
Zhuoyao Zhangba64f312024-10-14 20:32:53 +0000396 return pids
397
Zhuoyao Zhang585b4342024-11-12 22:46:43 +0000398 def _handle_resource_exhausted_error(self, resource_type:str):
399 if resource_type == "memory":
400 self._send_error_event_to_clearcut(
401 edit_event_pb2.EditEvent.KILLED_DUE_TO_EXCEEDED_MEMORY_USAGE
402 )
403 else:
404 self._send_error_event_to_clearcut(
405 edit_event_pb2.EditEvent.KILLED_DUE_TO_EXCEEDED_CPU_USAGE
406 )
407 logging.error(
408 "Daemon process is consuming too much %s, killing...", resource_type
409 ),
410 self._terminate_process(self.daemon_process.pid)
411
Zhuoyao Zhangba64f312024-10-14 20:32:53 +0000412 def _send_error_event_to_clearcut(self, error_type):
413 edit_monitor_error_event_proto = edit_event_pb2.EditEvent(
414 user_name=self.user_name,
415 host_name=self.host_name,
416 source_root=self.source_root,
417 )
418 edit_monitor_error_event_proto.edit_monitor_error_event.CopyFrom(
419 edit_event_pb2.EditEvent.EditMonitorErrorEvent(error_type=error_type)
420 )
421 log_event = clientanalytics_pb2.LogEvent(
422 event_time_ms=int(time.time() * 1000),
423 source_extension=edit_monitor_error_event_proto.SerializeToString(),
424 )
425 self.cclient.log(log_event)