blob: 445d849a495755b1f49286cf625360c184b435d6 [file] [log] [blame]
Zhuoyao Zhang53359552024-09-16 23:58:11 +00001# Copyright 2024, The Android Open Source Project
2#
3# Licensed under the Apache License, Version 2.0 (the "License");
4# you may not use this file except in compliance with the License.
5# You may obtain a copy of the License at
6#
7# http://www.apache.org/licenses/LICENSE-2.0
8#
9# Unless required by applicable law or agreed to in writing, software
10# distributed under the License is distributed on an "AS IS" BASIS,
11# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12# See the License for the specific language governing permissions and
13# limitations under the License.
14
15
16import hashlib
17import logging
18import multiprocessing
19import os
20import pathlib
21import signal
22import subprocess
Zhuoyao Zhang205a2fc2024-09-20 18:19:59 +000023import sys
Zhuoyao Zhang53359552024-09-16 23:58:11 +000024import tempfile
25import time
26
27
28DEFAULT_PROCESS_TERMINATION_TIMEOUT_SECONDS = 1
Zhuoyao Zhangdc2840d2024-09-19 23:29:27 +000029DEFAULT_MONITOR_INTERVAL_SECONDS = 5
30DEFAULT_MEMORY_USAGE_THRESHOLD = 2000
Zhuoyao Zhang205a2fc2024-09-20 18:19:59 +000031DEFAULT_CPU_USAGE_THRESHOLD = 200
32DEFAULT_REBOOT_TIMEOUT_SECONDS = 60 * 60 * 24
Zhuoyao Zhangd28da5c2024-09-24 19:46:12 +000033BLOCK_SIGN_FILE = "edit_monitor_block_sign"
Zhuoyao Zhang53359552024-09-16 23:58:11 +000034
35
36def default_daemon_target():
37 """Place holder for the default daemon target."""
38 print("default daemon target")
39
40
41class DaemonManager:
42 """Class to manage and monitor the daemon run as a subprocess."""
43
44 def __init__(
45 self,
46 binary_path: str,
47 daemon_target: callable = default_daemon_target,
48 daemon_args: tuple = (),
49 ):
50 self.binary_path = binary_path
51 self.daemon_target = daemon_target
52 self.daemon_args = daemon_args
53
54 self.pid = os.getpid()
55 self.daemon_process = None
56
Zhuoyao Zhangdc2840d2024-09-19 23:29:27 +000057 self.max_memory_usage = 0
58 self.max_cpu_usage = 0
59
Zhuoyao Zhang53359552024-09-16 23:58:11 +000060 pid_file_dir = pathlib.Path(tempfile.gettempdir()).joinpath("edit_monitor")
61 pid_file_dir.mkdir(parents=True, exist_ok=True)
62 self.pid_file_path = self._get_pid_file_path(pid_file_dir)
Zhuoyao Zhangd28da5c2024-09-24 19:46:12 +000063 self.block_sign = pathlib.Path(tempfile.gettempdir()).joinpath(
64 BLOCK_SIGN_FILE
65 )
Zhuoyao Zhang53359552024-09-16 23:58:11 +000066
67 def start(self):
68 """Writes the pidfile and starts the daemon proces."""
Zhuoyao Zhangd28da5c2024-09-24 19:46:12 +000069 if self.block_sign.exists():
70 logging.warning("Block sign found, exiting...")
71 return
72
73 self._stop_any_existing_instance()
74 self._write_pid_to_pidfile()
75 self._start_daemon_process()
Zhuoyao Zhang53359552024-09-16 23:58:11 +000076
Zhuoyao Zhangdc2840d2024-09-19 23:29:27 +000077 def monitor_daemon(
78 self,
79 interval: int = DEFAULT_MONITOR_INTERVAL_SECONDS,
80 memory_threshold: float = DEFAULT_MEMORY_USAGE_THRESHOLD,
81 cpu_threshold: float = DEFAULT_CPU_USAGE_THRESHOLD,
Zhuoyao Zhang205a2fc2024-09-20 18:19:59 +000082 reboot_timeout: int = DEFAULT_REBOOT_TIMEOUT_SECONDS,
Zhuoyao Zhangdc2840d2024-09-19 23:29:27 +000083 ):
84 """Monits the daemon process status.
85
86 Periodically check the CPU/Memory usage of the daemon process as long as the
87 process is still running and kill the process if the resource usage is above
88 given thresholds.
89 """
Zhuoyao Zhangd28da5c2024-09-24 19:46:12 +000090 if not self.daemon_process:
91 return
92
Zhuoyao Zhangdc2840d2024-09-19 23:29:27 +000093 logging.info("start monitoring daemon process %d.", self.daemon_process.pid)
Zhuoyao Zhang205a2fc2024-09-20 18:19:59 +000094 reboot_time = time.time() + reboot_timeout
Zhuoyao Zhangdc2840d2024-09-19 23:29:27 +000095 while self.daemon_process.is_alive():
Zhuoyao Zhang205a2fc2024-09-20 18:19:59 +000096 if time.time() > reboot_time:
97 self.reboot()
Zhuoyao Zhangdc2840d2024-09-19 23:29:27 +000098 try:
99 memory_usage = self._get_process_memory_percent(self.daemon_process.pid)
100 self.max_memory_usage = max(self.max_memory_usage, memory_usage)
101
102 cpu_usage = self._get_process_cpu_percent(self.daemon_process.pid)
103 self.max_cpu_usage = max(self.max_cpu_usage, cpu_usage)
104
105 time.sleep(interval)
106 except Exception as e:
107 # Logging the error and continue.
108 logging.warning("Failed to monitor daemon process with error: %s", e)
109
110 if (
111 self.max_memory_usage >= memory_threshold
112 or self.max_cpu_usage >= cpu_threshold
113 ):
114 logging.error(
115 "Daemon process is consuming too much resource, killing..."
116 ),
117 self._terminate_process(self.daemon_process.pid)
118
119 logging.info(
120 "Daemon process %d terminated. Max memory usage: %f, Max cpu"
121 " usage: %f.",
122 self.daemon_process.pid,
123 self.max_memory_usage,
124 self.max_cpu_usage,
125 )
126
Zhuoyao Zhang53359552024-09-16 23:58:11 +0000127 def stop(self):
128 """Stops the daemon process and removes the pidfile."""
129
130 logging.debug("in daemon manager cleanup.")
131 try:
132 if self.daemon_process and self.daemon_process.is_alive():
133 self._terminate_process(self.daemon_process.pid)
134 self._remove_pidfile()
Zhuoyao Zhang205a2fc2024-09-20 18:19:59 +0000135 logging.debug("Successfully stopped daemon manager.")
Zhuoyao Zhang53359552024-09-16 23:58:11 +0000136 except Exception as e:
137 logging.exception("Failed to stop daemon manager with error %s", e)
138
Zhuoyao Zhang205a2fc2024-09-20 18:19:59 +0000139 def reboot(self):
140 """Reboots the current process.
141
142 Stops the current daemon manager and reboots the entire process based on
143 the binary file. Exits directly If the binary file no longer exists.
144 """
145 logging.debug("Rebooting process based on binary %s.", self.binary_path)
146
147 # Stop the current daemon manager first.
148 self.stop()
149
150 # If the binary no longer exists, exit directly.
151 if not os.path.exists(self.binary_path):
152 logging.info("binary %s no longer exists, exiting.", self.binary_path)
153 sys.exit(0)
154
155 try:
156 os.execv(self.binary_path, sys.argv)
157 except OSError as e:
158 logging.exception("Failed to reboot process with error: %s.", e)
159 sys.exit(1) # Indicate an error occurred
160
Zhuoyao Zhangd28da5c2024-09-24 19:46:12 +0000161 def cleanup(self):
162 """Wipes out all edit monitor instances in the system.
163
164 Stops all the existing edit monitor instances and place a block sign
165 to prevent any edit monitor process to start. This method is only used
166 in emergency case when there's something goes wrong with the edit monitor
167 that requires immediate cleanup to prevent damanger to the system.
168 """
169 logging.debug("Start cleaning up all existing instances.")
170
171 try:
172 # First places a block sign to prevent any edit monitor process to start.
173 self.block_sign.touch()
174 except (FileNotFoundError, PermissionError, OSError):
175 logging.exception("Failed to place the block sign")
176
177 # Finds and kills all the existing instances of edit monitor.
178 existing_instances_pids = self._find_all_instances_pids()
179 for pid in existing_instances_pids:
180 logging.info(
181 "Found existing edit monitor instance with pid %d, killing...", pid
182 )
183 try:
184 self._terminate_process(pid)
185 except Exception:
186 logging.exception("Failed to terminate process %d", pid)
187
Zhuoyao Zhang4d485592024-09-17 21:14:38 +0000188 def _stop_any_existing_instance(self):
189 if not self.pid_file_path.exists():
190 logging.debug("No existing instances.")
191 return
192
193 ex_pid = self._read_pid_from_pidfile()
194
195 if ex_pid:
196 logging.info("Found another instance with pid %d.", ex_pid)
197 self._terminate_process(ex_pid)
198 self._remove_pidfile()
199
200 def _read_pid_from_pidfile(self):
201 with open(self.pid_file_path, "r") as f:
202 return int(f.read().strip())
203
Zhuoyao Zhang53359552024-09-16 23:58:11 +0000204 def _write_pid_to_pidfile(self):
205 """Creates a pidfile and writes the current pid to the file.
206
207 Raise FileExistsError if the pidfile already exists.
208 """
209 try:
210 # Use the 'x' mode to open the file for exclusive creation
211 with open(self.pid_file_path, "x") as f:
212 f.write(f"{self.pid}")
213 except FileExistsError as e:
214 # This could be caused due to race condition that a user is trying
215 # to start two edit monitors at the same time. Or because there is
216 # already an existing edit monitor running and we can not kill it
217 # for some reason.
218 logging.exception("pidfile %s already exists.", self.pid_file_path)
219 raise e
220
221 def _start_daemon_process(self):
222 """Starts a subprocess to run the daemon."""
223 p = multiprocessing.Process(
224 target=self.daemon_target, args=self.daemon_args
225 )
226 p.start()
227
228 logging.info("Start subprocess with PID %d", p.pid)
229 self.daemon_process = p
230
231 def _terminate_process(
232 self, pid: int, timeout: int = DEFAULT_PROCESS_TERMINATION_TIMEOUT_SECONDS
233 ):
234 """Terminates a process with given pid.
235
236 It first sends a SIGTERM to the process to allow it for proper
237 termination with a timeout. If the process is not terminated within
238 the timeout, kills it forcefully.
239 """
240 try:
241 os.kill(pid, signal.SIGTERM)
242 if not self._wait_for_process_terminate(pid, timeout):
243 logging.warning(
244 "Process %d not terminated within timeout, try force kill", pid
245 )
246 os.kill(pid, signal.SIGKILL)
247 except ProcessLookupError:
248 logging.info("Process with PID %d not found (already terminated)", pid)
249
250 def _wait_for_process_terminate(self, pid: int, timeout: int) -> bool:
251 start_time = time.time()
252
253 while time.time() < start_time + timeout:
254 if not self._is_process_alive(pid):
255 return True
256 time.sleep(1)
257
258 logging.error("Process %d not terminated within %d seconds.", pid, timeout)
259 return False
260
261 def _is_process_alive(self, pid: int) -> bool:
262 try:
263 output = subprocess.check_output(
264 ["ps", "-p", str(pid), "-o", "state="], text=True
265 ).strip()
266 state = output.split()[0]
267 return state != "Z" # Check if the state is not 'Z' (zombie)
268 except subprocess.CalledProcessError:
269 # Process not found (already dead).
270 return False
271 except (FileNotFoundError, OSError, ValueError) as e:
272 logging.warning(
273 "Unable to check the status for process %d with error: %s.", pid, e
274 )
275 return True
276
277 def _remove_pidfile(self):
278 try:
279 os.remove(self.pid_file_path)
280 except FileNotFoundError:
281 logging.info("pid file %s already removed.", self.pid_file_path)
282
283 def _get_pid_file_path(self, pid_file_dir: pathlib.Path) -> pathlib.Path:
284 """Generates the path to store the pidfile.
285
286 The file path should have the format of "/tmp/edit_monitor/xxxx.lock"
287 where xxxx is a hashed value based on the binary path that starts the
288 process.
289 """
290 hash_object = hashlib.sha256()
291 hash_object.update(self.binary_path.encode("utf-8"))
292 pid_file_path = pid_file_dir.joinpath(hash_object.hexdigest() + ".lock")
293 logging.info("pid_file_path: %s", pid_file_path)
294
295 return pid_file_path
Zhuoyao Zhangdc2840d2024-09-19 23:29:27 +0000296
297 def _get_process_memory_percent(self, pid: int) -> float:
298 try:
299 with open(f"/proc/{pid}/stat", "r") as f:
300 stat_data = f.readline().split()
301 # RSS is the 24th field in /proc/[pid]/stat
302 rss_pages = int(stat_data[23])
303 return rss_pages * 4 / 1024 # Covert to MB
304 except (FileNotFoundError, IndexError, ValueError, IOError) as e:
305 logging.exception("Failed to get memory usage.")
306 raise e
307
308 def _get_process_cpu_percent(self, pid: int, interval: int = 1) -> float:
309 try:
310 total_start_time = self._get_total_cpu_time(pid)
311 with open("/proc/uptime", "r") as f:
312 uptime_start = float(f.readline().split()[0])
313
314 time.sleep(interval)
315
316 total_end_time = self._get_total_cpu_time(pid)
317 with open("/proc/uptime", "r") as f:
318 uptime_end = float(f.readline().split()[0])
319
320 return (
321 (total_end_time - total_start_time)
322 / (uptime_end - uptime_start)
323 * 100
324 )
325 except (FileNotFoundError, IndexError, ValueError, IOError) as e:
326 logging.exception("Failed to get CPU usage.")
327 raise e
328
329 def _get_total_cpu_time(self, pid: int) -> float:
330 with open(f"/proc/{str(pid)}/stat", "r") as f:
331 stats = f.readline().split()
332 # utime is the 14th field in /proc/[pid]/stat measured in clock ticks.
333 utime = int(stats[13])
334 # stime is the 15th field in /proc/[pid]/stat measured in clock ticks.
335 stime = int(stats[14])
336 return (utime + stime) / os.sysconf(os.sysconf_names["SC_CLK_TCK"])
337
Zhuoyao Zhangd28da5c2024-09-24 19:46:12 +0000338 def _find_all_instances_pids(self) -> list[int]:
339 pids = []
340
341 for file in os.listdir(self.pid_file_path.parent):
342 if file.endswith(".lock"):
343 try:
344 with open(self.pid_file_path.parent.joinpath(file), "r") as f:
345 pids.append(int(f.read().strip()))
346 except (FileNotFoundError, IOError, ValueError, TypeError):
347 logging.exception("Failed to get pid from file path: %s", file)
348
349 return pids