blob: 187645131451ea9c4a63c818c0d6beeab260e421 [file] [log] [blame]
Zhuoyao Zhang53359552024-09-16 23:58:11 +00001# Copyright 2024, The Android Open Source Project
2#
3# Licensed under the Apache License, Version 2.0 (the "License");
4# you may not use this file except in compliance with the License.
5# You may obtain a copy of the License at
6#
7# http://www.apache.org/licenses/LICENSE-2.0
8#
9# Unless required by applicable law or agreed to in writing, software
10# distributed under the License is distributed on an "AS IS" BASIS,
11# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12# See the License for the specific language governing permissions and
13# limitations under the License.
14
15
16import hashlib
17import logging
18import multiprocessing
19import os
20import pathlib
21import signal
22import subprocess
Zhuoyao Zhang205a2fc2024-09-20 18:19:59 +000023import sys
Zhuoyao Zhang53359552024-09-16 23:58:11 +000024import tempfile
25import time
26
27
28DEFAULT_PROCESS_TERMINATION_TIMEOUT_SECONDS = 1
Zhuoyao Zhangdc2840d2024-09-19 23:29:27 +000029DEFAULT_MONITOR_INTERVAL_SECONDS = 5
30DEFAULT_MEMORY_USAGE_THRESHOLD = 2000
Zhuoyao Zhang205a2fc2024-09-20 18:19:59 +000031DEFAULT_CPU_USAGE_THRESHOLD = 200
32DEFAULT_REBOOT_TIMEOUT_SECONDS = 60 * 60 * 24
Zhuoyao Zhang53359552024-09-16 23:58:11 +000033
34
35def default_daemon_target():
36 """Place holder for the default daemon target."""
37 print("default daemon target")
38
39
40class DaemonManager:
41 """Class to manage and monitor the daemon run as a subprocess."""
42
43 def __init__(
44 self,
45 binary_path: str,
46 daemon_target: callable = default_daemon_target,
47 daemon_args: tuple = (),
48 ):
49 self.binary_path = binary_path
50 self.daemon_target = daemon_target
51 self.daemon_args = daemon_args
52
53 self.pid = os.getpid()
54 self.daemon_process = None
55
Zhuoyao Zhangdc2840d2024-09-19 23:29:27 +000056 self.max_memory_usage = 0
57 self.max_cpu_usage = 0
58
Zhuoyao Zhang53359552024-09-16 23:58:11 +000059 pid_file_dir = pathlib.Path(tempfile.gettempdir()).joinpath("edit_monitor")
60 pid_file_dir.mkdir(parents=True, exist_ok=True)
61 self.pid_file_path = self._get_pid_file_path(pid_file_dir)
62
63 def start(self):
64 """Writes the pidfile and starts the daemon proces."""
65 try:
Zhuoyao Zhang4d485592024-09-17 21:14:38 +000066 self._stop_any_existing_instance()
Zhuoyao Zhang53359552024-09-16 23:58:11 +000067 self._write_pid_to_pidfile()
68 self._start_daemon_process()
69 except Exception as e:
70 logging.exception("Failed to start daemon manager with error %s", e)
71
Zhuoyao Zhangdc2840d2024-09-19 23:29:27 +000072 def monitor_daemon(
73 self,
74 interval: int = DEFAULT_MONITOR_INTERVAL_SECONDS,
75 memory_threshold: float = DEFAULT_MEMORY_USAGE_THRESHOLD,
76 cpu_threshold: float = DEFAULT_CPU_USAGE_THRESHOLD,
Zhuoyao Zhang205a2fc2024-09-20 18:19:59 +000077 reboot_timeout: int = DEFAULT_REBOOT_TIMEOUT_SECONDS,
Zhuoyao Zhangdc2840d2024-09-19 23:29:27 +000078 ):
79 """Monits the daemon process status.
80
81 Periodically check the CPU/Memory usage of the daemon process as long as the
82 process is still running and kill the process if the resource usage is above
83 given thresholds.
84 """
85 logging.info("start monitoring daemon process %d.", self.daemon_process.pid)
Zhuoyao Zhang205a2fc2024-09-20 18:19:59 +000086 reboot_time = time.time() + reboot_timeout
Zhuoyao Zhangdc2840d2024-09-19 23:29:27 +000087 while self.daemon_process.is_alive():
Zhuoyao Zhang205a2fc2024-09-20 18:19:59 +000088 if time.time() > reboot_time:
89 self.reboot()
Zhuoyao Zhangdc2840d2024-09-19 23:29:27 +000090 try:
91 memory_usage = self._get_process_memory_percent(self.daemon_process.pid)
92 self.max_memory_usage = max(self.max_memory_usage, memory_usage)
93
94 cpu_usage = self._get_process_cpu_percent(self.daemon_process.pid)
95 self.max_cpu_usage = max(self.max_cpu_usage, cpu_usage)
96
97 time.sleep(interval)
98 except Exception as e:
99 # Logging the error and continue.
100 logging.warning("Failed to monitor daemon process with error: %s", e)
101
102 if (
103 self.max_memory_usage >= memory_threshold
104 or self.max_cpu_usage >= cpu_threshold
105 ):
106 logging.error(
107 "Daemon process is consuming too much resource, killing..."
108 ),
109 self._terminate_process(self.daemon_process.pid)
110
111 logging.info(
112 "Daemon process %d terminated. Max memory usage: %f, Max cpu"
113 " usage: %f.",
114 self.daemon_process.pid,
115 self.max_memory_usage,
116 self.max_cpu_usage,
117 )
118
Zhuoyao Zhang53359552024-09-16 23:58:11 +0000119 def stop(self):
120 """Stops the daemon process and removes the pidfile."""
121
122 logging.debug("in daemon manager cleanup.")
123 try:
124 if self.daemon_process and self.daemon_process.is_alive():
125 self._terminate_process(self.daemon_process.pid)
126 self._remove_pidfile()
Zhuoyao Zhang205a2fc2024-09-20 18:19:59 +0000127 logging.debug("Successfully stopped daemon manager.")
Zhuoyao Zhang53359552024-09-16 23:58:11 +0000128 except Exception as e:
129 logging.exception("Failed to stop daemon manager with error %s", e)
130
Zhuoyao Zhang205a2fc2024-09-20 18:19:59 +0000131 def reboot(self):
132 """Reboots the current process.
133
134 Stops the current daemon manager and reboots the entire process based on
135 the binary file. Exits directly If the binary file no longer exists.
136 """
137 logging.debug("Rebooting process based on binary %s.", self.binary_path)
138
139 # Stop the current daemon manager first.
140 self.stop()
141
142 # If the binary no longer exists, exit directly.
143 if not os.path.exists(self.binary_path):
144 logging.info("binary %s no longer exists, exiting.", self.binary_path)
145 sys.exit(0)
146
147 try:
148 os.execv(self.binary_path, sys.argv)
149 except OSError as e:
150 logging.exception("Failed to reboot process with error: %s.", e)
151 sys.exit(1) # Indicate an error occurred
152
Zhuoyao Zhang4d485592024-09-17 21:14:38 +0000153 def _stop_any_existing_instance(self):
154 if not self.pid_file_path.exists():
155 logging.debug("No existing instances.")
156 return
157
158 ex_pid = self._read_pid_from_pidfile()
159
160 if ex_pid:
161 logging.info("Found another instance with pid %d.", ex_pid)
162 self._terminate_process(ex_pid)
163 self._remove_pidfile()
164
165 def _read_pid_from_pidfile(self):
166 with open(self.pid_file_path, "r") as f:
167 return int(f.read().strip())
168
Zhuoyao Zhang53359552024-09-16 23:58:11 +0000169 def _write_pid_to_pidfile(self):
170 """Creates a pidfile and writes the current pid to the file.
171
172 Raise FileExistsError if the pidfile already exists.
173 """
174 try:
175 # Use the 'x' mode to open the file for exclusive creation
176 with open(self.pid_file_path, "x") as f:
177 f.write(f"{self.pid}")
178 except FileExistsError as e:
179 # This could be caused due to race condition that a user is trying
180 # to start two edit monitors at the same time. Or because there is
181 # already an existing edit monitor running and we can not kill it
182 # for some reason.
183 logging.exception("pidfile %s already exists.", self.pid_file_path)
184 raise e
185
186 def _start_daemon_process(self):
187 """Starts a subprocess to run the daemon."""
188 p = multiprocessing.Process(
189 target=self.daemon_target, args=self.daemon_args
190 )
191 p.start()
192
193 logging.info("Start subprocess with PID %d", p.pid)
194 self.daemon_process = p
195
196 def _terminate_process(
197 self, pid: int, timeout: int = DEFAULT_PROCESS_TERMINATION_TIMEOUT_SECONDS
198 ):
199 """Terminates a process with given pid.
200
201 It first sends a SIGTERM to the process to allow it for proper
202 termination with a timeout. If the process is not terminated within
203 the timeout, kills it forcefully.
204 """
205 try:
206 os.kill(pid, signal.SIGTERM)
207 if not self._wait_for_process_terminate(pid, timeout):
208 logging.warning(
209 "Process %d not terminated within timeout, try force kill", pid
210 )
211 os.kill(pid, signal.SIGKILL)
212 except ProcessLookupError:
213 logging.info("Process with PID %d not found (already terminated)", pid)
214
215 def _wait_for_process_terminate(self, pid: int, timeout: int) -> bool:
216 start_time = time.time()
217
218 while time.time() < start_time + timeout:
219 if not self._is_process_alive(pid):
220 return True
221 time.sleep(1)
222
223 logging.error("Process %d not terminated within %d seconds.", pid, timeout)
224 return False
225
226 def _is_process_alive(self, pid: int) -> bool:
227 try:
228 output = subprocess.check_output(
229 ["ps", "-p", str(pid), "-o", "state="], text=True
230 ).strip()
231 state = output.split()[0]
232 return state != "Z" # Check if the state is not 'Z' (zombie)
233 except subprocess.CalledProcessError:
234 # Process not found (already dead).
235 return False
236 except (FileNotFoundError, OSError, ValueError) as e:
237 logging.warning(
238 "Unable to check the status for process %d with error: %s.", pid, e
239 )
240 return True
241
242 def _remove_pidfile(self):
243 try:
244 os.remove(self.pid_file_path)
245 except FileNotFoundError:
246 logging.info("pid file %s already removed.", self.pid_file_path)
247
248 def _get_pid_file_path(self, pid_file_dir: pathlib.Path) -> pathlib.Path:
249 """Generates the path to store the pidfile.
250
251 The file path should have the format of "/tmp/edit_monitor/xxxx.lock"
252 where xxxx is a hashed value based on the binary path that starts the
253 process.
254 """
255 hash_object = hashlib.sha256()
256 hash_object.update(self.binary_path.encode("utf-8"))
257 pid_file_path = pid_file_dir.joinpath(hash_object.hexdigest() + ".lock")
258 logging.info("pid_file_path: %s", pid_file_path)
259
260 return pid_file_path
Zhuoyao Zhangdc2840d2024-09-19 23:29:27 +0000261
262 def _get_process_memory_percent(self, pid: int) -> float:
263 try:
264 with open(f"/proc/{pid}/stat", "r") as f:
265 stat_data = f.readline().split()
266 # RSS is the 24th field in /proc/[pid]/stat
267 rss_pages = int(stat_data[23])
268 return rss_pages * 4 / 1024 # Covert to MB
269 except (FileNotFoundError, IndexError, ValueError, IOError) as e:
270 logging.exception("Failed to get memory usage.")
271 raise e
272
273 def _get_process_cpu_percent(self, pid: int, interval: int = 1) -> float:
274 try:
275 total_start_time = self._get_total_cpu_time(pid)
276 with open("/proc/uptime", "r") as f:
277 uptime_start = float(f.readline().split()[0])
278
279 time.sleep(interval)
280
281 total_end_time = self._get_total_cpu_time(pid)
282 with open("/proc/uptime", "r") as f:
283 uptime_end = float(f.readline().split()[0])
284
285 return (
286 (total_end_time - total_start_time)
287 / (uptime_end - uptime_start)
288 * 100
289 )
290 except (FileNotFoundError, IndexError, ValueError, IOError) as e:
291 logging.exception("Failed to get CPU usage.")
292 raise e
293
294 def _get_total_cpu_time(self, pid: int) -> float:
295 with open(f"/proc/{str(pid)}/stat", "r") as f:
296 stats = f.readline().split()
297 # utime is the 14th field in /proc/[pid]/stat measured in clock ticks.
298 utime = int(stats[13])
299 # stime is the 15th field in /proc/[pid]/stat measured in clock ticks.
300 stime = int(stats[14])
301 return (utime + stime) / os.sysconf(os.sysconf_names["SC_CLK_TCK"])
302