blob: 4ff4ec87cf76513891300eccb1df1de27396b489 [file] [log] [blame]
Zhuoyao Zhang53359552024-09-16 23:58:11 +00001# Copyright 2024, The Android Open Source Project
2#
3# Licensed under the Apache License, Version 2.0 (the "License");
4# you may not use this file except in compliance with the License.
5# You may obtain a copy of the License at
6#
7# http://www.apache.org/licenses/LICENSE-2.0
8#
9# Unless required by applicable law or agreed to in writing, software
10# distributed under the License is distributed on an "AS IS" BASIS,
11# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12# See the License for the specific language governing permissions and
13# limitations under the License.
14
15
16import hashlib
17import logging
18import multiprocessing
19import os
20import pathlib
21import signal
22import subprocess
Zhuoyao Zhang205a2fc2024-09-20 18:19:59 +000023import sys
Zhuoyao Zhang53359552024-09-16 23:58:11 +000024import tempfile
25import time
26
27
28DEFAULT_PROCESS_TERMINATION_TIMEOUT_SECONDS = 1
Zhuoyao Zhangdc2840d2024-09-19 23:29:27 +000029DEFAULT_MONITOR_INTERVAL_SECONDS = 5
30DEFAULT_MEMORY_USAGE_THRESHOLD = 2000
Zhuoyao Zhang205a2fc2024-09-20 18:19:59 +000031DEFAULT_CPU_USAGE_THRESHOLD = 200
32DEFAULT_REBOOT_TIMEOUT_SECONDS = 60 * 60 * 24
Zhuoyao Zhangd28da5c2024-09-24 19:46:12 +000033BLOCK_SIGN_FILE = "edit_monitor_block_sign"
Zhuoyao Zhang53359552024-09-16 23:58:11 +000034
35
36def default_daemon_target():
37 """Place holder for the default daemon target."""
38 print("default daemon target")
39
40
41class DaemonManager:
42 """Class to manage and monitor the daemon run as a subprocess."""
43
44 def __init__(
45 self,
46 binary_path: str,
47 daemon_target: callable = default_daemon_target,
48 daemon_args: tuple = (),
49 ):
50 self.binary_path = binary_path
51 self.daemon_target = daemon_target
52 self.daemon_args = daemon_args
53
54 self.pid = os.getpid()
55 self.daemon_process = None
56
Zhuoyao Zhangdc2840d2024-09-19 23:29:27 +000057 self.max_memory_usage = 0
58 self.max_cpu_usage = 0
59
Zhuoyao Zhang53359552024-09-16 23:58:11 +000060 pid_file_dir = pathlib.Path(tempfile.gettempdir()).joinpath("edit_monitor")
61 pid_file_dir.mkdir(parents=True, exist_ok=True)
62 self.pid_file_path = self._get_pid_file_path(pid_file_dir)
Zhuoyao Zhangd28da5c2024-09-24 19:46:12 +000063 self.block_sign = pathlib.Path(tempfile.gettempdir()).joinpath(
64 BLOCK_SIGN_FILE
65 )
Zhuoyao Zhang53359552024-09-16 23:58:11 +000066
67 def start(self):
68 """Writes the pidfile and starts the daemon proces."""
Zhuoyao Zhangd28da5c2024-09-24 19:46:12 +000069 if self.block_sign.exists():
70 logging.warning("Block sign found, exiting...")
71 return
72
Zhuoyao Zhang05e28fa2024-10-04 21:58:39 +000073 if self.binary_path.startswith('/google/cog/'):
74 logging.warning("Edit monitor for cog is not supported, exiting...")
75 return
76
Zhuoyao Zhangd28da5c2024-09-24 19:46:12 +000077 self._stop_any_existing_instance()
78 self._write_pid_to_pidfile()
79 self._start_daemon_process()
Zhuoyao Zhang53359552024-09-16 23:58:11 +000080
Zhuoyao Zhangdc2840d2024-09-19 23:29:27 +000081 def monitor_daemon(
82 self,
83 interval: int = DEFAULT_MONITOR_INTERVAL_SECONDS,
84 memory_threshold: float = DEFAULT_MEMORY_USAGE_THRESHOLD,
85 cpu_threshold: float = DEFAULT_CPU_USAGE_THRESHOLD,
Zhuoyao Zhang205a2fc2024-09-20 18:19:59 +000086 reboot_timeout: int = DEFAULT_REBOOT_TIMEOUT_SECONDS,
Zhuoyao Zhangdc2840d2024-09-19 23:29:27 +000087 ):
88 """Monits the daemon process status.
89
90 Periodically check the CPU/Memory usage of the daemon process as long as the
91 process is still running and kill the process if the resource usage is above
92 given thresholds.
93 """
Zhuoyao Zhangd28da5c2024-09-24 19:46:12 +000094 if not self.daemon_process:
95 return
96
Zhuoyao Zhangdc2840d2024-09-19 23:29:27 +000097 logging.info("start monitoring daemon process %d.", self.daemon_process.pid)
Zhuoyao Zhang205a2fc2024-09-20 18:19:59 +000098 reboot_time = time.time() + reboot_timeout
Zhuoyao Zhangdc2840d2024-09-19 23:29:27 +000099 while self.daemon_process.is_alive():
Zhuoyao Zhang205a2fc2024-09-20 18:19:59 +0000100 if time.time() > reboot_time:
101 self.reboot()
Zhuoyao Zhangdc2840d2024-09-19 23:29:27 +0000102 try:
103 memory_usage = self._get_process_memory_percent(self.daemon_process.pid)
104 self.max_memory_usage = max(self.max_memory_usage, memory_usage)
105
106 cpu_usage = self._get_process_cpu_percent(self.daemon_process.pid)
107 self.max_cpu_usage = max(self.max_cpu_usage, cpu_usage)
108
109 time.sleep(interval)
110 except Exception as e:
111 # Logging the error and continue.
112 logging.warning("Failed to monitor daemon process with error: %s", e)
113
114 if (
115 self.max_memory_usage >= memory_threshold
116 or self.max_cpu_usage >= cpu_threshold
117 ):
118 logging.error(
119 "Daemon process is consuming too much resource, killing..."
120 ),
121 self._terminate_process(self.daemon_process.pid)
122
123 logging.info(
124 "Daemon process %d terminated. Max memory usage: %f, Max cpu"
125 " usage: %f.",
126 self.daemon_process.pid,
127 self.max_memory_usage,
128 self.max_cpu_usage,
129 )
130
Zhuoyao Zhang53359552024-09-16 23:58:11 +0000131 def stop(self):
132 """Stops the daemon process and removes the pidfile."""
133
134 logging.debug("in daemon manager cleanup.")
135 try:
Zhuoyao Zhang8a225792024-10-09 18:04:39 +0000136 if self.daemon_process:
137 # The daemon process might already in termination process,
138 # wait some time before kill it explicitly.
139 self._wait_for_process_terminate(self.daemon_process.pid, 1)
140 if self.daemon_process.is_alive():
141 self._terminate_process(self.daemon_process.pid)
Zhuoyao Zhang53359552024-09-16 23:58:11 +0000142 self._remove_pidfile()
Zhuoyao Zhang205a2fc2024-09-20 18:19:59 +0000143 logging.debug("Successfully stopped daemon manager.")
Zhuoyao Zhang53359552024-09-16 23:58:11 +0000144 except Exception as e:
145 logging.exception("Failed to stop daemon manager with error %s", e)
146
Zhuoyao Zhang205a2fc2024-09-20 18:19:59 +0000147 def reboot(self):
148 """Reboots the current process.
149
150 Stops the current daemon manager and reboots the entire process based on
151 the binary file. Exits directly If the binary file no longer exists.
152 """
153 logging.debug("Rebooting process based on binary %s.", self.binary_path)
154
155 # Stop the current daemon manager first.
156 self.stop()
157
158 # If the binary no longer exists, exit directly.
159 if not os.path.exists(self.binary_path):
160 logging.info("binary %s no longer exists, exiting.", self.binary_path)
161 sys.exit(0)
162
163 try:
164 os.execv(self.binary_path, sys.argv)
165 except OSError as e:
166 logging.exception("Failed to reboot process with error: %s.", e)
167 sys.exit(1) # Indicate an error occurred
168
Zhuoyao Zhangd28da5c2024-09-24 19:46:12 +0000169 def cleanup(self):
170 """Wipes out all edit monitor instances in the system.
171
172 Stops all the existing edit monitor instances and place a block sign
173 to prevent any edit monitor process to start. This method is only used
174 in emergency case when there's something goes wrong with the edit monitor
175 that requires immediate cleanup to prevent damanger to the system.
176 """
177 logging.debug("Start cleaning up all existing instances.")
178
179 try:
180 # First places a block sign to prevent any edit monitor process to start.
181 self.block_sign.touch()
182 except (FileNotFoundError, PermissionError, OSError):
183 logging.exception("Failed to place the block sign")
184
185 # Finds and kills all the existing instances of edit monitor.
186 existing_instances_pids = self._find_all_instances_pids()
187 for pid in existing_instances_pids:
188 logging.info(
189 "Found existing edit monitor instance with pid %d, killing...", pid
190 )
191 try:
192 self._terminate_process(pid)
193 except Exception:
194 logging.exception("Failed to terminate process %d", pid)
195
Zhuoyao Zhang4d485592024-09-17 21:14:38 +0000196 def _stop_any_existing_instance(self):
197 if not self.pid_file_path.exists():
198 logging.debug("No existing instances.")
199 return
200
201 ex_pid = self._read_pid_from_pidfile()
202
203 if ex_pid:
204 logging.info("Found another instance with pid %d.", ex_pid)
205 self._terminate_process(ex_pid)
206 self._remove_pidfile()
207
208 def _read_pid_from_pidfile(self):
209 with open(self.pid_file_path, "r") as f:
210 return int(f.read().strip())
211
Zhuoyao Zhang53359552024-09-16 23:58:11 +0000212 def _write_pid_to_pidfile(self):
213 """Creates a pidfile and writes the current pid to the file.
214
215 Raise FileExistsError if the pidfile already exists.
216 """
217 try:
218 # Use the 'x' mode to open the file for exclusive creation
219 with open(self.pid_file_path, "x") as f:
220 f.write(f"{self.pid}")
221 except FileExistsError as e:
222 # This could be caused due to race condition that a user is trying
223 # to start two edit monitors at the same time. Or because there is
224 # already an existing edit monitor running and we can not kill it
225 # for some reason.
226 logging.exception("pidfile %s already exists.", self.pid_file_path)
227 raise e
228
229 def _start_daemon_process(self):
230 """Starts a subprocess to run the daemon."""
231 p = multiprocessing.Process(
232 target=self.daemon_target, args=self.daemon_args
233 )
Zhuoyao Zhang8a225792024-10-09 18:04:39 +0000234 p.daemon = True
Zhuoyao Zhang53359552024-09-16 23:58:11 +0000235 p.start()
236
237 logging.info("Start subprocess with PID %d", p.pid)
238 self.daemon_process = p
239
240 def _terminate_process(
241 self, pid: int, timeout: int = DEFAULT_PROCESS_TERMINATION_TIMEOUT_SECONDS
242 ):
243 """Terminates a process with given pid.
244
245 It first sends a SIGTERM to the process to allow it for proper
246 termination with a timeout. If the process is not terminated within
247 the timeout, kills it forcefully.
248 """
249 try:
250 os.kill(pid, signal.SIGTERM)
251 if not self._wait_for_process_terminate(pid, timeout):
252 logging.warning(
253 "Process %d not terminated within timeout, try force kill", pid
254 )
255 os.kill(pid, signal.SIGKILL)
256 except ProcessLookupError:
257 logging.info("Process with PID %d not found (already terminated)", pid)
258
259 def _wait_for_process_terminate(self, pid: int, timeout: int) -> bool:
260 start_time = time.time()
261
262 while time.time() < start_time + timeout:
263 if not self._is_process_alive(pid):
264 return True
265 time.sleep(1)
266
267 logging.error("Process %d not terminated within %d seconds.", pid, timeout)
268 return False
269
270 def _is_process_alive(self, pid: int) -> bool:
271 try:
272 output = subprocess.check_output(
273 ["ps", "-p", str(pid), "-o", "state="], text=True
274 ).strip()
275 state = output.split()[0]
276 return state != "Z" # Check if the state is not 'Z' (zombie)
277 except subprocess.CalledProcessError:
278 # Process not found (already dead).
279 return False
280 except (FileNotFoundError, OSError, ValueError) as e:
281 logging.warning(
282 "Unable to check the status for process %d with error: %s.", pid, e
283 )
284 return True
285
286 def _remove_pidfile(self):
287 try:
288 os.remove(self.pid_file_path)
289 except FileNotFoundError:
290 logging.info("pid file %s already removed.", self.pid_file_path)
291
292 def _get_pid_file_path(self, pid_file_dir: pathlib.Path) -> pathlib.Path:
293 """Generates the path to store the pidfile.
294
295 The file path should have the format of "/tmp/edit_monitor/xxxx.lock"
296 where xxxx is a hashed value based on the binary path that starts the
297 process.
298 """
299 hash_object = hashlib.sha256()
300 hash_object.update(self.binary_path.encode("utf-8"))
301 pid_file_path = pid_file_dir.joinpath(hash_object.hexdigest() + ".lock")
302 logging.info("pid_file_path: %s", pid_file_path)
303
304 return pid_file_path
Zhuoyao Zhangdc2840d2024-09-19 23:29:27 +0000305
306 def _get_process_memory_percent(self, pid: int) -> float:
307 try:
308 with open(f"/proc/{pid}/stat", "r") as f:
309 stat_data = f.readline().split()
310 # RSS is the 24th field in /proc/[pid]/stat
311 rss_pages = int(stat_data[23])
312 return rss_pages * 4 / 1024 # Covert to MB
313 except (FileNotFoundError, IndexError, ValueError, IOError) as e:
314 logging.exception("Failed to get memory usage.")
315 raise e
316
317 def _get_process_cpu_percent(self, pid: int, interval: int = 1) -> float:
318 try:
319 total_start_time = self._get_total_cpu_time(pid)
320 with open("/proc/uptime", "r") as f:
321 uptime_start = float(f.readline().split()[0])
322
323 time.sleep(interval)
324
325 total_end_time = self._get_total_cpu_time(pid)
326 with open("/proc/uptime", "r") as f:
327 uptime_end = float(f.readline().split()[0])
328
329 return (
330 (total_end_time - total_start_time)
331 / (uptime_end - uptime_start)
332 * 100
333 )
334 except (FileNotFoundError, IndexError, ValueError, IOError) as e:
335 logging.exception("Failed to get CPU usage.")
336 raise e
337
338 def _get_total_cpu_time(self, pid: int) -> float:
339 with open(f"/proc/{str(pid)}/stat", "r") as f:
340 stats = f.readline().split()
341 # utime is the 14th field in /proc/[pid]/stat measured in clock ticks.
342 utime = int(stats[13])
343 # stime is the 15th field in /proc/[pid]/stat measured in clock ticks.
344 stime = int(stats[14])
345 return (utime + stime) / os.sysconf(os.sysconf_names["SC_CLK_TCK"])
346
Zhuoyao Zhangd28da5c2024-09-24 19:46:12 +0000347 def _find_all_instances_pids(self) -> list[int]:
348 pids = []
349
350 for file in os.listdir(self.pid_file_path.parent):
351 if file.endswith(".lock"):
352 try:
353 with open(self.pid_file_path.parent.joinpath(file), "r") as f:
354 pids.append(int(f.read().strip()))
355 except (FileNotFoundError, IOError, ValueError, TypeError):
356 logging.exception("Failed to get pid from file path: %s", file)
357
358 return pids