blob: 892c292a718e14cb05fa3b8663823eabaf3ab331 [file] [log] [blame]
Zhuoyao Zhang53359552024-09-16 23:58:11 +00001# Copyright 2024, The Android Open Source Project
2#
3# Licensed under the Apache License, Version 2.0 (the "License");
4# you may not use this file except in compliance with the License.
5# You may obtain a copy of the License at
6#
7# http://www.apache.org/licenses/LICENSE-2.0
8#
9# Unless required by applicable law or agreed to in writing, software
10# distributed under the License is distributed on an "AS IS" BASIS,
11# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12# See the License for the specific language governing permissions and
13# limitations under the License.
14
15
16import hashlib
17import logging
18import multiprocessing
19import os
20import pathlib
21import signal
22import subprocess
Zhuoyao Zhang205a2fc2024-09-20 18:19:59 +000023import sys
Zhuoyao Zhang53359552024-09-16 23:58:11 +000024import tempfile
25import time
26
27
28DEFAULT_PROCESS_TERMINATION_TIMEOUT_SECONDS = 1
Zhuoyao Zhangdc2840d2024-09-19 23:29:27 +000029DEFAULT_MONITOR_INTERVAL_SECONDS = 5
30DEFAULT_MEMORY_USAGE_THRESHOLD = 2000
Zhuoyao Zhang205a2fc2024-09-20 18:19:59 +000031DEFAULT_CPU_USAGE_THRESHOLD = 200
32DEFAULT_REBOOT_TIMEOUT_SECONDS = 60 * 60 * 24
Zhuoyao Zhangd28da5c2024-09-24 19:46:12 +000033BLOCK_SIGN_FILE = "edit_monitor_block_sign"
Zhuoyao Zhang53359552024-09-16 23:58:11 +000034
35
36def default_daemon_target():
37 """Place holder for the default daemon target."""
38 print("default daemon target")
39
40
41class DaemonManager:
42 """Class to manage and monitor the daemon run as a subprocess."""
43
44 def __init__(
45 self,
46 binary_path: str,
47 daemon_target: callable = default_daemon_target,
48 daemon_args: tuple = (),
49 ):
50 self.binary_path = binary_path
51 self.daemon_target = daemon_target
52 self.daemon_args = daemon_args
53
54 self.pid = os.getpid()
55 self.daemon_process = None
56
Zhuoyao Zhangdc2840d2024-09-19 23:29:27 +000057 self.max_memory_usage = 0
58 self.max_cpu_usage = 0
59
Zhuoyao Zhang53359552024-09-16 23:58:11 +000060 pid_file_dir = pathlib.Path(tempfile.gettempdir()).joinpath("edit_monitor")
61 pid_file_dir.mkdir(parents=True, exist_ok=True)
62 self.pid_file_path = self._get_pid_file_path(pid_file_dir)
Zhuoyao Zhangd28da5c2024-09-24 19:46:12 +000063 self.block_sign = pathlib.Path(tempfile.gettempdir()).joinpath(
64 BLOCK_SIGN_FILE
65 )
Zhuoyao Zhang53359552024-09-16 23:58:11 +000066
67 def start(self):
68 """Writes the pidfile and starts the daemon proces."""
Zhuoyao Zhangd28da5c2024-09-24 19:46:12 +000069 if self.block_sign.exists():
70 logging.warning("Block sign found, exiting...")
71 return
72
Zhuoyao Zhang05e28fa2024-10-04 21:58:39 +000073 if self.binary_path.startswith('/google/cog/'):
74 logging.warning("Edit monitor for cog is not supported, exiting...")
75 return
76
Zhuoyao Zhangd28da5c2024-09-24 19:46:12 +000077 self._stop_any_existing_instance()
78 self._write_pid_to_pidfile()
79 self._start_daemon_process()
Zhuoyao Zhang53359552024-09-16 23:58:11 +000080
Zhuoyao Zhangdc2840d2024-09-19 23:29:27 +000081 def monitor_daemon(
82 self,
83 interval: int = DEFAULT_MONITOR_INTERVAL_SECONDS,
84 memory_threshold: float = DEFAULT_MEMORY_USAGE_THRESHOLD,
85 cpu_threshold: float = DEFAULT_CPU_USAGE_THRESHOLD,
Zhuoyao Zhang205a2fc2024-09-20 18:19:59 +000086 reboot_timeout: int = DEFAULT_REBOOT_TIMEOUT_SECONDS,
Zhuoyao Zhangdc2840d2024-09-19 23:29:27 +000087 ):
88 """Monits the daemon process status.
89
90 Periodically check the CPU/Memory usage of the daemon process as long as the
91 process is still running and kill the process if the resource usage is above
92 given thresholds.
93 """
Zhuoyao Zhangd28da5c2024-09-24 19:46:12 +000094 if not self.daemon_process:
95 return
96
Zhuoyao Zhangdc2840d2024-09-19 23:29:27 +000097 logging.info("start monitoring daemon process %d.", self.daemon_process.pid)
Zhuoyao Zhang205a2fc2024-09-20 18:19:59 +000098 reboot_time = time.time() + reboot_timeout
Zhuoyao Zhangdc2840d2024-09-19 23:29:27 +000099 while self.daemon_process.is_alive():
Zhuoyao Zhang205a2fc2024-09-20 18:19:59 +0000100 if time.time() > reboot_time:
101 self.reboot()
Zhuoyao Zhangdc2840d2024-09-19 23:29:27 +0000102 try:
103 memory_usage = self._get_process_memory_percent(self.daemon_process.pid)
104 self.max_memory_usage = max(self.max_memory_usage, memory_usage)
105
106 cpu_usage = self._get_process_cpu_percent(self.daemon_process.pid)
107 self.max_cpu_usage = max(self.max_cpu_usage, cpu_usage)
108
109 time.sleep(interval)
110 except Exception as e:
111 # Logging the error and continue.
112 logging.warning("Failed to monitor daemon process with error: %s", e)
113
114 if (
115 self.max_memory_usage >= memory_threshold
116 or self.max_cpu_usage >= cpu_threshold
117 ):
118 logging.error(
119 "Daemon process is consuming too much resource, killing..."
120 ),
121 self._terminate_process(self.daemon_process.pid)
122
123 logging.info(
124 "Daemon process %d terminated. Max memory usage: %f, Max cpu"
125 " usage: %f.",
126 self.daemon_process.pid,
127 self.max_memory_usage,
128 self.max_cpu_usage,
129 )
130
Zhuoyao Zhang53359552024-09-16 23:58:11 +0000131 def stop(self):
132 """Stops the daemon process and removes the pidfile."""
133
134 logging.debug("in daemon manager cleanup.")
135 try:
136 if self.daemon_process and self.daemon_process.is_alive():
137 self._terminate_process(self.daemon_process.pid)
138 self._remove_pidfile()
Zhuoyao Zhang205a2fc2024-09-20 18:19:59 +0000139 logging.debug("Successfully stopped daemon manager.")
Zhuoyao Zhang53359552024-09-16 23:58:11 +0000140 except Exception as e:
141 logging.exception("Failed to stop daemon manager with error %s", e)
142
Zhuoyao Zhang205a2fc2024-09-20 18:19:59 +0000143 def reboot(self):
144 """Reboots the current process.
145
146 Stops the current daemon manager and reboots the entire process based on
147 the binary file. Exits directly If the binary file no longer exists.
148 """
149 logging.debug("Rebooting process based on binary %s.", self.binary_path)
150
151 # Stop the current daemon manager first.
152 self.stop()
153
154 # If the binary no longer exists, exit directly.
155 if not os.path.exists(self.binary_path):
156 logging.info("binary %s no longer exists, exiting.", self.binary_path)
157 sys.exit(0)
158
159 try:
160 os.execv(self.binary_path, sys.argv)
161 except OSError as e:
162 logging.exception("Failed to reboot process with error: %s.", e)
163 sys.exit(1) # Indicate an error occurred
164
Zhuoyao Zhangd28da5c2024-09-24 19:46:12 +0000165 def cleanup(self):
166 """Wipes out all edit monitor instances in the system.
167
168 Stops all the existing edit monitor instances and place a block sign
169 to prevent any edit monitor process to start. This method is only used
170 in emergency case when there's something goes wrong with the edit monitor
171 that requires immediate cleanup to prevent damanger to the system.
172 """
173 logging.debug("Start cleaning up all existing instances.")
174
175 try:
176 # First places a block sign to prevent any edit monitor process to start.
177 self.block_sign.touch()
178 except (FileNotFoundError, PermissionError, OSError):
179 logging.exception("Failed to place the block sign")
180
181 # Finds and kills all the existing instances of edit monitor.
182 existing_instances_pids = self._find_all_instances_pids()
183 for pid in existing_instances_pids:
184 logging.info(
185 "Found existing edit monitor instance with pid %d, killing...", pid
186 )
187 try:
188 self._terminate_process(pid)
189 except Exception:
190 logging.exception("Failed to terminate process %d", pid)
191
Zhuoyao Zhang4d485592024-09-17 21:14:38 +0000192 def _stop_any_existing_instance(self):
193 if not self.pid_file_path.exists():
194 logging.debug("No existing instances.")
195 return
196
197 ex_pid = self._read_pid_from_pidfile()
198
199 if ex_pid:
200 logging.info("Found another instance with pid %d.", ex_pid)
201 self._terminate_process(ex_pid)
202 self._remove_pidfile()
203
204 def _read_pid_from_pidfile(self):
205 with open(self.pid_file_path, "r") as f:
206 return int(f.read().strip())
207
Zhuoyao Zhang53359552024-09-16 23:58:11 +0000208 def _write_pid_to_pidfile(self):
209 """Creates a pidfile and writes the current pid to the file.
210
211 Raise FileExistsError if the pidfile already exists.
212 """
213 try:
214 # Use the 'x' mode to open the file for exclusive creation
215 with open(self.pid_file_path, "x") as f:
216 f.write(f"{self.pid}")
217 except FileExistsError as e:
218 # This could be caused due to race condition that a user is trying
219 # to start two edit monitors at the same time. Or because there is
220 # already an existing edit monitor running and we can not kill it
221 # for some reason.
222 logging.exception("pidfile %s already exists.", self.pid_file_path)
223 raise e
224
225 def _start_daemon_process(self):
226 """Starts a subprocess to run the daemon."""
227 p = multiprocessing.Process(
228 target=self.daemon_target, args=self.daemon_args
229 )
230 p.start()
231
232 logging.info("Start subprocess with PID %d", p.pid)
233 self.daemon_process = p
234
235 def _terminate_process(
236 self, pid: int, timeout: int = DEFAULT_PROCESS_TERMINATION_TIMEOUT_SECONDS
237 ):
238 """Terminates a process with given pid.
239
240 It first sends a SIGTERM to the process to allow it for proper
241 termination with a timeout. If the process is not terminated within
242 the timeout, kills it forcefully.
243 """
244 try:
245 os.kill(pid, signal.SIGTERM)
246 if not self._wait_for_process_terminate(pid, timeout):
247 logging.warning(
248 "Process %d not terminated within timeout, try force kill", pid
249 )
250 os.kill(pid, signal.SIGKILL)
251 except ProcessLookupError:
252 logging.info("Process with PID %d not found (already terminated)", pid)
253
254 def _wait_for_process_terminate(self, pid: int, timeout: int) -> bool:
255 start_time = time.time()
256
257 while time.time() < start_time + timeout:
258 if not self._is_process_alive(pid):
259 return True
260 time.sleep(1)
261
262 logging.error("Process %d not terminated within %d seconds.", pid, timeout)
263 return False
264
265 def _is_process_alive(self, pid: int) -> bool:
266 try:
267 output = subprocess.check_output(
268 ["ps", "-p", str(pid), "-o", "state="], text=True
269 ).strip()
270 state = output.split()[0]
271 return state != "Z" # Check if the state is not 'Z' (zombie)
272 except subprocess.CalledProcessError:
273 # Process not found (already dead).
274 return False
275 except (FileNotFoundError, OSError, ValueError) as e:
276 logging.warning(
277 "Unable to check the status for process %d with error: %s.", pid, e
278 )
279 return True
280
281 def _remove_pidfile(self):
282 try:
283 os.remove(self.pid_file_path)
284 except FileNotFoundError:
285 logging.info("pid file %s already removed.", self.pid_file_path)
286
287 def _get_pid_file_path(self, pid_file_dir: pathlib.Path) -> pathlib.Path:
288 """Generates the path to store the pidfile.
289
290 The file path should have the format of "/tmp/edit_monitor/xxxx.lock"
291 where xxxx is a hashed value based on the binary path that starts the
292 process.
293 """
294 hash_object = hashlib.sha256()
295 hash_object.update(self.binary_path.encode("utf-8"))
296 pid_file_path = pid_file_dir.joinpath(hash_object.hexdigest() + ".lock")
297 logging.info("pid_file_path: %s", pid_file_path)
298
299 return pid_file_path
Zhuoyao Zhangdc2840d2024-09-19 23:29:27 +0000300
301 def _get_process_memory_percent(self, pid: int) -> float:
302 try:
303 with open(f"/proc/{pid}/stat", "r") as f:
304 stat_data = f.readline().split()
305 # RSS is the 24th field in /proc/[pid]/stat
306 rss_pages = int(stat_data[23])
307 return rss_pages * 4 / 1024 # Covert to MB
308 except (FileNotFoundError, IndexError, ValueError, IOError) as e:
309 logging.exception("Failed to get memory usage.")
310 raise e
311
312 def _get_process_cpu_percent(self, pid: int, interval: int = 1) -> float:
313 try:
314 total_start_time = self._get_total_cpu_time(pid)
315 with open("/proc/uptime", "r") as f:
316 uptime_start = float(f.readline().split()[0])
317
318 time.sleep(interval)
319
320 total_end_time = self._get_total_cpu_time(pid)
321 with open("/proc/uptime", "r") as f:
322 uptime_end = float(f.readline().split()[0])
323
324 return (
325 (total_end_time - total_start_time)
326 / (uptime_end - uptime_start)
327 * 100
328 )
329 except (FileNotFoundError, IndexError, ValueError, IOError) as e:
330 logging.exception("Failed to get CPU usage.")
331 raise e
332
333 def _get_total_cpu_time(self, pid: int) -> float:
334 with open(f"/proc/{str(pid)}/stat", "r") as f:
335 stats = f.readline().split()
336 # utime is the 14th field in /proc/[pid]/stat measured in clock ticks.
337 utime = int(stats[13])
338 # stime is the 15th field in /proc/[pid]/stat measured in clock ticks.
339 stime = int(stats[14])
340 return (utime + stime) / os.sysconf(os.sysconf_names["SC_CLK_TCK"])
341
Zhuoyao Zhangd28da5c2024-09-24 19:46:12 +0000342 def _find_all_instances_pids(self) -> list[int]:
343 pids = []
344
345 for file in os.listdir(self.pid_file_path.parent):
346 if file.endswith(".lock"):
347 try:
348 with open(self.pid_file_path.parent.joinpath(file), "r") as f:
349 pids.append(int(f.read().strip()))
350 except (FileNotFoundError, IOError, ValueError, TypeError):
351 logging.exception("Failed to get pid from file path: %s", file)
352
353 return pids