blob: 79831a7eeb1c64ed0199abb38c47ae48321e4b08 [file] [log] [blame]
Zhuoyao Zhang53359552024-09-16 23:58:11 +00001# Copyright 2024, The Android Open Source Project
2#
3# Licensed under the Apache License, Version 2.0 (the "License");
4# you may not use this file except in compliance with the License.
5# You may obtain a copy of the License at
6#
7# http://www.apache.org/licenses/LICENSE-2.0
8#
9# Unless required by applicable law or agreed to in writing, software
10# distributed under the License is distributed on an "AS IS" BASIS,
11# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12# See the License for the specific language governing permissions and
13# limitations under the License.
14
15
16import hashlib
17import logging
18import multiprocessing
19import os
20import pathlib
21import signal
22import subprocess
23import tempfile
24import time
25
26
27DEFAULT_PROCESS_TERMINATION_TIMEOUT_SECONDS = 1
Zhuoyao Zhangdc2840d2024-09-19 23:29:27 +000028DEFAULT_MONITOR_INTERVAL_SECONDS = 5
29DEFAULT_MEMORY_USAGE_THRESHOLD = 2000
30DEFAULT_CPU_USAGE_THRESHOLD = 10
Zhuoyao Zhang53359552024-09-16 23:58:11 +000031
32
33def default_daemon_target():
34 """Place holder for the default daemon target."""
35 print("default daemon target")
36
37
38class DaemonManager:
39 """Class to manage and monitor the daemon run as a subprocess."""
40
41 def __init__(
42 self,
43 binary_path: str,
44 daemon_target: callable = default_daemon_target,
45 daemon_args: tuple = (),
46 ):
47 self.binary_path = binary_path
48 self.daemon_target = daemon_target
49 self.daemon_args = daemon_args
50
51 self.pid = os.getpid()
52 self.daemon_process = None
53
Zhuoyao Zhangdc2840d2024-09-19 23:29:27 +000054 self.max_memory_usage = 0
55 self.max_cpu_usage = 0
56
Zhuoyao Zhang53359552024-09-16 23:58:11 +000057 pid_file_dir = pathlib.Path(tempfile.gettempdir()).joinpath("edit_monitor")
58 pid_file_dir.mkdir(parents=True, exist_ok=True)
59 self.pid_file_path = self._get_pid_file_path(pid_file_dir)
60
61 def start(self):
62 """Writes the pidfile and starts the daemon proces."""
63 try:
Zhuoyao Zhang4d485592024-09-17 21:14:38 +000064 self._stop_any_existing_instance()
Zhuoyao Zhang53359552024-09-16 23:58:11 +000065 self._write_pid_to_pidfile()
66 self._start_daemon_process()
67 except Exception as e:
68 logging.exception("Failed to start daemon manager with error %s", e)
69
Zhuoyao Zhangdc2840d2024-09-19 23:29:27 +000070 def monitor_daemon(
71 self,
72 interval: int = DEFAULT_MONITOR_INTERVAL_SECONDS,
73 memory_threshold: float = DEFAULT_MEMORY_USAGE_THRESHOLD,
74 cpu_threshold: float = DEFAULT_CPU_USAGE_THRESHOLD,
75 ):
76 """Monits the daemon process status.
77
78 Periodically check the CPU/Memory usage of the daemon process as long as the
79 process is still running and kill the process if the resource usage is above
80 given thresholds.
81 """
82 logging.info("start monitoring daemon process %d.", self.daemon_process.pid)
83
84 while self.daemon_process.is_alive():
85 try:
86 memory_usage = self._get_process_memory_percent(self.daemon_process.pid)
87 self.max_memory_usage = max(self.max_memory_usage, memory_usage)
88
89 cpu_usage = self._get_process_cpu_percent(self.daemon_process.pid)
90 self.max_cpu_usage = max(self.max_cpu_usage, cpu_usage)
91
92 time.sleep(interval)
93 except Exception as e:
94 # Logging the error and continue.
95 logging.warning("Failed to monitor daemon process with error: %s", e)
96
97 if (
98 self.max_memory_usage >= memory_threshold
99 or self.max_cpu_usage >= cpu_threshold
100 ):
101 logging.error(
102 "Daemon process is consuming too much resource, killing..."
103 ),
104 self._terminate_process(self.daemon_process.pid)
105
106 logging.info(
107 "Daemon process %d terminated. Max memory usage: %f, Max cpu"
108 " usage: %f.",
109 self.daemon_process.pid,
110 self.max_memory_usage,
111 self.max_cpu_usage,
112 )
113
Zhuoyao Zhang53359552024-09-16 23:58:11 +0000114 def stop(self):
115 """Stops the daemon process and removes the pidfile."""
116
117 logging.debug("in daemon manager cleanup.")
118 try:
119 if self.daemon_process and self.daemon_process.is_alive():
120 self._terminate_process(self.daemon_process.pid)
121 self._remove_pidfile()
122 except Exception as e:
123 logging.exception("Failed to stop daemon manager with error %s", e)
124
Zhuoyao Zhang4d485592024-09-17 21:14:38 +0000125 def _stop_any_existing_instance(self):
126 if not self.pid_file_path.exists():
127 logging.debug("No existing instances.")
128 return
129
130 ex_pid = self._read_pid_from_pidfile()
131
132 if ex_pid:
133 logging.info("Found another instance with pid %d.", ex_pid)
134 self._terminate_process(ex_pid)
135 self._remove_pidfile()
136
137 def _read_pid_from_pidfile(self):
138 with open(self.pid_file_path, "r") as f:
139 return int(f.read().strip())
140
Zhuoyao Zhang53359552024-09-16 23:58:11 +0000141 def _write_pid_to_pidfile(self):
142 """Creates a pidfile and writes the current pid to the file.
143
144 Raise FileExistsError if the pidfile already exists.
145 """
146 try:
147 # Use the 'x' mode to open the file for exclusive creation
148 with open(self.pid_file_path, "x") as f:
149 f.write(f"{self.pid}")
150 except FileExistsError as e:
151 # This could be caused due to race condition that a user is trying
152 # to start two edit monitors at the same time. Or because there is
153 # already an existing edit monitor running and we can not kill it
154 # for some reason.
155 logging.exception("pidfile %s already exists.", self.pid_file_path)
156 raise e
157
158 def _start_daemon_process(self):
159 """Starts a subprocess to run the daemon."""
160 p = multiprocessing.Process(
161 target=self.daemon_target, args=self.daemon_args
162 )
163 p.start()
164
165 logging.info("Start subprocess with PID %d", p.pid)
166 self.daemon_process = p
167
168 def _terminate_process(
169 self, pid: int, timeout: int = DEFAULT_PROCESS_TERMINATION_TIMEOUT_SECONDS
170 ):
171 """Terminates a process with given pid.
172
173 It first sends a SIGTERM to the process to allow it for proper
174 termination with a timeout. If the process is not terminated within
175 the timeout, kills it forcefully.
176 """
177 try:
178 os.kill(pid, signal.SIGTERM)
179 if not self._wait_for_process_terminate(pid, timeout):
180 logging.warning(
181 "Process %d not terminated within timeout, try force kill", pid
182 )
183 os.kill(pid, signal.SIGKILL)
184 except ProcessLookupError:
185 logging.info("Process with PID %d not found (already terminated)", pid)
186
187 def _wait_for_process_terminate(self, pid: int, timeout: int) -> bool:
188 start_time = time.time()
189
190 while time.time() < start_time + timeout:
191 if not self._is_process_alive(pid):
192 return True
193 time.sleep(1)
194
195 logging.error("Process %d not terminated within %d seconds.", pid, timeout)
196 return False
197
198 def _is_process_alive(self, pid: int) -> bool:
199 try:
200 output = subprocess.check_output(
201 ["ps", "-p", str(pid), "-o", "state="], text=True
202 ).strip()
203 state = output.split()[0]
204 return state != "Z" # Check if the state is not 'Z' (zombie)
205 except subprocess.CalledProcessError:
206 # Process not found (already dead).
207 return False
208 except (FileNotFoundError, OSError, ValueError) as e:
209 logging.warning(
210 "Unable to check the status for process %d with error: %s.", pid, e
211 )
212 return True
213
214 def _remove_pidfile(self):
215 try:
216 os.remove(self.pid_file_path)
217 except FileNotFoundError:
218 logging.info("pid file %s already removed.", self.pid_file_path)
219
220 def _get_pid_file_path(self, pid_file_dir: pathlib.Path) -> pathlib.Path:
221 """Generates the path to store the pidfile.
222
223 The file path should have the format of "/tmp/edit_monitor/xxxx.lock"
224 where xxxx is a hashed value based on the binary path that starts the
225 process.
226 """
227 hash_object = hashlib.sha256()
228 hash_object.update(self.binary_path.encode("utf-8"))
229 pid_file_path = pid_file_dir.joinpath(hash_object.hexdigest() + ".lock")
230 logging.info("pid_file_path: %s", pid_file_path)
231
232 return pid_file_path
Zhuoyao Zhangdc2840d2024-09-19 23:29:27 +0000233
234 def _get_process_memory_percent(self, pid: int) -> float:
235 try:
236 with open(f"/proc/{pid}/stat", "r") as f:
237 stat_data = f.readline().split()
238 # RSS is the 24th field in /proc/[pid]/stat
239 rss_pages = int(stat_data[23])
240 return rss_pages * 4 / 1024 # Covert to MB
241 except (FileNotFoundError, IndexError, ValueError, IOError) as e:
242 logging.exception("Failed to get memory usage.")
243 raise e
244
245 def _get_process_cpu_percent(self, pid: int, interval: int = 1) -> float:
246 try:
247 total_start_time = self._get_total_cpu_time(pid)
248 with open("/proc/uptime", "r") as f:
249 uptime_start = float(f.readline().split()[0])
250
251 time.sleep(interval)
252
253 total_end_time = self._get_total_cpu_time(pid)
254 with open("/proc/uptime", "r") as f:
255 uptime_end = float(f.readline().split()[0])
256
257 return (
258 (total_end_time - total_start_time)
259 / (uptime_end - uptime_start)
260 * 100
261 )
262 except (FileNotFoundError, IndexError, ValueError, IOError) as e:
263 logging.exception("Failed to get CPU usage.")
264 raise e
265
266 def _get_total_cpu_time(self, pid: int) -> float:
267 with open(f"/proc/{str(pid)}/stat", "r") as f:
268 stats = f.readline().split()
269 # utime is the 14th field in /proc/[pid]/stat measured in clock ticks.
270 utime = int(stats[13])
271 # stime is the 15th field in /proc/[pid]/stat measured in clock ticks.
272 stime = int(stats[14])
273 return (utime + stime) / os.sysconf(os.sysconf_names["SC_CLK_TCK"])
274