blob: b25eb06c093451e8d3e1b005fd967634a0906c0c [file] [log] [blame]
Mark Salyzynf089e142018-02-20 10:47:40 -08001/*
2 * Copyright (C) 2018 The Android Open Source Project
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17#include "llkd.h"
18
19#include <ctype.h>
20#include <dirent.h> // opendir() and readdir()
21#include <errno.h>
22#include <fcntl.h>
23#include <pthread.h>
24#include <pwd.h> // getpwuid()
25#include <signal.h>
26#include <stdint.h>
27#include <sys/cdefs.h> // ___STRING, __predict_true() and _predict_false()
28#include <sys/mman.h> // mlockall()
29#include <sys/prctl.h>
30#include <sys/stat.h> // lstat()
31#include <sys/syscall.h> // __NR_getdents64
32#include <sys/sysinfo.h> // get_nprocs_conf()
33#include <sys/types.h>
34#include <time.h>
35#include <unistd.h>
36
37#include <chrono>
38#include <ios>
39#include <sstream>
40#include <string>
41#include <unordered_map>
42#include <unordered_set>
43
44#include <android-base/file.h>
45#include <android-base/logging.h>
46#include <android-base/parseint.h>
47#include <android-base/properties.h>
48#include <android-base/strings.h>
49#include <cutils/android_get_control_file.h>
50#include <log/log_main.h>
51
52#define ARRAY_SIZE(x) (sizeof(x) / sizeof(*(x)))
53
54#define TASK_COMM_LEN 16 // internal kernel, not uapi, from .../linux/include/linux/sched.h
55
56using namespace std::chrono_literals;
57using namespace std::chrono;
58
59namespace {
60
61constexpr pid_t kernelPid = 0;
62constexpr pid_t initPid = 1;
63constexpr pid_t kthreaddPid = 2;
64
65constexpr char procdir[] = "/proc/";
66
67// Configuration
68milliseconds llkUpdate; // last check ms signature
69milliseconds llkCycle; // ms to next thread check
70bool llkEnable = LLK_ENABLE_DEFAULT; // llk daemon enabled
71bool llkRunning = false; // thread is running
72bool llkMlockall = LLK_MLOCKALL_DEFAULT; // run mlocked
73milliseconds llkTimeoutMs = LLK_TIMEOUT_MS_DEFAULT; // default timeout
74enum { llkStateD, llkStateZ, llkNumStates }; // state indexes
75milliseconds llkStateTimeoutMs[llkNumStates]; // timeout override for each detection state
76milliseconds llkCheckMs; // checking interval to inspect any
77 // persistent live-locked states
78bool llkLowRam; // ro.config.low_ram
79bool khtEnable = LLK_ENABLE_DEFAULT; // [khungtaskd] panic
80// [khungtaskd] should have a timeout beyond the granularity of llkTimeoutMs.
81// Provides a wide angle of margin b/c khtTimeout is also its granularity.
82seconds khtTimeout = duration_cast<seconds>(llkTimeoutMs * (1 + LLK_CHECKS_PER_TIMEOUT_DEFAULT) /
83 LLK_CHECKS_PER_TIMEOUT_DEFAULT);
84
85// Blacklist variables, initialized with comma separated lists of high false
86// positive and/or dangerous references, e.g. without self restart, for pid,
87// ppid, name and uid:
88
89// list of pids, or tids or names to skip. kernel pid (0), init pid (1),
90// [kthreadd] pid (2), ourselves, "init", "[kthreadd]", "lmkd", "llkd" or
91// combinations of watchdogd in kernel and user space.
92std::unordered_set<std::string> llkBlacklistProcess;
93// list of parent pids, comm or cmdline names to skip. default:
94// kernel pid (0), [kthreadd] (2), or ourselves, enforced and implied
95std::unordered_set<std::string> llkBlacklistParent;
96// list of uids, and uid names, to skip, default nothing
97std::unordered_set<std::string> llkBlacklistUid;
98
99class dir {
100 public:
101 enum level { proc, task, numLevels };
102
103 private:
104 int fd;
105 size_t available_bytes;
106 dirent* next;
107 // each directory level picked to be just north of 4K in size
108 static constexpr size_t buffEntries = 15;
109 static dirent buff[numLevels][buffEntries];
110
111 bool fill(enum level index) {
112 if (index >= numLevels) return false;
113 if (available_bytes != 0) return true;
114 if (__predict_false(fd < 0)) return false;
115 // getdents64 has no libc wrapper
116 auto rc = TEMP_FAILURE_RETRY(syscall(__NR_getdents64, fd, buff[index], sizeof(buff[0]), 0));
117 if (rc <= 0) return false;
118 available_bytes = rc;
119 next = buff[index];
120 return true;
121 }
122
123 public:
124 dir() : fd(-1), available_bytes(0), next(nullptr) {}
125
126 explicit dir(const char* directory)
127 : fd(__predict_true(directory != nullptr)
128 ? ::open(directory, O_CLOEXEC | O_DIRECTORY | O_RDONLY)
129 : -1),
130 available_bytes(0),
131 next(nullptr) {}
132
133 explicit dir(const std::string&& directory)
134 : fd(::open(directory.c_str(), O_CLOEXEC | O_DIRECTORY | O_RDONLY)),
135 available_bytes(0),
136 next(nullptr) {}
137
138 explicit dir(const std::string& directory)
139 : fd(::open(directory.c_str(), O_CLOEXEC | O_DIRECTORY | O_RDONLY)),
140 available_bytes(0),
141 next(nullptr) {}
142
143 // Don't need any copy or move constructors.
144 explicit dir(const dir& c) = delete;
145 explicit dir(dir& c) = delete;
146 explicit dir(dir&& c) = delete;
147
148 ~dir() {
149 if (fd >= 0) {
150 ::close(fd);
151 }
152 }
153
154 operator bool() const { return fd >= 0; }
155
156 void reset(void) {
157 if (fd >= 0) {
158 ::close(fd);
159 fd = -1;
160 available_bytes = 0;
161 next = nullptr;
162 }
163 }
164
165 dir& reset(const char* directory) {
166 reset();
167 // available_bytes will _always_ be zero here as its value is
168 // intimately tied to fd < 0 or not.
169 fd = ::open(directory, O_CLOEXEC | O_DIRECTORY | O_RDONLY);
170 return *this;
171 }
172
173 void rewind(void) {
174 if (fd >= 0) {
175 ::lseek(fd, off_t(0), SEEK_SET);
176 available_bytes = 0;
177 next = nullptr;
178 }
179 }
180
181 dirent* read(enum level index = proc, dirent* def = nullptr) {
182 if (!fill(index)) return def;
183 auto ret = next;
184 available_bytes -= next->d_reclen;
185 next = reinterpret_cast<dirent*>(reinterpret_cast<char*>(next) + next->d_reclen);
186 return ret;
187 }
188} llkTopDirectory;
189
190dirent dir::buff[dir::numLevels][dir::buffEntries];
191
192// helper functions
193
194bool llkIsMissingExeLink(pid_t tid) {
195 char c;
196 // CAP_SYS_PTRACE is required to prevent ret == -1, but ENOENT is signal
197 auto ret = ::readlink((procdir + std::to_string(tid) + "/exe").c_str(), &c, sizeof(c));
198 return (ret == -1) && (errno == ENOENT);
199}
200
201// Common routine where caller accepts empty content as error/passthrough.
202// Reduces the churn of reporting read errors in the callers.
203std::string ReadFile(std::string&& path) {
204 std::string content;
205 if (!android::base::ReadFileToString(path, &content)) {
206 PLOG(DEBUG) << "Read " << path << " failed";
207 content = "";
208 }
209 return content;
210}
211
212std::string llkProcGetName(pid_t tid, const char* node = "/cmdline") {
213 std::string content = ReadFile(procdir + std::to_string(tid) + node);
214 static constexpr char needles[] = " \t\r\n"; // including trailing nul
215 auto pos = content.find_first_of(needles, 0, sizeof(needles));
216 if (pos != std::string::npos) {
217 content.erase(pos);
218 }
219 return content;
220}
221
222uid_t llkProcGetUid(pid_t tid) {
223 // Get the process' uid. The following read from /status is admittedly
224 // racy, prone to corruption due to shape-changes. The consequences are
225 // not catastrophic as we sample a few times before taking action.
226 //
227 // If /loginuid worked on reliably, or on Android (all tasks report -1)...
228 // Android lmkd causes /cgroup to contain memory:/<dom>/uid_<uid>/pid_<pid>
229 // which is tighter, but also not reliable.
230 std::string content = ReadFile(procdir + std::to_string(tid) + "/status");
231 static constexpr char Uid[] = "\nUid:";
232 auto pos = content.find(Uid);
233 if (pos == std::string::npos) {
234 return -1;
235 }
236 pos += ::strlen(Uid);
237 while ((pos < content.size()) && ::isblank(content[pos])) {
238 ++pos;
239 }
240 content.erase(0, pos);
241 for (pos = 0; (pos < content.size()) && ::isdigit(content[pos]); ++pos) {
242 ;
243 }
244 // Content of form 'Uid: 0 0 0 0', newline is error
245 if ((pos >= content.size()) || !::isblank(content[pos])) {
246 return -1;
247 }
248 content.erase(pos);
249 uid_t ret;
250 if (!android::base::ParseInt(content, &ret, uid_t(0))) {
251 return -1;
252 }
253 return ret;
254}
255
256struct proc {
257 pid_t tid; // monitored thread id (in Z or D state).
258 nanoseconds schedUpdate; // /proc/<tid>/sched "se.avg.lastUpdateTime",
259 uint64_t nrSwitches; // /proc/<tid>/sched "nr_switches" for
260 // refined ABA problem detection, determine
261 // forward scheduling progress.
262 milliseconds update; // llkUpdate millisecond signature of last.
263 milliseconds count; // duration in state.
264 pid_t pid; // /proc/<pid> before iterating through
265 // /proc/<pid>/task/<tid> for threads.
266 pid_t ppid; // /proc/<tid>/stat field 4 parent pid.
267 uid_t uid; // /proc/<tid>/status Uid: field.
268 unsigned time; // sum of /proc/<tid>/stat field 14 utime &
269 // 15 stime for coarse ABA problem detection.
270 std::string cmdline; // cached /cmdline content
271 char state; // /proc/<tid>/stat field 3: Z or D
272 // (others we do not monitor: S, R, T or ?)
273 char comm[TASK_COMM_LEN + 3]; // space for adding '[' and ']'
274 bool exeMissingValid; // exeMissing has been cached
275 bool cmdlineValid; // cmdline has been cached
276 bool updated; // cleared before monitoring pass.
277 bool killed; // sent a kill to this thread, next panic...
278
279 void setComm(const char* _comm) { strncpy(comm + 1, _comm, sizeof(comm) - 2); }
280
281 proc(pid_t tid, pid_t pid, pid_t ppid, const char* _comm, int time, char state)
282 : tid(tid),
283 schedUpdate(0),
284 nrSwitches(0),
285 update(llkUpdate),
286 count(0),
287 pid(pid),
288 ppid(ppid),
289 uid(-1),
290 time(time),
291 state(state),
292 exeMissingValid(false),
293 cmdlineValid(false),
294 updated(true),
295 killed(false) {
296 memset(comm, '\0', sizeof(comm));
297 setComm(_comm);
298 }
299
300 const char* getComm(void) {
301 if (comm[1] == '\0') { // comm Valid?
302 strncpy(comm + 1, llkProcGetName(tid, "/comm").c_str(), sizeof(comm) - 2);
303 }
304 if (!exeMissingValid) {
305 if (llkIsMissingExeLink(tid)) {
306 comm[0] = '[';
307 }
308 exeMissingValid = true;
309 }
310 size_t len = strlen(comm + 1);
311 if (__predict_true(len < (sizeof(comm) - 1))) {
312 if (comm[0] == '[') {
313 if ((comm[len] != ']') && __predict_true(len < (sizeof(comm) - 2))) {
314 comm[++len] = ']';
315 comm[++len] = '\0';
316 }
317 } else {
318 if (comm[len] == ']') {
319 comm[len] = '\0';
320 }
321 }
322 }
323 return &comm[comm[0] != '['];
324 }
325
326 const char* getCmdline(void) {
327 if (!cmdlineValid) {
328 cmdline = llkProcGetName(tid);
329 cmdlineValid = true;
330 }
331 return cmdline.c_str();
332 }
333
334 uid_t getUid(void) {
335 if (uid <= 0) { // Churn on root user, because most likely to setuid()
336 uid = llkProcGetUid(tid);
337 }
338 return uid;
339 }
340
341 void reset(void) { // reset cache, if we detected pid rollover
342 uid = -1;
343 state = '?';
344 cmdline = "";
345 comm[0] = '\0';
346 exeMissingValid = false;
347 cmdlineValid = false;
348 }
349};
350
351std::unordered_map<pid_t, proc> tids;
352
353// Check range and setup defaults, in order of propagation:
354// llkTimeoutMs
355// llkCheckMs
356// ...
357// KISS to keep it all self-contained, and called multiple times as parameters
358// are interpreted so that defaults, llkCheckMs and llkCycle make sense.
359void llkValidate() {
360 if (llkTimeoutMs == 0ms) {
361 llkTimeoutMs = LLK_TIMEOUT_MS_DEFAULT;
362 }
363 llkTimeoutMs = std::max(llkTimeoutMs, LLK_TIMEOUT_MS_MINIMUM);
364 if (llkCheckMs == 0ms) {
365 llkCheckMs = llkTimeoutMs / LLK_CHECKS_PER_TIMEOUT_DEFAULT;
366 }
367 llkCheckMs = std::min(llkCheckMs, llkTimeoutMs);
368
369 for (size_t state = 0; state < ARRAY_SIZE(llkStateTimeoutMs); ++state) {
370 if (llkStateTimeoutMs[state] == 0ms) {
371 llkStateTimeoutMs[state] = llkTimeoutMs;
372 }
373 llkStateTimeoutMs[state] =
374 std::min(std::max(llkStateTimeoutMs[state], LLK_TIMEOUT_MS_MINIMUM), llkTimeoutMs);
375 llkCheckMs = std::min(llkCheckMs, llkStateTimeoutMs[state]);
376 }
377
378 llkCheckMs = std::max(llkCheckMs, LLK_CHECK_MS_MINIMUM);
379 if (llkCycle == 0ms) {
380 llkCycle = llkCheckMs;
381 }
382 llkCycle = std::min(llkCycle, llkCheckMs);
383}
384
385milliseconds llkGetTimespecDiffMs(timespec* from, timespec* to) {
386 return duration_cast<milliseconds>(seconds(to->tv_sec - from->tv_sec)) +
387 duration_cast<milliseconds>(nanoseconds(to->tv_nsec - from->tv_nsec));
388}
389
390std::string llkProcGetName(pid_t tid, const char* comm, const char* cmdline) {
391 if ((cmdline != nullptr) && (*cmdline != '\0')) {
392 return cmdline;
393 }
394 if ((comm != nullptr) && (*comm != '\0')) {
395 return comm;
396 }
397
398 // UNLIKELY! Here because killed before we kill it?
399 // Assume change is afoot, do not call llkTidAlloc
400
401 // cmdline ?
402 std::string content = llkProcGetName(tid);
403 if (content.size() != 0) {
404 return content;
405 }
406 // Comm instead?
407 content = llkProcGetName(tid, "/comm");
408 if (llkIsMissingExeLink(tid) && (content.size() != 0)) {
409 return '[' + content + ']';
410 }
411 return content;
412}
413
414int llkKillOneProcess(pid_t pid, char state, pid_t tid, const char* tcomm = nullptr,
415 const char* tcmdline = nullptr, const char* pcomm = nullptr,
416 const char* pcmdline = nullptr) {
417 std::string forTid;
418 if (tid != pid) {
419 forTid = " for '" + llkProcGetName(tid, tcomm, tcmdline) + "' (" + std::to_string(tid) + ")";
420 }
421 LOG(INFO) << "Killing '" << llkProcGetName(pid, pcomm, pcmdline) << "' (" << pid
422 << ") to check forward scheduling progress in " << state << " state" << forTid;
423 // CAP_KILL required
424 errno = 0;
425 auto r = ::kill(pid, SIGKILL);
426 if (r) {
427 PLOG(ERROR) << "kill(" << pid << ")=" << r << ' ';
428 }
429
430 return r;
431}
432
433// Kill one process
434int llkKillOneProcess(pid_t pid, proc* tprocp) {
435 return llkKillOneProcess(pid, tprocp->state, tprocp->tid, tprocp->getComm(),
436 tprocp->getCmdline());
437}
438
439// Kill one process specified by kprocp
440int llkKillOneProcess(proc* kprocp, proc* tprocp) {
441 if (kprocp == nullptr) {
442 return -2;
443 }
444
445 return llkKillOneProcess(kprocp->tid, tprocp->state, tprocp->tid, tprocp->getComm(),
446 tprocp->getCmdline(), kprocp->getComm(), kprocp->getCmdline());
447}
448
449// Acquire file descriptor from environment, or open and cache it.
450// NB: cache is unnecessary in our current context, pedantically
451// required to prevent leakage of file descriptors in the future.
452int llkFileToWriteFd(const std::string& file) {
453 static std::unordered_map<std::string, int> cache;
454 auto search = cache.find(file);
455 if (search != cache.end()) return search->second;
456 auto fd = android_get_control_file(file.c_str());
457 if (fd >= 0) return fd;
458 fd = TEMP_FAILURE_RETRY(::open(file.c_str(), O_WRONLY | O_CLOEXEC));
459 if (fd >= 0) cache.emplace(std::make_pair(file, fd));
460 return fd;
461}
462
463// Wrap android::base::WriteStringToFile to use android_get_control_file.
464bool llkWriteStringToFile(const std::string& string, const std::string& file) {
465 auto fd = llkFileToWriteFd(file);
466 if (fd < 0) return false;
467 return android::base::WriteStringToFd(string, fd);
468}
469
470bool llkWriteStringToFileConfirm(const std::string& string, const std::string& file) {
471 auto fd = llkFileToWriteFd(file);
472 auto ret = (fd < 0) ? false : android::base::WriteStringToFd(string, fd);
473 std::string content;
474 if (!android::base::ReadFileToString(file, &content)) return ret;
475 return android::base::Trim(content) == string;
476}
477
478void llkPanicKernel(bool dump, pid_t tid) __noreturn;
479void llkPanicKernel(bool dump, pid_t tid) {
480 auto sysrqTriggerFd = llkFileToWriteFd("/proc/sysrq-trigger");
481 if (sysrqTriggerFd < 0) {
482 // DYB
483 llkKillOneProcess(initPid, 'R', tid);
484 // The answer to life, the universe and everything
485 ::exit(42);
486 // NOTREACHED
487 }
488 ::sync();
489 if (dump) {
490 // Show all locks that are held
491 android::base::WriteStringToFd("d", sysrqTriggerFd);
492 // This can trigger hardware watchdog, that is somewhat _ok_.
493 // But useless if pstore configured for <256KB, low ram devices ...
494 if (!llkLowRam) {
495 android::base::WriteStringToFd("t", sysrqTriggerFd);
496 }
497 ::usleep(200000); // let everything settle
498 }
499 android::base::WriteStringToFd("c", sysrqTriggerFd);
500 // NOTREACHED
501 // DYB
502 llkKillOneProcess(initPid, 'R', tid);
503 // I sat at my desk, stared into the garden and thought '42 will do'.
504 // I typed it out. End of story
505 ::exit(42);
506 // NOTREACHED
507}
508
509void llkAlarmHandler(int) {
510 llkPanicKernel(false, ::getpid());
511}
512
513milliseconds GetUintProperty(const std::string& key, milliseconds def) {
514 return milliseconds(android::base::GetUintProperty(key, static_cast<uint64_t>(def.count()),
515 static_cast<uint64_t>(def.max().count())));
516}
517
518seconds GetUintProperty(const std::string& key, seconds def) {
519 return seconds(android::base::GetUintProperty(key, static_cast<uint64_t>(def.count()),
520 static_cast<uint64_t>(def.max().count())));
521}
522
523proc* llkTidLookup(pid_t tid) {
524 auto search = tids.find(tid);
525 if (search == tids.end()) {
526 return nullptr;
527 }
528 return &search->second;
529}
530
531void llkTidRemove(pid_t tid) {
532 tids.erase(tid);
533}
534
535proc* llkTidAlloc(pid_t tid, pid_t pid, pid_t ppid, const char* comm, int time, char state) {
536 auto it = tids.emplace(std::make_pair(tid, proc(tid, pid, ppid, comm, time, state)));
537 return &it.first->second;
538}
539
540std::string llkFormat(milliseconds ms) {
541 auto sec = duration_cast<seconds>(ms);
542 std::ostringstream s;
543 s << sec.count() << '.';
544 auto f = s.fill('0');
545 auto w = s.width(3);
546 s << std::right << (ms - sec).count();
547 s.width(w);
548 s.fill(f);
549 s << 's';
550 return s.str();
551}
552
553std::string llkFormat(seconds s) {
554 return std::to_string(s.count()) + 's';
555}
556
557std::string llkFormat(bool flag) {
558 return flag ? "true" : "false";
559}
560
561std::string llkFormat(const std::unordered_set<std::string>& blacklist) {
562 std::string ret;
563 for (auto entry : blacklist) {
564 if (ret.size()) {
565 ret += ",";
566 }
567 ret += entry;
568 }
569 return ret;
570}
571
572// We only officially support comma separators, but wetware being what they
573// are will take some liberty and I do not believe they should be punished.
574std::unordered_set<std::string> llkSplit(const std::string& s,
575 const std::string& delimiters = ", \t:") {
576 std::unordered_set<std::string> result;
577
578 size_t base = 0;
579 size_t found;
580 while (true) {
581 found = s.find_first_of(delimiters, base);
582 result.emplace(s.substr(base, found - base));
583 if (found == s.npos) break;
584 base = found + 1;
585 }
586 return result;
587}
588
589bool llkSkipName(const std::string& name,
590 const std::unordered_set<std::string>& blacklist = llkBlacklistProcess) {
591 if ((name.size() == 0) || (blacklist.size() == 0)) {
592 return false;
593 }
594
595 return blacklist.find(name) != blacklist.end();
596}
597
598bool llkSkipPid(pid_t pid) {
599 return llkSkipName(std::to_string(pid), llkBlacklistProcess);
600}
601
602bool llkSkipPpid(pid_t ppid) {
603 return llkSkipName(std::to_string(ppid), llkBlacklistParent);
604}
605
606bool llkSkipUid(uid_t uid) {
607 // Match by number?
608 if (llkSkipName(std::to_string(uid), llkBlacklistUid)) {
609 return true;
610 }
611
612 // Match by name?
613 auto pwd = ::getpwuid(uid);
614 return (pwd != nullptr) && __predict_true(pwd->pw_name != nullptr) &&
615 __predict_true(pwd->pw_name[0] != '\0') && llkSkipName(pwd->pw_name, llkBlacklistUid);
616}
617
618bool getValidTidDir(dirent* dp, std::string* piddir) {
619 if (!::isdigit(dp->d_name[0])) {
620 return false;
621 }
622
623 // Corner case can not happen in reality b/c of above ::isdigit check
624 if (__predict_false(dp->d_type != DT_DIR)) {
625 if (__predict_false(dp->d_type == DT_UNKNOWN)) { // can't b/c procfs
626 struct stat st;
627 *piddir = procdir;
628 *piddir += dp->d_name;
629 return (lstat(piddir->c_str(), &st) == 0) && (st.st_mode & S_IFDIR);
630 }
631 return false;
632 }
633
634 *piddir = procdir;
635 *piddir += dp->d_name;
636 return true;
637}
638
639bool llkIsMonitorState(char state) {
640 return (state == 'Z') || (state == 'D');
641}
642
643// returns -1 if not found
644long long getSchedValue(const std::string& schedString, const char* key) {
645 auto pos = schedString.find(key);
646 if (pos == std::string::npos) {
647 return -1;
648 }
649 pos = schedString.find(':', pos);
650 if (__predict_false(pos == std::string::npos)) {
651 return -1;
652 }
653 while ((++pos < schedString.size()) && ::isblank(schedString[pos])) {
654 ;
655 }
656 long long ret;
657 if (!android::base::ParseInt(schedString.substr(pos), &ret, static_cast<long long>(0))) {
658 return -1;
659 }
660 return ret;
661}
662
663// Primary ABA mitigation watching last time schedule activity happened
664void llkCheckSchedUpdate(proc* procp, const std::string& piddir) {
665 // Audit finds /proc/<tid>/sched is just over 1K, and
666 // is rarely larger than 2K, even less on Android.
667 // For example, the "se.avg.lastUpdateTime" field we are
668 // interested in typically within the primary set in
669 // the first 1K.
670 //
671 // Proc entries can not be read >1K atomically via libbase,
672 // but if there are problems we assume at least a few
673 // samples of reads occur before we take any real action.
674 std::string schedString = ReadFile(piddir + "/sched");
675 if (schedString.size() == 0) {
676 // /schedstat is not as standardized, but in 3.1+
677 // Android devices, the third field is nr_switches
678 // from /sched:
679 schedString = ReadFile(piddir + "/schedstat");
680 if (schedString.size() == 0) {
681 return;
682 }
683 auto val = static_cast<unsigned long long>(-1);
684 if (((::sscanf(schedString.c_str(), "%*d %*d %llu", &val)) == 1) &&
685 (val != static_cast<unsigned long long>(-1)) && (val != 0) &&
686 (val != procp->nrSwitches)) {
687 procp->nrSwitches = val;
688 procp->count = 0ms;
689 procp->killed = false;
690 }
691 return;
692 }
693
694 auto val = getSchedValue(schedString, "\nse.avg.lastUpdateTime");
695 if (val == -1) {
696 val = getSchedValue(schedString, "\nse.svg.last_update_time");
697 }
698 if (val != -1) {
699 auto schedUpdate = nanoseconds(val);
700 if (schedUpdate != procp->schedUpdate) {
701 procp->schedUpdate = schedUpdate;
702 procp->count = 0ms;
703 procp->killed = false;
704 }
705 }
706
707 val = getSchedValue(schedString, "\nnr_switches");
708 if (val != -1) {
709 if (static_cast<uint64_t>(val) != procp->nrSwitches) {
710 procp->nrSwitches = val;
711 procp->count = 0ms;
712 procp->killed = false;
713 }
714 }
715}
716
717void llkLogConfig(void) {
718 LOG(INFO) << "ro.config.low_ram=" << llkFormat(llkLowRam) << "\n"
719 << LLK_ENABLE_PROPERTY "=" << llkFormat(llkEnable) << "\n"
720 << KHT_ENABLE_PROPERTY "=" << llkFormat(khtEnable) << "\n"
721 << LLK_MLOCKALL_PROPERTY "=" << llkFormat(llkMlockall) << "\n"
722 << KHT_TIMEOUT_PROPERTY "=" << llkFormat(khtTimeout) << "\n"
723 << LLK_TIMEOUT_MS_PROPERTY "=" << llkFormat(llkTimeoutMs) << "\n"
724 << LLK_D_TIMEOUT_MS_PROPERTY "=" << llkFormat(llkStateTimeoutMs[llkStateD]) << "\n"
725 << LLK_Z_TIMEOUT_MS_PROPERTY "=" << llkFormat(llkStateTimeoutMs[llkStateZ]) << "\n"
726 << LLK_CHECK_MS_PROPERTY "=" << llkFormat(llkCheckMs) << "\n"
727 << LLK_BLACKLIST_PROCESS_PROPERTY "=" << llkFormat(llkBlacklistProcess) << "\n"
728 << LLK_BLACKLIST_PARENT_PROPERTY "=" << llkFormat(llkBlacklistParent) << "\n"
729 << LLK_BLACKLIST_UID_PROPERTY "=" << llkFormat(llkBlacklistUid);
730}
731
732void* llkThread(void* obj) {
733 LOG(INFO) << "started";
734
735 std::string name = std::to_string(::gettid());
736 if (!llkSkipName(name)) {
737 llkBlacklistProcess.emplace(name);
738 }
739 name = static_cast<const char*>(obj);
740 prctl(PR_SET_NAME, name.c_str());
741 if (__predict_false(!llkSkipName(name))) {
742 llkBlacklistProcess.insert(name);
743 }
744 // No longer modifying llkBlacklistProcess.
745 llkRunning = true;
746 llkLogConfig();
747 while (llkRunning) {
748 ::usleep(duration_cast<microseconds>(llkCheck(true)).count());
749 }
750 // NOTREACHED
751 LOG(INFO) << "exiting";
752 return nullptr;
753}
754
755} // namespace
756
757milliseconds llkCheck(bool checkRunning) {
758 if (!llkEnable || (checkRunning != llkRunning)) {
759 return milliseconds::max();
760 }
761
762 // Reset internal watchdog, which is a healthy engineering margin of
763 // double the maximum wait or cycle time for the mainloop that calls us.
764 //
765 // This alarm is effectively the live lock detection of llkd, as
766 // we understandably can not monitor ourselves otherwise.
767 ::alarm(duration_cast<seconds>(llkTimeoutMs * 2).count());
768
769 // kernel jiffy precision fastest acquisition
770 static timespec last;
771 timespec now;
772 ::clock_gettime(CLOCK_MONOTONIC_COARSE, &now);
773 auto ms = llkGetTimespecDiffMs(&last, &now);
774 if (ms < llkCycle) {
775 return llkCycle - ms;
776 }
777 last = now;
778
779 LOG(VERBOSE) << "opendir(\"" << procdir << "\")";
780 if (__predict_false(!llkTopDirectory)) {
781 // gid containing AID_READPROC required
782 llkTopDirectory.reset(procdir);
783 if (__predict_false(!llkTopDirectory)) {
784 // Most likely reason we could be here is a resource limit.
785 // Keep our processing down to a minimum, but not so low that
786 // we do not recover in a timely manner should the issue be
787 // transitory.
788 LOG(DEBUG) << "opendir(\"" << procdir << "\") failed";
789 return llkTimeoutMs;
790 }
791 }
792
793 for (auto& it : tids) {
794 it.second.updated = false;
795 }
796
797 auto prevUpdate = llkUpdate;
798 llkUpdate += ms;
799 ms -= llkCycle;
800 auto myPid = ::getpid();
801 auto myTid = ::gettid();
802 for (auto dp = llkTopDirectory.read(); dp != nullptr; dp = llkTopDirectory.read()) {
803 std::string piddir;
804
805 if (!getValidTidDir(dp, &piddir)) {
806 continue;
807 }
808
809 // Get the process tasks
810 std::string taskdir = piddir + "/task/";
811 int pid = -1;
812 LOG(VERBOSE) << "+opendir(\"" << taskdir << "\")";
813 dir taskDirectory(taskdir);
814 if (__predict_false(!taskDirectory)) {
815 LOG(DEBUG) << "+opendir(\"" << taskdir << "\") failed";
816 }
817 for (auto tp = taskDirectory.read(dir::task, dp); tp != nullptr;
818 tp = taskDirectory.read(dir::task)) {
819 if (!getValidTidDir(tp, &piddir)) {
820 continue;
821 }
822
823 // Get the process stat
824 std::string stat = ReadFile(piddir + "/stat");
825 if (stat.size() == 0) {
826 continue;
827 }
828 unsigned tid = -1;
829 char pdir[TASK_COMM_LEN + 1];
830 char state = '?';
831 unsigned ppid = -1;
832 unsigned utime = -1;
833 unsigned stime = -1;
834 int dummy;
835 pdir[0] = '\0';
836 // tid should not change value
837 auto match = ::sscanf(
838 stat.c_str(),
839 "%u (%" ___STRING(
840 TASK_COMM_LEN) "[^)]) %c %u %*d %*d %*d %*d %*d %*d %*d %*d %*d %u %u %d",
841 &tid, pdir, &state, &ppid, &utime, &stime, &dummy);
842 if (pid == -1) {
843 pid = tid;
844 }
845 LOG(VERBOSE) << "match " << match << ' ' << tid << " (" << pdir << ") " << state << ' '
846 << ppid << " ... " << utime << ' ' << stime << ' ' << dummy;
847 if (match != 7) {
848 continue;
849 }
850
851 auto procp = llkTidLookup(tid);
852 if (procp == nullptr) {
853 procp = llkTidAlloc(tid, pid, ppid, pdir, utime + stime, state);
854 } else {
855 // comm can change ...
856 procp->setComm(pdir);
857 procp->updated = true;
858 // pid/ppid/tid wrap?
859 if (((procp->update != prevUpdate) && (procp->update != llkUpdate)) ||
860 (procp->ppid != ppid) || (procp->pid != pid)) {
861 procp->reset();
862 } else if (procp->time != (utime + stime)) { // secondary ABA.
863 // watching utime+stime granularity jiffy
864 procp->state = '?';
865 }
866 procp->update = llkUpdate;
867 procp->pid = pid;
868 procp->ppid = ppid;
869 procp->time = utime + stime;
870 if (procp->state != state) {
871 procp->count = 0ms;
872 procp->killed = false;
873 procp->state = state;
874 } else {
875 procp->count += llkCycle;
876 }
877 }
878
879 // Filter checks in intuitive order of CPU cost to evaluate
880 // If tid unique continue, if ppid or pid unique break
881
882 if (pid == myPid) {
883 break;
884 }
885 if (!llkIsMonitorState(state)) {
886 continue;
887 }
888 if ((tid == myTid) || llkSkipPid(tid)) {
889 continue;
890 }
891 if (llkSkipPpid(ppid)) {
892 break;
893 }
894
895 if (llkSkipName(procp->getComm())) {
896 continue;
897 }
898 if (llkSkipName(procp->getCmdline())) {
899 break;
900 }
901
902 auto pprocp = llkTidLookup(ppid);
903 if (pprocp == nullptr) {
904 pprocp = llkTidAlloc(ppid, ppid, 0, "", 0, '?');
905 }
906 if ((pprocp != nullptr) && (llkSkipName(pprocp->getComm(), llkBlacklistParent) ||
907 llkSkipName(pprocp->getCmdline(), llkBlacklistParent))) {
908 break;
909 }
910
911 if ((llkBlacklistUid.size() != 0) && llkSkipUid(procp->getUid())) {
912 continue;
913 }
914
915 // ABA mitigation watching last time schedule activity happened
916 llkCheckSchedUpdate(procp, piddir);
917
918 // Can only fall through to here if registered D or Z state !!!
919 if (procp->count < llkStateTimeoutMs[(state == 'Z') ? llkStateZ : llkStateD]) {
920 LOG(VERBOSE) << state << ' ' << llkFormat(procp->count) << ' ' << ppid << "->"
921 << pid << "->" << tid << ' ' << procp->getComm();
922 continue;
923 }
924
925 // We have to kill it to determine difference between live lock
926 // and persistent state blocked on a resource. Is there something
927 // wrong with a process that has no forward scheduling progress in
928 // Z or D? Yes, generally means improper accounting in the
929 // process, but not always ...
930 //
931 // Whomever we hit with a test kill must accept the Android
932 // Aphorism that everything can be burned to the ground and
933 // must survive.
934 if (procp->killed == false) {
935 procp->killed = true;
936 // confirm: re-read uid before committing to a panic.
937 procp->uid = -1;
938 switch (state) {
939 case 'Z': // kill ppid to free up a Zombie
940 // Killing init will kernel panic without diagnostics
941 // so skip right to controlled kernel panic with
942 // diagnostics.
943 if (ppid == initPid) {
944 break;
945 }
946 LOG(WARNING) << "Z " << llkFormat(procp->count) << ' ' << ppid << "->"
947 << pid << "->" << tid << ' ' << procp->getComm() << " [kill]";
948 if ((llkKillOneProcess(pprocp, procp) >= 0) ||
949 (llkKillOneProcess(ppid, procp) >= 0)) {
950 continue;
951 }
952 break;
953
954 case 'D': // kill tid to free up an uninterruptible D
955 // If ABA is doing its job, we would not need or
956 // want the following. Test kill is a Hail Mary
957 // to make absolutely sure there is no forward
958 // scheduling progress. The cost when ABA is
959 // not working is we kill a process that likes to
960 // stay in 'D' state, instead of panicing the
961 // kernel (worse).
962 LOG(WARNING) << "D " << llkFormat(procp->count) << ' ' << pid << "->" << tid
963 << ' ' << procp->getComm() << " [kill]";
964 if ((llkKillOneProcess(llkTidLookup(pid), procp) >= 0) ||
965 (llkKillOneProcess(pid, 'D', tid) >= 0) ||
966 (llkKillOneProcess(procp, procp) >= 0) ||
967 (llkKillOneProcess(tid, 'D', tid) >= 0)) {
968 continue;
969 }
970 break;
971 }
972 }
973 // We are here because we have confirmed kernel live-lock
974 LOG(ERROR) << state << ' ' << llkFormat(procp->count) << ' ' << ppid << "->" << pid
975 << "->" << tid << ' ' << procp->getComm() << " [panic]";
976 llkPanicKernel(true, tid);
977 }
978 LOG(VERBOSE) << "+closedir()";
979 }
980 llkTopDirectory.rewind();
981 LOG(VERBOSE) << "closedir()";
982
983 // garbage collection of old process references
984 for (auto p = tids.begin(); p != tids.end();) {
985 if (!p->second.updated) {
986 IF_ALOG(LOG_VERBOSE, LOG_TAG) {
987 std::string ppidCmdline = llkProcGetName(p->second.ppid, nullptr, nullptr);
988 if (ppidCmdline.size()) {
989 ppidCmdline = "(" + ppidCmdline + ")";
990 }
991 std::string pidCmdline;
992 if (p->second.pid != p->second.tid) {
993 pidCmdline = llkProcGetName(p->second.pid, nullptr, p->second.getCmdline());
994 if (pidCmdline.size()) {
995 pidCmdline = "(" + pidCmdline + ")";
996 }
997 }
998 std::string tidCmdline =
999 llkProcGetName(p->second.tid, p->second.getComm(), p->second.getCmdline());
1000 if (tidCmdline.size()) {
1001 tidCmdline = "(" + tidCmdline + ")";
1002 }
1003 LOG(VERBOSE) << "thread " << p->second.ppid << ppidCmdline << "->" << p->second.pid
1004 << pidCmdline << "->" << p->second.tid << tidCmdline << " removed";
1005 }
1006 p = tids.erase(p);
1007 } else {
1008 ++p;
1009 }
1010 }
1011 if (__predict_false(tids.empty())) {
1012 llkTopDirectory.reset();
1013 }
1014
1015 llkCycle = llkCheckMs;
1016
1017 timespec end;
1018 ::clock_gettime(CLOCK_MONOTONIC_COARSE, &end);
1019 auto milli = llkGetTimespecDiffMs(&now, &end);
1020 LOG((milli > 10s) ? ERROR : (milli > 1s) ? WARNING : VERBOSE) << "sample " << llkFormat(milli);
1021
1022 // cap to minimum sleep for 1 second since last cycle
1023 if (llkCycle < (ms + 1s)) {
1024 return 1s;
1025 }
1026 return llkCycle - ms;
1027}
1028
1029unsigned llkCheckMilliseconds() {
1030 return duration_cast<milliseconds>(llkCheck()).count();
1031}
1032
1033bool llkInit(const char* threadname) {
1034 llkLowRam = android::base::GetBoolProperty("ro.config.low_ram", false);
1035 llkEnable = android::base::GetBoolProperty(LLK_ENABLE_PROPERTY, llkEnable);
1036 if (llkEnable && !llkTopDirectory.reset(procdir)) {
1037 // Most likely reason we could be here is llkd was started
1038 // incorrectly without the readproc permissions. Keep our
1039 // processing down to a minimum.
1040 llkEnable = false;
1041 }
1042 khtEnable = android::base::GetBoolProperty(KHT_ENABLE_PROPERTY, khtEnable);
1043 llkMlockall = android::base::GetBoolProperty(LLK_MLOCKALL_PROPERTY, llkMlockall);
1044 // if LLK_TIMOUT_MS_PROPERTY was not set, we will use a set
1045 // KHT_TIMEOUT_PROPERTY as co-operative guidance for the default value.
1046 khtTimeout = GetUintProperty(KHT_TIMEOUT_PROPERTY, khtTimeout);
1047 if (khtTimeout == 0s) {
1048 khtTimeout = duration_cast<seconds>(llkTimeoutMs * (1 + LLK_CHECKS_PER_TIMEOUT_DEFAULT) /
1049 LLK_CHECKS_PER_TIMEOUT_DEFAULT);
1050 }
1051 llkTimeoutMs =
1052 khtTimeout * LLK_CHECKS_PER_TIMEOUT_DEFAULT / (1 + LLK_CHECKS_PER_TIMEOUT_DEFAULT);
1053 llkTimeoutMs = GetUintProperty(LLK_TIMEOUT_MS_PROPERTY, llkTimeoutMs);
1054 llkValidate(); // validate llkTimeoutMs, llkCheckMs and llkCycle
1055 llkStateTimeoutMs[llkStateD] = GetUintProperty(LLK_D_TIMEOUT_MS_PROPERTY, llkTimeoutMs);
1056 llkStateTimeoutMs[llkStateZ] = GetUintProperty(LLK_Z_TIMEOUT_MS_PROPERTY, llkTimeoutMs);
1057 llkCheckMs = GetUintProperty(LLK_CHECK_MS_PROPERTY, llkCheckMs);
1058 llkValidate(); // validate all (effectively minus llkTimeoutMs)
1059 std::string defaultBlacklistProcess(
1060 std::to_string(kernelPid) + "," + std::to_string(initPid) + "," +
1061 std::to_string(kthreaddPid) + "," + std::to_string(::getpid()) + "," +
1062 std::to_string(::gettid()) + "," LLK_BLACKLIST_PROCESS_DEFAULT);
1063 if (threadname) {
1064 defaultBlacklistProcess += std::string(",") + threadname;
1065 }
1066 for (int cpu = 1; cpu < get_nprocs_conf(); ++cpu) {
1067 defaultBlacklistProcess += ",[watchdog/" + std::to_string(cpu) + "]";
1068 }
1069 defaultBlacklistProcess =
1070 android::base::GetProperty(LLK_BLACKLIST_PROCESS_PROPERTY, defaultBlacklistProcess);
1071 llkBlacklistProcess = llkSplit(defaultBlacklistProcess);
1072 if (!llkSkipName("[khungtaskd]")) { // ALWAYS ignore as special
1073 llkBlacklistProcess.emplace("[khungtaskd]");
1074 }
1075 llkBlacklistParent = llkSplit(android::base::GetProperty(
1076 LLK_BLACKLIST_PARENT_PROPERTY, std::to_string(kernelPid) + "," + std::to_string(kthreaddPid) +
1077 "," LLK_BLACKLIST_PARENT_DEFAULT));
1078 llkBlacklistUid =
1079 llkSplit(android::base::GetProperty(LLK_BLACKLIST_UID_PROPERTY, LLK_BLACKLIST_UID_DEFAULT));
1080
1081 // internal watchdog
1082 ::signal(SIGALRM, llkAlarmHandler);
1083
1084 // kernel hung task configuration? Otherwise leave it as-is
1085 if (khtEnable) {
1086 // EUID must be AID_ROOT to write to /proc/sys/kernel/ nodes, there
1087 // are no capability overrides. For security reasons we do not want
1088 // to run as AID_ROOT. We may not be able to write them successfully,
1089 // we will try, but the least we can do is read the values back to
1090 // confirm expectations and report whether configured or not.
1091 auto configured = llkWriteStringToFileConfirm(std::to_string(khtTimeout.count()),
1092 "/proc/sys/kernel/hung_task_timeout_secs");
1093 if (configured) {
1094 llkWriteStringToFile("65535", "/proc/sys/kernel/hung_task_warnings");
1095 llkWriteStringToFile("65535", "/proc/sys/kernel/hung_task_check_count");
1096 configured = llkWriteStringToFileConfirm("1", "/proc/sys/kernel/hung_task_panic");
1097 }
1098 if (configured) {
1099 LOG(INFO) << "[khungtaskd] configured";
1100 } else {
1101 LOG(WARNING) << "[khungtaskd] not configurable";
1102 }
1103 }
1104
1105 bool logConfig = true;
1106 if (llkEnable) {
1107 if (llkMlockall &&
1108 // MCL_ONFAULT pins pages as they fault instead of loading
1109 // everything immediately all at once. (Which would be bad,
1110 // because as of this writing, we have a lot of mapped pages we
1111 // never use.) Old kernels will see MCL_ONFAULT and fail with
1112 // EINVAL; we ignore this failure.
1113 //
1114 // N.B. read the man page for mlockall. MCL_CURRENT | MCL_ONFAULT
1115 // pins ⊆ MCL_CURRENT, converging to just MCL_CURRENT as we fault
1116 // in pages.
1117
1118 // CAP_IPC_LOCK required
1119 mlockall(MCL_CURRENT | MCL_FUTURE | MCL_ONFAULT) && (errno != EINVAL)) {
1120 PLOG(WARNING) << "mlockall failed ";
1121 }
1122
1123 if (threadname) {
1124 pthread_attr_t attr;
1125
1126 if (!pthread_attr_init(&attr)) {
1127 sched_param param;
1128
1129 memset(&param, 0, sizeof(param));
1130 pthread_attr_setschedparam(&attr, &param);
1131 pthread_attr_setschedpolicy(&attr, SCHED_BATCH);
1132 if (!pthread_attr_setdetachstate(&attr, PTHREAD_CREATE_DETACHED)) {
1133 pthread_t thread;
1134 if (!pthread_create(&thread, &attr, llkThread, const_cast<char*>(threadname))) {
1135 // wait a second for thread to start
1136 for (auto retry = 50; retry && !llkRunning; --retry) {
1137 ::usleep(20000);
1138 }
1139 logConfig = !llkRunning; // printed in llkd context?
1140 } else {
1141 LOG(ERROR) << "failed to spawn llkd thread";
1142 }
1143 } else {
1144 LOG(ERROR) << "failed to detach llkd thread";
1145 }
1146 pthread_attr_destroy(&attr);
1147 } else {
1148 LOG(ERROR) << "failed to allocate attibutes for llkd thread";
1149 }
1150 }
1151 } else {
1152 LOG(DEBUG) << "[khungtaskd] left unconfigured";
1153 }
1154 if (logConfig) {
1155 llkLogConfig();
1156 }
1157
1158 return llkEnable;
1159}