libprocessgroup: Poll on cgroup.events

In killProcessGroup we currently read cgroup.procs to find processes to
kill, send them kill signals until cgroup.procs is empty, then remove
the cgroup directory. The cgroup cannot be removed until all processes
are dead, otherwise we'll get an EBUSY error from the kernel.

There is a race in the kernel where cgroup.procs can read empty even
though the cgroup is pinned by processes which are still exiting, and
can't be removed yet. [1]

Let's use the populated field of cgroup.events instead of an empty
cgroup.procs file to determine when the cgroup is removable. In
addition to functioning like we expect, this is more efficient because
we can poll on cgroup.events instead of retrying kills and rereading
cgroup.procs every 5ms which should help reduce CPU contention and
cgroup lock contention.

It's still possible that it takes longer for a cgroup to become
unpopulated than our timeout allows, in which case we will fail to
remove the cgroup and leak kernel memory. But this change should help
reduce the probability of that happening.

[1] https://lore.kernel.org/all/CABdmKX3SOXpcK85a7cx3iXrwUj=i1yXqEz9i9zNkx8mB=ZXQ8A@mail.gmail.com/

Bug: 301871933
Change-Id: If7dcfb331f47e06994c9ac85ed08bbcce18cdad7
diff --git a/libprocessgroup/processgroup.cpp b/libprocessgroup/processgroup.cpp
index f594f7f..7e27d75 100644
--- a/libprocessgroup/processgroup.cpp
+++ b/libprocessgroup/processgroup.cpp
@@ -22,6 +22,7 @@
 #include <errno.h>
 #include <fcntl.h>
 #include <inttypes.h>
+#include <poll.h>
 #include <signal.h>
 #include <stdio.h>
 #include <stdlib.h>
@@ -30,6 +31,7 @@
 #include <unistd.h>
 
 #include <chrono>
+#include <cstring>
 #include <map>
 #include <memory>
 #include <mutex>
@@ -53,7 +55,8 @@
 
 using namespace std::chrono_literals;
 
-#define PROCESSGROUP_CGROUP_PROCS_FILE "/cgroup.procs"
+#define PROCESSGROUP_CGROUP_PROCS_FILE "cgroup.procs"
+#define PROCESSGROUP_CGROUP_EVENTS_FILE "cgroup.events"
 
 bool CgroupsAvailable() {
     static bool cgroups_available = access("/proc/cgroups", F_OK) == 0;
@@ -213,30 +216,21 @@
     return StringPrintf("%s/uid_%u/pid_%d", cgroup, uid, pid);
 }
 
-static int RemoveCgroup(const char* cgroup, uid_t uid, int pid, unsigned int retries) {
-    int ret = 0;
-    auto uid_pid_path = ConvertUidPidToPath(cgroup, uid, pid);
-
-    while (retries--) {
-        ret = rmdir(uid_pid_path.c_str());
-        // If we get an error 2 'No such file or directory' , that means the
-        // cgroup is already removed, treat it as success and return 0 for
-        // idempotency.
-        if (ret < 0 && errno == ENOENT) {
-            ret = 0;
-        }
-        if (!ret || errno != EBUSY || !retries) break;
-        std::this_thread::sleep_for(5ms);
-    }
+static int RemoveCgroup(const char* cgroup, uid_t uid, int pid) {
+    auto path = ConvertUidPidToPath(cgroup, uid, pid);
+    int ret = TEMP_FAILURE_RETRY(rmdir(path.c_str()));
 
     if (!ret && uid >= AID_ISOLATED_START && uid <= AID_ISOLATED_END) {
         // Isolated UIDs are unlikely to be reused soon after removal,
         // so free up the kernel resources for the UID level cgroup.
-        const auto uid_path = ConvertUidToPath(cgroup, uid);
-        ret = rmdir(uid_path.c_str());
-        if (ret < 0 && errno == ENOENT) {
-            ret = 0;
-        }
+        path = ConvertUidToPath(cgroup, uid);
+        ret = TEMP_FAILURE_RETRY(rmdir(path.c_str()));
+    }
+
+    if (ret < 0 && errno == ENOENT) {
+        // This function is idempoetent, but still warn here.
+        LOG(WARNING) << "RemoveCgroup: " << path << " does not exist.";
+        ret = 0;
     }
 
     return ret;
@@ -360,38 +354,33 @@
     return false;
 }
 
-// Returns number of processes killed on success
-// Returns 0 if there are no processes in the process cgroup left to kill
-// Returns -1 on error
-static int DoKillProcessGroupOnce(const char* cgroup, uid_t uid, int initialPid, int signal) {
-    // We separate all of the pids in the cgroup into those pids that are also the leaders of
-    // process groups (stored in the pgids set) and those that are not (stored in the pids set).
-    std::set<pid_t> pgids;
-    pgids.emplace(initialPid);
-    std::set<pid_t> pids;
-    int processes = 0;
-
-    std::unique_ptr<FILE, decltype(&fclose)> fd(nullptr, fclose);
+bool sendSignalToProcessGroup(uid_t uid, int initialPid, int signal) {
+    std::set<pid_t> pgids, pids;
 
     if (CgroupsAvailable()) {
-        auto path = ConvertUidPidToPath(cgroup, uid, initialPid) + PROCESSGROUP_CGROUP_PROCS_FILE;
-        fd.reset(fopen(path.c_str(), "re"));
-        if (!fd) {
-            if (errno == ENOENT) {
-                // This happens when the process is already dead or if, as the result of a bug, it
-                // has been migrated to another cgroup. An example of a bug that can cause migration
-                // to another cgroup is using the JoinCgroup action with a cgroup controller that
-                // has been activated in the v2 cgroup hierarchy.
-                goto kill;
-            }
-            PLOG(WARNING) << __func__ << " failed to open process cgroup uid " << uid << " pid "
-                          << initialPid;
-            return -1;
+        std::string hierarchy_root_path, cgroup_v2_path;
+        CgroupGetControllerPath(CGROUPV2_HIERARCHY_NAME, &hierarchy_root_path);
+        cgroup_v2_path = ConvertUidPidToPath(hierarchy_root_path.c_str(), uid, initialPid);
+
+        LOG(VERBOSE) << "Using " << PROCESSGROUP_CGROUP_PROCS_FILE << " to signal (" << signal
+                     << ") " << cgroup_v2_path;
+
+        // We separate all of the pids in the cgroup into those pids that are also the leaders of
+        // process groups (stored in the pgids set) and those that are not (stored in the pids set).
+        const auto procsfilepath = cgroup_v2_path + '/' + PROCESSGROUP_CGROUP_PROCS_FILE;
+        std::unique_ptr<FILE, decltype(&fclose)> fp(fopen(procsfilepath.c_str(), "re"), fclose);
+        if (!fp) {
+            // This should only happen if the cgroup has already been removed with a successful call
+            // to killProcessGroup. Callers should only retry sendSignalToProcessGroup or
+            // killProcessGroup calls if they fail without ENOENT.
+            PLOG(ERROR) << "Failed to open " << procsfilepath;
+            kill(-initialPid, signal);
+            return false;
         }
+
         pid_t pid;
         bool file_is_empty = true;
-        while (fscanf(fd.get(), "%d\n", &pid) == 1 && pid >= 0) {
-            processes++;
+        while (fscanf(fp.get(), "%d\n", &pid) == 1 && pid >= 0) {
             file_is_empty = false;
             if (pid == 0) {
                 // Should never happen...  but if it does, trying to kill this
@@ -421,7 +410,8 @@
         }
     }
 
-kill:
+    pgids.emplace(initialPid);
+
     // Kill all process groups.
     for (const auto pgid : pgids) {
         LOG(VERBOSE) << "Killing process group " << -pgid << " in uid " << uid
@@ -442,101 +432,174 @@
         }
     }
 
-    return (!fd || feof(fd.get())) ? processes : -1;
+    return true;
 }
 
-static int KillProcessGroup(uid_t uid, int initialPid, int signal, int retries) {
+template <typename T>
+static std::chrono::milliseconds toMillisec(T&& duration) {
+    return std::chrono::duration_cast<std::chrono::milliseconds>(duration);
+}
+
+enum class populated_status
+{
+    populated,
+    not_populated,
+    error
+};
+
+static populated_status cgroupIsPopulated(int events_fd) {
+    const std::string POPULATED_KEY("populated ");
+    const std::string::size_type MAX_EVENTS_FILE_SIZE = 32;
+
+    std::string buf;
+    buf.resize(MAX_EVENTS_FILE_SIZE);
+    ssize_t len = TEMP_FAILURE_RETRY(pread(events_fd, buf.data(), buf.size(), 0));
+    if (len == -1) {
+        PLOG(ERROR) << "Could not read cgroup.events: ";
+        // Potentially ENODEV if the cgroup has been removed since we opened this file, but that
+        // shouldn't have happened yet.
+        return populated_status::error;
+    }
+
+    if (len == 0) {
+        LOG(ERROR) << "cgroup.events EOF";
+        return populated_status::error;
+    }
+
+    buf.resize(len);
+
+    const std::string::size_type pos = buf.find(POPULATED_KEY);
+    if (pos == std::string::npos) {
+        LOG(ERROR) << "Could not find populated key in cgroup.events";
+        return populated_status::error;
+    }
+
+    if (pos + POPULATED_KEY.size() + 1 > len) {
+        LOG(ERROR) << "Partial read of cgroup.events";
+        return populated_status::error;
+    }
+
+    return buf[pos + POPULATED_KEY.size()] == '1' ?
+        populated_status::populated : populated_status::not_populated;
+}
+
+// The default timeout of 2200ms comes from the default number of retries in a previous
+// implementation of this function. The default retry value was 40 for killing and 400 for cgroup
+// removal with 5ms sleeps between each retry.
+static int KillProcessGroup(
+        uid_t uid, int initialPid, int signal, bool once = false,
+        std::chrono::steady_clock::time_point until = std::chrono::steady_clock::now() + 2200ms) {
     CHECK_GE(uid, 0);
     CHECK_GT(initialPid, 0);
 
+    // Always attempt to send a kill signal to at least the initialPid, at least once, regardless of
+    // whether its cgroup exists or not. This should only be necessary if a bug results in the
+    // migration of the targeted process out of its cgroup, which we will also attempt to kill.
+    const bool signal_ret = sendSignalToProcessGroup(uid, initialPid, signal);
+
+    if (!CgroupsAvailable() || !signal_ret) return signal_ret ? 0 : -1;
+
     std::string hierarchy_root_path;
-    if (CgroupsAvailable()) {
-        CgroupGetControllerPath(CGROUPV2_HIERARCHY_NAME, &hierarchy_root_path);
-    }
-    const char* cgroup = hierarchy_root_path.c_str();
+    CgroupGetControllerPath(CGROUPV2_HIERARCHY_NAME, &hierarchy_root_path);
 
-    std::chrono::steady_clock::time_point start = std::chrono::steady_clock::now();
+    const std::string cgroup_v2_path =
+            ConvertUidPidToPath(hierarchy_root_path.c_str(), uid, initialPid);
 
-    int retry = retries;
-    int processes;
-    while ((processes = DoKillProcessGroupOnce(cgroup, uid, initialPid, signal)) > 0) {
-        LOG(VERBOSE) << "Killed " << processes << " processes for processgroup " << initialPid;
-        if (!CgroupsAvailable()) {
-            // makes no sense to retry, because there are no cgroup_procs file
-            processes = 0;  // no remaining processes
-            break;
-        }
-        if (retry > 0) {
-            std::this_thread::sleep_for(5ms);
-            --retry;
-        } else {
-            break;
-        }
-    }
-
-    if (processes < 0) {
-        PLOG(ERROR) << "Error encountered killing process cgroup uid " << uid << " pid "
-                    << initialPid;
+    const std::string eventsfile = cgroup_v2_path + '/' + PROCESSGROUP_CGROUP_EVENTS_FILE;
+    android::base::unique_fd events_fd(open(eventsfile.c_str(), O_RDONLY));
+    if (events_fd.get() == -1) {
+        PLOG(WARNING) << "Error opening " << eventsfile << " for KillProcessGroup";
         return -1;
     }
 
-    std::chrono::steady_clock::time_point end = std::chrono::steady_clock::now();
-    auto ms = std::chrono::duration_cast<std::chrono::milliseconds>(end - start).count();
+    struct pollfd fds = {
+        .fd = events_fd,
+        .events = POLLPRI,
+    };
 
-    // We only calculate the number of 'processes' when killing the processes.
-    // In the retries == 0 case, we only kill the processes once and therefore
-    // will not have waited then recalculated how many processes are remaining
-    // after the first signals have been sent.
-    // Logging anything regarding the number of 'processes' here does not make sense.
+    const std::chrono::steady_clock::time_point start = std::chrono::steady_clock::now();
 
-    if (processes == 0) {
-        if (retries > 0) {
-            LOG(INFO) << "Successfully killed process cgroup uid " << uid << " pid " << initialPid
-                      << " in " << static_cast<int>(ms) << "ms";
+    // The primary reason to loop here is to capture any new forks or migrations that could occur
+    // after we send signals to the original set of processes, but before all of those processes
+    // exit and the cgroup becomes unpopulated, or before we remove the cgroup. We try hard to
+    // ensure this completes successfully to avoid permanent memory leaks, but we still place a
+    // large default upper bound on the amount of time we spend in this loop. The amount of CPU
+    // contention, and the amount of work that needs to be done in do_exit for each process
+    // determines how long this will take.
+    int ret;
+    do {
+        populated_status populated;
+        while ((populated = cgroupIsPopulated(events_fd.get())) == populated_status::populated &&
+               std::chrono::steady_clock::now() < until) {
+
+            sendSignalToProcessGroup(uid, initialPid, signal);
+            if (once) {
+                populated = cgroupIsPopulated(events_fd.get());
+                break;
+            }
+
+            const std::chrono::steady_clock::time_point poll_start =
+                    std::chrono::steady_clock::now();
+
+            if (poll_start < until)
+                ret = TEMP_FAILURE_RETRY(poll(&fds, 1, toMillisec(until - poll_start).count()));
+
+            if (ret == -1) {
+                // Fallback to 5ms sleeps if poll fails
+                PLOG(ERROR) << "Poll on " << eventsfile << "failed";
+                const std::chrono::steady_clock::time_point now = std::chrono::steady_clock::now();
+                if (now < until)
+                    std::this_thread::sleep_for(std::min(5ms, toMillisec(until - now)));
+            }
+
+            LOG(VERBOSE) << "Waited "
+                         << toMillisec(std::chrono::steady_clock::now() - poll_start).count()
+                         << " ms for " << eventsfile << " poll";
         }
 
-        if (!CgroupsAvailable()) {
-            // nothing to do here, if cgroups isn't available
-            return 0;
+        const std::chrono::milliseconds kill_duration =
+                toMillisec(std::chrono::steady_clock::now() - start);
+
+        if (populated == populated_status::populated) {
+            LOG(WARNING) << "Still waiting on process(es) to exit for cgroup " << cgroup_v2_path
+                         << " after " << kill_duration.count() << " ms";
+            // We'll still try the cgroup removal below which we expect to log an error.
+        } else if (populated == populated_status::not_populated) {
+            LOG(VERBOSE) << "Killed all processes under cgroup " << cgroup_v2_path
+                         << " after " << kill_duration.count() << " ms";
         }
 
-        // 400 retries correspond to 2 secs max timeout
-        int err = RemoveCgroup(cgroup, uid, initialPid, 400);
+        ret = RemoveCgroup(hierarchy_root_path.c_str(), uid, initialPid);
+        if (ret)
+            PLOG(ERROR) << "Unable to remove cgroup " << cgroup_v2_path;
+        else
+            LOG(INFO) << "Removed cgroup " << cgroup_v2_path;
 
         if (isMemoryCgroupSupported() && UsePerAppMemcg()) {
+            // This per-application memcg v1 case should eventually be removed after migration to
+            // memcg v2.
             std::string memcg_apps_path;
             if (CgroupGetMemcgAppsPath(&memcg_apps_path) &&
-                RemoveCgroup(memcg_apps_path.c_str(), uid, initialPid, 400) < 0) {
-                return -1;
+                (ret = RemoveCgroup(memcg_apps_path.c_str(), uid, initialPid)) < 0) {
+                const auto memcg_v1_cgroup_path =
+                        ConvertUidPidToPath(memcg_apps_path.c_str(), uid, initialPid);
+                PLOG(ERROR) << "Unable to remove memcg v1 cgroup " << memcg_v1_cgroup_path;
             }
         }
 
-        return err;
-    } else {
-        if (retries > 0) {
-            LOG(ERROR) << "Failed to kill process cgroup uid " << uid << " pid " << initialPid
-                       << " in " << static_cast<int>(ms) << "ms, " << processes
-                       << " processes remain";
-        }
-        return -1;
-    }
+        if (once) break;
+        if (std::chrono::steady_clock::now() >= until) break;
+    } while (ret && errno == EBUSY);
+
+    return ret;
 }
 
 int killProcessGroup(uid_t uid, int initialPid, int signal) {
-    return KillProcessGroup(uid, initialPid, signal, 40 /*retries*/);
+    return KillProcessGroup(uid, initialPid, signal);
 }
 
 int killProcessGroupOnce(uid_t uid, int initialPid, int signal) {
-    return KillProcessGroup(uid, initialPid, signal, 0 /*retries*/);
-}
-
-int sendSignalToProcessGroup(uid_t uid, int initialPid, int signal) {
-    std::string hierarchy_root_path;
-    if (CgroupsAvailable()) {
-        CgroupGetControllerPath(CGROUPV2_HIERARCHY_NAME, &hierarchy_root_path);
-    }
-    const char* cgroup = hierarchy_root_path.c_str();
-    return DoKillProcessGroupOnce(cgroup, uid, initialPid, signal);
+    return KillProcessGroup(uid, initialPid, signal, true);
 }
 
 static int createProcessGroupInternal(uid_t uid, int initialPid, std::string cgroup,
@@ -576,7 +639,7 @@
         return -errno;
     }
 
-    auto uid_pid_procs_file = uid_pid_path + PROCESSGROUP_CGROUP_PROCS_FILE;
+    auto uid_pid_procs_file = uid_pid_path + '/' + PROCESSGROUP_CGROUP_PROCS_FILE;
 
     if (!WriteStringToFile(std::to_string(initialPid), uid_pid_procs_file)) {
         ret = -errno;