Use eBPF-based time-in-state monitoring for groups of threads

Bug: 169279846

Test: atest libtimeinstate_test
Change-Id: I69de04cd93f065e4e2e4d78b7bb5a35fea8811ca
diff --git a/libs/cputimeinstate/Android.bp b/libs/cputimeinstate/Android.bp
index b1943a4..e3cd085 100644
--- a/libs/cputimeinstate/Android.bp
+++ b/libs/cputimeinstate/Android.bp
@@ -33,5 +33,6 @@
         "-Wall",
         "-Wextra",
     ],
+    require_root: true,
 }
 
diff --git a/libs/cputimeinstate/cputimeinstate.cpp b/libs/cputimeinstate/cputimeinstate.cpp
index 6058430..4209dc5 100644
--- a/libs/cputimeinstate/cputimeinstate.cpp
+++ b/libs/cputimeinstate/cputimeinstate.cpp
@@ -59,6 +59,7 @@
 static unique_fd gTisMapFd;
 static unique_fd gConcurrentMapFd;
 static unique_fd gUidLastUpdateMapFd;
+static unique_fd gPidTisMapFd;
 
 static std::optional<std::vector<uint32_t>> readNumbersFromFile(const std::string &path) {
     std::string data;
@@ -139,6 +140,12 @@
             unique_fd{bpf_obj_get(BPF_FS_PATH "map_time_in_state_uid_last_update_map")};
     if (gUidLastUpdateMapFd < 0) return false;
 
+    gPidTisMapFd = unique_fd{mapRetrieveRO(BPF_FS_PATH "map_time_in_state_pid_time_in_state_map")};
+    if (gPidTisMapFd < 0) return false;
+
+    unique_fd trackedPidMapFd(mapRetrieveWO(BPF_FS_PATH "map_time_in_state_pid_tracked_map"));
+    if (trackedPidMapFd < 0) return false;
+
     gInitialized = true;
     return true;
 }
@@ -222,7 +229,8 @@
     }
 
     gTracking = attachTracepointProgram("sched", "sched_switch") &&
-            attachTracepointProgram("power", "cpu_frequency");
+            attachTracepointProgram("power", "cpu_frequency") &&
+            attachTracepointProgram("sched", "sched_process_free");
     return gTracking;
 }
 
@@ -502,5 +510,106 @@
     return true;
 }
 
+bool startTrackingProcessCpuTimes(pid_t pid) {
+    if (!gInitialized && !initGlobals()) return false;
+
+    unique_fd trackedPidHashMapFd(
+            mapRetrieveWO(BPF_FS_PATH "map_time_in_state_pid_tracked_hash_map"));
+    if (trackedPidHashMapFd < 0) return false;
+
+    unique_fd trackedPidMapFd(mapRetrieveWO(BPF_FS_PATH "map_time_in_state_pid_tracked_map"));
+    if (trackedPidMapFd < 0) return false;
+
+    for (uint32_t index = 0; index < MAX_TRACKED_PIDS; index++) {
+        // Find first available [index, pid] entry in the pid_tracked_hash_map map
+        if (writeToMapEntry(trackedPidHashMapFd, &index, &pid, BPF_NOEXIST) != 0) {
+            if (errno != EEXIST) {
+                return false;
+            }
+            continue; // This index is already taken
+        }
+
+        tracked_pid_t tracked_pid = {.pid = pid, .state = TRACKED_PID_STATE_ACTIVE};
+        if (writeToMapEntry(trackedPidMapFd, &index, &tracked_pid, BPF_ANY) != 0) {
+            return false;
+        }
+        return true;
+    }
+    return false;
+}
+
+// Marks the specified task identified by its PID (aka TID) for CPU time-in-state tracking
+// aggregated with other tasks sharing the same TGID and aggregation key.
+bool startAggregatingTaskCpuTimes(pid_t pid, uint16_t aggregationKey) {
+    if (!gInitialized && !initGlobals()) return false;
+
+    unique_fd taskAggregationMapFd(
+            mapRetrieveWO(BPF_FS_PATH "map_time_in_state_pid_task_aggregation_map"));
+    if (taskAggregationMapFd < 0) return false;
+
+    return writeToMapEntry(taskAggregationMapFd, &pid, &aggregationKey, BPF_ANY) == 0;
+}
+
+// Retrieves the times in ns that each thread spent running at each CPU freq, aggregated by
+// aggregation key.
+// Return contains no value on error, otherwise it contains a map from aggregation keys
+// to vectors of vectors using the format:
+// { aggKey0 -> [[t0_0_0, t0_0_1, ...], [t0_1_0, t0_1_1, ...], ...],
+//   aggKey1 -> [[t1_0_0, t1_0_1, ...], [t1_1_0, t1_1_1, ...], ...], ... }
+// where ti_j_k is the ns tid i spent running on the jth cluster at the cluster's kth lowest freq.
+std::optional<std::unordered_map<uint16_t, std::vector<std::vector<uint64_t>>>>
+getAggregatedTaskCpuFreqTimes(pid_t tgid, const std::vector<uint16_t> &aggregationKeys) {
+    if (!gInitialized && !initGlobals()) return {};
+
+    uint32_t maxFreqCount = 0;
+    std::vector<std::vector<uint64_t>> mapFormat;
+    for (const auto &freqList : gPolicyFreqs) {
+        if (freqList.size() > maxFreqCount) maxFreqCount = freqList.size();
+        mapFormat.emplace_back(freqList.size(), 0);
+    }
+
+    bool dataCollected = false;
+    std::unordered_map<uint16_t, std::vector<std::vector<uint64_t>>> map;
+    std::vector<tis_val_t> vals(gNCpus);
+    for (uint16_t aggregationKey : aggregationKeys) {
+        map.emplace(aggregationKey, mapFormat);
+
+        aggregated_task_tis_key_t key{.tgid = tgid, .aggregation_key = aggregationKey};
+        for (key.bucket = 0; key.bucket <= (maxFreqCount - 1) / FREQS_PER_ENTRY; ++key.bucket) {
+            if (findMapEntry(gPidTisMapFd, &key, vals.data()) != 0) {
+                if (errno != ENOENT) {
+                    return {};
+                }
+                continue;
+            } else {
+                dataCollected = true;
+            }
+
+            // Combine data by aggregating time-in-state data grouped by CPU cluster aka policy.
+            uint32_t offset = key.bucket * FREQS_PER_ENTRY;
+            uint32_t nextOffset = offset + FREQS_PER_ENTRY;
+            for (uint32_t j = 0; j < gNPolicies; ++j) {
+                if (offset >= gPolicyFreqs[j].size()) continue;
+                auto begin = map[key.aggregation_key][j].begin() + offset;
+                auto end = nextOffset < gPolicyFreqs[j].size() ? begin + FREQS_PER_ENTRY
+                                                               : map[key.aggregation_key][j].end();
+                for (const auto &cpu : gPolicyCpus[j]) {
+                    std::transform(begin, end, std::begin(vals[cpu].ar), begin,
+                                   std::plus<uint64_t>());
+                }
+            }
+        }
+    }
+
+    if (!dataCollected) {
+        // Check if eBPF is supported on this device. If it is, gTisMap should not be empty.
+        time_key_t key;
+        if (getFirstMapKey(gTisMapFd, &key) != 0) {
+            return {};
+        }
+    }
+    return map;
+}
+
 } // namespace bpf
 } // namespace android
diff --git a/libs/cputimeinstate/cputimeinstate.h b/libs/cputimeinstate/cputimeinstate.h
index b7600f5..87a328a 100644
--- a/libs/cputimeinstate/cputimeinstate.h
+++ b/libs/cputimeinstate/cputimeinstate.h
@@ -41,5 +41,10 @@
     getUidsUpdatedConcurrentTimes(uint64_t *lastUpdate);
 bool clearUidTimes(unsigned int uid);
 
+bool startTrackingProcessCpuTimes(pid_t pid);
+bool startAggregatingTaskCpuTimes(pid_t pid, uint16_t aggregationKey);
+std::optional<std::unordered_map<uint16_t, std::vector<std::vector<uint64_t>>>>
+getAggregatedTaskCpuFreqTimes(pid_t pid, const std::vector<uint16_t> &aggregationKeys);
+
 } // namespace bpf
 } // namespace android
diff --git a/libs/cputimeinstate/testtimeinstate.cpp b/libs/cputimeinstate/testtimeinstate.cpp
index 0d5f412..519689b 100644
--- a/libs/cputimeinstate/testtimeinstate.cpp
+++ b/libs/cputimeinstate/testtimeinstate.cpp
@@ -19,6 +19,8 @@
 
 #include <sys/sysinfo.h>
 
+#include <pthread.h>
+#include <semaphore.h>
 #include <numeric>
 #include <unordered_map>
 #include <vector>
@@ -504,5 +506,85 @@
     for (size_t i = 0; i < freqs->size(); ++i) EXPECT_EQ((*freqs)[i].size(), (*times)[i].size());
 }
 
+uint64_t timeNanos() {
+    struct timespec spec;
+    clock_gettime(CLOCK_MONOTONIC, &spec);
+    return spec.tv_sec * 1000000000 + spec.tv_nsec;
+}
+
+// Keeps CPU busy with some number crunching
+void useCpu() {
+    long sum = 0;
+    for (int i = 0; i < 100000; i++) {
+        sum *= i;
+    }
+}
+
+sem_t pingsem, pongsem;
+
+void *testThread(void *) {
+    for (int i = 0; i < 10; i++) {
+        sem_wait(&pingsem);
+        useCpu();
+        sem_post(&pongsem);
+    }
+    return nullptr;
+}
+
+TEST(TimeInStateTest, GetAggregatedTaskCpuFreqTimes) {
+    uint64_t startTimeNs = timeNanos();
+
+    sem_init(&pingsem, 0, 1);
+    sem_init(&pongsem, 0, 0);
+
+    pthread_t thread;
+    ASSERT_EQ(pthread_create(&thread, NULL, &testThread, NULL), 0);
+
+    // This process may have been running for some time, so when we start tracking
+    // CPU time, the very first switch may include the accumulated time.
+    // Yield the remainder of this timeslice to the newly created thread.
+    sem_wait(&pongsem);
+    sem_post(&pingsem);
+
+    pid_t tgid = getpid();
+    startTrackingProcessCpuTimes(tgid);
+
+    pid_t tid = pthread_gettid_np(thread);
+    startAggregatingTaskCpuTimes(tid, 42);
+
+    // Play ping-pong with the other thread to ensure that both threads get
+    // some CPU time.
+    for (int i = 0; i < 9; i++) {
+        sem_wait(&pongsem);
+        useCpu();
+        sem_post(&pingsem);
+    }
+
+    pthread_join(thread, NULL);
+
+    std::optional<std::unordered_map<uint16_t, std::vector<std::vector<uint64_t>>>> optionalMap =
+            getAggregatedTaskCpuFreqTimes(tgid, {0, 42});
+    ASSERT_TRUE(optionalMap);
+
+    std::unordered_map<uint16_t, std::vector<std::vector<uint64_t>>> map = *optionalMap;
+    ASSERT_EQ(map.size(), 2u);
+
+    uint64_t testDurationNs = timeNanos() - startTimeNs;
+    for (auto pair : map) {
+        uint16_t aggregationKey = pair.first;
+        ASSERT_TRUE(aggregationKey == 0 || aggregationKey == 42);
+
+        std::vector<std::vector<uint64_t>> timesInState = pair.second;
+        uint64_t totalCpuTime = 0;
+        for (size_t i = 0; i < timesInState.size(); i++) {
+            for (size_t j = 0; j < timesInState[i].size(); j++) {
+                totalCpuTime += timesInState[i][j];
+            }
+        }
+        ASSERT_GT(totalCpuTime, 0ul);
+        ASSERT_LE(totalCpuTime, testDurationNs);
+    }
+}
+
 } // namespace bpf
 } // namespace android