Upload GPU frequency info via statsd

Bug: b/213577484
Change-Id: I6a1035932db3426e69aad81df8b2d03f6b37c9d6
diff --git a/services/gpuservice/gpuwork/Android.bp b/services/gpuservice/gpuwork/Android.bp
index e6bad47..89b31a6 100644
--- a/services/gpuservice/gpuwork/Android.bp
+++ b/services/gpuservice/gpuwork/Android.bp
@@ -26,10 +26,13 @@
     ],
     shared_libs: [
         "libbase",
+        "libbinder",
         "libbpf_bcc",
         "libbpf_android",
         "libcutils",
         "liblog",
+        "libstatslog",
+        "libstatspull",
         "libutils",
     ],
     export_include_dirs: [
@@ -41,6 +44,7 @@
     export_shared_lib_headers: [
         "libbase",
         "libbpf_android",
+        "libstatspull",
     ],
     cppflags: [
         "-Wall",
diff --git a/services/gpuservice/gpuwork/GpuWork.cpp b/services/gpuservice/gpuwork/GpuWork.cpp
index e2b7d34..e7b1cd4 100644
--- a/services/gpuservice/gpuwork/GpuWork.cpp
+++ b/services/gpuservice/gpuwork/GpuWork.cpp
@@ -21,16 +21,22 @@
 #include "gpuwork/GpuWork.h"
 
 #include <android-base/stringprintf.h>
+#include <binder/PermissionCache.h>
 #include <bpf/WaitForProgsLoaded.h>
 #include <libbpf.h>
 #include <libbpf_android.h>
 #include <log/log.h>
+#include <random>
+#include <stats_event.h>
+#include <statslog.h>
 #include <unistd.h>
 #include <utils/Timers.h>
 #include <utils/Trace.h>
 
+#include <bit>
 #include <chrono>
 #include <cstdint>
+#include <limits>
 #include <map>
 #include <mutex>
 #include <unordered_map>
@@ -58,6 +64,37 @@
     return true;
 }
 
+template <typename SourceType>
+inline int32_t cast_int32(SourceType) = delete;
+
+template <typename SourceType>
+inline int32_t bitcast_int32(SourceType) = delete;
+
+template <>
+inline int32_t bitcast_int32<uint32_t>(uint32_t source) {
+    int32_t result;
+    memcpy(&result, &source, sizeof(result));
+    return result;
+}
+
+template <>
+inline int32_t cast_int32<uint64_t>(uint64_t source) {
+    if (source > std::numeric_limits<int32_t>::max()) {
+        return std::numeric_limits<int32_t>::max();
+    }
+    return static_cast<int32_t>(source);
+}
+
+template <>
+inline int32_t cast_int32<long long>(long long source) {
+    if (source > std::numeric_limits<int32_t>::max()) {
+        return std::numeric_limits<int32_t>::max();
+    } else if (source < std::numeric_limits<int32_t>::min()) {
+        return std::numeric_limits<int32_t>::min();
+    }
+    return static_cast<int32_t>(source);
+}
+
 } // namespace
 
 using base::StringAppendF;
@@ -76,6 +113,13 @@
         mMapClearerThread.join();
     }
 
+    {
+        std::scoped_lock<std::mutex> lock(mMutex);
+        if (mStatsdRegistered) {
+            AStatsManager_clearPullAtomCallback(android::util::GPU_FREQ_TIME_IN_STATE_PER_UID);
+        }
+    }
+
     bpf_detach_tracepoint("power", "gpu_work_period");
 }
 
@@ -83,6 +127,8 @@
     // Make sure BPF programs are loaded.
     bpf::waitForProgsLoaded();
 
+    waitForPermissions();
+
     // Get the BPF maps before trying to attach the BPF program; if we can't get
     // the maps then there is no point in attaching the BPF program.
     {
@@ -95,6 +141,8 @@
         if (!getBpfMap("/sys/fs/bpf/map_gpu_work_gpu_work_global_data", &mGpuWorkGlobalDataMap)) {
             return;
         }
+
+        mPreviousMapClearTimePoint = std::chrono::steady_clock::now();
     }
 
     // Attach the tracepoint ONLY if we got the map above.
@@ -108,6 +156,13 @@
 
     mMapClearerThread.swap(thread);
 
+    {
+        std::lock_guard<std::mutex> lock(mMutex);
+        AStatsManager_setPullAtomCallback(int32_t{android::util::GPU_FREQ_TIME_IN_STATE_PER_UID},
+                                          nullptr, GpuWork::pullAtomCallback, this);
+        mStatsdRegistered = true;
+    }
+
     ALOGI("Initialized!");
 
     mInitialized.store(true);
@@ -215,6 +270,127 @@
     return true;
 }
 
+AStatsManager_PullAtomCallbackReturn GpuWork::pullAtomCallback(int32_t atomTag,
+                                                               AStatsEventList* data,
+                                                               void* cookie) {
+    ATRACE_CALL();
+
+    GpuWork* gpuWork = reinterpret_cast<GpuWork*>(cookie);
+    if (atomTag == android::util::GPU_FREQ_TIME_IN_STATE_PER_UID) {
+        return gpuWork->pullFrequencyAtoms(data);
+    }
+
+    return AStatsManager_PULL_SKIP;
+}
+
+AStatsManager_PullAtomCallbackReturn GpuWork::pullFrequencyAtoms(AStatsEventList* data) {
+    ATRACE_CALL();
+
+    if (!data || !mInitialized.load()) {
+        return AStatsManager_PULL_SKIP;
+    }
+
+    std::lock_guard<std::mutex> lock(mMutex);
+
+    if (!mGpuWorkMap.isValid()) {
+        return AStatsManager_PULL_SKIP;
+    }
+
+    std::unordered_map<Uid, UidTrackingInfo> uidInfos;
+
+    // Iteration of BPF hash maps can be unreliable (no data races, but elements
+    // may be repeated), as the map is typically being modified by other
+    // threads. The buckets are all preallocated. Our eBPF program only updates
+    // entries (in-place) or adds entries. |GpuWork| only iterates or clears the
+    // map while holding |mMutex|. Given this, we should be able to iterate over
+    // all elements reliably. In the worst case, we might see elements more than
+    // once.
+
+    // Note that userspace reads of BPF maps make a copy of the value, and thus
+    // the returned value is not being concurrently accessed by the BPF program
+    // (no atomic reads needed below).
+
+    mGpuWorkMap.iterateWithValue(
+            [&uidInfos](const Uid& key, const UidTrackingInfo& value,
+                        const android::bpf::BpfMap<Uid, UidTrackingInfo>&) -> base::Result<void> {
+                uidInfos[key] = value;
+                return {};
+            });
+
+    ALOGI("pullFrequencyAtoms: uidInfos.size() == %zu", uidInfos.size());
+
+    // Get a list of just the UIDs; the order does not matter.
+    std::vector<Uid> uids;
+    for (const auto& pair : uidInfos) {
+        uids.push_back(pair.first);
+    }
+
+    std::random_device device;
+    std::default_random_engine random_engine(device());
+
+    // If we have more than |kNumSampledUids| UIDs, choose |kNumSampledUids|
+    // random UIDs. We swap them to the front of the list. Given the list
+    // indices 0..i..n-1, we have the following inclusive-inclusive ranges:
+    // - [0, i-1] == the randomly chosen elements.
+    // - [i, n-1] == the remaining unchosen elements.
+    if (uids.size() > kNumSampledUids) {
+        for (size_t i = 0; i < kNumSampledUids; ++i) {
+            std::uniform_int_distribution<size_t> uniform_dist(i, uids.size() - 1);
+            size_t random_index = uniform_dist(random_engine);
+            std::swap(uids[i], uids[random_index]);
+        }
+        // Only keep the front |kNumSampledUids| elements.
+        uids.resize(kNumSampledUids);
+    }
+
+    ALOGI("pullFrequencyAtoms: uids.size() == %zu", uids.size());
+
+    auto now = std::chrono::steady_clock::now();
+
+    int32_t duration = cast_int32(
+            std::chrono::duration_cast<std::chrono::seconds>(now - mPreviousMapClearTimePoint)
+                    .count());
+
+    for (const Uid uid : uids) {
+        const UidTrackingInfo& info = uidInfos[uid];
+        ALOGI("pullFrequencyAtoms: adding stats for UID %" PRIu32, uid);
+        android::util::addAStatsEvent(data, int32_t{android::util::GPU_FREQ_TIME_IN_STATE_PER_UID},
+                                      // uid
+                                      bitcast_int32(uid),
+                                      // time_duration_seconds
+                                      int32_t{duration},
+                                      // max_freq_mhz
+                                      int32_t{1000},
+                                      // freq_0_mhz_time_millis
+                                      cast_int32(info.frequency_times_ns[0] / 1000000),
+                                      // freq_50_mhz_time_millis
+                                      cast_int32(info.frequency_times_ns[1] / 1000000),
+                                      // ... etc. ...
+                                      cast_int32(info.frequency_times_ns[2] / 1000000),
+                                      cast_int32(info.frequency_times_ns[3] / 1000000),
+                                      cast_int32(info.frequency_times_ns[4] / 1000000),
+                                      cast_int32(info.frequency_times_ns[5] / 1000000),
+                                      cast_int32(info.frequency_times_ns[6] / 1000000),
+                                      cast_int32(info.frequency_times_ns[7] / 1000000),
+                                      cast_int32(info.frequency_times_ns[8] / 1000000),
+                                      cast_int32(info.frequency_times_ns[9] / 1000000),
+                                      cast_int32(info.frequency_times_ns[10] / 1000000),
+                                      cast_int32(info.frequency_times_ns[11] / 1000000),
+                                      cast_int32(info.frequency_times_ns[12] / 1000000),
+                                      cast_int32(info.frequency_times_ns[13] / 1000000),
+                                      cast_int32(info.frequency_times_ns[14] / 1000000),
+                                      cast_int32(info.frequency_times_ns[15] / 1000000),
+                                      cast_int32(info.frequency_times_ns[16] / 1000000),
+                                      cast_int32(info.frequency_times_ns[17] / 1000000),
+                                      cast_int32(info.frequency_times_ns[18] / 1000000),
+                                      cast_int32(info.frequency_times_ns[19] / 1000000),
+                                      // freq_1000_mhz_time_millis
+                                      cast_int32(info.frequency_times_ns[20] / 1000000));
+    }
+    clearMap();
+    return AStatsManager_PULL_SUCCESS;
+}
+
 void GpuWork::periodicallyClearMap() {
     std::unique_lock<std::mutex> lock(mMutex);
 
@@ -264,6 +440,21 @@
         return;
     }
 
+    clearMap();
+}
+
+void GpuWork::clearMap() {
+    if (!mInitialized.load() || !mGpuWorkMap.isValid() || !mGpuWorkGlobalDataMap.isValid()) {
+        ALOGW("Map clearing could not occur because we are not initialized properly");
+        return;
+    }
+
+    base::Result<GlobalData> globalData = mGpuWorkGlobalDataMap.readValue(0);
+    if (!globalData.ok()) {
+        ALOGW("Could not read BPF global data map entry");
+        return;
+    }
+
     // Iterating BPF maps to delete keys is tricky. If we just repeatedly call
     // |getFirstKey()| and delete that, we may loop forever (or for a long time)
     // because our BPF program might be repeatedly re-adding UID keys. Also,
@@ -290,6 +481,23 @@
     // |writeValue|.
     globalData.value().num_map_entries = 0;
     mGpuWorkGlobalDataMap.writeValue(0, globalData.value(), BPF_ANY);
+
+    // Update |mPreviousMapClearTimePoint| so we know when we started collecting
+    // the stats.
+    mPreviousMapClearTimePoint = std::chrono::steady_clock::now();
+}
+
+void GpuWork::waitForPermissions() {
+    const String16 permissionRegisterStatsPullAtom(kPermissionRegisterStatsPullAtom);
+    int count = 0;
+    while (!PermissionCache::checkPermission(permissionRegisterStatsPullAtom, getpid(), getuid())) {
+        if (++count > kPermissionsWaitTimeoutSeconds) {
+            ALOGW("Timed out waiting for android.permission.REGISTER_STATS_PULL_ATOM");
+            return;
+        }
+        // Retry.
+        sleep(1);
+    }
 }
 
 } // namespace gpuwork
diff --git a/services/gpuservice/gpuwork/include/gpuwork/GpuWork.h b/services/gpuservice/gpuwork/include/gpuwork/GpuWork.h
index 0ef10d0..b6f493d 100644
--- a/services/gpuservice/gpuwork/include/gpuwork/GpuWork.h
+++ b/services/gpuservice/gpuwork/include/gpuwork/GpuWork.h
@@ -17,6 +17,7 @@
 #pragma once
 
 #include <bpf/BpfMap.h>
+#include <stats_pull_atom_callback.h>
 #include <utils/Mutex.h>
 #include <utils/String16.h>
 #include <utils/Vector.h>
@@ -49,6 +50,13 @@
     static bool attachTracepoint(const char* program_path, const char* tracepoint_group,
                                  const char* tracepoint_name);
 
+    // Native atom puller callback registered in statsd.
+    static AStatsManager_PullAtomCallbackReturn pullAtomCallback(int32_t atomTag,
+                                                                 AStatsEventList* data,
+                                                                 void* cookie);
+
+    AStatsManager_PullAtomCallbackReturn pullFrequencyAtoms(AStatsEventList* data);
+
     // Periodically calls |clearMapIfNeeded| to clear the |mGpuWorkMap| map, if
     // needed.
     //
@@ -61,6 +69,14 @@
     // it.
     void clearMapIfNeeded() REQUIRES(mMutex);
 
+    // Clears the |mGpuWorkMap| map.
+    void clearMap() REQUIRES(mMutex);
+
+    // Waits for required permissions to become set. This seems to be needed
+    // because platform service permissions might not be set when a service
+    // first starts. See b/214085769.
+    void waitForPermissions();
+
     // Indicates whether our eBPF components have been initialized.
     std::atomic<bool> mInitialized = false;
 
@@ -89,6 +105,22 @@
     // The wait duration for the map clearer thread; the thread checks the map
     // every ~1 hour.
     static constexpr uint32_t kMapClearerWaitDurationSeconds = 60 * 60;
+
+    // Whether our |pullAtomCallback| function is registered.
+    bool mStatsdRegistered GUARDED_BY(mMutex) = false;
+
+    // The number of randomly chosen (i.e. sampled) UIDs to log stats for.
+    static constexpr int kNumSampledUids = 10;
+
+    // The previous time point at which |mGpuWorkMap| was cleared.
+    std::chrono::steady_clock::time_point mPreviousMapClearTimePoint GUARDED_BY(mMutex);
+
+    // Permission to register a statsd puller.
+    static constexpr char16_t kPermissionRegisterStatsPullAtom[] =
+            u"android.permission.REGISTER_STATS_PULL_ATOM";
+
+    // Time limit for waiting for permissions.
+    static constexpr int kPermissionsWaitTimeoutSeconds = 30;
 };
 
 } // namespace gpuwork