GpuWork: fix gpu_work_period tracepoint

The tracepoint design was too fine-grained, too complex, and was missing
a gpu_id field.

- Fix the gpu_work_period tracepoint struct:

  - More coarse-grained.

  - Add gpu_id field.

  - Remove frequency and other fields.

- Update the documentation comment, adding more detail.

- Update the eBPF program and map structs to account for the tracepoint
  changes.

  - Aggregate the `total_active_duration_ns` and
    `total_inactive_duration_ns`.

  - Collect data for each (gpu_id, uid) pair.

- Update GpuWork class to account for the tracepoint changes.

  - Remove references to "frequency"; we now collect "gpu work".

  - Rework dumping code; dump active and inactive time.

  - Rework stats upload:

    - Log active and inactive time.

    - Log per (gpu_id, uid) pair.

    - Reduce the amount logged if we observe a large number of GPUs.

    - Remove UIDs with a very low GPU time.

Bug: b/221096545
Change-Id: I33d1c55f886913dad7b563e7b9a25cc1af0308f2
diff --git a/services/gpuservice/gpuwork/GpuWork.cpp b/services/gpuservice/gpuwork/GpuWork.cpp
index 67ce9f2..974ae38 100644
--- a/services/gpuservice/gpuwork/GpuWork.cpp
+++ b/services/gpuservice/gpuwork/GpuWork.cpp
@@ -39,17 +39,30 @@
 #include <map>
 #include <mutex>
 #include <unordered_map>
+#include <unordered_set>
 #include <vector>
 
 #include "gpuwork/gpu_work.h"
 
-#define MS_IN_NS (1000000)
+#define ONE_MS_IN_NS (10000000)
 
 namespace android {
 namespace gpuwork {
 
 namespace {
 
+bool lessThanGpuIdUid(const android::gpuwork::GpuIdUid& l, const android::gpuwork::GpuIdUid& r) {
+    return std::tie(l.gpu_id, l.uid) < std::tie(r.gpu_id, r.uid);
+}
+
+size_t hashGpuIdUid(const android::gpuwork::GpuIdUid& gpuIdUid) {
+    return static_cast<size_t>((gpuIdUid.gpu_id << 5U) + gpuIdUid.uid);
+}
+
+bool equalGpuIdUid(const android::gpuwork::GpuIdUid& l, const android::gpuwork::GpuIdUid& r) {
+    return std::tie(l.gpu_id, l.uid) == std::tie(r.gpu_id, r.uid);
+}
+
 // Gets a BPF map from |mapPath|.
 template <class Key, class Value>
 bool getBpfMap(const char* mapPath, bpf::BpfMap<Key, Value>* out) {
@@ -76,24 +89,6 @@
     return result;
 }
 
-template <>
-inline int32_t cast_int32<uint64_t>(uint64_t source) {
-    if (source > std::numeric_limits<int32_t>::max()) {
-        return std::numeric_limits<int32_t>::max();
-    }
-    return static_cast<int32_t>(source);
-}
-
-template <>
-inline int32_t cast_int32<long long>(long long source) {
-    if (source > std::numeric_limits<int32_t>::max()) {
-        return std::numeric_limits<int32_t>::max();
-    } else if (source < std::numeric_limits<int32_t>::min()) {
-        return std::numeric_limits<int32_t>::min();
-    }
-    return static_cast<int32_t>(source);
-}
-
 } // namespace
 
 using base::StringAppendF;
@@ -115,7 +110,7 @@
     {
         std::scoped_lock<std::mutex> lock(mMutex);
         if (mStatsdRegistered) {
-            AStatsManager_clearPullAtomCallback(android::util::GPU_FREQ_TIME_IN_STATE_PER_UID);
+            AStatsManager_clearPullAtomCallback(android::util::GPU_WORK_PER_UID);
         }
     }
 
@@ -144,7 +139,7 @@
         mPreviousMapClearTimePoint = std::chrono::steady_clock::now();
     }
 
-    // Attach the tracepoint ONLY if we got the map above.
+    // Attach the tracepoint.
     if (!attachTracepoint("/sys/fs/bpf/prog_gpu_work_tracepoint_power_gpu_work_period", "power",
                           "gpu_work_period")) {
         return;
@@ -157,8 +152,8 @@
 
     {
         std::lock_guard<std::mutex> lock(mMutex);
-        AStatsManager_setPullAtomCallback(int32_t{android::util::GPU_FREQ_TIME_IN_STATE_PER_UID},
-                                          nullptr, GpuWork::pullAtomCallback, this);
+        AStatsManager_setPullAtomCallback(int32_t{android::util::GPU_WORK_PER_UID}, nullptr,
+                                          GpuWork::pullAtomCallback, this);
         mStatsdRegistered = true;
     }
 
@@ -169,18 +164,18 @@
 
 void GpuWork::dump(const Vector<String16>& /* args */, std::string* result) {
     if (!mInitialized.load()) {
-        result->append("GPU time in state information is not available.\n");
+        result->append("GPU work information is not available.\n");
         return;
     }
 
-    // Ordered map ensures output data is sorted by UID.
-    std::map<Uid, UidTrackingInfo> dumpMap;
+    // Ordered map ensures output data is sorted.
+    std::map<GpuIdUid, UidTrackingInfo, decltype(lessThanGpuIdUid)*> dumpMap(&lessThanGpuIdUid);
 
     {
         std::lock_guard<std::mutex> lock(mMutex);
 
         if (!mGpuWorkMap.isValid()) {
-            result->append("GPU time in state map is not available.\n");
+            result->append("GPU work map is not available.\n");
             return;
         }
 
@@ -189,56 +184,42 @@
         // threads. The buckets are all preallocated. Our eBPF program only updates
         // entries (in-place) or adds entries. |GpuWork| only iterates or clears the
         // map while holding |mMutex|. Given this, we should be able to iterate over
-        // all elements reliably. In the worst case, we might see elements more than
-        // once.
+        // all elements reliably. Nevertheless, we copy into a map to avoid
+        // duplicates.
 
         // Note that userspace reads of BPF maps make a copy of the value, and
         // thus the returned value is not being concurrently accessed by the BPF
         // program (no atomic reads needed below).
 
-        mGpuWorkMap.iterateWithValue([&dumpMap](const Uid& key, const UidTrackingInfo& value,
-                                                const android::bpf::BpfMap<Uid, UidTrackingInfo>&)
-                                             -> base::Result<void> {
-            dumpMap[key] = value;
-            return {};
-        });
+        mGpuWorkMap.iterateWithValue(
+                [&dumpMap](const GpuIdUid& key, const UidTrackingInfo& value,
+                           const android::bpf::BpfMap<GpuIdUid, UidTrackingInfo>&)
+                        -> base::Result<void> {
+                    dumpMap[key] = value;
+                    return {};
+                });
     }
 
-    // Find the largest frequency where some UID has spent time in that frequency.
-    size_t largestFrequencyWithTime = 0;
-    for (const auto& uidToUidInfo : dumpMap) {
-        for (size_t i = largestFrequencyWithTime + 1; i < kNumTrackedFrequencies; ++i) {
-            if (uidToUidInfo.second.frequency_times_ns[i] > 0) {
-                largestFrequencyWithTime = i;
-            }
-        }
-    }
-
-    // Dump time in state information.
+    // Dump work information.
     // E.g.
-    // uid/freq: 0MHz 50MHz 100MHz ...
-    // 1000: 0 0 0 0 ...
-    // 1003: 0 0 3456 0 ...
-    // [errors:3]1006: 0 0 3456 0 ...
+    // GPU work information.
+    // gpu_id uid total_active_duration_ns total_inactive_duration_ns
+    // 0 1000 0 0
+    // 0 1003 1234 123
+    // [errors:3]0 1006 4567 456
 
     // Header.
-    result->append("GPU time in frequency state in ms.\n");
-    result->append("uid/freq: 0MHz");
-    for (size_t i = 1; i <= largestFrequencyWithTime; ++i) {
-        StringAppendF(result, " %zuMHz", i * 50);
-    }
-    result->append("\n");
+    result->append("GPU work information.\ngpu_id uid total_active_duration_ns "
+                   "total_inactive_duration_ns\n");
 
-    for (const auto& uidToUidInfo : dumpMap) {
-        if (uidToUidInfo.second.error_count) {
-            StringAppendF(result, "[errors:%" PRIu32 "]", uidToUidInfo.second.error_count);
+    for (const auto& idToUidInfo : dumpMap) {
+        if (idToUidInfo.second.error_count) {
+            StringAppendF(result, "[errors:%" PRIu32 "]", idToUidInfo.second.error_count);
         }
-        StringAppendF(result, "%" PRIu32 ":", uidToUidInfo.first);
-        for (size_t i = 0; i <= largestFrequencyWithTime; ++i) {
-            StringAppendF(result, " %" PRIu64,
-                          uidToUidInfo.second.frequency_times_ns[i] / MS_IN_NS);
-        }
-        result->append("\n");
+        StringAppendF(result, "%" PRIu32 " %" PRIu32 " %" PRIu64 " %" PRIu64 "\n",
+                      idToUidInfo.first.gpu_id, idToUidInfo.first.uid,
+                      idToUidInfo.second.total_active_duration_ns,
+                      idToUidInfo.second.total_inactive_duration_ns);
     }
 }
 
@@ -275,14 +256,14 @@
     ATRACE_CALL();
 
     GpuWork* gpuWork = reinterpret_cast<GpuWork*>(cookie);
-    if (atomTag == android::util::GPU_FREQ_TIME_IN_STATE_PER_UID) {
-        return gpuWork->pullFrequencyAtoms(data);
+    if (atomTag == android::util::GPU_WORK_PER_UID) {
+        return gpuWork->pullWorkAtoms(data);
     }
 
     return AStatsManager_PULL_SKIP;
 }
 
-AStatsManager_PullAtomCallbackReturn GpuWork::pullFrequencyAtoms(AStatsEventList* data) {
+AStatsManager_PullAtomCallbackReturn GpuWork::pullWorkAtoms(AStatsEventList* data) {
     ATRACE_CALL();
 
     if (!data || !mInitialized.load()) {
@@ -295,96 +276,153 @@
         return AStatsManager_PULL_SKIP;
     }
 
-    std::unordered_map<Uid, UidTrackingInfo> uidInfos;
+    std::unordered_map<GpuIdUid, UidTrackingInfo, decltype(hashGpuIdUid)*, decltype(equalGpuIdUid)*>
+            workMap(32, &hashGpuIdUid, &equalGpuIdUid);
 
     // Iteration of BPF hash maps can be unreliable (no data races, but elements
     // may be repeated), as the map is typically being modified by other
     // threads. The buckets are all preallocated. Our eBPF program only updates
     // entries (in-place) or adds entries. |GpuWork| only iterates or clears the
     // map while holding |mMutex|. Given this, we should be able to iterate over
-    // all elements reliably. In the worst case, we might see elements more than
-    // once.
+    // all elements reliably. Nevertheless, we copy into a map to avoid
+    // duplicates.
 
     // Note that userspace reads of BPF maps make a copy of the value, and thus
     // the returned value is not being concurrently accessed by the BPF program
     // (no atomic reads needed below).
 
-    mGpuWorkMap.iterateWithValue(
-            [&uidInfos](const Uid& key, const UidTrackingInfo& value,
-                        const android::bpf::BpfMap<Uid, UidTrackingInfo>&) -> base::Result<void> {
-                uidInfos[key] = value;
-                return {};
-            });
-
-    ALOGI("pullFrequencyAtoms: uidInfos.size() == %zu", uidInfos.size());
+    mGpuWorkMap.iterateWithValue([&workMap](const GpuIdUid& key, const UidTrackingInfo& value,
+                                            const android::bpf::BpfMap<GpuIdUid, UidTrackingInfo>&)
+                                         -> base::Result<void> {
+        workMap[key] = value;
+        return {};
+    });
 
     // Get a list of just the UIDs; the order does not matter.
     std::vector<Uid> uids;
-    for (const auto& pair : uidInfos) {
-        uids.push_back(pair.first);
+    // Get a list of the GPU IDs, in order.
+    std::set<uint32_t> gpuIds;
+    {
+        // To avoid adding duplicate UIDs.
+        std::unordered_set<Uid> addedUids;
+
+        for (const auto& workInfo : workMap) {
+            if (addedUids.insert(workInfo.first.uid).second) {
+                // Insertion was successful.
+                uids.push_back(workInfo.first.uid);
+            }
+            gpuIds.insert(workInfo.first.gpu_id);
+        }
     }
 
+    ALOGI("pullWorkAtoms: uids.size() == %zu", uids.size());
+    ALOGI("pullWorkAtoms: gpuIds.size() == %zu", gpuIds.size());
+
+    if (gpuIds.size() > kNumGpusHardLimit) {
+        // If we observe a very high number of GPUs then something has probably
+        // gone wrong, so don't log any atoms.
+        return AStatsManager_PULL_SKIP;
+    }
+
+    size_t numSampledUids = kNumSampledUids;
+
+    if (gpuIds.size() > kNumGpusSoftLimit) {
+        // If we observe a high number of GPUs then we just sample 1 UID.
+        numSampledUids = 1;
+    }
+
+    // Remove all UIDs that do not have at least |kMinGpuTimeNanoseconds| on at
+    // least one GPU.
+    {
+        auto uidIt = uids.begin();
+        while (uidIt != uids.end()) {
+            bool hasEnoughGpuTime = false;
+            for (uint32_t gpuId : gpuIds) {
+                auto infoIt = workMap.find(GpuIdUid{gpuId, *uidIt});
+                if (infoIt == workMap.end()) {
+                    continue;
+                }
+                if (infoIt->second.total_active_duration_ns +
+                            infoIt->second.total_inactive_duration_ns >=
+                    kMinGpuTimeNanoseconds) {
+                    hasEnoughGpuTime = true;
+                    break;
+                }
+            }
+            if (hasEnoughGpuTime) {
+                ++uidIt;
+            } else {
+                uidIt = uids.erase(uidIt);
+            }
+        }
+    }
+
+    ALOGI("pullWorkAtoms: after removing uids with very low GPU time: uids.size() == %zu",
+          uids.size());
+
     std::random_device device;
     std::default_random_engine random_engine(device());
 
-    // If we have more than |kNumSampledUids| UIDs, choose |kNumSampledUids|
+    // If we have more than |numSampledUids| UIDs, choose |numSampledUids|
     // random UIDs. We swap them to the front of the list. Given the list
     // indices 0..i..n-1, we have the following inclusive-inclusive ranges:
     // - [0, i-1] == the randomly chosen elements.
     // - [i, n-1] == the remaining unchosen elements.
-    if (uids.size() > kNumSampledUids) {
-        for (size_t i = 0; i < kNumSampledUids; ++i) {
+    if (uids.size() > numSampledUids) {
+        for (size_t i = 0; i < numSampledUids; ++i) {
             std::uniform_int_distribution<size_t> uniform_dist(i, uids.size() - 1);
             size_t random_index = uniform_dist(random_engine);
             std::swap(uids[i], uids[random_index]);
         }
-        // Only keep the front |kNumSampledUids| elements.
-        uids.resize(kNumSampledUids);
+        // Only keep the front |numSampledUids| elements.
+        uids.resize(numSampledUids);
     }
 
-    ALOGI("pullFrequencyAtoms: uids.size() == %zu", uids.size());
+    ALOGI("pullWorkAtoms: after random selection: uids.size() == %zu", uids.size());
 
     auto now = std::chrono::steady_clock::now();
-
-    int32_t duration = cast_int32(
+    long long duration =
             std::chrono::duration_cast<std::chrono::seconds>(now - mPreviousMapClearTimePoint)
-                    .count());
+                    .count();
+    if (duration > std::numeric_limits<int32_t>::max() || duration < 0) {
+        // This is essentially impossible. If it does somehow happen, give up,
+        // but still clear the map.
+        clearMap();
+        return AStatsManager_PULL_SKIP;
+    }
 
-    for (const Uid uid : uids) {
-        const UidTrackingInfo& info = uidInfos[uid];
-        ALOGI("pullFrequencyAtoms: adding stats for UID %" PRIu32, uid);
-        android::util::addAStatsEvent(data, int32_t{android::util::GPU_FREQ_TIME_IN_STATE_PER_UID},
-                                      // uid
-                                      bitcast_int32(uid),
-                                      // time_duration_seconds
-                                      int32_t{duration},
-                                      // max_freq_mhz
-                                      int32_t{1000},
-                                      // freq_0_mhz_time_millis
-                                      cast_int32(info.frequency_times_ns[0] / 1000000),
-                                      // freq_50_mhz_time_millis
-                                      cast_int32(info.frequency_times_ns[1] / 1000000),
-                                      // ... etc. ...
-                                      cast_int32(info.frequency_times_ns[2] / 1000000),
-                                      cast_int32(info.frequency_times_ns[3] / 1000000),
-                                      cast_int32(info.frequency_times_ns[4] / 1000000),
-                                      cast_int32(info.frequency_times_ns[5] / 1000000),
-                                      cast_int32(info.frequency_times_ns[6] / 1000000),
-                                      cast_int32(info.frequency_times_ns[7] / 1000000),
-                                      cast_int32(info.frequency_times_ns[8] / 1000000),
-                                      cast_int32(info.frequency_times_ns[9] / 1000000),
-                                      cast_int32(info.frequency_times_ns[10] / 1000000),
-                                      cast_int32(info.frequency_times_ns[11] / 1000000),
-                                      cast_int32(info.frequency_times_ns[12] / 1000000),
-                                      cast_int32(info.frequency_times_ns[13] / 1000000),
-                                      cast_int32(info.frequency_times_ns[14] / 1000000),
-                                      cast_int32(info.frequency_times_ns[15] / 1000000),
-                                      cast_int32(info.frequency_times_ns[16] / 1000000),
-                                      cast_int32(info.frequency_times_ns[17] / 1000000),
-                                      cast_int32(info.frequency_times_ns[18] / 1000000),
-                                      cast_int32(info.frequency_times_ns[19] / 1000000),
-                                      // freq_1000_mhz_time_millis
-                                      cast_int32(info.frequency_times_ns[20] / 1000000));
+    // Log an atom for each (gpu id, uid) pair for which we have data.
+    for (uint32_t gpuId : gpuIds) {
+        for (Uid uid : uids) {
+            auto it = workMap.find(GpuIdUid{gpuId, uid});
+            if (it == workMap.end()) {
+                continue;
+            }
+            const UidTrackingInfo& info = it->second;
+
+            uint64_t total_active_duration_ms = info.total_active_duration_ns / ONE_MS_IN_NS;
+            uint64_t total_inactive_duration_ms = info.total_inactive_duration_ns / ONE_MS_IN_NS;
+
+            // Skip this atom if any numbers are out of range. |duration| is
+            // already checked above.
+            if (total_active_duration_ms > std::numeric_limits<int32_t>::max() ||
+                total_inactive_duration_ms > std::numeric_limits<int32_t>::max()) {
+                continue;
+            }
+
+            ALOGI("pullWorkAtoms: adding stats for GPU ID %" PRIu32 "; UID %" PRIu32, gpuId, uid);
+            android::util::addAStatsEvent(data, int32_t{android::util::GPU_WORK_PER_UID},
+                                          // uid
+                                          bitcast_int32(uid),
+                                          // gpu_id
+                                          bitcast_int32(gpuId),
+                                          // time_duration_seconds
+                                          static_cast<int32_t>(duration),
+                                          // total_active_duration_millis
+                                          static_cast<int32_t>(total_active_duration_ms),
+                                          // total_inactive_duration_millis
+                                          static_cast<int32_t>(total_inactive_duration_ms));
+        }
     }
     clearMap();
     return AStatsManager_PULL_SUCCESS;
@@ -435,7 +473,7 @@
     uint64_t numEntries = globalData.value().num_map_entries;
 
     // If the map is <=75% full, we do nothing.
-    if (numEntries <= (kMaxTrackedUids / 4) * 3) {
+    if (numEntries <= (kMaxTrackedGpuIdUids / 4) * 3) {
         return;
     }
 
@@ -456,22 +494,22 @@
 
     // Iterating BPF maps to delete keys is tricky. If we just repeatedly call
     // |getFirstKey()| and delete that, we may loop forever (or for a long time)
-    // because our BPF program might be repeatedly re-adding UID keys. Also,
-    // even if we limit the number of elements we try to delete, we might only
-    // delete new entries, leaving old entries in the map. If we delete a key A
-    // and then call |getNextKey(A)|, the first key in the map is returned, so
-    // we have the same issue.
+    // because our BPF program might be repeatedly re-adding keys. Also, even if
+    // we limit the number of elements we try to delete, we might only delete
+    // new entries, leaving old entries in the map. If we delete a key A and
+    // then call |getNextKey(A)|, the first key in the map is returned, so we
+    // have the same issue.
     //
     // Thus, we instead get the next key and then delete the previous key. We
     // also limit the number of deletions we try, just in case.
 
-    base::Result<Uid> key = mGpuWorkMap.getFirstKey();
+    base::Result<GpuIdUid> key = mGpuWorkMap.getFirstKey();
 
-    for (size_t i = 0; i < kMaxTrackedUids; ++i) {
+    for (size_t i = 0; i < kMaxTrackedGpuIdUids; ++i) {
         if (!key.ok()) {
             break;
         }
-        base::Result<Uid> previousKey = key;
+        base::Result<GpuIdUid> previousKey = key;
         key = mGpuWorkMap.getNextKey(previousKey.value());
         mGpuWorkMap.deleteValue(previousKey.value());
     }
diff --git a/services/gpuservice/gpuwork/bpfprogs/gpu_work.c b/services/gpuservice/gpuwork/bpfprogs/gpu_work.c
index a0e1b22..c8e79a2 100644
--- a/services/gpuservice/gpuwork/bpfprogs/gpu_work.c
+++ b/services/gpuservice/gpuwork/bpfprogs/gpu_work.c
@@ -27,66 +27,124 @@
 #endif
 
 #define S_IN_NS (1000000000)
-#define MHZ_IN_KHZS (1000)
+#define SMALL_TIME_GAP_LIMIT_NS (S_IN_NS)
 
-typedef uint32_t Uid;
-
-// A map from UID to |UidTrackingInfo|.
-DEFINE_BPF_MAP_GRW(gpu_work_map, HASH, Uid, UidTrackingInfo, kMaxTrackedUids, AID_GRAPHICS);
+// A map from GpuIdUid (GPU ID and application UID) to |UidTrackingInfo|.
+DEFINE_BPF_MAP_GRW(gpu_work_map, HASH, GpuIdUid, UidTrackingInfo, kMaxTrackedGpuIdUids,
+                   AID_GRAPHICS);
 
 // A map containing a single entry of |GlobalData|.
 DEFINE_BPF_MAP_GRW(gpu_work_global_data, ARRAY, uint32_t, GlobalData, 1, AID_GRAPHICS);
 
-// GpuUidWorkPeriodEvent defines the structure of a kernel tracepoint under the
-// tracepoint system (also referred to as the group) "power" and name
-// "gpu_work_period". In summary, the kernel tracepoint should be
-// "power/gpu_work_period", available at:
+// Defines the structure of the kernel tracepoint:
 //
 //  /sys/kernel/tracing/events/power/gpu_work_period/
 //
-// GpuUidWorkPeriodEvent defines a non-overlapping, non-zero period of time when
-// work was running on the GPU for a given application (identified by its UID;
-// the persistent, unique ID of the application) from |start_time_ns| to
-// |end_time_ns|. Drivers should issue this tracepoint as soon as possible
-// (within 1 second) after |end_time_ns|. For a given UID, periods must not
-// overlap, but periods from different UIDs can overlap (and should overlap, if
-// and only if that is how the work was executed). The period includes
-// information such as |frequency_khz|, the frequency that the GPU was running
-// at during the period, and |includes_compute_work|, whether the work included
-// compute shader work (not just graphics work). GPUs may have multiple
-// frequencies that can be adjusted, but the driver should report the frequency
-// that most closely estimates power usage (e.g. the frequency of shader cores,
-// not a scheduling unit).
+// Drivers must define an appropriate gpu_work_period kernel tracepoint (for
+// example, using the DECLARE_EVENT_CLASS and DEFINE_EVENT macros) such that the
+// arguments/fields match the fields of |GpuWorkPeriodEvent|, excluding the
+// initial "common" field. Drivers must invoke the tracepoint (also referred to
+// as emitting the event) as described below. Note that the description below
+// assumes a single physical GPU and its driver; for devices with multiple GPUs,
+// each GPU and its driver should emit events independently, using a different
+// value for |gpu_id| per GPU.
 //
-// If any information changes while work from the UID is running on the GPU
-// (e.g. the GPU frequency changes, or the work starts/stops including compute
-// work) then the driver must conceptually end the period, issue the tracepoint,
-// start tracking a new period, and eventually issue a second tracepoint when
-// the work completes or when the information changes again. In this case, the
-// |end_time_ns| of the first period must equal the |start_time_ns| of the
-// second period. The driver may also end and start a new period (without a
-// gap), even if no information changes. For example, this might be convenient
-// if there is a collection of work from a UID running on the GPU for a long
-// time; ending and starting a period as individual parts of the work complete
-// allows the consumer of the tracepoint to be updated about the ongoing work.
+// |GpuWorkPeriodEvent| defines a non-overlapping, non-zero period of time from
+// |start_time_ns| (inclusive) until |end_time_ns| (exclusive) for a given
+// |uid|, and includes details of how much work the GPU was performing for |uid|
+// during the period. When GPU work for a given |uid| runs on the GPU, the
+// driver must track one or more periods that cover the time where the work was
+// running, and emit events soon after. The driver should try to emit the event
+// for a period at most 1 second after |end_time_ns|, and must emit the event at
+// most 2 seconds after |end_time_ns|. A period's duration (|end_time_ns| -
+// |start_time_ns|) must be at most 1 second. Periods for different |uids| can
+// overlap, but periods for the same |uid| must not overlap. The driver must
+// emit events for the same |uid| in strictly increasing order of
+// |start_time_ns|, such that it is guaranteed that the tracepoint call for a
+// period for |uid| has returned before the tracepoint call for the next period
+// for |uid| is made. Note that synchronization may be necessary if the driver
+// emits events for the same |uid| from different threads/contexts. Note that
+// |end_time_ns| for a period for a |uid| may equal the |start_time_ns| of the
+// next period for |uid|. The driver should try to avoid emitting a large number
+// of events in a short time period (e.g. 1000 events per second) for a given
+// |uid|.
 //
-// For a given UID, the tracepoints must be emitted in order; that is, the
-// |start_time_ns| of each subsequent tracepoint for a given UID must be
-// monotonically increasing.
+// The |total_active_duration_ns| must be set to the approximate total amount of
+// time the GPU spent running work for |uid| within the period, without
+// "double-counting" parallel GPU work on the same GPU for the same |uid|. "GPU
+// work" should correspond to the "GPU slices" shown in the AGI (Android GPU
+// Inspector) tool, and so should include work such as fragment and non-fragment
+// work/shaders running on the shader cores of the GPU. For example, given the
+// following:
+//  - A period has:
+//    - |start_time_ns|: 100,000,000 ns
+//    - |end_time_ns|:   800,000,000 ns
+//  - Some GPU vertex work (A):
+//    - started at:      200,000,000 ns
+//    - ended at:        400,000,000 ns
+//  - Some GPU fragment work (B):
+//    - started at:      300,000,000 ns
+//    - ended at:        500,000,000 ns
+//  - Some GPU fragment work (C):
+//    - started at:      300,000,000 ns
+//    - ended at:        400,000,000 ns
+//  - Some GPU fragment work (D):
+//    - started at:      600,000,000 ns
+//    - ended at:        700,000,000 ns
+//
+// The |total_active_duration_ns| would be 400,000,000 ns, because GPU work for
+// |uid| was executing:
+//  - from 200,000,000 ns to 500,000,000 ns, giving a duration of 300,000,000 ns
+//    (encompassing GPU work A, B, and C)
+//  - from 600,000,000 ns to 700,000,000 ns, giving a duration of 100,000,000 ns
+//    (GPU work D)
+//
+// Thus, the |total_active_duration_ns| is the sum of the (non-overlapping)
+// durations. Drivers may not have efficient access to the exact start and end
+// times of all GPU work, as shown above, but drivers should try to
+// approximate/aggregate the value of |total_active_duration_ns| as accurately
+// as possible within the limitations of the hardware, without double-counting
+// parallel GPU work for the same |uid|. The |total_active_duration_ns| value
+// must be less than or equal to the period duration (|end_time_ns| -
+// |start_time_ns|); if the aggregation approach might violate this requirement
+// then the driver must clamp |total_active_duration_ns| to be at most the
+// period duration.
+//
+// Note that the above description allows for a certain amount of flexibility in
+// how the driver tracks periods and emits the events. We list a few examples of
+// how drivers might implement the above:
+//
+// - 1: The driver could track periods for all |uid| values at fixed intervals
+//   of 1 second. Thus, every period duration would be exactly 1 second, and
+//   periods from different |uid|s that overlap would have the same
+//   |start_time_ns| and |end_time_ns| values.
+//
+// - 2: The driver could track periods with many different durations (up to 1
+//   second), as needed in order to cover the GPU work for each |uid|.
+//   Overlapping periods for different |uid|s may have very different durations,
+//   as well as different |start_time_ns| and |end_time_ns| values.
+//
+// - 3: The driver could track fine-grained periods with different durations
+//   that precisely cover the time where GPU work is running for each |uid|.
+//   Thus, |total_active_duration_ns| would always equal the period duration.
+//   For example, if a game was running at 60 frames per second, the driver
+//   would most likely emit _at least_ 60 events per second (probably more, as
+//   there would likely be multiple "chunks" of GPU work per frame, with gaps
+//   between each chunk). However, the driver may sometimes need to resort to
+//   more coarse-grained periods to avoid emitting thousands of events per
+//   second for a |uid|, where |total_active_duration_ns| would then be less
+//   than the period duration.
 typedef struct {
     // Actual fields start at offset 8.
-    uint64_t reserved;
+    uint64_t common;
 
-    // The UID of the process (i.e. persistent, unique ID of the Android app)
-    // that submitted work to the GPU.
+    // A value that uniquely identifies the GPU within the system.
+    uint32_t gpu_id;
+
+    // The UID of the application (i.e. persistent, unique ID of the Android
+    // app) that submitted work to the GPU.
     uint32_t uid;
 
-    // The GPU frequency during the period. GPUs may have multiple frequencies
-    // that can be adjusted, but the driver should report the frequency that
-    // would most closely estimate power usage (e.g. the frequency of shader
-    // cores, not a scheduling unit).
-    uint32_t frequency_khz;
-
     // The start time of the period in nanoseconds. The clock must be
     // CLOCK_MONOTONIC, as returned by the ktime_get_ns(void) function.
     uint64_t start_time_ns;
@@ -95,54 +153,44 @@
     // CLOCK_MONOTONIC, as returned by the ktime_get_ns(void) function.
     uint64_t end_time_ns;
 
-    // Flags about the work. Reserved for future use.
-    uint64_t flags;
+    // The amount of time the GPU was running GPU work for |uid| during the
+    // period, in nanoseconds, without double-counting parallel GPU work for the
+    // same |uid|. For example, this might include the amount of time the GPU
+    // spent performing shader work (vertex work, fragment work, etc.) for
+    // |uid|.
+    uint64_t total_active_duration_ns;
 
-    // The maximum GPU frequency allowed during the period according to the
-    // thermal throttling policy. Must be 0 if no thermal throttling was
-    // enforced during the period. The value must relate to |frequency_khz|; in
-    // other words, if |frequency_khz| is the frequency of the GPU shader cores,
-    // then |thermally_throttled_max_frequency_khz| must be the maximum allowed
-    // frequency of the GPU shader cores (not the maximum allowed frequency of
-    // some other part of the GPU).
-    //
-    // Note: Unlike with other fields of this struct, if the
-    // |thermally_throttled_max_frequency_khz| value conceptually changes while
-    // work from a UID is running on the GPU then the GPU driver does NOT need
-    // to accurately report this by ending the period and starting to track a
-    // new period; instead, the GPU driver may report any one of the
-    // |thermally_throttled_max_frequency_khz| values that was enforced during
-    // the period. The motivation for this relaxation is that we assume the
-    // maximum frequency will not be changing rapidly, and so capturing the
-    // exact point at which the change occurs is unnecessary.
-    uint32_t thermally_throttled_max_frequency_khz;
+} GpuWorkPeriodEvent;
 
-} GpuUidWorkPeriodEvent;
-
-_Static_assert(offsetof(GpuUidWorkPeriodEvent, uid) == 8 &&
-                       offsetof(GpuUidWorkPeriodEvent, frequency_khz) == 12 &&
-                       offsetof(GpuUidWorkPeriodEvent, start_time_ns) == 16 &&
-                       offsetof(GpuUidWorkPeriodEvent, end_time_ns) == 24 &&
-                       offsetof(GpuUidWorkPeriodEvent, flags) == 32 &&
-                       offsetof(GpuUidWorkPeriodEvent, thermally_throttled_max_frequency_khz) == 40,
-               "Field offsets of struct GpuUidWorkPeriodEvent must not be changed because they "
+_Static_assert(offsetof(GpuWorkPeriodEvent, gpu_id) == 8 &&
+                       offsetof(GpuWorkPeriodEvent, uid) == 12 &&
+                       offsetof(GpuWorkPeriodEvent, start_time_ns) == 16 &&
+                       offsetof(GpuWorkPeriodEvent, end_time_ns) == 24 &&
+                       offsetof(GpuWorkPeriodEvent, total_active_duration_ns) == 32,
+               "Field offsets of struct GpuWorkPeriodEvent must not be changed because they "
                "must match the tracepoint field offsets found via adb shell cat "
                "/sys/kernel/tracing/events/power/gpu_work_period/format");
 
 DEFINE_BPF_PROG("tracepoint/power/gpu_work_period", AID_ROOT, AID_GRAPHICS, tp_gpu_work_period)
-(GpuUidWorkPeriodEvent* const args) {
-    // Note: In BPF programs, |__sync_fetch_and_add| is translated to an atomic
+(GpuWorkPeriodEvent* const period) {
+    // Note: In eBPF programs, |__sync_fetch_and_add| is translated to an atomic
     // add.
 
-    const uint32_t uid = args->uid;
+    // Return 1 to avoid blocking simpleperf from receiving events.
+    const int ALLOW = 1;
 
-    // Get |UidTrackingInfo| for |uid|.
-    UidTrackingInfo* uid_tracking_info = bpf_gpu_work_map_lookup_elem(&uid);
+    GpuIdUid gpu_id_and_uid;
+    __builtin_memset(&gpu_id_and_uid, 0, sizeof(gpu_id_and_uid));
+    gpu_id_and_uid.gpu_id = period->gpu_id;
+    gpu_id_and_uid.uid = period->uid;
+
+    // Get |UidTrackingInfo|.
+    UidTrackingInfo* uid_tracking_info = bpf_gpu_work_map_lookup_elem(&gpu_id_and_uid);
     if (!uid_tracking_info) {
         // There was no existing entry, so we add a new one.
         UidTrackingInfo initial_info;
         __builtin_memset(&initial_info, 0, sizeof(initial_info));
-        if (0 == bpf_gpu_work_map_update_elem(&uid, &initial_info, BPF_NOEXIST)) {
+        if (0 == bpf_gpu_work_map_update_elem(&gpu_id_and_uid, &initial_info, BPF_NOEXIST)) {
             // We added an entry to the map, so we increment our entry counter in
             // |GlobalData|.
             const uint32_t zero = 0;
@@ -154,66 +202,80 @@
                 __sync_fetch_and_add(&global_data->num_map_entries, 1);
             }
         }
-        uid_tracking_info = bpf_gpu_work_map_lookup_elem(&uid);
+        uid_tracking_info = bpf_gpu_work_map_lookup_elem(&gpu_id_and_uid);
         if (!uid_tracking_info) {
             // This should never happen, unless entries are getting deleted at
             // this moment. If so, we just give up.
-            return 0;
+            return ALLOW;
         }
     }
 
-    // The period duration must be non-zero.
-    if (args->start_time_ns >= args->end_time_ns) {
+    if (
+            // The period duration must be non-zero.
+            period->start_time_ns >= period->end_time_ns ||
+            // The period duration must be at most 1 second.
+            (period->end_time_ns - period->start_time_ns) > S_IN_NS) {
         __sync_fetch_and_add(&uid_tracking_info->error_count, 1);
-        return 0;
+        return ALLOW;
     }
 
-    // The frequency must not be 0.
-    if (args->frequency_khz == 0) {
-        __sync_fetch_and_add(&uid_tracking_info->error_count, 1);
-        return 0;
+    // If |total_active_duration_ns| is 0 then no GPU work occurred and there is
+    // nothing to do.
+    if (period->total_active_duration_ns == 0) {
+        return ALLOW;
     }
 
-    // Calculate the frequency index: see |UidTrackingInfo.frequency_times_ns|.
-    // Round to the nearest 50MHz bucket.
-    uint32_t frequency_index =
-            ((args->frequency_khz / MHZ_IN_KHZS) + (kFrequencyGranularityMhz / 2)) /
-            kFrequencyGranularityMhz;
-    if (frequency_index >= kNumTrackedFrequencies) {
-        frequency_index = kNumTrackedFrequencies - 1;
-    }
+    // Update |uid_tracking_info->total_active_duration_ns|.
+    __sync_fetch_and_add(&uid_tracking_info->total_active_duration_ns,
+                         period->total_active_duration_ns);
 
-    // Never round down to 0MHz, as this is a special bucket (see below) and not
-    // an actual operating point.
-    if (frequency_index == 0) {
-        frequency_index = 1;
-    }
-
-    // Update time in state.
-    __sync_fetch_and_add(&uid_tracking_info->frequency_times_ns[frequency_index],
-                         args->end_time_ns - args->start_time_ns);
-
-    if (uid_tracking_info->previous_end_time_ns > args->start_time_ns) {
-        // This must not happen because per-UID periods must not overlap and
-        // must be emitted in order.
+    // |small_gap_time_ns| is the time gap between the current and previous
+    // active period, which could be 0. If the gap is more than
+    // |SMALL_TIME_GAP_LIMIT_NS| then |small_gap_time_ns| will be set to 0
+    // because we want to estimate the small gaps between "continuous" GPU work.
+    uint64_t small_gap_time_ns = 0;
+    if (uid_tracking_info->previous_active_end_time_ns > period->start_time_ns) {
+        // The current period appears to have occurred before the previous
+        // active period, which must not happen because per-UID periods must not
+        // overlap and must be emitted in strictly increasing order of
+        // |start_time_ns|.
         __sync_fetch_and_add(&uid_tracking_info->error_count, 1);
     } else {
-        // The period appears to have been emitted after the previous, as
-        // expected, so we can calculate the gap between this and the previous
-        // period.
-        const uint64_t gap_time = args->start_time_ns - uid_tracking_info->previous_end_time_ns;
+        // The current period appears to have been emitted after the previous
+        // active period, as expected, so we can calculate the gap between the
+        // current and previous active period.
+        small_gap_time_ns = period->start_time_ns - uid_tracking_info->previous_active_end_time_ns;
 
-        // Update |previous_end_time_ns|.
-        uid_tracking_info->previous_end_time_ns = args->end_time_ns;
+        // Update |previous_active_end_time_ns|.
+        uid_tracking_info->previous_active_end_time_ns = period->end_time_ns;
 
-        // Update the special 0MHz frequency time, which stores the gaps between
-        // periods, but only if the gap is < 1 second.
-        if (gap_time > 0 && gap_time < S_IN_NS) {
-            __sync_fetch_and_add(&uid_tracking_info->frequency_times_ns[0], gap_time);
+        // We want to estimate the small gaps between "continuous" GPU work; if
+        // the gap is more than |SMALL_TIME_GAP_LIMIT_NS| then we don't consider
+        // this "continuous" GPU work.
+        if (small_gap_time_ns > SMALL_TIME_GAP_LIMIT_NS) {
+            small_gap_time_ns = 0;
         }
     }
 
-    return 0;
+    uint64_t period_total_inactive_time_ns = 0;
+    const uint64_t period_duration_ns = period->end_time_ns - period->start_time_ns;
+    // |period->total_active_duration_ns| is the active time within the period duration, so
+    // it must not be larger than |period_duration_ns|.
+    if (period->total_active_duration_ns > period_duration_ns) {
+        __sync_fetch_and_add(&uid_tracking_info->error_count, 1);
+    } else {
+        period_total_inactive_time_ns = period_duration_ns - period->total_active_duration_ns;
+    }
+
+    // Update |uid_tracking_info->total_inactive_duration_ns| by adding the
+    // inactive time from this period, plus the small gap between the current
+    // and previous active period. Either or both of these values could be 0.
+    if (small_gap_time_ns > 0 || period_total_inactive_time_ns > 0) {
+        __sync_fetch_and_add(&uid_tracking_info->total_inactive_duration_ns,
+                             small_gap_time_ns + period_total_inactive_time_ns);
+    }
+
+    return ALLOW;
 }
 
 LICENSE("Apache 2.0");
diff --git a/services/gpuservice/gpuwork/bpfprogs/include/gpuwork/gpu_work.h b/services/gpuservice/gpuwork/bpfprogs/include/gpuwork/gpu_work.h
index 4fe8464..57338f4 100644
--- a/services/gpuservice/gpuwork/bpfprogs/include/gpuwork/gpu_work.h
+++ b/services/gpuservice/gpuwork/bpfprogs/include/gpuwork/gpu_work.h
@@ -25,18 +25,28 @@
 namespace gpuwork {
 #endif
 
+typedef struct  {
+    uint32_t gpu_id;
+    uint32_t uid;
+} GpuIdUid;
+
 typedef struct {
-    // The end time of the previous period from this UID in nanoseconds.
-    uint64_t previous_end_time_ns;
+    // The end time of the previous period where the GPU was active for the UID,
+    // in nanoseconds.
+    uint64_t previous_active_end_time_ns;
 
-    // The time spent at each GPU frequency while running GPU work from the UID,
-    // in nanoseconds. Array index i stores the time for frequency i*50 MHz. So
-    // index 0 is 0Mhz, index 1 is 50MHz, index 2 is 100MHz, etc., up to index
-    // |kNumTrackedFrequencies|.
-    uint64_t frequency_times_ns[21];
+    // The total amount of time the GPU has spent running work for the UID, in
+    // nanoseconds.
+    uint64_t total_active_duration_ns;
 
-    // The number of times we received |GpuUidWorkPeriodEvent| events in an
-    // unexpected order. See |GpuUidWorkPeriodEvent|.
+    // The total amount of time of the "gaps" between "continuous" GPU work for
+    // the UID, in nanoseconds. This is estimated by ignoring large gaps between
+    // GPU work for this UID.
+    uint64_t total_inactive_duration_ns;
+
+    // The number of errors detected due to |GpuWorkPeriodEvent| events for the
+    // UID violating the specification in some way. E.g. periods with a zero or
+    // negative duration.
     uint32_t error_count;
 
 } UidTrackingInfo;
@@ -48,14 +58,10 @@
     uint64_t num_map_entries;
 } GlobalData;
 
-static const uint32_t kMaxTrackedUids = 512;
-static const uint32_t kFrequencyGranularityMhz = 50;
-static const uint32_t kNumTrackedFrequencies = 21;
+// The maximum number of tracked GPU ID and UID pairs (|GpuIdUid|).
+static const uint32_t kMaxTrackedGpuIdUids = 512;
 
 #ifdef __cplusplus
-static_assert(kNumTrackedFrequencies ==
-              std::extent<decltype(UidTrackingInfo::frequency_times_ns)>::value);
-
 } // namespace gpuwork
 } // namespace android
 #endif
diff --git a/services/gpuservice/gpuwork/include/gpuwork/GpuWork.h b/services/gpuservice/gpuwork/include/gpuwork/GpuWork.h
index b6f493d..acecd56 100644
--- a/services/gpuservice/gpuwork/include/gpuwork/GpuWork.h
+++ b/services/gpuservice/gpuwork/include/gpuwork/GpuWork.h
@@ -41,7 +41,7 @@
 
     void initialize();
 
-    // Dumps the GPU time in frequency state information.
+    // Dumps the GPU work information.
     void dump(const Vector<String16>& args, std::string* result);
 
 private:
@@ -55,7 +55,7 @@
                                                                  AStatsEventList* data,
                                                                  void* cookie);
 
-    AStatsManager_PullAtomCallbackReturn pullFrequencyAtoms(AStatsEventList* data);
+    AStatsManager_PullAtomCallbackReturn pullWorkAtoms(AStatsEventList* data);
 
     // Periodically calls |clearMapIfNeeded| to clear the |mGpuWorkMap| map, if
     // needed.
@@ -88,7 +88,7 @@
     std::mutex mMutex;
 
     // BPF map for per-UID GPU work.
-    bpf::BpfMap<Uid, UidTrackingInfo> mGpuWorkMap GUARDED_BY(mMutex);
+    bpf::BpfMap<GpuIdUid, UidTrackingInfo> mGpuWorkMap GUARDED_BY(mMutex);
 
     // BPF map containing a single element for global data.
     bpf::BpfMap<uint32_t, GlobalData> mGpuWorkGlobalDataMap GUARDED_BY(mMutex);
@@ -110,7 +110,18 @@
     bool mStatsdRegistered GUARDED_BY(mMutex) = false;
 
     // The number of randomly chosen (i.e. sampled) UIDs to log stats for.
-    static constexpr int kNumSampledUids = 10;
+    static constexpr size_t kNumSampledUids = 10;
+
+    // A "large" number of GPUs. If we observe more GPUs than this limit then
+    // we reduce the amount of stats we log.
+    static constexpr size_t kNumGpusSoftLimit = 4;
+
+    // A "very large" number of GPUs. If we observe more GPUs than this limit
+    // then we don't log any stats.
+    static constexpr size_t kNumGpusHardLimit = 32;
+
+    // The minimum GPU time needed to actually log stats for a UID.
+    static constexpr uint64_t kMinGpuTimeNanoseconds = 30U * 1000000000U; // 30 seconds.
 
     // The previous time point at which |mGpuWorkMap| was cleared.
     std::chrono::steady_clock::time_point mPreviousMapClearTimePoint GUARDED_BY(mMutex);