GpuWork: fix gpu_work_period tracepoint
The tracepoint design was too fine-grained, too complex, and was missing
a gpu_id field.
- Fix the gpu_work_period tracepoint struct:
- More coarse-grained.
- Add gpu_id field.
- Remove frequency and other fields.
- Update the documentation comment, adding more detail.
- Update the eBPF program and map structs to account for the tracepoint
changes.
- Aggregate the `total_active_duration_ns` and
`total_inactive_duration_ns`.
- Collect data for each (gpu_id, uid) pair.
- Update GpuWork class to account for the tracepoint changes.
- Remove references to "frequency"; we now collect "gpu work".
- Rework dumping code; dump active and inactive time.
- Rework stats upload:
- Log active and inactive time.
- Log per (gpu_id, uid) pair.
- Reduce the amount logged if we observe a large number of GPUs.
- Remove UIDs with a very low GPU time.
Bug: b/221096545
Change-Id: I33d1c55f886913dad7b563e7b9a25cc1af0308f2
diff --git a/services/gpuservice/gpuwork/GpuWork.cpp b/services/gpuservice/gpuwork/GpuWork.cpp
index 67ce9f2..974ae38 100644
--- a/services/gpuservice/gpuwork/GpuWork.cpp
+++ b/services/gpuservice/gpuwork/GpuWork.cpp
@@ -39,17 +39,30 @@
#include <map>
#include <mutex>
#include <unordered_map>
+#include <unordered_set>
#include <vector>
#include "gpuwork/gpu_work.h"
-#define MS_IN_NS (1000000)
+#define ONE_MS_IN_NS (10000000)
namespace android {
namespace gpuwork {
namespace {
+bool lessThanGpuIdUid(const android::gpuwork::GpuIdUid& l, const android::gpuwork::GpuIdUid& r) {
+ return std::tie(l.gpu_id, l.uid) < std::tie(r.gpu_id, r.uid);
+}
+
+size_t hashGpuIdUid(const android::gpuwork::GpuIdUid& gpuIdUid) {
+ return static_cast<size_t>((gpuIdUid.gpu_id << 5U) + gpuIdUid.uid);
+}
+
+bool equalGpuIdUid(const android::gpuwork::GpuIdUid& l, const android::gpuwork::GpuIdUid& r) {
+ return std::tie(l.gpu_id, l.uid) == std::tie(r.gpu_id, r.uid);
+}
+
// Gets a BPF map from |mapPath|.
template <class Key, class Value>
bool getBpfMap(const char* mapPath, bpf::BpfMap<Key, Value>* out) {
@@ -76,24 +89,6 @@
return result;
}
-template <>
-inline int32_t cast_int32<uint64_t>(uint64_t source) {
- if (source > std::numeric_limits<int32_t>::max()) {
- return std::numeric_limits<int32_t>::max();
- }
- return static_cast<int32_t>(source);
-}
-
-template <>
-inline int32_t cast_int32<long long>(long long source) {
- if (source > std::numeric_limits<int32_t>::max()) {
- return std::numeric_limits<int32_t>::max();
- } else if (source < std::numeric_limits<int32_t>::min()) {
- return std::numeric_limits<int32_t>::min();
- }
- return static_cast<int32_t>(source);
-}
-
} // namespace
using base::StringAppendF;
@@ -115,7 +110,7 @@
{
std::scoped_lock<std::mutex> lock(mMutex);
if (mStatsdRegistered) {
- AStatsManager_clearPullAtomCallback(android::util::GPU_FREQ_TIME_IN_STATE_PER_UID);
+ AStatsManager_clearPullAtomCallback(android::util::GPU_WORK_PER_UID);
}
}
@@ -144,7 +139,7 @@
mPreviousMapClearTimePoint = std::chrono::steady_clock::now();
}
- // Attach the tracepoint ONLY if we got the map above.
+ // Attach the tracepoint.
if (!attachTracepoint("/sys/fs/bpf/prog_gpu_work_tracepoint_power_gpu_work_period", "power",
"gpu_work_period")) {
return;
@@ -157,8 +152,8 @@
{
std::lock_guard<std::mutex> lock(mMutex);
- AStatsManager_setPullAtomCallback(int32_t{android::util::GPU_FREQ_TIME_IN_STATE_PER_UID},
- nullptr, GpuWork::pullAtomCallback, this);
+ AStatsManager_setPullAtomCallback(int32_t{android::util::GPU_WORK_PER_UID}, nullptr,
+ GpuWork::pullAtomCallback, this);
mStatsdRegistered = true;
}
@@ -169,18 +164,18 @@
void GpuWork::dump(const Vector<String16>& /* args */, std::string* result) {
if (!mInitialized.load()) {
- result->append("GPU time in state information is not available.\n");
+ result->append("GPU work information is not available.\n");
return;
}
- // Ordered map ensures output data is sorted by UID.
- std::map<Uid, UidTrackingInfo> dumpMap;
+ // Ordered map ensures output data is sorted.
+ std::map<GpuIdUid, UidTrackingInfo, decltype(lessThanGpuIdUid)*> dumpMap(&lessThanGpuIdUid);
{
std::lock_guard<std::mutex> lock(mMutex);
if (!mGpuWorkMap.isValid()) {
- result->append("GPU time in state map is not available.\n");
+ result->append("GPU work map is not available.\n");
return;
}
@@ -189,56 +184,42 @@
// threads. The buckets are all preallocated. Our eBPF program only updates
// entries (in-place) or adds entries. |GpuWork| only iterates or clears the
// map while holding |mMutex|. Given this, we should be able to iterate over
- // all elements reliably. In the worst case, we might see elements more than
- // once.
+ // all elements reliably. Nevertheless, we copy into a map to avoid
+ // duplicates.
// Note that userspace reads of BPF maps make a copy of the value, and
// thus the returned value is not being concurrently accessed by the BPF
// program (no atomic reads needed below).
- mGpuWorkMap.iterateWithValue([&dumpMap](const Uid& key, const UidTrackingInfo& value,
- const android::bpf::BpfMap<Uid, UidTrackingInfo>&)
- -> base::Result<void> {
- dumpMap[key] = value;
- return {};
- });
+ mGpuWorkMap.iterateWithValue(
+ [&dumpMap](const GpuIdUid& key, const UidTrackingInfo& value,
+ const android::bpf::BpfMap<GpuIdUid, UidTrackingInfo>&)
+ -> base::Result<void> {
+ dumpMap[key] = value;
+ return {};
+ });
}
- // Find the largest frequency where some UID has spent time in that frequency.
- size_t largestFrequencyWithTime = 0;
- for (const auto& uidToUidInfo : dumpMap) {
- for (size_t i = largestFrequencyWithTime + 1; i < kNumTrackedFrequencies; ++i) {
- if (uidToUidInfo.second.frequency_times_ns[i] > 0) {
- largestFrequencyWithTime = i;
- }
- }
- }
-
- // Dump time in state information.
+ // Dump work information.
// E.g.
- // uid/freq: 0MHz 50MHz 100MHz ...
- // 1000: 0 0 0 0 ...
- // 1003: 0 0 3456 0 ...
- // [errors:3]1006: 0 0 3456 0 ...
+ // GPU work information.
+ // gpu_id uid total_active_duration_ns total_inactive_duration_ns
+ // 0 1000 0 0
+ // 0 1003 1234 123
+ // [errors:3]0 1006 4567 456
// Header.
- result->append("GPU time in frequency state in ms.\n");
- result->append("uid/freq: 0MHz");
- for (size_t i = 1; i <= largestFrequencyWithTime; ++i) {
- StringAppendF(result, " %zuMHz", i * 50);
- }
- result->append("\n");
+ result->append("GPU work information.\ngpu_id uid total_active_duration_ns "
+ "total_inactive_duration_ns\n");
- for (const auto& uidToUidInfo : dumpMap) {
- if (uidToUidInfo.second.error_count) {
- StringAppendF(result, "[errors:%" PRIu32 "]", uidToUidInfo.second.error_count);
+ for (const auto& idToUidInfo : dumpMap) {
+ if (idToUidInfo.second.error_count) {
+ StringAppendF(result, "[errors:%" PRIu32 "]", idToUidInfo.second.error_count);
}
- StringAppendF(result, "%" PRIu32 ":", uidToUidInfo.first);
- for (size_t i = 0; i <= largestFrequencyWithTime; ++i) {
- StringAppendF(result, " %" PRIu64,
- uidToUidInfo.second.frequency_times_ns[i] / MS_IN_NS);
- }
- result->append("\n");
+ StringAppendF(result, "%" PRIu32 " %" PRIu32 " %" PRIu64 " %" PRIu64 "\n",
+ idToUidInfo.first.gpu_id, idToUidInfo.first.uid,
+ idToUidInfo.second.total_active_duration_ns,
+ idToUidInfo.second.total_inactive_duration_ns);
}
}
@@ -275,14 +256,14 @@
ATRACE_CALL();
GpuWork* gpuWork = reinterpret_cast<GpuWork*>(cookie);
- if (atomTag == android::util::GPU_FREQ_TIME_IN_STATE_PER_UID) {
- return gpuWork->pullFrequencyAtoms(data);
+ if (atomTag == android::util::GPU_WORK_PER_UID) {
+ return gpuWork->pullWorkAtoms(data);
}
return AStatsManager_PULL_SKIP;
}
-AStatsManager_PullAtomCallbackReturn GpuWork::pullFrequencyAtoms(AStatsEventList* data) {
+AStatsManager_PullAtomCallbackReturn GpuWork::pullWorkAtoms(AStatsEventList* data) {
ATRACE_CALL();
if (!data || !mInitialized.load()) {
@@ -295,96 +276,153 @@
return AStatsManager_PULL_SKIP;
}
- std::unordered_map<Uid, UidTrackingInfo> uidInfos;
+ std::unordered_map<GpuIdUid, UidTrackingInfo, decltype(hashGpuIdUid)*, decltype(equalGpuIdUid)*>
+ workMap(32, &hashGpuIdUid, &equalGpuIdUid);
// Iteration of BPF hash maps can be unreliable (no data races, but elements
// may be repeated), as the map is typically being modified by other
// threads. The buckets are all preallocated. Our eBPF program only updates
// entries (in-place) or adds entries. |GpuWork| only iterates or clears the
// map while holding |mMutex|. Given this, we should be able to iterate over
- // all elements reliably. In the worst case, we might see elements more than
- // once.
+ // all elements reliably. Nevertheless, we copy into a map to avoid
+ // duplicates.
// Note that userspace reads of BPF maps make a copy of the value, and thus
// the returned value is not being concurrently accessed by the BPF program
// (no atomic reads needed below).
- mGpuWorkMap.iterateWithValue(
- [&uidInfos](const Uid& key, const UidTrackingInfo& value,
- const android::bpf::BpfMap<Uid, UidTrackingInfo>&) -> base::Result<void> {
- uidInfos[key] = value;
- return {};
- });
-
- ALOGI("pullFrequencyAtoms: uidInfos.size() == %zu", uidInfos.size());
+ mGpuWorkMap.iterateWithValue([&workMap](const GpuIdUid& key, const UidTrackingInfo& value,
+ const android::bpf::BpfMap<GpuIdUid, UidTrackingInfo>&)
+ -> base::Result<void> {
+ workMap[key] = value;
+ return {};
+ });
// Get a list of just the UIDs; the order does not matter.
std::vector<Uid> uids;
- for (const auto& pair : uidInfos) {
- uids.push_back(pair.first);
+ // Get a list of the GPU IDs, in order.
+ std::set<uint32_t> gpuIds;
+ {
+ // To avoid adding duplicate UIDs.
+ std::unordered_set<Uid> addedUids;
+
+ for (const auto& workInfo : workMap) {
+ if (addedUids.insert(workInfo.first.uid).second) {
+ // Insertion was successful.
+ uids.push_back(workInfo.first.uid);
+ }
+ gpuIds.insert(workInfo.first.gpu_id);
+ }
}
+ ALOGI("pullWorkAtoms: uids.size() == %zu", uids.size());
+ ALOGI("pullWorkAtoms: gpuIds.size() == %zu", gpuIds.size());
+
+ if (gpuIds.size() > kNumGpusHardLimit) {
+ // If we observe a very high number of GPUs then something has probably
+ // gone wrong, so don't log any atoms.
+ return AStatsManager_PULL_SKIP;
+ }
+
+ size_t numSampledUids = kNumSampledUids;
+
+ if (gpuIds.size() > kNumGpusSoftLimit) {
+ // If we observe a high number of GPUs then we just sample 1 UID.
+ numSampledUids = 1;
+ }
+
+ // Remove all UIDs that do not have at least |kMinGpuTimeNanoseconds| on at
+ // least one GPU.
+ {
+ auto uidIt = uids.begin();
+ while (uidIt != uids.end()) {
+ bool hasEnoughGpuTime = false;
+ for (uint32_t gpuId : gpuIds) {
+ auto infoIt = workMap.find(GpuIdUid{gpuId, *uidIt});
+ if (infoIt == workMap.end()) {
+ continue;
+ }
+ if (infoIt->second.total_active_duration_ns +
+ infoIt->second.total_inactive_duration_ns >=
+ kMinGpuTimeNanoseconds) {
+ hasEnoughGpuTime = true;
+ break;
+ }
+ }
+ if (hasEnoughGpuTime) {
+ ++uidIt;
+ } else {
+ uidIt = uids.erase(uidIt);
+ }
+ }
+ }
+
+ ALOGI("pullWorkAtoms: after removing uids with very low GPU time: uids.size() == %zu",
+ uids.size());
+
std::random_device device;
std::default_random_engine random_engine(device());
- // If we have more than |kNumSampledUids| UIDs, choose |kNumSampledUids|
+ // If we have more than |numSampledUids| UIDs, choose |numSampledUids|
// random UIDs. We swap them to the front of the list. Given the list
// indices 0..i..n-1, we have the following inclusive-inclusive ranges:
// - [0, i-1] == the randomly chosen elements.
// - [i, n-1] == the remaining unchosen elements.
- if (uids.size() > kNumSampledUids) {
- for (size_t i = 0; i < kNumSampledUids; ++i) {
+ if (uids.size() > numSampledUids) {
+ for (size_t i = 0; i < numSampledUids; ++i) {
std::uniform_int_distribution<size_t> uniform_dist(i, uids.size() - 1);
size_t random_index = uniform_dist(random_engine);
std::swap(uids[i], uids[random_index]);
}
- // Only keep the front |kNumSampledUids| elements.
- uids.resize(kNumSampledUids);
+ // Only keep the front |numSampledUids| elements.
+ uids.resize(numSampledUids);
}
- ALOGI("pullFrequencyAtoms: uids.size() == %zu", uids.size());
+ ALOGI("pullWorkAtoms: after random selection: uids.size() == %zu", uids.size());
auto now = std::chrono::steady_clock::now();
-
- int32_t duration = cast_int32(
+ long long duration =
std::chrono::duration_cast<std::chrono::seconds>(now - mPreviousMapClearTimePoint)
- .count());
+ .count();
+ if (duration > std::numeric_limits<int32_t>::max() || duration < 0) {
+ // This is essentially impossible. If it does somehow happen, give up,
+ // but still clear the map.
+ clearMap();
+ return AStatsManager_PULL_SKIP;
+ }
- for (const Uid uid : uids) {
- const UidTrackingInfo& info = uidInfos[uid];
- ALOGI("pullFrequencyAtoms: adding stats for UID %" PRIu32, uid);
- android::util::addAStatsEvent(data, int32_t{android::util::GPU_FREQ_TIME_IN_STATE_PER_UID},
- // uid
- bitcast_int32(uid),
- // time_duration_seconds
- int32_t{duration},
- // max_freq_mhz
- int32_t{1000},
- // freq_0_mhz_time_millis
- cast_int32(info.frequency_times_ns[0] / 1000000),
- // freq_50_mhz_time_millis
- cast_int32(info.frequency_times_ns[1] / 1000000),
- // ... etc. ...
- cast_int32(info.frequency_times_ns[2] / 1000000),
- cast_int32(info.frequency_times_ns[3] / 1000000),
- cast_int32(info.frequency_times_ns[4] / 1000000),
- cast_int32(info.frequency_times_ns[5] / 1000000),
- cast_int32(info.frequency_times_ns[6] / 1000000),
- cast_int32(info.frequency_times_ns[7] / 1000000),
- cast_int32(info.frequency_times_ns[8] / 1000000),
- cast_int32(info.frequency_times_ns[9] / 1000000),
- cast_int32(info.frequency_times_ns[10] / 1000000),
- cast_int32(info.frequency_times_ns[11] / 1000000),
- cast_int32(info.frequency_times_ns[12] / 1000000),
- cast_int32(info.frequency_times_ns[13] / 1000000),
- cast_int32(info.frequency_times_ns[14] / 1000000),
- cast_int32(info.frequency_times_ns[15] / 1000000),
- cast_int32(info.frequency_times_ns[16] / 1000000),
- cast_int32(info.frequency_times_ns[17] / 1000000),
- cast_int32(info.frequency_times_ns[18] / 1000000),
- cast_int32(info.frequency_times_ns[19] / 1000000),
- // freq_1000_mhz_time_millis
- cast_int32(info.frequency_times_ns[20] / 1000000));
+ // Log an atom for each (gpu id, uid) pair for which we have data.
+ for (uint32_t gpuId : gpuIds) {
+ for (Uid uid : uids) {
+ auto it = workMap.find(GpuIdUid{gpuId, uid});
+ if (it == workMap.end()) {
+ continue;
+ }
+ const UidTrackingInfo& info = it->second;
+
+ uint64_t total_active_duration_ms = info.total_active_duration_ns / ONE_MS_IN_NS;
+ uint64_t total_inactive_duration_ms = info.total_inactive_duration_ns / ONE_MS_IN_NS;
+
+ // Skip this atom if any numbers are out of range. |duration| is
+ // already checked above.
+ if (total_active_duration_ms > std::numeric_limits<int32_t>::max() ||
+ total_inactive_duration_ms > std::numeric_limits<int32_t>::max()) {
+ continue;
+ }
+
+ ALOGI("pullWorkAtoms: adding stats for GPU ID %" PRIu32 "; UID %" PRIu32, gpuId, uid);
+ android::util::addAStatsEvent(data, int32_t{android::util::GPU_WORK_PER_UID},
+ // uid
+ bitcast_int32(uid),
+ // gpu_id
+ bitcast_int32(gpuId),
+ // time_duration_seconds
+ static_cast<int32_t>(duration),
+ // total_active_duration_millis
+ static_cast<int32_t>(total_active_duration_ms),
+ // total_inactive_duration_millis
+ static_cast<int32_t>(total_inactive_duration_ms));
+ }
}
clearMap();
return AStatsManager_PULL_SUCCESS;
@@ -435,7 +473,7 @@
uint64_t numEntries = globalData.value().num_map_entries;
// If the map is <=75% full, we do nothing.
- if (numEntries <= (kMaxTrackedUids / 4) * 3) {
+ if (numEntries <= (kMaxTrackedGpuIdUids / 4) * 3) {
return;
}
@@ -456,22 +494,22 @@
// Iterating BPF maps to delete keys is tricky. If we just repeatedly call
// |getFirstKey()| and delete that, we may loop forever (or for a long time)
- // because our BPF program might be repeatedly re-adding UID keys. Also,
- // even if we limit the number of elements we try to delete, we might only
- // delete new entries, leaving old entries in the map. If we delete a key A
- // and then call |getNextKey(A)|, the first key in the map is returned, so
- // we have the same issue.
+ // because our BPF program might be repeatedly re-adding keys. Also, even if
+ // we limit the number of elements we try to delete, we might only delete
+ // new entries, leaving old entries in the map. If we delete a key A and
+ // then call |getNextKey(A)|, the first key in the map is returned, so we
+ // have the same issue.
//
// Thus, we instead get the next key and then delete the previous key. We
// also limit the number of deletions we try, just in case.
- base::Result<Uid> key = mGpuWorkMap.getFirstKey();
+ base::Result<GpuIdUid> key = mGpuWorkMap.getFirstKey();
- for (size_t i = 0; i < kMaxTrackedUids; ++i) {
+ for (size_t i = 0; i < kMaxTrackedGpuIdUids; ++i) {
if (!key.ok()) {
break;
}
- base::Result<Uid> previousKey = key;
+ base::Result<GpuIdUid> previousKey = key;
key = mGpuWorkMap.getNextKey(previousKey.value());
mGpuWorkMap.deleteValue(previousKey.value());
}
diff --git a/services/gpuservice/gpuwork/bpfprogs/gpu_work.c b/services/gpuservice/gpuwork/bpfprogs/gpu_work.c
index a0e1b22..c8e79a2 100644
--- a/services/gpuservice/gpuwork/bpfprogs/gpu_work.c
+++ b/services/gpuservice/gpuwork/bpfprogs/gpu_work.c
@@ -27,66 +27,124 @@
#endif
#define S_IN_NS (1000000000)
-#define MHZ_IN_KHZS (1000)
+#define SMALL_TIME_GAP_LIMIT_NS (S_IN_NS)
-typedef uint32_t Uid;
-
-// A map from UID to |UidTrackingInfo|.
-DEFINE_BPF_MAP_GRW(gpu_work_map, HASH, Uid, UidTrackingInfo, kMaxTrackedUids, AID_GRAPHICS);
+// A map from GpuIdUid (GPU ID and application UID) to |UidTrackingInfo|.
+DEFINE_BPF_MAP_GRW(gpu_work_map, HASH, GpuIdUid, UidTrackingInfo, kMaxTrackedGpuIdUids,
+ AID_GRAPHICS);
// A map containing a single entry of |GlobalData|.
DEFINE_BPF_MAP_GRW(gpu_work_global_data, ARRAY, uint32_t, GlobalData, 1, AID_GRAPHICS);
-// GpuUidWorkPeriodEvent defines the structure of a kernel tracepoint under the
-// tracepoint system (also referred to as the group) "power" and name
-// "gpu_work_period". In summary, the kernel tracepoint should be
-// "power/gpu_work_period", available at:
+// Defines the structure of the kernel tracepoint:
//
// /sys/kernel/tracing/events/power/gpu_work_period/
//
-// GpuUidWorkPeriodEvent defines a non-overlapping, non-zero period of time when
-// work was running on the GPU for a given application (identified by its UID;
-// the persistent, unique ID of the application) from |start_time_ns| to
-// |end_time_ns|. Drivers should issue this tracepoint as soon as possible
-// (within 1 second) after |end_time_ns|. For a given UID, periods must not
-// overlap, but periods from different UIDs can overlap (and should overlap, if
-// and only if that is how the work was executed). The period includes
-// information such as |frequency_khz|, the frequency that the GPU was running
-// at during the period, and |includes_compute_work|, whether the work included
-// compute shader work (not just graphics work). GPUs may have multiple
-// frequencies that can be adjusted, but the driver should report the frequency
-// that most closely estimates power usage (e.g. the frequency of shader cores,
-// not a scheduling unit).
+// Drivers must define an appropriate gpu_work_period kernel tracepoint (for
+// example, using the DECLARE_EVENT_CLASS and DEFINE_EVENT macros) such that the
+// arguments/fields match the fields of |GpuWorkPeriodEvent|, excluding the
+// initial "common" field. Drivers must invoke the tracepoint (also referred to
+// as emitting the event) as described below. Note that the description below
+// assumes a single physical GPU and its driver; for devices with multiple GPUs,
+// each GPU and its driver should emit events independently, using a different
+// value for |gpu_id| per GPU.
//
-// If any information changes while work from the UID is running on the GPU
-// (e.g. the GPU frequency changes, or the work starts/stops including compute
-// work) then the driver must conceptually end the period, issue the tracepoint,
-// start tracking a new period, and eventually issue a second tracepoint when
-// the work completes or when the information changes again. In this case, the
-// |end_time_ns| of the first period must equal the |start_time_ns| of the
-// second period. The driver may also end and start a new period (without a
-// gap), even if no information changes. For example, this might be convenient
-// if there is a collection of work from a UID running on the GPU for a long
-// time; ending and starting a period as individual parts of the work complete
-// allows the consumer of the tracepoint to be updated about the ongoing work.
+// |GpuWorkPeriodEvent| defines a non-overlapping, non-zero period of time from
+// |start_time_ns| (inclusive) until |end_time_ns| (exclusive) for a given
+// |uid|, and includes details of how much work the GPU was performing for |uid|
+// during the period. When GPU work for a given |uid| runs on the GPU, the
+// driver must track one or more periods that cover the time where the work was
+// running, and emit events soon after. The driver should try to emit the event
+// for a period at most 1 second after |end_time_ns|, and must emit the event at
+// most 2 seconds after |end_time_ns|. A period's duration (|end_time_ns| -
+// |start_time_ns|) must be at most 1 second. Periods for different |uids| can
+// overlap, but periods for the same |uid| must not overlap. The driver must
+// emit events for the same |uid| in strictly increasing order of
+// |start_time_ns|, such that it is guaranteed that the tracepoint call for a
+// period for |uid| has returned before the tracepoint call for the next period
+// for |uid| is made. Note that synchronization may be necessary if the driver
+// emits events for the same |uid| from different threads/contexts. Note that
+// |end_time_ns| for a period for a |uid| may equal the |start_time_ns| of the
+// next period for |uid|. The driver should try to avoid emitting a large number
+// of events in a short time period (e.g. 1000 events per second) for a given
+// |uid|.
//
-// For a given UID, the tracepoints must be emitted in order; that is, the
-// |start_time_ns| of each subsequent tracepoint for a given UID must be
-// monotonically increasing.
+// The |total_active_duration_ns| must be set to the approximate total amount of
+// time the GPU spent running work for |uid| within the period, without
+// "double-counting" parallel GPU work on the same GPU for the same |uid|. "GPU
+// work" should correspond to the "GPU slices" shown in the AGI (Android GPU
+// Inspector) tool, and so should include work such as fragment and non-fragment
+// work/shaders running on the shader cores of the GPU. For example, given the
+// following:
+// - A period has:
+// - |start_time_ns|: 100,000,000 ns
+// - |end_time_ns|: 800,000,000 ns
+// - Some GPU vertex work (A):
+// - started at: 200,000,000 ns
+// - ended at: 400,000,000 ns
+// - Some GPU fragment work (B):
+// - started at: 300,000,000 ns
+// - ended at: 500,000,000 ns
+// - Some GPU fragment work (C):
+// - started at: 300,000,000 ns
+// - ended at: 400,000,000 ns
+// - Some GPU fragment work (D):
+// - started at: 600,000,000 ns
+// - ended at: 700,000,000 ns
+//
+// The |total_active_duration_ns| would be 400,000,000 ns, because GPU work for
+// |uid| was executing:
+// - from 200,000,000 ns to 500,000,000 ns, giving a duration of 300,000,000 ns
+// (encompassing GPU work A, B, and C)
+// - from 600,000,000 ns to 700,000,000 ns, giving a duration of 100,000,000 ns
+// (GPU work D)
+//
+// Thus, the |total_active_duration_ns| is the sum of the (non-overlapping)
+// durations. Drivers may not have efficient access to the exact start and end
+// times of all GPU work, as shown above, but drivers should try to
+// approximate/aggregate the value of |total_active_duration_ns| as accurately
+// as possible within the limitations of the hardware, without double-counting
+// parallel GPU work for the same |uid|. The |total_active_duration_ns| value
+// must be less than or equal to the period duration (|end_time_ns| -
+// |start_time_ns|); if the aggregation approach might violate this requirement
+// then the driver must clamp |total_active_duration_ns| to be at most the
+// period duration.
+//
+// Note that the above description allows for a certain amount of flexibility in
+// how the driver tracks periods and emits the events. We list a few examples of
+// how drivers might implement the above:
+//
+// - 1: The driver could track periods for all |uid| values at fixed intervals
+// of 1 second. Thus, every period duration would be exactly 1 second, and
+// periods from different |uid|s that overlap would have the same
+// |start_time_ns| and |end_time_ns| values.
+//
+// - 2: The driver could track periods with many different durations (up to 1
+// second), as needed in order to cover the GPU work for each |uid|.
+// Overlapping periods for different |uid|s may have very different durations,
+// as well as different |start_time_ns| and |end_time_ns| values.
+//
+// - 3: The driver could track fine-grained periods with different durations
+// that precisely cover the time where GPU work is running for each |uid|.
+// Thus, |total_active_duration_ns| would always equal the period duration.
+// For example, if a game was running at 60 frames per second, the driver
+// would most likely emit _at least_ 60 events per second (probably more, as
+// there would likely be multiple "chunks" of GPU work per frame, with gaps
+// between each chunk). However, the driver may sometimes need to resort to
+// more coarse-grained periods to avoid emitting thousands of events per
+// second for a |uid|, where |total_active_duration_ns| would then be less
+// than the period duration.
typedef struct {
// Actual fields start at offset 8.
- uint64_t reserved;
+ uint64_t common;
- // The UID of the process (i.e. persistent, unique ID of the Android app)
- // that submitted work to the GPU.
+ // A value that uniquely identifies the GPU within the system.
+ uint32_t gpu_id;
+
+ // The UID of the application (i.e. persistent, unique ID of the Android
+ // app) that submitted work to the GPU.
uint32_t uid;
- // The GPU frequency during the period. GPUs may have multiple frequencies
- // that can be adjusted, but the driver should report the frequency that
- // would most closely estimate power usage (e.g. the frequency of shader
- // cores, not a scheduling unit).
- uint32_t frequency_khz;
-
// The start time of the period in nanoseconds. The clock must be
// CLOCK_MONOTONIC, as returned by the ktime_get_ns(void) function.
uint64_t start_time_ns;
@@ -95,54 +153,44 @@
// CLOCK_MONOTONIC, as returned by the ktime_get_ns(void) function.
uint64_t end_time_ns;
- // Flags about the work. Reserved for future use.
- uint64_t flags;
+ // The amount of time the GPU was running GPU work for |uid| during the
+ // period, in nanoseconds, without double-counting parallel GPU work for the
+ // same |uid|. For example, this might include the amount of time the GPU
+ // spent performing shader work (vertex work, fragment work, etc.) for
+ // |uid|.
+ uint64_t total_active_duration_ns;
- // The maximum GPU frequency allowed during the period according to the
- // thermal throttling policy. Must be 0 if no thermal throttling was
- // enforced during the period. The value must relate to |frequency_khz|; in
- // other words, if |frequency_khz| is the frequency of the GPU shader cores,
- // then |thermally_throttled_max_frequency_khz| must be the maximum allowed
- // frequency of the GPU shader cores (not the maximum allowed frequency of
- // some other part of the GPU).
- //
- // Note: Unlike with other fields of this struct, if the
- // |thermally_throttled_max_frequency_khz| value conceptually changes while
- // work from a UID is running on the GPU then the GPU driver does NOT need
- // to accurately report this by ending the period and starting to track a
- // new period; instead, the GPU driver may report any one of the
- // |thermally_throttled_max_frequency_khz| values that was enforced during
- // the period. The motivation for this relaxation is that we assume the
- // maximum frequency will not be changing rapidly, and so capturing the
- // exact point at which the change occurs is unnecessary.
- uint32_t thermally_throttled_max_frequency_khz;
+} GpuWorkPeriodEvent;
-} GpuUidWorkPeriodEvent;
-
-_Static_assert(offsetof(GpuUidWorkPeriodEvent, uid) == 8 &&
- offsetof(GpuUidWorkPeriodEvent, frequency_khz) == 12 &&
- offsetof(GpuUidWorkPeriodEvent, start_time_ns) == 16 &&
- offsetof(GpuUidWorkPeriodEvent, end_time_ns) == 24 &&
- offsetof(GpuUidWorkPeriodEvent, flags) == 32 &&
- offsetof(GpuUidWorkPeriodEvent, thermally_throttled_max_frequency_khz) == 40,
- "Field offsets of struct GpuUidWorkPeriodEvent must not be changed because they "
+_Static_assert(offsetof(GpuWorkPeriodEvent, gpu_id) == 8 &&
+ offsetof(GpuWorkPeriodEvent, uid) == 12 &&
+ offsetof(GpuWorkPeriodEvent, start_time_ns) == 16 &&
+ offsetof(GpuWorkPeriodEvent, end_time_ns) == 24 &&
+ offsetof(GpuWorkPeriodEvent, total_active_duration_ns) == 32,
+ "Field offsets of struct GpuWorkPeriodEvent must not be changed because they "
"must match the tracepoint field offsets found via adb shell cat "
"/sys/kernel/tracing/events/power/gpu_work_period/format");
DEFINE_BPF_PROG("tracepoint/power/gpu_work_period", AID_ROOT, AID_GRAPHICS, tp_gpu_work_period)
-(GpuUidWorkPeriodEvent* const args) {
- // Note: In BPF programs, |__sync_fetch_and_add| is translated to an atomic
+(GpuWorkPeriodEvent* const period) {
+ // Note: In eBPF programs, |__sync_fetch_and_add| is translated to an atomic
// add.
- const uint32_t uid = args->uid;
+ // Return 1 to avoid blocking simpleperf from receiving events.
+ const int ALLOW = 1;
- // Get |UidTrackingInfo| for |uid|.
- UidTrackingInfo* uid_tracking_info = bpf_gpu_work_map_lookup_elem(&uid);
+ GpuIdUid gpu_id_and_uid;
+ __builtin_memset(&gpu_id_and_uid, 0, sizeof(gpu_id_and_uid));
+ gpu_id_and_uid.gpu_id = period->gpu_id;
+ gpu_id_and_uid.uid = period->uid;
+
+ // Get |UidTrackingInfo|.
+ UidTrackingInfo* uid_tracking_info = bpf_gpu_work_map_lookup_elem(&gpu_id_and_uid);
if (!uid_tracking_info) {
// There was no existing entry, so we add a new one.
UidTrackingInfo initial_info;
__builtin_memset(&initial_info, 0, sizeof(initial_info));
- if (0 == bpf_gpu_work_map_update_elem(&uid, &initial_info, BPF_NOEXIST)) {
+ if (0 == bpf_gpu_work_map_update_elem(&gpu_id_and_uid, &initial_info, BPF_NOEXIST)) {
// We added an entry to the map, so we increment our entry counter in
// |GlobalData|.
const uint32_t zero = 0;
@@ -154,66 +202,80 @@
__sync_fetch_and_add(&global_data->num_map_entries, 1);
}
}
- uid_tracking_info = bpf_gpu_work_map_lookup_elem(&uid);
+ uid_tracking_info = bpf_gpu_work_map_lookup_elem(&gpu_id_and_uid);
if (!uid_tracking_info) {
// This should never happen, unless entries are getting deleted at
// this moment. If so, we just give up.
- return 0;
+ return ALLOW;
}
}
- // The period duration must be non-zero.
- if (args->start_time_ns >= args->end_time_ns) {
+ if (
+ // The period duration must be non-zero.
+ period->start_time_ns >= period->end_time_ns ||
+ // The period duration must be at most 1 second.
+ (period->end_time_ns - period->start_time_ns) > S_IN_NS) {
__sync_fetch_and_add(&uid_tracking_info->error_count, 1);
- return 0;
+ return ALLOW;
}
- // The frequency must not be 0.
- if (args->frequency_khz == 0) {
- __sync_fetch_and_add(&uid_tracking_info->error_count, 1);
- return 0;
+ // If |total_active_duration_ns| is 0 then no GPU work occurred and there is
+ // nothing to do.
+ if (period->total_active_duration_ns == 0) {
+ return ALLOW;
}
- // Calculate the frequency index: see |UidTrackingInfo.frequency_times_ns|.
- // Round to the nearest 50MHz bucket.
- uint32_t frequency_index =
- ((args->frequency_khz / MHZ_IN_KHZS) + (kFrequencyGranularityMhz / 2)) /
- kFrequencyGranularityMhz;
- if (frequency_index >= kNumTrackedFrequencies) {
- frequency_index = kNumTrackedFrequencies - 1;
- }
+ // Update |uid_tracking_info->total_active_duration_ns|.
+ __sync_fetch_and_add(&uid_tracking_info->total_active_duration_ns,
+ period->total_active_duration_ns);
- // Never round down to 0MHz, as this is a special bucket (see below) and not
- // an actual operating point.
- if (frequency_index == 0) {
- frequency_index = 1;
- }
-
- // Update time in state.
- __sync_fetch_and_add(&uid_tracking_info->frequency_times_ns[frequency_index],
- args->end_time_ns - args->start_time_ns);
-
- if (uid_tracking_info->previous_end_time_ns > args->start_time_ns) {
- // This must not happen because per-UID periods must not overlap and
- // must be emitted in order.
+ // |small_gap_time_ns| is the time gap between the current and previous
+ // active period, which could be 0. If the gap is more than
+ // |SMALL_TIME_GAP_LIMIT_NS| then |small_gap_time_ns| will be set to 0
+ // because we want to estimate the small gaps between "continuous" GPU work.
+ uint64_t small_gap_time_ns = 0;
+ if (uid_tracking_info->previous_active_end_time_ns > period->start_time_ns) {
+ // The current period appears to have occurred before the previous
+ // active period, which must not happen because per-UID periods must not
+ // overlap and must be emitted in strictly increasing order of
+ // |start_time_ns|.
__sync_fetch_and_add(&uid_tracking_info->error_count, 1);
} else {
- // The period appears to have been emitted after the previous, as
- // expected, so we can calculate the gap between this and the previous
- // period.
- const uint64_t gap_time = args->start_time_ns - uid_tracking_info->previous_end_time_ns;
+ // The current period appears to have been emitted after the previous
+ // active period, as expected, so we can calculate the gap between the
+ // current and previous active period.
+ small_gap_time_ns = period->start_time_ns - uid_tracking_info->previous_active_end_time_ns;
- // Update |previous_end_time_ns|.
- uid_tracking_info->previous_end_time_ns = args->end_time_ns;
+ // Update |previous_active_end_time_ns|.
+ uid_tracking_info->previous_active_end_time_ns = period->end_time_ns;
- // Update the special 0MHz frequency time, which stores the gaps between
- // periods, but only if the gap is < 1 second.
- if (gap_time > 0 && gap_time < S_IN_NS) {
- __sync_fetch_and_add(&uid_tracking_info->frequency_times_ns[0], gap_time);
+ // We want to estimate the small gaps between "continuous" GPU work; if
+ // the gap is more than |SMALL_TIME_GAP_LIMIT_NS| then we don't consider
+ // this "continuous" GPU work.
+ if (small_gap_time_ns > SMALL_TIME_GAP_LIMIT_NS) {
+ small_gap_time_ns = 0;
}
}
- return 0;
+ uint64_t period_total_inactive_time_ns = 0;
+ const uint64_t period_duration_ns = period->end_time_ns - period->start_time_ns;
+ // |period->total_active_duration_ns| is the active time within the period duration, so
+ // it must not be larger than |period_duration_ns|.
+ if (period->total_active_duration_ns > period_duration_ns) {
+ __sync_fetch_and_add(&uid_tracking_info->error_count, 1);
+ } else {
+ period_total_inactive_time_ns = period_duration_ns - period->total_active_duration_ns;
+ }
+
+ // Update |uid_tracking_info->total_inactive_duration_ns| by adding the
+ // inactive time from this period, plus the small gap between the current
+ // and previous active period. Either or both of these values could be 0.
+ if (small_gap_time_ns > 0 || period_total_inactive_time_ns > 0) {
+ __sync_fetch_and_add(&uid_tracking_info->total_inactive_duration_ns,
+ small_gap_time_ns + period_total_inactive_time_ns);
+ }
+
+ return ALLOW;
}
LICENSE("Apache 2.0");
diff --git a/services/gpuservice/gpuwork/bpfprogs/include/gpuwork/gpu_work.h b/services/gpuservice/gpuwork/bpfprogs/include/gpuwork/gpu_work.h
index 4fe8464..57338f4 100644
--- a/services/gpuservice/gpuwork/bpfprogs/include/gpuwork/gpu_work.h
+++ b/services/gpuservice/gpuwork/bpfprogs/include/gpuwork/gpu_work.h
@@ -25,18 +25,28 @@
namespace gpuwork {
#endif
+typedef struct {
+ uint32_t gpu_id;
+ uint32_t uid;
+} GpuIdUid;
+
typedef struct {
- // The end time of the previous period from this UID in nanoseconds.
- uint64_t previous_end_time_ns;
+ // The end time of the previous period where the GPU was active for the UID,
+ // in nanoseconds.
+ uint64_t previous_active_end_time_ns;
- // The time spent at each GPU frequency while running GPU work from the UID,
- // in nanoseconds. Array index i stores the time for frequency i*50 MHz. So
- // index 0 is 0Mhz, index 1 is 50MHz, index 2 is 100MHz, etc., up to index
- // |kNumTrackedFrequencies|.
- uint64_t frequency_times_ns[21];
+ // The total amount of time the GPU has spent running work for the UID, in
+ // nanoseconds.
+ uint64_t total_active_duration_ns;
- // The number of times we received |GpuUidWorkPeriodEvent| events in an
- // unexpected order. See |GpuUidWorkPeriodEvent|.
+ // The total amount of time of the "gaps" between "continuous" GPU work for
+ // the UID, in nanoseconds. This is estimated by ignoring large gaps between
+ // GPU work for this UID.
+ uint64_t total_inactive_duration_ns;
+
+ // The number of errors detected due to |GpuWorkPeriodEvent| events for the
+ // UID violating the specification in some way. E.g. periods with a zero or
+ // negative duration.
uint32_t error_count;
} UidTrackingInfo;
@@ -48,14 +58,10 @@
uint64_t num_map_entries;
} GlobalData;
-static const uint32_t kMaxTrackedUids = 512;
-static const uint32_t kFrequencyGranularityMhz = 50;
-static const uint32_t kNumTrackedFrequencies = 21;
+// The maximum number of tracked GPU ID and UID pairs (|GpuIdUid|).
+static const uint32_t kMaxTrackedGpuIdUids = 512;
#ifdef __cplusplus
-static_assert(kNumTrackedFrequencies ==
- std::extent<decltype(UidTrackingInfo::frequency_times_ns)>::value);
-
} // namespace gpuwork
} // namespace android
#endif
diff --git a/services/gpuservice/gpuwork/include/gpuwork/GpuWork.h b/services/gpuservice/gpuwork/include/gpuwork/GpuWork.h
index b6f493d..acecd56 100644
--- a/services/gpuservice/gpuwork/include/gpuwork/GpuWork.h
+++ b/services/gpuservice/gpuwork/include/gpuwork/GpuWork.h
@@ -41,7 +41,7 @@
void initialize();
- // Dumps the GPU time in frequency state information.
+ // Dumps the GPU work information.
void dump(const Vector<String16>& args, std::string* result);
private:
@@ -55,7 +55,7 @@
AStatsEventList* data,
void* cookie);
- AStatsManager_PullAtomCallbackReturn pullFrequencyAtoms(AStatsEventList* data);
+ AStatsManager_PullAtomCallbackReturn pullWorkAtoms(AStatsEventList* data);
// Periodically calls |clearMapIfNeeded| to clear the |mGpuWorkMap| map, if
// needed.
@@ -88,7 +88,7 @@
std::mutex mMutex;
// BPF map for per-UID GPU work.
- bpf::BpfMap<Uid, UidTrackingInfo> mGpuWorkMap GUARDED_BY(mMutex);
+ bpf::BpfMap<GpuIdUid, UidTrackingInfo> mGpuWorkMap GUARDED_BY(mMutex);
// BPF map containing a single element for global data.
bpf::BpfMap<uint32_t, GlobalData> mGpuWorkGlobalDataMap GUARDED_BY(mMutex);
@@ -110,7 +110,18 @@
bool mStatsdRegistered GUARDED_BY(mMutex) = false;
// The number of randomly chosen (i.e. sampled) UIDs to log stats for.
- static constexpr int kNumSampledUids = 10;
+ static constexpr size_t kNumSampledUids = 10;
+
+ // A "large" number of GPUs. If we observe more GPUs than this limit then
+ // we reduce the amount of stats we log.
+ static constexpr size_t kNumGpusSoftLimit = 4;
+
+ // A "very large" number of GPUs. If we observe more GPUs than this limit
+ // then we don't log any stats.
+ static constexpr size_t kNumGpusHardLimit = 32;
+
+ // The minimum GPU time needed to actually log stats for a UID.
+ static constexpr uint64_t kMinGpuTimeNanoseconds = 30U * 1000000000U; // 30 seconds.
// The previous time point at which |mGpuWorkMap| was cleared.
std::chrono::steady_clock::time_point mPreviousMapClearTimePoint GUARDED_BY(mMutex);