Add more statistics on failure cases for value metrics
This adds to PulledAtomStats and creates a new stats atom AtomMetricStats.
Test: none
Bug: 123572169
Change-Id: Ifea682005210a6342a027643e162f8ce7785cb06
diff --git a/cmds/statsd/src/external/StatsCompanionServicePuller.cpp b/cmds/statsd/src/external/StatsCompanionServicePuller.cpp
index 4e4b8f3..f37d2be 100644
--- a/cmds/statsd/src/external/StatsCompanionServicePuller.cpp
+++ b/cmds/statsd/src/external/StatsCompanionServicePuller.cpp
@@ -55,6 +55,10 @@
Status status = statsCompanionServiceCopy->pullData(mTagId, &returned_value);
if (!status.isOk()) {
ALOGW("StatsCompanionServicePuller::pull failed for %d", mTagId);
+ StatsdStats::getInstance().noteStatsCompanionPullFailed(mTagId);
+ if (status.exceptionCode() == Status::Exception::EX_TRANSACTION_FAILED) {
+ StatsdStats::getInstance().noteStatsCompanionPullBinderTransactionFailed(mTagId);
+ }
return false;
}
data->clear();
diff --git a/cmds/statsd/src/external/StatsPullerManager.cpp b/cmds/statsd/src/external/StatsPullerManager.cpp
index ba7bcc4..a6ba2ca 100644
--- a/cmds/statsd/src/external/StatsPullerManager.cpp
+++ b/cmds/statsd/src/external/StatsPullerManager.cpp
@@ -231,6 +231,9 @@
if (kAllPullAtomInfo.find(tagId) != kAllPullAtomInfo.end()) {
bool ret = kAllPullAtomInfo.find(tagId)->second.puller->Pull(data);
VLOG("pulled %d items", (int)data->size());
+ if (!ret) {
+ StatsdStats::getInstance().notePullFailed(tagId);
+ }
return ret;
} else {
VLOG("Unknown tagId %d", tagId);
diff --git a/cmds/statsd/src/guardrail/StatsdStats.cpp b/cmds/statsd/src/guardrail/StatsdStats.cpp
index f4d0144..37ccad5 100644
--- a/cmds/statsd/src/guardrail/StatsdStats.cpp
+++ b/cmds/statsd/src/guardrail/StatsdStats.cpp
@@ -403,6 +403,60 @@
mSystemServerRestartSec.push_back(timeSec);
}
+void StatsdStats::notePullFailed(int atomId) {
+ lock_guard<std::mutex> lock(mLock);
+ mPulledAtomStats[atomId].pullFailed++;
+}
+
+void StatsdStats::noteStatsCompanionPullFailed(int atomId) {
+ lock_guard<std::mutex> lock(mLock);
+ mPulledAtomStats[atomId].statsCompanionPullFailed++;
+}
+
+void StatsdStats::noteStatsCompanionPullBinderTransactionFailed(int atomId) {
+ lock_guard<std::mutex> lock(mLock);
+ mPulledAtomStats[atomId].statsCompanionPullBinderTransactionFailed++;
+}
+
+void StatsdStats::noteEmptyData(int atomId) {
+ lock_guard<std::mutex> lock(mLock);
+ mPulledAtomStats[atomId].emptyData++;
+}
+
+void StatsdStats::noteHardDimensionLimitReached(int metricId) {
+ lock_guard<std::mutex> lock(mLock);
+ getAtomMetricStats(metricId).hardDimensionLimitReached++;
+}
+
+void StatsdStats::noteLateLogEventSkipped(int metricId) {
+ lock_guard<std::mutex> lock(mLock);
+ getAtomMetricStats(metricId).lateLogEventSkipped++;
+}
+
+void StatsdStats::noteSkippedForwardBuckets(int metricId) {
+ lock_guard<std::mutex> lock(mLock);
+ getAtomMetricStats(metricId).skippedForwardBuckets++;
+}
+
+void StatsdStats::noteBadValueType(int metricId) {
+ lock_guard<std::mutex> lock(mLock);
+ getAtomMetricStats(metricId).badValueType++;
+}
+
+void StatsdStats::noteConditionChangeInNextBucket(int metricId) {
+ lock_guard<std::mutex> lock(mLock);
+ getAtomMetricStats(metricId).conditionChangeInNextBucket++;
+}
+
+StatsdStats::AtomMetricStats& StatsdStats::getAtomMetricStats(int metricId) {
+ auto atomMetricStatsIter = mAtomMetricStats.find(metricId);
+ if (atomMetricStatsIter != mAtomMetricStats.end()) {
+ return atomMetricStatsIter->second;
+ }
+ auto emplaceResult = mAtomMetricStats.emplace(metricId, AtomMetricStats());
+ return emplaceResult.first->second;
+}
+
void StatsdStats::reset() {
lock_guard<std::mutex> lock(mLock);
resetInternalLocked();
@@ -442,6 +496,7 @@
pullStats.second.pullTimeout = 0;
pullStats.second.pullExceedMaxDelay = 0;
}
+ mAtomMetricStats.clear();
}
string buildTimeString(int64_t timeSec) {
@@ -713,6 +768,10 @@
android::os::statsd::writePullerStatsToStream(pair, &proto);
}
+ for (const auto& pair : mAtomMetricStats) {
+ android::os::statsd::writeAtomMetricStatsToStream(pair, &proto);
+ }
+
if (mAnomalyAlarmRegisteredStats > 0) {
uint64_t token = proto.start(FIELD_TYPE_MESSAGE | FIELD_ID_ANOMALY_ALARM_STATS);
proto.write(FIELD_TYPE_INT32 | FIELD_ID_ANOMALY_ALARMS_REGISTERED,
diff --git a/cmds/statsd/src/guardrail/StatsdStats.h b/cmds/statsd/src/guardrail/StatsdStats.h
index dc647f8..01e9ca1 100644
--- a/cmds/statsd/src/guardrail/StatsdStats.h
+++ b/cmds/statsd/src/guardrail/StatsdStats.h
@@ -318,6 +318,53 @@
void noteLogLost(int32_t wallClockTimeSec, int32_t count, int lastError);
/**
+ * Records that the pull of an atom has failed
+ */
+ void notePullFailed(int atomId);
+
+ /**
+ * Records that the pull of StatsCompanionService atom has failed
+ */
+ void noteStatsCompanionPullFailed(int atomId);
+
+ /**
+ * Records that the pull of a StatsCompanionService atom has failed due to a failed binder
+ * transaction. This can happen when StatsCompanionService returns too
+ * much data (the max Binder parcel size is 1MB)
+ */
+ void noteStatsCompanionPullBinderTransactionFailed(int atomId);
+
+ /**
+ * A pull with no data occurred
+ */
+ void noteEmptyData(int atomId);
+
+ /**
+ * Hard limit was reached in the cardinality of an atom
+ */
+ void noteHardDimensionLimitReached(int atomId);
+
+ /**
+ * A log event was too late, arrived in the wrong bucket and was skipped
+ */
+ void noteLateLogEventSkipped(int atomId);
+
+ /**
+ * Buckets were skipped as time elapsed without any data for them
+ */
+ void noteSkippedForwardBuckets(int atomId);
+
+ /**
+ * An unsupported value type was received
+ */
+ void noteBadValueType(int atomId);
+
+ /**
+ * A condition change was too late, arrived in the wrong bucket and was skipped
+ */
+ void noteConditionChangeInNextBucket(int atomId);
+
+ /**
* Reset the historical stats. Including all stats in icebox, and the tracked stats about
* metrics, matchers, and atoms. The active configs will be kept and StatsdStats will continue
* to collect stats after reset() has been called.
@@ -349,8 +396,20 @@
long dataError = 0;
long pullTimeout = 0;
long pullExceedMaxDelay = 0;
+ long pullFailed = 0;
+ long statsCompanionPullFailed = 0;
+ long statsCompanionPullBinderTransactionFailed = 0;
+ long emptyData = 0;
} PulledAtomStats;
+ typedef struct {
+ long hardDimensionLimitReached = 0;
+ long lateLogEventSkipped = 0;
+ long skippedForwardBuckets = 0;
+ long badValueType = 0;
+ long conditionChangeInNextBucket = 0;
+ } AtomMetricStats;
+
private:
StatsdStats();
@@ -378,6 +437,9 @@
// Maps PullAtomId to its stats. The size is capped by the puller atom counts.
std::map<int, PulledAtomStats> mPulledAtomStats;
+ // Maps metric ID to its stats. The size is capped by the number of metrics.
+ std::map<int, AtomMetricStats> mAtomMetricStats;
+
struct LogLossStats {
LogLossStats(int32_t sec, int32_t count, int32_t error)
: mWallClockSec(sec), mCount(count), mLastError(error) {
@@ -414,6 +476,12 @@
void addToIceBoxLocked(std::shared_ptr<ConfigStats>& stats);
+ /**
+ * Get a reference to AtomMetricStats for a metric. If none exists, create it. The reference
+ * will live as long as `this`.
+ */
+ StatsdStats::AtomMetricStats& getAtomMetricStats(int metricId);
+
FRIEND_TEST(StatsdStatsTest, TestValidConfigAdd);
FRIEND_TEST(StatsdStatsTest, TestInvalidConfigAdd);
FRIEND_TEST(StatsdStatsTest, TestConfigRemove);
diff --git a/cmds/statsd/src/metrics/ValueMetricProducer.cpp b/cmds/statsd/src/metrics/ValueMetricProducer.cpp
index 5645461..6aa8e84 100644
--- a/cmds/statsd/src/metrics/ValueMetricProducer.cpp
+++ b/cmds/statsd/src/metrics/ValueMetricProducer.cpp
@@ -322,6 +322,7 @@
if (eventTimeNs < mCurrentBucketStartTimeNs) {
VLOG("Skip event due to late arrival: %lld vs %lld", (long long)eventTimeNs,
(long long)mCurrentBucketStartTimeNs);
+ StatsdStats::getInstance().noteConditionChangeInNextBucket(mMetricId);
return;
}
@@ -359,6 +360,12 @@
}
StatsdStats::getInstance().notePullDelay(mPullTagId, pullDelayNs);
+ if (timestampNs < mCurrentBucketStartTimeNs) {
+ // The data will be skipped in onMatchedLogEventInternalLocked, but we don't want to report
+ // for every event, just the pull
+ StatsdStats::getInstance().noteLateLogEventSkipped(mMetricId);
+ }
+
for (const auto& data : allData) {
// make a copy before doing and changes
LogEvent localCopy = data->makeCopy();
@@ -380,6 +387,7 @@
if (mCondition) {
if (allData.size() == 0) {
VLOG("Data pulled is empty");
+ StatsdStats::getInstance().noteEmptyData(mPullTagId);
return;
}
// For scheduled pulled data, the effective event time is snap to the nearest
@@ -394,6 +402,7 @@
if (bucketEndTime < mCurrentBucketStartTimeNs) {
VLOG("Skip bucket end pull due to late arrival: %lld vs %lld", (long long)bucketEndTime,
(long long)mCurrentBucketStartTimeNs);
+ StatsdStats::getInstance().noteLateLogEventSkipped(mMetricId);
return;
}
for (const auto& data : allData) {
@@ -442,6 +451,7 @@
if (newTupleCount > mDimensionHardLimit) {
ALOGE("ValueMetric %lld dropping data for dimension key %s", (long long)mMetricId,
newKey.toString().c_str());
+ StatsdStats::getInstance().noteHardDimensionLimitReached(mMetricId);
return true;
}
}
@@ -539,6 +549,7 @@
Value value;
if (!getDoubleOrLong(event, matcher, value)) {
VLOG("Failed to get value %d from event %s", i, event.ToString().c_str());
+ StatsdStats::getInstance().noteBadValueType(mMetricId);
return;
}
interval.seenNewData = true;
@@ -656,6 +667,7 @@
if (numBucketsForward > 1) {
VLOG("Skipping forward %lld buckets", (long long)numBucketsForward);
+ StatsdStats::getInstance().noteSkippedForwardBuckets(mMetricId);
// take base again in future good bucket.
resetBase();
}
diff --git a/cmds/statsd/src/stats_log.proto b/cmds/statsd/src/stats_log.proto
index e8de875..cca09ac 100644
--- a/cmds/statsd/src/stats_log.proto
+++ b/cmds/statsd/src/stats_log.proto
@@ -403,9 +403,23 @@
optional int64 data_error = 9;
optional int64 pull_timeout = 10;
optional int64 pull_exceed_max_delay = 11;
+ optional int64 pull_failed = 12;
+ optional int64 stats_companion_pull_failed = 13;
+ optional int64 stats_companion_pull_binder_transaction_failed = 14;
+ optional int64 empty_data = 15;
}
repeated PulledAtomStats pulled_atom_stats = 10;
+ message AtomMetricStats {
+ optional int64 metric_id = 1;
+ optional int64 hard_dimension_limit_reached = 2;
+ optional int64 late_log_event_skipped = 3;
+ optional int64 skipped_forward_buckets = 4;
+ optional int64 bad_value_type = 5;
+ optional int64 condition_change_in_next_bucket = 6;
+ }
+ repeated AtomMetricStats atom_metric_stats = 17;
+
message LoggerErrorStats {
optional int32 logger_disconnection_sec = 1;
optional int32 error_code = 2;
diff --git a/cmds/statsd/src/stats_log_util.cpp b/cmds/statsd/src/stats_log_util.cpp
index 7de0bb3..9c9985e 100644
--- a/cmds/statsd/src/stats_log_util.cpp
+++ b/cmds/statsd/src/stats_log_util.cpp
@@ -66,6 +66,18 @@
const int FIELD_ID_DATA_ERROR = 9;
const int FIELD_ID_PULL_TIMEOUT = 10;
const int FIELD_ID_PULL_EXCEED_MAX_DELAY = 11;
+const int FIELD_ID_PULL_FAILED = 12;
+const int FIELD_ID_STATS_COMPANION_FAILED = 13;
+const int FIELD_ID_STATS_COMPANION_BINDER_TRANSACTION_FAILED = 14;
+const int FIELD_ID_EMPTY_DATA = 15;
+// for AtomMetricStats proto
+const int FIELD_ID_ATOM_METRIC_STATS = 17;
+const int FIELD_ID_METRIC_ID = 1;
+const int FIELD_ID_HARD_DIMENSION_LIMIT_REACHED = 2;
+const int FIELD_ID_LATE_LOG_EVENT_SKIPPED = 3;
+const int FIELD_ID_SKIPPED_FORWARD_BUCKETS = 4;
+const int FIELD_ID_BAD_VALUE_TYPE = 5;
+const int FIELD_ID_CONDITION_CHANGE_IN_NEXT_BUCKET = 6;
namespace {
@@ -456,6 +468,32 @@
(long long)pair.second.pullTimeout);
protoOutput->write(FIELD_TYPE_INT64 | FIELD_ID_PULL_EXCEED_MAX_DELAY,
(long long)pair.second.pullExceedMaxDelay);
+ protoOutput->write(FIELD_TYPE_INT64 | FIELD_ID_PULL_FAILED,
+ (long long)pair.second.pullFailed);
+ protoOutput->write(FIELD_TYPE_INT64 | FIELD_ID_STATS_COMPANION_FAILED,
+ (long long)pair.second.statsCompanionPullFailed);
+ protoOutput->write(FIELD_TYPE_INT64 | FIELD_ID_STATS_COMPANION_BINDER_TRANSACTION_FAILED,
+ (long long)pair.second.statsCompanionPullBinderTransactionFailed);
+ protoOutput->write(FIELD_TYPE_INT64 | FIELD_ID_EMPTY_DATA,
+ (long long)pair.second.emptyData);
+ protoOutput->end(token);
+}
+
+void writeAtomMetricStatsToStream(const std::pair<int, StatsdStats::AtomMetricStats> &pair,
+ util::ProtoOutputStream *protoOutput) {
+ uint64_t token = protoOutput->start(FIELD_TYPE_MESSAGE | FIELD_ID_ATOM_METRIC_STATS |
+ FIELD_COUNT_REPEATED);
+ protoOutput->write(FIELD_TYPE_INT32 | FIELD_ID_METRIC_ID, (int32_t)pair.first);
+ protoOutput->write(FIELD_TYPE_INT64 | FIELD_ID_HARD_DIMENSION_LIMIT_REACHED,
+ (long long)pair.second.hardDimensionLimitReached);
+ protoOutput->write(FIELD_TYPE_INT64 | FIELD_ID_LATE_LOG_EVENT_SKIPPED,
+ (long long)pair.second.lateLogEventSkipped);
+ protoOutput->write(FIELD_TYPE_INT64 | FIELD_ID_SKIPPED_FORWARD_BUCKETS,
+ (long long)pair.second.skippedForwardBuckets);
+ protoOutput->write(FIELD_TYPE_INT64 | FIELD_ID_BAD_VALUE_TYPE,
+ (long long)pair.second.badValueType);
+ protoOutput->write(FIELD_TYPE_INT64 | FIELD_ID_CONDITION_CHANGE_IN_NEXT_BUCKET,
+ (long long)pair.second.conditionChangeInNextBucket);
protoOutput->end(token);
}
diff --git a/cmds/statsd/src/stats_log_util.h b/cmds/statsd/src/stats_log_util.h
index 61f31eb..dcea0e6 100644
--- a/cmds/statsd/src/stats_log_util.h
+++ b/cmds/statsd/src/stats_log_util.h
@@ -73,6 +73,10 @@
void writePullerStatsToStream(const std::pair<int, StatsdStats::PulledAtomStats>& pair,
util::ProtoOutputStream* protoOutput);
+// Helper function to write AtomMetricStats to ProtoOutputStream
+void writeAtomMetricStatsToStream(const std::pair<int, StatsdStats::AtomMetricStats> &pair,
+ util::ProtoOutputStream *protoOutput);
+
template<class T>
bool parseProtoOutputStream(util::ProtoOutputStream& protoOutput, T* message) {
std::string pbBytes;