Add more statistics on failure cases for value metrics

This adds to PulledAtomStats and creates a new stats atom AtomMetricStats.

Test: none
Bug: 123572169
Change-Id: Ifea682005210a6342a027643e162f8ce7785cb06
diff --git a/cmds/statsd/src/external/StatsCompanionServicePuller.cpp b/cmds/statsd/src/external/StatsCompanionServicePuller.cpp
index 4e4b8f3..f37d2be 100644
--- a/cmds/statsd/src/external/StatsCompanionServicePuller.cpp
+++ b/cmds/statsd/src/external/StatsCompanionServicePuller.cpp
@@ -55,6 +55,10 @@
         Status status = statsCompanionServiceCopy->pullData(mTagId, &returned_value);
         if (!status.isOk()) {
             ALOGW("StatsCompanionServicePuller::pull failed for %d", mTagId);
+            StatsdStats::getInstance().noteStatsCompanionPullFailed(mTagId);
+            if (status.exceptionCode() == Status::Exception::EX_TRANSACTION_FAILED) {
+                StatsdStats::getInstance().noteStatsCompanionPullBinderTransactionFailed(mTagId);
+            }
             return false;
         }
         data->clear();
diff --git a/cmds/statsd/src/external/StatsPullerManager.cpp b/cmds/statsd/src/external/StatsPullerManager.cpp
index ba7bcc4..a6ba2ca 100644
--- a/cmds/statsd/src/external/StatsPullerManager.cpp
+++ b/cmds/statsd/src/external/StatsPullerManager.cpp
@@ -231,6 +231,9 @@
     if (kAllPullAtomInfo.find(tagId) != kAllPullAtomInfo.end()) {
         bool ret = kAllPullAtomInfo.find(tagId)->second.puller->Pull(data);
         VLOG("pulled %d items", (int)data->size());
+        if (!ret) {
+            StatsdStats::getInstance().notePullFailed(tagId);
+        }
         return ret;
     } else {
         VLOG("Unknown tagId %d", tagId);
diff --git a/cmds/statsd/src/guardrail/StatsdStats.cpp b/cmds/statsd/src/guardrail/StatsdStats.cpp
index f4d0144..37ccad5 100644
--- a/cmds/statsd/src/guardrail/StatsdStats.cpp
+++ b/cmds/statsd/src/guardrail/StatsdStats.cpp
@@ -403,6 +403,60 @@
     mSystemServerRestartSec.push_back(timeSec);
 }
 
+void StatsdStats::notePullFailed(int atomId) {
+    lock_guard<std::mutex> lock(mLock);
+    mPulledAtomStats[atomId].pullFailed++;
+}
+
+void StatsdStats::noteStatsCompanionPullFailed(int atomId) {
+    lock_guard<std::mutex> lock(mLock);
+    mPulledAtomStats[atomId].statsCompanionPullFailed++;
+}
+
+void StatsdStats::noteStatsCompanionPullBinderTransactionFailed(int atomId) {
+    lock_guard<std::mutex> lock(mLock);
+    mPulledAtomStats[atomId].statsCompanionPullBinderTransactionFailed++;
+}
+
+void StatsdStats::noteEmptyData(int atomId) {
+    lock_guard<std::mutex> lock(mLock);
+    mPulledAtomStats[atomId].emptyData++;
+}
+
+void StatsdStats::noteHardDimensionLimitReached(int metricId) {
+    lock_guard<std::mutex> lock(mLock);
+    getAtomMetricStats(metricId).hardDimensionLimitReached++;
+}
+
+void StatsdStats::noteLateLogEventSkipped(int metricId) {
+    lock_guard<std::mutex> lock(mLock);
+    getAtomMetricStats(metricId).lateLogEventSkipped++;
+}
+
+void StatsdStats::noteSkippedForwardBuckets(int metricId) {
+    lock_guard<std::mutex> lock(mLock);
+    getAtomMetricStats(metricId).skippedForwardBuckets++;
+}
+
+void StatsdStats::noteBadValueType(int metricId) {
+    lock_guard<std::mutex> lock(mLock);
+    getAtomMetricStats(metricId).badValueType++;
+}
+
+void StatsdStats::noteConditionChangeInNextBucket(int metricId) {
+    lock_guard<std::mutex> lock(mLock);
+    getAtomMetricStats(metricId).conditionChangeInNextBucket++;
+}
+
+StatsdStats::AtomMetricStats& StatsdStats::getAtomMetricStats(int metricId) {
+    auto atomMetricStatsIter = mAtomMetricStats.find(metricId);
+    if (atomMetricStatsIter != mAtomMetricStats.end()) {
+        return atomMetricStatsIter->second;
+    }
+    auto emplaceResult = mAtomMetricStats.emplace(metricId, AtomMetricStats());
+    return emplaceResult.first->second;
+}
+
 void StatsdStats::reset() {
     lock_guard<std::mutex> lock(mLock);
     resetInternalLocked();
@@ -442,6 +496,7 @@
         pullStats.second.pullTimeout = 0;
         pullStats.second.pullExceedMaxDelay = 0;
     }
+    mAtomMetricStats.clear();
 }
 
 string buildTimeString(int64_t timeSec) {
@@ -713,6 +768,10 @@
         android::os::statsd::writePullerStatsToStream(pair, &proto);
     }
 
+    for (const auto& pair : mAtomMetricStats) {
+        android::os::statsd::writeAtomMetricStatsToStream(pair, &proto);
+    }
+
     if (mAnomalyAlarmRegisteredStats > 0) {
         uint64_t token = proto.start(FIELD_TYPE_MESSAGE | FIELD_ID_ANOMALY_ALARM_STATS);
         proto.write(FIELD_TYPE_INT32 | FIELD_ID_ANOMALY_ALARMS_REGISTERED,
diff --git a/cmds/statsd/src/guardrail/StatsdStats.h b/cmds/statsd/src/guardrail/StatsdStats.h
index dc647f8..01e9ca1 100644
--- a/cmds/statsd/src/guardrail/StatsdStats.h
+++ b/cmds/statsd/src/guardrail/StatsdStats.h
@@ -318,6 +318,53 @@
     void noteLogLost(int32_t wallClockTimeSec, int32_t count, int lastError);
 
     /**
+     * Records that the pull of an atom has failed
+     */
+    void notePullFailed(int atomId);
+
+    /**
+     * Records that the pull of StatsCompanionService atom has failed
+     */
+    void noteStatsCompanionPullFailed(int atomId);
+
+    /**
+     * Records that the pull of a StatsCompanionService atom has failed due to a failed binder
+     * transaction. This can happen when StatsCompanionService returns too
+     * much data (the max Binder parcel size is 1MB)
+     */
+    void noteStatsCompanionPullBinderTransactionFailed(int atomId);
+
+    /**
+     * A pull with no data occurred
+     */
+    void noteEmptyData(int atomId);
+
+    /**
+     * Hard limit was reached in the cardinality of an atom
+     */
+    void noteHardDimensionLimitReached(int atomId);
+
+    /**
+     * A log event was too late, arrived in the wrong bucket and was skipped
+     */
+    void noteLateLogEventSkipped(int atomId);
+
+    /**
+     * Buckets were skipped as time elapsed without any data for them
+     */
+    void noteSkippedForwardBuckets(int atomId);
+
+    /**
+     * An unsupported value type was received
+     */
+    void noteBadValueType(int atomId);
+
+    /**
+     * A condition change was too late, arrived in the wrong bucket and was skipped
+     */
+    void noteConditionChangeInNextBucket(int atomId);
+
+    /**
      * Reset the historical stats. Including all stats in icebox, and the tracked stats about
      * metrics, matchers, and atoms. The active configs will be kept and StatsdStats will continue
      * to collect stats after reset() has been called.
@@ -349,8 +396,20 @@
         long dataError = 0;
         long pullTimeout = 0;
         long pullExceedMaxDelay = 0;
+        long pullFailed = 0;
+        long statsCompanionPullFailed = 0;
+        long statsCompanionPullBinderTransactionFailed = 0;
+        long emptyData = 0;
     } PulledAtomStats;
 
+    typedef struct {
+        long hardDimensionLimitReached = 0;
+        long lateLogEventSkipped = 0;
+        long skippedForwardBuckets = 0;
+        long badValueType = 0;
+        long conditionChangeInNextBucket = 0;
+    } AtomMetricStats;
+
 private:
     StatsdStats();
 
@@ -378,6 +437,9 @@
     // Maps PullAtomId to its stats. The size is capped by the puller atom counts.
     std::map<int, PulledAtomStats> mPulledAtomStats;
 
+    // Maps metric ID to its stats. The size is capped by the number of metrics.
+    std::map<int, AtomMetricStats> mAtomMetricStats;
+
     struct LogLossStats {
         LogLossStats(int32_t sec, int32_t count, int32_t error)
             : mWallClockSec(sec), mCount(count), mLastError(error) {
@@ -414,6 +476,12 @@
 
     void addToIceBoxLocked(std::shared_ptr<ConfigStats>& stats);
 
+    /**
+     * Get a reference to AtomMetricStats for a metric. If none exists, create it. The reference
+     * will live as long as `this`.
+     */
+    StatsdStats::AtomMetricStats& getAtomMetricStats(int metricId);
+
     FRIEND_TEST(StatsdStatsTest, TestValidConfigAdd);
     FRIEND_TEST(StatsdStatsTest, TestInvalidConfigAdd);
     FRIEND_TEST(StatsdStatsTest, TestConfigRemove);
diff --git a/cmds/statsd/src/metrics/ValueMetricProducer.cpp b/cmds/statsd/src/metrics/ValueMetricProducer.cpp
index 5645461..6aa8e84 100644
--- a/cmds/statsd/src/metrics/ValueMetricProducer.cpp
+++ b/cmds/statsd/src/metrics/ValueMetricProducer.cpp
@@ -322,6 +322,7 @@
     if (eventTimeNs < mCurrentBucketStartTimeNs) {
         VLOG("Skip event due to late arrival: %lld vs %lld", (long long)eventTimeNs,
              (long long)mCurrentBucketStartTimeNs);
+        StatsdStats::getInstance().noteConditionChangeInNextBucket(mMetricId);
         return;
     }
 
@@ -359,6 +360,12 @@
     }
     StatsdStats::getInstance().notePullDelay(mPullTagId, pullDelayNs);
 
+    if (timestampNs < mCurrentBucketStartTimeNs) {
+        // The data will be skipped in onMatchedLogEventInternalLocked, but we don't want to report
+        // for every event, just the pull
+        StatsdStats::getInstance().noteLateLogEventSkipped(mMetricId);
+    }
+
     for (const auto& data : allData) {
         // make a copy before doing and changes
         LogEvent localCopy = data->makeCopy();
@@ -380,6 +387,7 @@
     if (mCondition) {
         if (allData.size() == 0) {
             VLOG("Data pulled is empty");
+            StatsdStats::getInstance().noteEmptyData(mPullTagId);
             return;
         }
         // For scheduled pulled data, the effective event time is snap to the nearest
@@ -394,6 +402,7 @@
         if (bucketEndTime < mCurrentBucketStartTimeNs) {
             VLOG("Skip bucket end pull due to late arrival: %lld vs %lld", (long long)bucketEndTime,
                  (long long)mCurrentBucketStartTimeNs);
+            StatsdStats::getInstance().noteLateLogEventSkipped(mMetricId);
             return;
         }
         for (const auto& data : allData) {
@@ -442,6 +451,7 @@
         if (newTupleCount > mDimensionHardLimit) {
             ALOGE("ValueMetric %lld dropping data for dimension key %s", (long long)mMetricId,
                   newKey.toString().c_str());
+            StatsdStats::getInstance().noteHardDimensionLimitReached(mMetricId);
             return true;
         }
     }
@@ -539,6 +549,7 @@
         Value value;
         if (!getDoubleOrLong(event, matcher, value)) {
             VLOG("Failed to get value %d from event %s", i, event.ToString().c_str());
+            StatsdStats::getInstance().noteBadValueType(mMetricId);
             return;
         }
         interval.seenNewData = true;
@@ -656,6 +667,7 @@
 
     if (numBucketsForward > 1) {
         VLOG("Skipping forward %lld buckets", (long long)numBucketsForward);
+        StatsdStats::getInstance().noteSkippedForwardBuckets(mMetricId);
         // take base again in future good bucket.
         resetBase();
     }
diff --git a/cmds/statsd/src/stats_log.proto b/cmds/statsd/src/stats_log.proto
index e8de875..cca09ac 100644
--- a/cmds/statsd/src/stats_log.proto
+++ b/cmds/statsd/src/stats_log.proto
@@ -403,9 +403,23 @@
         optional int64 data_error = 9;
         optional int64 pull_timeout = 10;
         optional int64 pull_exceed_max_delay = 11;
+        optional int64 pull_failed = 12;
+        optional int64 stats_companion_pull_failed = 13;
+        optional int64 stats_companion_pull_binder_transaction_failed = 14;
+        optional int64 empty_data = 15;
     }
     repeated PulledAtomStats pulled_atom_stats = 10;
 
+    message AtomMetricStats {
+      optional int64 metric_id = 1;
+      optional int64 hard_dimension_limit_reached = 2;
+      optional int64 late_log_event_skipped = 3;
+      optional int64 skipped_forward_buckets = 4;
+      optional int64 bad_value_type = 5;
+      optional int64 condition_change_in_next_bucket = 6;
+    }
+    repeated AtomMetricStats atom_metric_stats = 17;
+
     message LoggerErrorStats {
         optional int32 logger_disconnection_sec = 1;
         optional int32 error_code = 2;
diff --git a/cmds/statsd/src/stats_log_util.cpp b/cmds/statsd/src/stats_log_util.cpp
index 7de0bb3..9c9985e 100644
--- a/cmds/statsd/src/stats_log_util.cpp
+++ b/cmds/statsd/src/stats_log_util.cpp
@@ -66,6 +66,18 @@
 const int FIELD_ID_DATA_ERROR = 9;
 const int FIELD_ID_PULL_TIMEOUT = 10;
 const int FIELD_ID_PULL_EXCEED_MAX_DELAY = 11;
+const int FIELD_ID_PULL_FAILED = 12;
+const int FIELD_ID_STATS_COMPANION_FAILED = 13;
+const int FIELD_ID_STATS_COMPANION_BINDER_TRANSACTION_FAILED = 14;
+const int FIELD_ID_EMPTY_DATA = 15;
+// for AtomMetricStats proto
+const int FIELD_ID_ATOM_METRIC_STATS = 17;
+const int FIELD_ID_METRIC_ID = 1;
+const int FIELD_ID_HARD_DIMENSION_LIMIT_REACHED = 2;
+const int FIELD_ID_LATE_LOG_EVENT_SKIPPED = 3;
+const int FIELD_ID_SKIPPED_FORWARD_BUCKETS = 4;
+const int FIELD_ID_BAD_VALUE_TYPE = 5;
+const int FIELD_ID_CONDITION_CHANGE_IN_NEXT_BUCKET = 6;
 
 namespace {
 
@@ -456,6 +468,32 @@
                        (long long)pair.second.pullTimeout);
     protoOutput->write(FIELD_TYPE_INT64 | FIELD_ID_PULL_EXCEED_MAX_DELAY,
                        (long long)pair.second.pullExceedMaxDelay);
+    protoOutput->write(FIELD_TYPE_INT64 | FIELD_ID_PULL_FAILED,
+                       (long long)pair.second.pullFailed);
+    protoOutput->write(FIELD_TYPE_INT64 | FIELD_ID_STATS_COMPANION_FAILED,
+                       (long long)pair.second.statsCompanionPullFailed);
+    protoOutput->write(FIELD_TYPE_INT64 | FIELD_ID_STATS_COMPANION_BINDER_TRANSACTION_FAILED,
+                       (long long)pair.second.statsCompanionPullBinderTransactionFailed);
+    protoOutput->write(FIELD_TYPE_INT64 | FIELD_ID_EMPTY_DATA,
+                       (long long)pair.second.emptyData);
+    protoOutput->end(token);
+}
+
+void writeAtomMetricStatsToStream(const std::pair<int, StatsdStats::AtomMetricStats> &pair,
+                                  util::ProtoOutputStream *protoOutput) {
+    uint64_t token = protoOutput->start(FIELD_TYPE_MESSAGE | FIELD_ID_ATOM_METRIC_STATS |
+                                        FIELD_COUNT_REPEATED);
+    protoOutput->write(FIELD_TYPE_INT32 | FIELD_ID_METRIC_ID, (int32_t)pair.first);
+    protoOutput->write(FIELD_TYPE_INT64 | FIELD_ID_HARD_DIMENSION_LIMIT_REACHED,
+                       (long long)pair.second.hardDimensionLimitReached);
+    protoOutput->write(FIELD_TYPE_INT64 | FIELD_ID_LATE_LOG_EVENT_SKIPPED,
+                       (long long)pair.second.lateLogEventSkipped);
+    protoOutput->write(FIELD_TYPE_INT64 | FIELD_ID_SKIPPED_FORWARD_BUCKETS,
+                       (long long)pair.second.skippedForwardBuckets);
+    protoOutput->write(FIELD_TYPE_INT64 | FIELD_ID_BAD_VALUE_TYPE,
+                       (long long)pair.second.badValueType);
+    protoOutput->write(FIELD_TYPE_INT64 | FIELD_ID_CONDITION_CHANGE_IN_NEXT_BUCKET,
+                       (long long)pair.second.conditionChangeInNextBucket);
     protoOutput->end(token);
 }
 
diff --git a/cmds/statsd/src/stats_log_util.h b/cmds/statsd/src/stats_log_util.h
index 61f31eb..dcea0e6 100644
--- a/cmds/statsd/src/stats_log_util.h
+++ b/cmds/statsd/src/stats_log_util.h
@@ -73,6 +73,10 @@
 void writePullerStatsToStream(const std::pair<int, StatsdStats::PulledAtomStats>& pair,
                               util::ProtoOutputStream* protoOutput);
 
+// Helper function to write AtomMetricStats to ProtoOutputStream
+void writeAtomMetricStatsToStream(const std::pair<int, StatsdStats::AtomMetricStats> &pair,
+                                  util::ProtoOutputStream *protoOutput);
+
 template<class T>
 bool parseProtoOutputStream(util::ProtoOutputStream& protoOutput, T* message) {
     std::string pbBytes;