MediaMetrics: Track audioserver restart times

Log construction events through AudioFlinger
and timeouts through AudioFlinger and AudioPolicy.

Test: atest mediametrics_tests
Test: instrumented build with timeouts
Test: adb shell dumpsys media.metrics
Bug: 219958414
Change-Id: If93bde1d3d23ad1fd82e776e222d7ebe6e863b22
diff --git a/services/mediametrics/AudioAnalytics.cpp b/services/mediametrics/AudioAnalytics.cpp
index aacc2be..99e3691 100644
--- a/services/mediametrics/AudioAnalytics.cpp
+++ b/services/mediametrics/AudioAnalytics.cpp
@@ -297,33 +297,35 @@
     ALOGD("%s", __func__);
 
     // Add action to save AnalyticsState if audioserver is restarted.
-    // This triggers on an item of "audio.flinger"
-    // with a property "event" set to "AudioFlinger" (the constructor).
+    // This triggers on AudioFlinger or AudioPolicy ctors and onFirstRef,
+    // as well as TimeCheck events.
     mActions.addAction(
         AMEDIAMETRICS_KEY_AUDIO_FLINGER "." AMEDIAMETRICS_PROP_EVENT,
         std::string(AMEDIAMETRICS_PROP_EVENT_VALUE_CTOR),
         std::make_shared<AnalyticsActions::Function>(
             [this](const std::shared_ptr<const android::mediametrics::Item> &item){
-                ALOGW("(key=%s) Audioflinger constructor event detected", item->getKey().c_str());
-                mPreviousAnalyticsState.set(std::make_shared<AnalyticsState>(
-                        *mAnalyticsState.get()));
-                // Note: get returns shared_ptr temp, whose lifetime is extended
-                // to end of full expression.
-                mAnalyticsState->clear();  // TODO: filter the analytics state.
-                // Perhaps report this.
-
-                // Set up a timer to expire the previous audio state to save space.
-                // Use the transaction log size as a cookie to see if it is the
-                // same as before.  A benign race is possible where a state is cleared early.
-                const size_t size = mPreviousAnalyticsState->transactionLog().size();
-                mTimedAction.postIn(
-                        std::chrono::seconds(PREVIOUS_STATE_EXPIRE_SEC), [this, size](){
-                    if (mPreviousAnalyticsState->transactionLog().size() == size) {
-                        ALOGD("expiring previous audio state after %d seconds.",
-                                PREVIOUS_STATE_EXPIRE_SEC);
-                        mPreviousAnalyticsState->clear();  // removes data from the state.
-                    }
-                });
+                mHealth.onAudioServerStart(Health::Module::AUDIOFLINGER, item);
+            }));
+    mActions.addAction(
+        AMEDIAMETRICS_KEY_AUDIO_POLICY "." AMEDIAMETRICS_PROP_EVENT,
+        std::string(AMEDIAMETRICS_PROP_EVENT_VALUE_CTOR),
+        std::make_shared<AnalyticsActions::Function>(
+            [this](const std::shared_ptr<const android::mediametrics::Item> &item){
+                mHealth.onAudioServerStart(Health::Module::AUDIOPOLICY, item);
+            }));
+    mActions.addAction(
+        AMEDIAMETRICS_KEY_AUDIO_FLINGER "." AMEDIAMETRICS_PROP_EVENT,
+        std::string(AMEDIAMETRICS_PROP_EVENT_VALUE_TIMEOUT),
+        std::make_shared<AnalyticsActions::Function>(
+            [this](const std::shared_ptr<const android::mediametrics::Item> &item){
+                mHealth.onAudioServerTimeout(Health::Module::AUDIOFLINGER, item);
+            }));
+    mActions.addAction(
+        AMEDIAMETRICS_KEY_AUDIO_POLICY "." AMEDIAMETRICS_PROP_EVENT,
+        std::string(AMEDIAMETRICS_PROP_EVENT_VALUE_TIMEOUT),
+        std::make_shared<AnalyticsActions::Function>(
+            [this](const std::shared_ptr<const android::mediametrics::Item> &item){
+                mHealth.onAudioServerTimeout(Health::Module::AUDIOPOLICY, item);
             }));
 
     // Handle legacy aaudio playback stream statistics
@@ -1390,4 +1392,138 @@
     }
 }
 
+// Create new state, typically occurs after an AudioFlinger ctor event.
+void AudioAnalytics::newState()
+{
+    mPreviousAnalyticsState.set(std::make_shared<AnalyticsState>(
+            *mAnalyticsState.get()));
+    // Note: get returns shared_ptr temp, whose lifetime is extended
+    // to end of full expression.
+    mAnalyticsState->clear();  // TODO: filter the analytics state.
+    // Perhaps report this.
+
+    // Set up a timer to expire the previous audio state to save space.
+    // Use the transaction log size as a cookie to see if it is the
+    // same as before.  A benign race is possible where a state is cleared early.
+    const size_t size = mPreviousAnalyticsState->transactionLog().size();
+    mTimedAction.postIn(
+            std::chrono::seconds(PREVIOUS_STATE_EXPIRE_SEC), [this, size](){
+        if (mPreviousAnalyticsState->transactionLog().size() == size) {
+            ALOGD("expiring previous audio state after %d seconds.",
+                    PREVIOUS_STATE_EXPIRE_SEC);
+            mPreviousAnalyticsState->clear();  // removes data from the state.
+        }
+    });
+}
+
+void AudioAnalytics::Health::onAudioServerStart(Module module,
+        const std::shared_ptr<const android::mediametrics::Item> &item)
+{
+    const auto nowTime = std::chrono::system_clock::now();
+    if (module == Module::AUDIOFLINGER) {
+       {
+            std::lock_guard lg(mLock);
+            // reset state on AudioFlinger construction.
+            // AudioPolicy is created after AudioFlinger.
+            mAudioFlingerCtorTime = nowTime;
+            mSimpleLog.log("AudioFlinger ctor");
+        }
+        mAudioAnalytics.newState();
+        return;
+    }
+    if (module == Module::AUDIOPOLICY) {
+        // A start event occurs when audioserver
+        //
+        // (1) Starts the first time
+        // (2) Restarts because of the TimeCheck watchdog
+        // (3) Restarts not because of the TimeCheck watchdog.
+        int64_t executionTimeNs = 0;
+        (void)item->get(AMEDIAMETRICS_PROP_EXECUTIONTIMENS, &executionTimeNs);
+        const float loadTimeMs = executionTimeNs * 1e-6f;
+        std::lock_guard lg(mLock);
+        const int64_t restarts = mStartCount;
+        if (mStopCount == mStartCount) {
+            mAudioPolicyCtorTime = nowTime;
+            ++mStartCount;
+            if (mStopCount == 0) {
+                // (1) First time initialization.
+                ALOGW("%s: (key=%s) AudioPolicy ctor, loadTimeMs:%f",
+                        __func__, item->getKey().c_str(), loadTimeMs);
+                mSimpleLog.log("AudioPolicy ctor, loadTimeMs:%f", loadTimeMs);
+            } else {
+                // (2) Previous failure caught due to TimeCheck.  We know how long restart takes.
+                const float restartMs =
+                        std::chrono::duration_cast<std::chrono::duration<float, std::milli>>(
+                                mAudioFlingerCtorTime - mStopTime).count();
+                ALOGW("%s: (key=%s) AudioPolicy ctor, "
+                        "restarts:%lld restartMs:%f loadTimeMs:%f",
+                        __func__, item->getKey().c_str(),
+                        (long long)restarts, restartMs, loadTimeMs);
+                mSimpleLog.log("AudioPolicy ctor restarts:%lld restartMs:%f loadTimeMs:%f",
+                        (long long)restarts, restartMs, loadTimeMs);
+            }
+        } else {
+            // (3) Previous failure is NOT due to TimeCheck, so we don't know the restart time.
+            // However we can estimate the uptime from the delta time from previous ctor.
+            const float uptimeMs =
+                    std::chrono::duration_cast<std::chrono::duration<float, std::milli>>(
+                            nowTime - mAudioFlingerCtorTime).count();
+            mStopCount = mStartCount;
+            mAudioPolicyCtorTime = nowTime;
+            ++mStartCount;
+
+            ALOGW("%s: (key=%s) AudioPolicy ctor after uncaught failure, "
+                    "mStartCount:%lld mStopCount:%lld uptimeMs:%f loadTimeMs:%f",
+                    __func__, item->getKey().c_str(),
+                    (long long)mStartCount, (long long)mStopCount, uptimeMs, loadTimeMs);
+            mSimpleLog.log("AudioPolicy ctor after uncaught failure, "
+                    "restarts:%lld uptimeMs:%f loadTimeMs:%f",
+                    (long long)restarts, uptimeMs, loadTimeMs);
+        }
+    }
+}
+
+void AudioAnalytics::Health::onAudioServerTimeout(Module module,
+        const std::shared_ptr<const android::mediametrics::Item> &item)
+{
+    std::string moduleName = getModuleName(module);
+    int64_t methodCode{};
+    std::string methodName;
+    (void)item->get(AMEDIAMETRICS_PROP_METHODCODE, &methodCode);
+    (void)item->get(AMEDIAMETRICS_PROP_METHODNAME, &methodName);
+
+    std::lock_guard lg(mLock);
+    if (mStopCount >= mStartCount) {
+        ALOGD("%s: (key=%s) %s timeout %s(%lld) "
+            "unmatched mStopCount(%lld) >= mStartCount(%lld), ignoring",
+            __func__, item->getKey().c_str(), moduleName.c_str(),
+            methodName.c_str(), (long long)methodCode,
+            (long long)mStopCount, (long long)mStartCount);
+        return;
+    }
+
+    const int64_t restarts = mStartCount - 1;
+    ++mStopCount;
+    mStopTime = std::chrono::system_clock::now();
+    const float uptimeMs = std::chrono::duration_cast<std::chrono::duration<float, std::milli>>(
+            mStopTime - mAudioFlingerCtorTime).count();
+    ALOGW("%s: (key=%s) %s timeout %s(%lld) restarts:%lld uptimeMs:%f",
+         __func__, item->getKey().c_str(), moduleName.c_str(),
+         methodName.c_str(), (long long)methodCode,
+         (long long)restarts, uptimeMs);
+    mSimpleLog.log("%s timeout %s(%lld) restarts:%lld uptimeMs:%f",
+            moduleName.c_str(), methodName.c_str(), (long long)methodCode,
+            (long long)restarts, uptimeMs);
+}
+
+std::pair<std::string, int32_t> AudioAnalytics::Health::dump(
+        int32_t lines, const char *prefix) const
+{
+    std::lock_guard lg(mLock);
+    std::string s = mSimpleLog.dumpToString(prefix == nullptr ? "" : prefix, lines);
+    size_t n = std::count(s.begin(), s.end(), '\n');
+    return { s, n };
+}
+
+
 } // namespace android::mediametrics
diff --git a/services/mediametrics/MediaMetricsService.cpp b/services/mediametrics/MediaMetricsService.cpp
index 636b343..ff16b9e 100644
--- a/services/mediametrics/MediaMetricsService.cpp
+++ b/services/mediametrics/MediaMetricsService.cpp
@@ -327,6 +327,15 @@
                 result << "-- some lines may be truncated --\n";
             }
 
+            const int32_t healthLinesToDump = all ? INT32_MAX : 15;
+            result << "\nHealth Message Log:";
+            const auto [ healthDumpString, healthLines ] =
+                    mAudioAnalytics.dumpHealth(healthLinesToDump);
+            result << "\n" << healthDumpString;
+            if (healthLines == healthLinesToDump) {
+                result << "-- some lines may be truncated --\n";
+            }
+
             result << "\nLogSessionId:\n"
                    << mediametrics::ValidateId::get()->dump();
 
diff --git a/services/mediametrics/include/mediametricsservice/AudioAnalytics.h b/services/mediametrics/include/mediametricsservice/AudioAnalytics.h
index a44fcc1..5ee8c30 100644
--- a/services/mediametrics/include/mediametricsservice/AudioAnalytics.h
+++ b/services/mediametrics/include/mediametricsservice/AudioAnalytics.h
@@ -83,6 +83,15 @@
         return mHeatMap.dump(lines);
     }
 
+    /**
+     * Returns a pair consisting of the dump string and the number of lines in the string.
+     *
+     * Health dump.
+     */
+    std::pair<std::string, int32_t> dumpHealth(int32_t lines = INT32_MAX) const {
+        return mHealth.dump(lines);
+    }
+
     void clear() {
         // underlying state is locked.
         mPreviousAnalyticsState->clear();
@@ -247,6 +256,67 @@
         AudioAnalytics &mAudioAnalytics;
     } mAAudioStreamInfo{*this};
 
+    // Create new state, typically occurs after an AudioFlinger ctor event.
+    void newState();
+
+    // Health is a nested class that tracks audioserver health properties
+    class Health {
+    public:
+        explicit Health(AudioAnalytics &audioAnalytics)
+            : mAudioAnalytics(audioAnalytics) {}
+
+        enum class Module {
+            AUDIOFLINGER,
+            AUDIOPOLICY,
+        };
+
+        const char *getModuleName(Module module) {
+            switch (module) {
+                case Module::AUDIOFLINGER: return "AudioFlinger";
+                case Module::AUDIOPOLICY: return "AudioPolicy";
+            }
+            return "Unknown";
+        }
+
+        // Called when we believe audioserver starts (AudioFlinger ctor)
+        void onAudioServerStart(Module module,
+                const std::shared_ptr<const android::mediametrics::Item> &item);
+
+        // Called when we believe audioserver crashes (TimeCheck timeouts).
+        void onAudioServerTimeout(Module module,
+                const std::shared_ptr<const android::mediametrics::Item> &item);
+
+        std::pair<std::string, int32_t> dump(
+                int32_t lines = INT32_MAX, const char *prefix = nullptr) const;
+
+    private:
+        AudioAnalytics& mAudioAnalytics;
+
+        mutable std::mutex mLock;
+
+        // Life cycle of AudioServer
+        // mAudioFlingerCtorTime
+        // mAudioPolicyCtorTime
+        // mAudioPolicyCtorDoneTime
+        // ...
+        // possibly mStopTime  (if TimeCheck thread)
+        //
+        // UpTime is measured from mStopTime - mAudioFlingerCtorTime.
+        //
+        // The stop events come from TimeCheck timeout aborts.  There may be other
+        // uncaught signals, e.g. SIGSEGV, that cause missing stop events.
+        std::chrono::system_clock::time_point mAudioFlingerCtorTime GUARDED_BY(mLock);
+        std::chrono::system_clock::time_point mAudioPolicyCtorTime GUARDED_BY(mLock);
+        std::chrono::system_clock::time_point mAudioPolicyCtorDoneTime GUARDED_BY(mLock);
+        std::chrono::system_clock::time_point mStopTime GUARDED_BY(mLock);
+
+        // mStartCount and mStopCount track the audioserver start and stop events.
+        int64_t mStartCount GUARDED_BY(mLock) = 0;
+        int64_t mStopCount GUARDED_BY(mLock) = 0;
+
+        SimpleLog mSimpleLog GUARDED_BY(mLock) {64};
+    } mHealth{*this};
+
     AudioPowerUsage mAudioPowerUsage;
 };