SF: rate-limit luma sampling

Instead of sampling the luma regions every frame, introduce a rate
limiting system to reduce load. Introduces a few tunables to control
the rate, which is defaulted to 10Hz, when there is content being
watched for luma.

Test: manual systrace inspection, using SamplingDemo
Test: libgui_test --gtest_filter="RegionSampling*"
Test: atest CompositionSamplingListenerTest
Fixes: 126747045
Change-Id: I7cae3e90fb405ba72dc2f276a88be48f1533a219
diff --git a/services/surfaceflinger/RegionSamplingThread.cpp b/services/surfaceflinger/RegionSamplingThread.cpp
index bd8548c..4f0b3bb 100644
--- a/services/surfaceflinger/RegionSamplingThread.cpp
+++ b/services/surfaceflinger/RegionSamplingThread.cpp
@@ -21,27 +21,168 @@
 
 #include "RegionSamplingThread.h"
 
+#include <cutils/properties.h>
 #include <gui/IRegionSamplingListener.h>
 #include <utils/Trace.h>
+#include <string>
 
 #include "DisplayDevice.h"
 #include "Layer.h"
 #include "SurfaceFlinger.h"
 
 namespace android {
+using namespace std::chrono_literals;
 
 template <typename T>
 struct SpHash {
     size_t operator()(const sp<T>& p) const { return std::hash<T*>()(p.get()); }
 };
 
-RegionSamplingThread::RegionSamplingThread(SurfaceFlinger& flinger) : mFlinger(flinger) {
-    std::lock_guard threadLock(mThreadMutex);
-    mThread = std::thread([this]() { threadMain(); });
-    pthread_setname_np(mThread.native_handle(), "RegionSamplingThread");
+constexpr auto lumaSamplingStepTag = "LumaSamplingStep";
+enum class samplingStep {
+    noWorkNeeded,
+    idleTimerWaiting,
+    waitForZeroPhase,
+    waitForSamplePhase,
+    sample
+};
+
+constexpr auto defaultRegionSamplingOffset = -3ms;
+constexpr auto defaultRegionSamplingPeriod = 100ms;
+constexpr auto defaultRegionSamplingTimerTimeout = 100ms;
+// TODO: (b/127403193) duration to string conversion could probably be constexpr
+template <typename Rep, typename Per>
+inline std::string toNsString(std::chrono::duration<Rep, Per> t) {
+    return std::to_string(std::chrono::duration_cast<std::chrono::nanoseconds>(t).count());
 }
 
+RegionSamplingThread::EnvironmentTimingTunables::EnvironmentTimingTunables() {
+    char value[PROPERTY_VALUE_MAX] = {};
+
+    property_get("debug.sf.region_sampling_offset_ns", value,
+                 toNsString(defaultRegionSamplingOffset).c_str());
+    int const samplingOffsetNsRaw = atoi(value);
+
+    property_get("debug.sf.region_sampling_period_ns", value,
+                 toNsString(defaultRegionSamplingPeriod).c_str());
+    int const samplingPeriodNsRaw = atoi(value);
+
+    property_get("debug.sf.region_sampling_timer_timeout_ns", value,
+                 toNsString(defaultRegionSamplingTimerTimeout).c_str());
+    int const samplingTimerTimeoutNsRaw = atoi(value);
+
+    if ((samplingPeriodNsRaw < 0) || (samplingTimerTimeoutNsRaw < 0)) {
+        ALOGW("User-specified sampling tuning options nonsensical. Using defaults");
+        mSamplingOffset = defaultRegionSamplingOffset;
+        mSamplingPeriod = defaultRegionSamplingPeriod;
+        mSamplingTimerTimeout = defaultRegionSamplingTimerTimeout;
+    } else {
+        mSamplingOffset = std::chrono::nanoseconds(samplingOffsetNsRaw);
+        mSamplingPeriod = std::chrono::nanoseconds(samplingPeriodNsRaw);
+        mSamplingTimerTimeout = std::chrono::nanoseconds(samplingTimerTimeoutNsRaw);
+    }
+}
+
+struct SamplingOffsetCallback : DispSync::Callback {
+    SamplingOffsetCallback(RegionSamplingThread& samplingThread, Scheduler& scheduler,
+                           std::chrono::nanoseconds targetSamplingOffset)
+          : mRegionSamplingThread(samplingThread),
+            mScheduler(scheduler),
+            mTargetSamplingOffset(targetSamplingOffset) {}
+
+    ~SamplingOffsetCallback() { stopVsyncListener(); }
+
+    SamplingOffsetCallback(const SamplingOffsetCallback&) = delete;
+    SamplingOffsetCallback& operator=(const SamplingOffsetCallback&) = delete;
+
+    void startVsyncListener() {
+        std::lock_guard lock(mMutex);
+        if (mVsyncListening) return;
+
+        mPhaseIntervalSetting = Phase::ZERO;
+        mScheduler.withPrimaryDispSync([this](android::DispSync& sync) {
+            sync.addEventListener("SamplingThreadDispSyncListener", 0, this);
+        });
+        mVsyncListening = true;
+    }
+
+    void stopVsyncListener() {
+        std::lock_guard lock(mMutex);
+        stopVsyncListenerLocked();
+    }
+
+private:
+    void stopVsyncListenerLocked() /*REQUIRES(mMutex)*/ {
+        if (!mVsyncListening) return;
+
+        mScheduler.withPrimaryDispSync(
+                [this](android::DispSync& sync) { sync.removeEventListener(this); });
+        mVsyncListening = false;
+    }
+
+    void onDispSyncEvent(nsecs_t /* when */) final {
+        std::unique_lock<decltype(mMutex)> lock(mMutex);
+
+        if (mPhaseIntervalSetting == Phase::ZERO) {
+            ATRACE_INT(lumaSamplingStepTag, static_cast<int>(samplingStep::waitForSamplePhase));
+            mPhaseIntervalSetting = Phase::SAMPLING;
+            mScheduler.withPrimaryDispSync([this](android::DispSync& sync) {
+                sync.changePhaseOffset(this, mTargetSamplingOffset.count());
+            });
+            return;
+        }
+
+        if (mPhaseIntervalSetting == Phase::SAMPLING) {
+            mPhaseIntervalSetting = Phase::ZERO;
+            mScheduler.withPrimaryDispSync(
+                    [this](android::DispSync& sync) { sync.changePhaseOffset(this, 0); });
+            stopVsyncListenerLocked();
+            lock.unlock();
+            mRegionSamplingThread.notifySamplingOffset();
+            return;
+        }
+    }
+
+    RegionSamplingThread& mRegionSamplingThread;
+    Scheduler& mScheduler;
+    const std::chrono::nanoseconds mTargetSamplingOffset;
+    mutable std::mutex mMutex;
+    enum class Phase {
+        ZERO,
+        SAMPLING
+    } mPhaseIntervalSetting /*GUARDED_BY(mMutex) macro doesnt work with unique_lock?*/
+            = Phase::ZERO;
+    bool mVsyncListening /*GUARDED_BY(mMutex)*/ = false;
+};
+
+RegionSamplingThread::RegionSamplingThread(SurfaceFlinger& flinger, Scheduler& scheduler,
+                                           const TimingTunables& tunables)
+      : mFlinger(flinger),
+        mScheduler(scheduler),
+        mTunables(tunables),
+        mIdleTimer(std::chrono::duration_cast<std::chrono::milliseconds>(
+                           mTunables.mSamplingTimerTimeout),
+                   [] {}, [this] { checkForStaleLuma(); }),
+        mPhaseCallback(std::make_unique<SamplingOffsetCallback>(*this, mScheduler,
+                                                                tunables.mSamplingOffset)),
+        lastSampleTime(0ns) {
+    {
+        std::lock_guard threadLock(mThreadMutex);
+        mThread = std::thread([this]() { threadMain(); });
+        pthread_setname_np(mThread.native_handle(), "RegionSamplingThread");
+    }
+    mIdleTimer.start();
+}
+
+RegionSamplingThread::RegionSamplingThread(SurfaceFlinger& flinger, Scheduler& scheduler)
+      : RegionSamplingThread(flinger, scheduler,
+                             TimingTunables{defaultRegionSamplingOffset,
+                                            defaultRegionSamplingPeriod,
+                                            defaultRegionSamplingTimerTimeout}) {}
+
 RegionSamplingThread::~RegionSamplingThread() {
+    mIdleTimer.stop();
+
     {
         std::lock_guard lock(mMutex);
         mRunning = false;
@@ -71,8 +212,41 @@
     mDescriptors.erase(wp<IBinder>(IInterface::asBinder(listener)));
 }
 
-void RegionSamplingThread::sampleNow() {
+void RegionSamplingThread::checkForStaleLuma() {
     std::lock_guard lock(mMutex);
+
+    if (mDiscardedFrames) {
+        ATRACE_INT(lumaSamplingStepTag, static_cast<int>(samplingStep::waitForZeroPhase));
+        mDiscardedFrames = false;
+        mPhaseCallback->startVsyncListener();
+    }
+}
+
+void RegionSamplingThread::notifyNewContent() {
+    doSample();
+}
+
+void RegionSamplingThread::notifySamplingOffset() {
+    doSample();
+}
+
+void RegionSamplingThread::doSample() {
+    std::lock_guard lock(mMutex);
+    auto now = std::chrono::nanoseconds(systemTime(SYSTEM_TIME_MONOTONIC));
+    if (lastSampleTime + mTunables.mSamplingPeriod > now) {
+        ATRACE_INT(lumaSamplingStepTag, static_cast<int>(samplingStep::idleTimerWaiting));
+        mDiscardedFrames = true;
+        return;
+    }
+
+    ATRACE_INT(lumaSamplingStepTag, static_cast<int>(samplingStep::sample));
+
+    mDiscardedFrames = false;
+    lastSampleTime = now;
+
+    mIdleTimer.reset();
+    mPhaseCallback->stopVsyncListener();
+
     mSampleRequested = true;
     mCondition.notify_one();
 }
@@ -238,6 +412,7 @@
     for (size_t d = 0; d < activeDescriptors.size(); ++d) {
         activeDescriptors[d].listener->onSampleCollected(lumas[d]);
     }
+    ATRACE_INT(lumaSamplingStepTag, static_cast<int>(samplingStep::noWorkNeeded));
 }
 
 void RegionSamplingThread::threadMain() {
diff --git a/services/surfaceflinger/RegionSamplingThread.h b/services/surfaceflinger/RegionSamplingThread.h
index ab06513..d4e57bf 100644
--- a/services/surfaceflinger/RegionSamplingThread.h
+++ b/services/surfaceflinger/RegionSamplingThread.h
@@ -16,6 +16,7 @@
 
 #pragma once
 
+#include <chrono>
 #include <condition_variable>
 #include <mutex>
 #include <thread>
@@ -25,17 +26,42 @@
 #include <binder/IBinder.h>
 #include <ui/Rect.h>
 #include <utils/StrongPointer.h>
+#include "Scheduler/IdleTimer.h"
 
 namespace android {
 
 class GraphicBuffer;
 class IRegionSamplingListener;
 class Layer;
+class Scheduler;
 class SurfaceFlinger;
+struct SamplingOffsetCallback;
 
 class RegionSamplingThread : public IBinder::DeathRecipient {
 public:
-    explicit RegionSamplingThread(SurfaceFlinger& flinger);
+    struct TimingTunables {
+        // debug.sf.sampling_offset_ns
+        // When asynchronously collecting sample, the offset, from zero phase in the vsync timeline
+        // at which the sampling should start.
+        std::chrono::nanoseconds mSamplingOffset;
+        // debug.sf.sampling_period_ns
+        // This is the maximum amount of time the luma recieving client
+        // should have to wait for a new luma value after a frame is updated. The inverse of this is
+        // roughly the sampling rate. Sampling system rounds up sub-vsync sampling period to vsync
+        // period.
+        std::chrono::nanoseconds mSamplingPeriod;
+        // debug.sf.sampling_timer_timeout_ns
+        // This is the interval at which the luma sampling system will check that the luma clients
+        // have up to date information. It defaults to the mSamplingPeriod.
+        std::chrono::nanoseconds mSamplingTimerTimeout;
+    };
+    struct EnvironmentTimingTunables : TimingTunables {
+        EnvironmentTimingTunables();
+    };
+    explicit RegionSamplingThread(SurfaceFlinger& flinger, Scheduler& scheduler,
+                                  const TimingTunables& tunables);
+    explicit RegionSamplingThread(SurfaceFlinger& flinger, Scheduler& scheduler);
+
     ~RegionSamplingThread();
 
     // Add a listener to receive luma notifications. The luma reported via listener will
@@ -44,8 +70,13 @@
                      const sp<IRegionSamplingListener>& listener);
     // Remove the listener to stop receiving median luma notifications.
     void removeListener(const sp<IRegionSamplingListener>& listener);
-    // Instruct the thread to perform a median luma sampling on the layers.
-    void sampleNow();
+
+    // Notifies sampling engine that new content is available. This will trigger a sampling
+    // pass at some point in the future.
+    void notifyNewContent();
+
+    // Notifies the sampling engine that it has a good timing window in which to sample.
+    void notifySamplingOffset();
 
 private:
     struct Descriptor {
@@ -63,12 +94,19 @@
             const sp<GraphicBuffer>& buffer, const Point& leftTop,
             const std::vector<RegionSamplingThread::Descriptor>& descriptors);
 
+    void doSample();
     void binderDied(const wp<IBinder>& who) override;
+    void checkForStaleLuma();
 
     void captureSample() REQUIRES(mMutex);
     void threadMain();
 
     SurfaceFlinger& mFlinger;
+    Scheduler& mScheduler;
+    const TimingTunables mTunables;
+    scheduler::IdleTimer mIdleTimer;
+
+    std::unique_ptr<SamplingOffsetCallback> const mPhaseCallback;
 
     std::mutex mThreadMutex;
     std::thread mThread GUARDED_BY(mThreadMutex);
@@ -79,6 +117,8 @@
     bool mSampleRequested GUARDED_BY(mMutex) = false;
 
     std::unordered_map<wp<IBinder>, Descriptor, WpHash> mDescriptors GUARDED_BY(mMutex);
+    std::chrono::nanoseconds lastSampleTime GUARDED_BY(mMutex);
+    bool mDiscardedFrames GUARDED_BY(mMutex) = false;
 };
 
 } // namespace android
diff --git a/services/surfaceflinger/Scheduler/Scheduler.cpp b/services/surfaceflinger/Scheduler/Scheduler.cpp
index 0ba6cf9..0063c8a 100644
--- a/services/surfaceflinger/Scheduler/Scheduler.cpp
+++ b/services/surfaceflinger/Scheduler/Scheduler.cpp
@@ -304,6 +304,10 @@
     mApiHistoryCounter = mApiHistoryCounter % scheduler::ARRAY_SIZE;
 }
 
+void Scheduler::withPrimaryDispSync(std::function<void(DispSync&)> const& fn) {
+    fn(*mPrimaryDispSync);
+}
+
 void Scheduler::updateFpsBasedOnNativeWindowApi() {
     int mode;
     {
diff --git a/services/surfaceflinger/Scheduler/Scheduler.h b/services/surfaceflinger/Scheduler/Scheduler.h
index 2582c93..73896d5 100644
--- a/services/surfaceflinger/Scheduler/Scheduler.h
+++ b/services/surfaceflinger/Scheduler/Scheduler.h
@@ -17,6 +17,7 @@
 #pragma once
 
 #include <cstdint>
+#include <functional>
 #include <memory>
 
 #include <ui/DisplayStatInfo.h>
@@ -104,6 +105,9 @@
     // Getter methods.
     EventThread* getEventThread(const sp<ConnectionHandle>& handle);
 
+    // Provides access to the DispSync object for the primary display.
+    void withPrimaryDispSync(std::function<void(DispSync&)> const& fn);
+
     sp<EventThreadConnection> getEventConnection(const sp<ConnectionHandle>& handle);
 
     // Should be called when receiving a hotplug event.
diff --git a/services/surfaceflinger/SurfaceFlinger.cpp b/services/surfaceflinger/SurfaceFlinger.cpp
index dba1f8e..08a9eeb 100644
--- a/services/surfaceflinger/SurfaceFlinger.cpp
+++ b/services/surfaceflinger/SurfaceFlinger.cpp
@@ -637,6 +637,10 @@
     mVsyncModulator.setSchedulerAndHandles(mScheduler.get(), mAppConnectionHandle.get(),
                                            mSfConnectionHandle.get());
 
+    mRegionSamplingThread =
+            new RegionSamplingThread(*this, *mScheduler,
+                                     RegionSamplingThread::EnvironmentTimingTunables());
+
     // Get a RenderEngine for the given display / config (can't fail)
     int32_t renderEngineFeature = 0;
     renderEngineFeature |= (useColorManagement ?
@@ -2063,8 +2067,8 @@
     mTransactionCompletedThread.addPresentFence(mPreviousPresentFence);
     mTransactionCompletedThread.sendCallbacks();
 
-    if (mLumaSampling) {
-        mRegionSamplingThread->sampleNow();
+    if (mLumaSampling && mRegionSamplingThread) {
+        mRegionSamplingThread->notifyNewContent();
     }
 }
 
diff --git a/services/surfaceflinger/SurfaceFlinger.h b/services/surfaceflinger/SurfaceFlinger.h
index 8de1e97..0c58de4 100644
--- a/services/surfaceflinger/SurfaceFlinger.h
+++ b/services/surfaceflinger/SurfaceFlinger.h
@@ -1011,9 +1011,6 @@
 
     TransactionCompletedThread mTransactionCompletedThread;
 
-    bool mLumaSampling = true;
-    sp<RegionSamplingThread> mRegionSamplingThread = new RegionSamplingThread(*this);
-
     // Restrict layers to use two buffers in their bufferqueues.
     bool mLayerTripleBufferingDisabled = false;
 
@@ -1139,6 +1136,9 @@
     bool mCheckPendingFence = false;
 
     /* ------------------------------------------------------------------------ */
+    bool mLumaSampling = true;
+    sp<RegionSamplingThread> mRegionSamplingThread;
+
     sp<IInputFlinger> mInputFlinger;
 
     InputWindowCommands mPendingInputWindowCommands GUARDED_BY(mStateLock);