SurfaceFlinger: Report stuck fences to client

It's ocassionally observed in traces that the GPU is hung
resulting in fences failing to fire. This normally propagates
back up as an ANR in dequeueBuffers or onFrameAvailable. The ANR
might then pass from the app team, to the HWUI team, to the WM
team, and then if we are lucky enough to get a trace, to the
GPU team. In this CL we allow the client process to monitor
this situation itself, and proactively trigger an ANR with
a more useful and informative message than "stuck in
dequeueBuffers"

Bug: 216160569
Test: Existing tests pass
Change-Id: I3ff93bf57d24268ac57734dfed2df185985ffe1f
diff --git a/libs/gui/BLASTBufferQueue.cpp b/libs/gui/BLASTBufferQueue.cpp
index c2793ac..dbccf30 100644
--- a/libs/gui/BLASTBufferQueue.cpp
+++ b/libs/gui/BLASTBufferQueue.cpp
@@ -165,6 +165,17 @@
     mCurrentMaxAcquiredBufferCount = mMaxAcquiredBuffers;
     mNumAcquired = 0;
     mNumFrameAvailable = 0;
+
+    TransactionCompletedListener::getInstance()->addQueueStallListener(
+        [&]() {
+            std::function<void(bool)> callbackCopy;
+            {
+                std::unique_lock _lock{mMutex};
+                callbackCopy = mTransactionHangCallback;
+            }
+            if (callbackCopy) callbackCopy(true);
+        }, this);
+
     BQA_LOGV("BLASTBufferQueue created");
 }
 
@@ -175,6 +186,7 @@
 }
 
 BLASTBufferQueue::~BLASTBufferQueue() {
+    TransactionCompletedListener::getInstance()->removeQueueStallListener(this);
     if (mPendingTransactions.empty()) {
         return;
     }
@@ -1113,4 +1125,9 @@
     return SurfaceControl::isSameSurface(mSurfaceControl, surfaceControl);
 }
 
+void BLASTBufferQueue::setTransactionHangCallback(std::function<void(bool)> callback) {
+    std::unique_lock _lock{mMutex};
+    mTransactionHangCallback = callback;
+}
+
 } // namespace android
diff --git a/libs/gui/ITransactionCompletedListener.cpp b/libs/gui/ITransactionCompletedListener.cpp
index f7392d4..e4b8bad 100644
--- a/libs/gui/ITransactionCompletedListener.cpp
+++ b/libs/gui/ITransactionCompletedListener.cpp
@@ -29,6 +29,7 @@
 enum class Tag : uint32_t {
     ON_TRANSACTION_COMPLETED = IBinder::FIRST_CALL_TRANSACTION,
     ON_RELEASE_BUFFER,
+    ON_TRANSACTION_QUEUE_STALLED,
     LAST = ON_RELEASE_BUFFER,
 };
 
@@ -277,6 +278,11 @@
                                                                   callbackId, releaseFence,
                                                                   currentMaxAcquiredBufferCount);
     }
+
+    void onTransactionQueueStalled() override {
+        callRemoteAsync<decltype(&ITransactionCompletedListener::onTransactionQueueStalled)>(
+            Tag::ON_TRANSACTION_QUEUE_STALLED);
+    }
 };
 
 // Out-of-line virtual method definitions to trigger vtable emission in this translation unit (see
@@ -297,6 +303,9 @@
                                   &ITransactionCompletedListener::onTransactionCompleted);
         case Tag::ON_RELEASE_BUFFER:
             return callLocalAsync(data, reply, &ITransactionCompletedListener::onReleaseBuffer);
+        case Tag::ON_TRANSACTION_QUEUE_STALLED:
+            return callLocalAsync(data, reply,
+                                  &ITransactionCompletedListener::onTransactionQueueStalled);
     }
 }
 
diff --git a/libs/gui/SurfaceComposerClient.cpp b/libs/gui/SurfaceComposerClient.cpp
index 7a63af0..4ce5e4f 100644
--- a/libs/gui/SurfaceComposerClient.cpp
+++ b/libs/gui/SurfaceComposerClient.cpp
@@ -447,6 +447,27 @@
     }
 }
 
+void TransactionCompletedListener::onTransactionQueueStalled() {
+      std::unordered_map<void*, std::function<void()>> callbackCopy;
+      {
+          std::scoped_lock<std::mutex> lock(mMutex);
+          callbackCopy = mQueueStallListeners;
+      }
+      for (auto const& it : callbackCopy) {
+          it.second();
+      }
+}
+
+void TransactionCompletedListener::addQueueStallListener(std::function<void()> stallListener,
+                                                         void* id) {
+    std::scoped_lock<std::mutex> lock(mMutex);
+    mQueueStallListeners[id] = stallListener;
+}
+void TransactionCompletedListener::removeQueueStallListener(void *id) {
+    std::scoped_lock<std::mutex> lock(mMutex);
+    mQueueStallListeners.erase(id);
+}
+
 void TransactionCompletedListener::onReleaseBuffer(ReleaseCallbackId callbackId,
                                                    sp<Fence> releaseFence,
                                                    uint32_t currentMaxAcquiredBufferCount) {
diff --git a/libs/gui/include/gui/BLASTBufferQueue.h b/libs/gui/include/gui/BLASTBufferQueue.h
index 65fc04d..9328a54 100644
--- a/libs/gui/include/gui/BLASTBufferQueue.h
+++ b/libs/gui/include/gui/BLASTBufferQueue.h
@@ -113,6 +113,14 @@
     uint64_t getLastAcquiredFrameNum();
     void abandon();
 
+    /**
+     * Set a callback to be invoked when we are hung. The boolean parameter
+     * indicates whether the hang is due to an unfired fence.
+     * TODO: The boolean is always true atm, unfired fence is
+     * the only case we detect.
+     */
+    void setTransactionHangCallback(std::function<void(bool)> callback);
+
     virtual ~BLASTBufferQueue();
 
 private:
@@ -269,6 +277,8 @@
     // transaction that will be applied by some sync consumer.
     bool mAppliedLastTransaction = false;
     uint64_t mLastAppliedFrameNumber = 0;
+
+    std::function<void(bool)> mTransactionHangCallback;
 };
 
 } // namespace android
diff --git a/libs/gui/include/gui/ITransactionCompletedListener.h b/libs/gui/include/gui/ITransactionCompletedListener.h
index a791c66..cc136bb 100644
--- a/libs/gui/include/gui/ITransactionCompletedListener.h
+++ b/libs/gui/include/gui/ITransactionCompletedListener.h
@@ -194,6 +194,7 @@
 
     virtual void onReleaseBuffer(ReleaseCallbackId callbackId, sp<Fence> releaseFence,
                                  uint32_t currentMaxAcquiredBufferCount) = 0;
+    virtual void onTransactionQueueStalled() = 0;
 };
 
 class BnTransactionCompletedListener : public SafeBnInterface<ITransactionCompletedListener> {
diff --git a/libs/gui/include/gui/SurfaceComposerClient.h b/libs/gui/include/gui/SurfaceComposerClient.h
index 0cc43d8..efbdb36 100644
--- a/libs/gui/include/gui/SurfaceComposerClient.h
+++ b/libs/gui/include/gui/SurfaceComposerClient.h
@@ -772,6 +772,7 @@
     // This is protected by mSurfaceStatsListenerMutex, but GUARDED_BY isn't supported for
     // std::recursive_mutex
     std::multimap<int32_t, SurfaceStatsCallbackEntry> mSurfaceStatsListeners;
+    std::unordered_map<void*, std::function<void()>> mQueueStallListeners;
 
 public:
     static sp<TransactionCompletedListener> getInstance();
@@ -789,6 +790,9 @@
             const sp<SurfaceControl>& surfaceControl,
             const std::unordered_set<CallbackId, CallbackIdHash>& callbackIds);
 
+    void addQueueStallListener(std::function<void()> stallListener, void* id);
+    void removeQueueStallListener(void *id);
+
     /*
      * Adds a jank listener to be informed about SurfaceFlinger's jank classification for a specific
      * surface. Jank classifications arrive as part of the transaction callbacks about previous
@@ -817,6 +821,8 @@
     // For Testing Only
     static void setInstance(const sp<TransactionCompletedListener>&);
 
+    void onTransactionQueueStalled() override;
+
 private:
     ReleaseBufferCallback popReleaseBufferCallbackLocked(const ReleaseCallbackId&);
     static sp<TransactionCompletedListener> sInstance;
diff --git a/services/surfaceflinger/SurfaceFlinger.cpp b/services/surfaceflinger/SurfaceFlinger.cpp
index 3bfc2cc..f6f3805 100644
--- a/services/surfaceflinger/SurfaceFlinger.cpp
+++ b/services/surfaceflinger/SurfaceFlinger.cpp
@@ -3697,12 +3697,13 @@
 
             auto& transaction = transactionQueue.front();
             const auto ready =
-                    transactionIsReadyToBeApplied(transaction.frameTimelineInfo,
-                                                  transaction.isAutoTimestamp,
-                                                  transaction.desiredPresentTime,
-                                                  transaction.originUid, transaction.states,
-                                                  bufferLayersReadyToPresent, transactions.size(),
-                                                  tryApplyUnsignaled);
+                transactionIsReadyToBeApplied(transaction,
+                                              transaction.frameTimelineInfo,
+                                              transaction.isAutoTimestamp,
+                                              transaction.desiredPresentTime,
+                                              transaction.originUid, transaction.states,
+                                              bufferLayersReadyToPresent, transactions.size(),
+                                              tryApplyUnsignaled);
             ATRACE_INT("TransactionReadiness", static_cast<int>(ready));
             if (ready == TransactionReadiness::NotReady) {
                 setTransactionFlags(eTransactionFlushNeeded);
@@ -3779,7 +3780,7 @@
                         return TransactionReadiness::NotReady;
                     }
 
-                    return transactionIsReadyToBeApplied(transaction.frameTimelineInfo,
+                    return transactionIsReadyToBeApplied(transaction, transaction.frameTimelineInfo,
                                                          transaction.isAutoTimestamp,
                                                          transaction.desiredPresentTime,
                                                          transaction.originUid, transaction.states,
@@ -3941,7 +3942,7 @@
     return true;
 }
 
-auto SurfaceFlinger::transactionIsReadyToBeApplied(
+auto SurfaceFlinger::transactionIsReadyToBeApplied(TransactionState& transaction,
         const FrameTimelineInfo& info, bool isAutoTimestamp, int64_t desiredPresentTime,
         uid_t originUid, const Vector<ComposerState>& states,
         const std::unordered_map<
@@ -3970,8 +3971,10 @@
     }
 
     bool fenceUnsignaled = false;
+    auto queueProcessTime = systemTime();
     for (const ComposerState& state : states) {
         const layer_state_t& s = state.state;
+
         sp<Layer> layer = nullptr;
         if (s.surface) {
             layer = fromHandle(s.surface).promote();
@@ -4007,6 +4010,15 @@
                  s.bufferData->acquireFence->getStatus() == Fence::Status::Unsignaled);
 
         if (fenceUnsignaled && !allowLatchUnsignaled) {
+            if (!transaction.sentFenceTimeoutWarning &&
+                queueProcessTime - transaction.queueTime > std::chrono::nanoseconds(4s).count()) {
+                transaction.sentFenceTimeoutWarning = true;
+                auto listener = s.bufferData->releaseBufferListener;
+                if (listener) {
+                    listener->onTransactionQueueStalled();
+                }
+            }
+
             ATRACE_NAME("fence unsignaled");
             return TransactionReadiness::NotReady;
         }
@@ -4026,6 +4038,8 @@
 }
 
 void SurfaceFlinger::queueTransaction(TransactionState& state) {
+    state.queueTime = systemTime();
+
     Mutex::Autolock lock(mQueueLock);
 
     // Generate a CountDownLatch pending state if this is a synchronous transaction.
diff --git a/services/surfaceflinger/SurfaceFlinger.h b/services/surfaceflinger/SurfaceFlinger.h
index 011aaef..0b013d4 100644
--- a/services/surfaceflinger/SurfaceFlinger.h
+++ b/services/surfaceflinger/SurfaceFlinger.h
@@ -816,7 +816,7 @@
         Ready,
         ReadyUnsignaled,
     };
-    TransactionReadiness transactionIsReadyToBeApplied(
+    TransactionReadiness transactionIsReadyToBeApplied(TransactionState& state,
             const FrameTimelineInfo& info, bool isAutoTimestamp, int64_t desiredPresentTime,
             uid_t originUid, const Vector<ComposerState>& states,
             const std::unordered_map<
diff --git a/services/surfaceflinger/TransactionState.h b/services/surfaceflinger/TransactionState.h
index bab5326..900d566 100644
--- a/services/surfaceflinger/TransactionState.h
+++ b/services/surfaceflinger/TransactionState.h
@@ -98,6 +98,8 @@
     int originUid;
     uint64_t id;
     std::shared_ptr<CountDownLatch> transactionCommittedSignal;
+    int64_t queueTime = 0;
+    bool sentFenceTimeoutWarning = false;
 };
 
 class CountDownLatch {