[sf] write transaction trace to disk on error

Flush always on transaction traces to disk when we detect
errors like: - out of order buffer updates or cycles in
the layer hierarchy so it easier to investigate issues
from dogfood (ref bug b/272189296)

Bug: 282110579, 238781169
Fixes: 277265947
Test: presubmit

Change-Id: I22dbd7cd43d94763ac941f0c3871dc7e4ea8db8d
diff --git a/services/surfaceflinger/FrontEnd/LayerSnapshotBuilder.cpp b/services/surfaceflinger/FrontEnd/LayerSnapshotBuilder.cpp
index 96ff70c..7213ffa 100644
--- a/services/surfaceflinger/FrontEnd/LayerSnapshotBuilder.cpp
+++ b/services/surfaceflinger/FrontEnd/LayerSnapshotBuilder.cpp
@@ -31,6 +31,7 @@
 #include "LayerLog.h"
 #include "LayerSnapshotBuilder.h"
 #include "TimeStats/TimeStats.h"
+#include "Tracing/TransactionTracing.h"
 
 namespace android::surfaceflinger::frontend {
 
@@ -435,13 +436,13 @@
         // multiple children.
         LayerHierarchy::ScopedAddToTraversalPath addChildToPath(root, args.root.getLayer()->id,
                                                                 LayerHierarchy::Variant::Attached);
-        updateSnapshotsInHierarchy(args, args.root, root, mRootSnapshot);
+        updateSnapshotsInHierarchy(args, args.root, root, mRootSnapshot, /*depth=*/0);
     } else {
         for (auto& [childHierarchy, variant] : args.root.mChildren) {
             LayerHierarchy::ScopedAddToTraversalPath addChildToPath(root,
                                                                     childHierarchy->getLayer()->id,
                                                                     variant);
-            updateSnapshotsInHierarchy(args, *childHierarchy, root, mRootSnapshot);
+            updateSnapshotsInHierarchy(args, *childHierarchy, root, mRootSnapshot, /*depth=*/0);
         }
     }
 
@@ -494,7 +495,15 @@
 
 const LayerSnapshot& LayerSnapshotBuilder::updateSnapshotsInHierarchy(
         const Args& args, const LayerHierarchy& hierarchy,
-        LayerHierarchy::TraversalPath& traversalPath, const LayerSnapshot& parentSnapshot) {
+        LayerHierarchy::TraversalPath& traversalPath, const LayerSnapshot& parentSnapshot,
+        int depth) {
+    if (depth > 50) {
+        TransactionTraceWriter::getInstance().invoke("layer_builder_stack_overflow_",
+                                                     /*overwrite=*/false);
+        LOG_ALWAYS_FATAL("Cycle detected in LayerSnapshotBuilder. See "
+                         "builder_stack_overflow_transactions.winscope");
+    }
+
     const RequestedLayerState* layer = hierarchy.getLayer();
     LayerSnapshot* snapshot = getSnapshot(traversalPath);
     const bool newSnapshot = snapshot == nullptr;
@@ -517,7 +526,8 @@
                                                                 childHierarchy->getLayer()->id,
                                                                 variant);
         const LayerSnapshot& childSnapshot =
-                updateSnapshotsInHierarchy(args, *childHierarchy, traversalPath, *snapshot);
+                updateSnapshotsInHierarchy(args, *childHierarchy, traversalPath, *snapshot,
+                                           depth + 1);
         updateChildState(*snapshot, childSnapshot, args);
     }
 
diff --git a/services/surfaceflinger/FrontEnd/LayerSnapshotBuilder.h b/services/surfaceflinger/FrontEnd/LayerSnapshotBuilder.h
index 3f33ab8..2e46dc6 100644
--- a/services/surfaceflinger/FrontEnd/LayerSnapshotBuilder.h
+++ b/services/surfaceflinger/FrontEnd/LayerSnapshotBuilder.h
@@ -96,7 +96,7 @@
 
     const LayerSnapshot& updateSnapshotsInHierarchy(const Args&, const LayerHierarchy& hierarchy,
                                                     LayerHierarchy::TraversalPath& traversalPath,
-                                                    const LayerSnapshot& parentSnapshot);
+                                                    const LayerSnapshot& parentSnapshot, int depth);
     void updateSnapshot(LayerSnapshot&, const Args&, const RequestedLayerState&,
                         const LayerSnapshot& parentSnapshot, const LayerHierarchy::TraversalPath&);
     static void updateRelativeState(LayerSnapshot& snapshot, const LayerSnapshot& parentSnapshot,
diff --git a/services/surfaceflinger/FrontEnd/RequestedLayerState.cpp b/services/surfaceflinger/FrontEnd/RequestedLayerState.cpp
index 23bb54c..bde2d05 100644
--- a/services/surfaceflinger/FrontEnd/RequestedLayerState.cpp
+++ b/services/surfaceflinger/FrontEnd/RequestedLayerState.cpp
@@ -131,6 +131,7 @@
     const uint32_t oldFlags = flags;
     const half oldAlpha = color.a;
     const bool hadBuffer = externalTexture != nullptr;
+    uint64_t oldFramenumber = hadBuffer ? bufferData->frameNumber : 0;
     const bool hadSideStream = sidebandStream != nullptr;
     const layer_state_t& clientState = resolvedComposerState.state;
     const bool hadBlur = hasBlur();
@@ -147,12 +148,9 @@
             changes |= RequestedLayerState::Changes::Geometry;
         }
     }
+
     if (clientState.what & layer_state_t::eBufferChanged) {
         externalTexture = resolvedComposerState.externalTexture;
-        barrierProducerId = std::max(bufferData->producerId, barrierProducerId);
-        barrierFrameNumber = std::max(bufferData->frameNumber, barrierFrameNumber);
-        // TODO(b/277265947) log and flush transaction trace when we detect out of order updates
-
         const bool hasBuffer = externalTexture != nullptr;
         if (hasBuffer || hasBuffer != hadBuffer) {
             changes |= RequestedLayerState::Changes::Buffer;
@@ -163,6 +161,28 @@
                     RequestedLayerState::Changes::VisibleRegion |
                     RequestedLayerState::Changes::Visibility | RequestedLayerState::Changes::Input;
         }
+
+        if (hasBuffer) {
+            const bool frameNumberChanged =
+                    bufferData->flags.test(BufferData::BufferDataChange::frameNumberChanged);
+            const uint64_t frameNumber =
+                    frameNumberChanged ? bufferData->frameNumber : oldFramenumber + 1;
+            bufferData->frameNumber = frameNumber;
+
+            if ((barrierProducerId > bufferData->producerId) ||
+                ((barrierProducerId == bufferData->producerId) &&
+                 (barrierFrameNumber > bufferData->frameNumber))) {
+                ALOGE("Out of order buffers detected for %s producedId=%d frameNumber=%" PRIu64
+                      " -> producedId=%d frameNumber=%" PRIu64,
+                      getDebugString().c_str(), bufferData->producerId, bufferData->frameNumber,
+                      bufferData->producerId, frameNumber);
+                TransactionTraceWriter::getInstance().invoke("out_of_order_buffers_",
+                                                             /*overwrite=*/false);
+            }
+
+            barrierProducerId = std::max(bufferData->producerId, barrierProducerId);
+            barrierFrameNumber = std::max(bufferData->frameNumber, barrierFrameNumber);
+        }
     }
 
     if (clientState.what & layer_state_t::eSidebandStreamChanged) {
diff --git a/services/surfaceflinger/Layer.cpp b/services/surfaceflinger/Layer.cpp
index f627501..fabcd61 100644
--- a/services/surfaceflinger/Layer.cpp
+++ b/services/surfaceflinger/Layer.cpp
@@ -3125,6 +3125,16 @@
         return true;
     }
 
+    if ((mDrawingState.producerId > bufferData.producerId) ||
+        ((mDrawingState.producerId == bufferData.producerId) &&
+         (mDrawingState.frameNumber > frameNumber))) {
+        ALOGE("Out of order buffers detected for %s producedId=%d frameNumber=%" PRIu64
+              " -> producedId=%d frameNumber=%" PRIu64,
+              getDebugName(), mDrawingState.producerId, mDrawingState.frameNumber,
+              bufferData.producerId, frameNumber);
+        TransactionTraceWriter::getInstance().invoke("out_of_order_buffers_", /*overwrite=*/false);
+    }
+
     mDrawingState.producerId = bufferData.producerId;
     mDrawingState.barrierProducerId =
             std::max(mDrawingState.producerId, mDrawingState.barrierProducerId);
@@ -3132,7 +3142,6 @@
     mDrawingState.barrierFrameNumber =
             std::max(mDrawingState.frameNumber, mDrawingState.barrierFrameNumber);
 
-    // TODO(b/277265947) log and flush transaction trace when we detect out of order updates
     mDrawingState.releaseBufferListener = bufferData.releaseBufferListener;
     mDrawingState.buffer = std::move(buffer);
     mDrawingState.acquireFence = bufferData.flags.test(BufferData::BufferDataChange::fenceChanged)
diff --git a/services/surfaceflinger/SurfaceFlinger.cpp b/services/surfaceflinger/SurfaceFlinger.cpp
index f0ccc14..e0ffb6b 100644
--- a/services/surfaceflinger/SurfaceFlinger.cpp
+++ b/services/surfaceflinger/SurfaceFlinger.cpp
@@ -909,6 +909,29 @@
         ALOGE("Run StartPropertySetThread failed!");
     }
 
+    if (mTransactionTracing) {
+        TransactionTraceWriter::getInstance().setWriterFunction([&](const std::string& prefix,
+                                                                    bool overwrite) {
+            auto writeFn = [&]() {
+                const std::string filename =
+                        TransactionTracing::DIR_NAME + prefix + TransactionTracing::FILE_NAME;
+                if (overwrite) {
+                    std::ifstream file(filename);
+                    if (file.is_open()) {
+                        return;
+                    }
+                }
+                mTransactionTracing->flush();
+                mTransactionTracing->writeToFile(filename);
+            };
+            if (std::this_thread::get_id() == mMainThreadId) {
+                writeFn();
+            } else {
+                mScheduler->schedule(writeFn).get();
+            }
+        });
+    }
+
     ALOGV("Done initializing");
 }
 
@@ -6728,7 +6751,7 @@
                         mTransactionTracing->setBufferSize(
                                 TransactionTracing::ACTIVE_TRACING_BUFFER_SIZE);
                     } else {
-                        mTransactionTracing->writeToFile();
+                        TransactionTraceWriter::getInstance().invoke("", /* overwrite= */ true);
                         mTransactionTracing->setBufferSize(
                                 TransactionTracing::CONTINUOUS_TRACING_BUFFER_SIZE);
                     }
diff --git a/services/surfaceflinger/Tracing/TransactionTracing.cpp b/services/surfaceflinger/Tracing/TransactionTracing.cpp
index 632de01..7e330b9 100644
--- a/services/surfaceflinger/Tracing/TransactionTracing.cpp
+++ b/services/surfaceflinger/Tracing/TransactionTracing.cpp
@@ -28,6 +28,7 @@
 #include "TransactionTracing.h"
 
 namespace android {
+ANDROID_SINGLETON_STATIC_INSTANCE(android::TransactionTraceWriter)
 
 TransactionTracing::TransactionTracing()
       : mProtoParser(std::make_unique<TransactionProtoParser::FlingerDataMapper>()) {
@@ -56,7 +57,7 @@
     writeToFile();
 }
 
-status_t TransactionTracing::writeToFile(std::string filename) {
+status_t TransactionTracing::writeToFile(const std::string& filename) {
     std::scoped_lock lock(mTraceLock);
     proto::TransactionTraceFile fileProto = createTraceFileProto();
     addStartingStateToProtoLocked(fileProto);
@@ -115,6 +116,7 @@
     }
     mPendingUpdates.emplace_back(update);
     tryPushToTracingThread();
+    mLastUpdatedVsyncId = vsyncId;
 }
 
 void TransactionTracing::loop() {
@@ -218,19 +220,29 @@
     mTransactionsAddedToBufferCv.notify_one();
 }
 
-void TransactionTracing::flush(int64_t vsyncId) {
-    while (!mPendingUpdates.empty() || !mPendingDestroyedLayers.empty()) {
-        tryPushToTracingThread();
+void TransactionTracing::flush() {
+    {
+        std::scoped_lock lock(mMainThreadLock);
+        // Collect any pending transactions and wait for transactions to be added to
+        mUpdates.insert(mUpdates.end(), std::make_move_iterator(mPendingUpdates.begin()),
+                        std::make_move_iterator(mPendingUpdates.end()));
+        mPendingUpdates.clear();
+        mDestroyedLayers.insert(mDestroyedLayers.end(), mPendingDestroyedLayers.begin(),
+                                mPendingDestroyedLayers.end());
+        mPendingDestroyedLayers.clear();
+        mTransactionsAvailableCv.notify_one();
     }
     std::unique_lock<std::mutex> lock(mTraceLock);
     base::ScopedLockAssertion assumeLocked(mTraceLock);
-    mTransactionsAddedToBufferCv.wait(lock, [&]() REQUIRES(mTraceLock) {
-        proto::TransactionTraceEntry entry;
-        if (mBuffer.used() > 0) {
-            entry.ParseFromString(mBuffer.back());
-        }
-        return mBuffer.used() > 0 && entry.vsync_id() >= vsyncId;
-    });
+    mTransactionsAddedToBufferCv.wait_for(lock, std::chrono::milliseconds(100),
+                                          [&]() REQUIRES(mTraceLock) {
+                                              proto::TransactionTraceEntry entry;
+                                              if (mBuffer.used() > 0) {
+                                                  entry.ParseFromString(mBuffer.back());
+                                              }
+                                              return mBuffer.used() > 0 &&
+                                                      entry.vsync_id() >= mLastUpdatedVsyncId;
+                                          });
 }
 
 void TransactionTracing::onLayerRemoved(int32_t layerId) {
diff --git a/services/surfaceflinger/Tracing/TransactionTracing.h b/services/surfaceflinger/Tracing/TransactionTracing.h
index 0e56627..fa006fc 100644
--- a/services/surfaceflinger/Tracing/TransactionTracing.h
+++ b/services/surfaceflinger/Tracing/TransactionTracing.h
@@ -19,6 +19,7 @@
 #include <android-base/thread_annotations.h>
 #include <layerproto/TransactionProto.h>
 #include <utils/Errors.h>
+#include <utils/Singleton.h>
 #include <utils/Timers.h>
 
 #include <memory>
@@ -60,10 +61,12 @@
     void addQueuedTransaction(const TransactionState&);
     void addCommittedTransactions(int64_t vsyncId, nsecs_t commitTime, frontend::Update& update,
                                   const frontend::DisplayInfos&, bool displayInfoChanged);
-    status_t writeToFile(std::string filename = FILE_NAME);
+    status_t writeToFile(const std::string& filename = FILE_PATH);
     void setBufferSize(size_t bufferSizeInBytes);
     void onLayerRemoved(int layerId);
     void dump(std::string&) const;
+    // Wait until all the committed transactions for the specified vsync id are added to the buffer.
+    void flush() EXCLUDES(mMainThreadLock);
     static constexpr auto CONTINUOUS_TRACING_BUFFER_SIZE = 512 * 1024;
     static constexpr auto ACTIVE_TRACING_BUFFER_SIZE = 100 * 1024 * 1024;
     // version 1 - switching to support new frontend
@@ -73,7 +76,9 @@
     friend class TransactionTracingTest;
     friend class SurfaceFlinger;
 
-    static constexpr auto FILE_NAME = "/data/misc/wmtrace/transactions_trace.winscope";
+    static constexpr auto DIR_NAME = "/data/misc/wmtrace/";
+    static constexpr auto FILE_NAME = "/transactions_trace.winscope";
+    static constexpr auto FILE_PATH = "/data/misc/wmtrace/transactions_trace.winscope";
 
     mutable std::mutex mTraceLock;
     RingBuffer<proto::TransactionTraceFile, proto::TransactionTraceEntry> mBuffer
@@ -111,6 +116,7 @@
 
     std::vector<uint32_t /* layerId */> mDestroyedLayers GUARDED_BY(mMainThreadLock);
     std::vector<uint32_t /* layerId */> mPendingDestroyedLayers; // only accessed by main thread
+    int64_t mLastUpdatedVsyncId = -1;
 
     proto::TransactionTraceFile createTraceFileProto() const;
     void loop();
@@ -121,10 +127,21 @@
     void addStartingStateToProtoLocked(proto::TransactionTraceFile& proto) REQUIRES(mTraceLock);
     void updateStartingStateLocked(const proto::TransactionTraceEntry& entry) REQUIRES(mTraceLock);
     // TEST
-    // Wait until all the committed transactions for the specified vsync id are added to the buffer.
-    void flush(int64_t vsyncId) EXCLUDES(mMainThreadLock);
     // Return buffer contents as trace file proto
     proto::TransactionTraceFile writeToProto() EXCLUDES(mMainThreadLock);
 };
 
+class TransactionTraceWriter : public Singleton<TransactionTraceWriter> {
+    friend class Singleton<TransactionTracing>;
+    std::function<void(const std::string& prefix, bool overwrite)> mWriterFunction =
+            [](const std::string&, bool) {};
+
+public:
+    void setWriterFunction(
+            std::function<void(const std::string& prefix, bool overwrite)> function) {
+        mWriterFunction = std::move(function);
+    }
+    void invoke(const std::string& prefix, bool overwrite) { mWriterFunction(prefix, overwrite); }
+};
+
 } // namespace android
diff --git a/services/surfaceflinger/tests/unittests/TransactionTracingTest.cpp b/services/surfaceflinger/tests/unittests/TransactionTracingTest.cpp
index 92411a7..809966f 100644
--- a/services/surfaceflinger/tests/unittests/TransactionTracingTest.cpp
+++ b/services/surfaceflinger/tests/unittests/TransactionTracingTest.cpp
@@ -37,7 +37,7 @@
     static constexpr size_t SMALL_BUFFER_SIZE = 1024;
     TransactionTracing mTracing;
 
-    void flush(int64_t vsyncId) { mTracing.flush(vsyncId); }
+    void flush() { mTracing.flush(); }
     proto::TransactionTraceFile writeToProto() { return mTracing.writeToProto(); }
 
     proto::TransactionTraceEntry bufferFront() {
@@ -57,7 +57,7 @@
         std::vector<TransactionState> transactions;
         update.transactions.emplace_back(transaction);
         mTracing.addCommittedTransactions(vsyncId, 0, update, {}, false);
-        flush(vsyncId);
+        flush();
     }
 
     void verifyEntry(const proto::TransactionTraceEntry& actualProto,
@@ -116,7 +116,7 @@
     secondUpdate.transactions =
             std::vector<TransactionState>(transactions.begin(), transactions.begin() + 50);
     mTracing.addCommittedTransactions(secondTransactionSetVsyncId, 0, secondUpdate, {}, false);
-    flush(secondTransactionSetVsyncId);
+    flush();
 
     proto::TransactionTraceFile proto = writeToProto();
     ASSERT_EQ(proto.entry().size(), 2);
@@ -158,7 +158,7 @@
             VSYNC_ID_FIRST_LAYER_CHANGE = ++mVsyncId;
             mTracing.addCommittedTransactions(VSYNC_ID_FIRST_LAYER_CHANGE, 0, update, {}, false);
 
-            flush(VSYNC_ID_FIRST_LAYER_CHANGE);
+            flush();
         }
 
         // add transactions that modify the layer state further so we can test that layer state
@@ -178,7 +178,7 @@
             update.transactions.emplace_back(transaction);
             VSYNC_ID_SECOND_LAYER_CHANGE = ++mVsyncId;
             mTracing.addCommittedTransactions(VSYNC_ID_SECOND_LAYER_CHANGE, 0, update, {}, false);
-            flush(VSYNC_ID_SECOND_LAYER_CHANGE);
+            flush();
         }
 
         // remove child layer
@@ -290,7 +290,7 @@
 
             update.transactions.emplace_back(transaction);
             mTracing.addCommittedTransactions(mVsyncId, 0, update, {}, false);
-            flush(mVsyncId);
+            flush();
         }
     }