Offload present to a separate thread

Make Output::present() return an ftl::Future. When offloading its
HWC call, present will return a future that can be waited upon.

In CompositionEngine::present, store a vector of futures and wait
upon the results before returning. This allow's HWC's present to run in
parallel with other work; in particular, present on other displays.
Waiting here ensures that post-composition work does not start until
present has completed. Future work may defer this even later.

Reuse HwcAsyncWorker to run present on a separate thread. Add a new
variable for determining whether to run validate in parallel, since the
presence of the HwcAsyncWorker could just mean that we are offloading
present.

Read the new DisplayCapability to determine whether a display can be
offloaded to a worker thread. Only run displays in parallel if they all
have the DisplayCapability. Non HWC-enabled displays without the
capability do not prevent other displays from being offloaded. They also
run last, since they can run in parallel with HWC-enabled displays.
(The ordering is now set by SurfaceFlinger, which places physical
displays at the start of the list of outputs.)

When telling a display to offload its present call, make it only last
for a single frame. This simplifies the code while ensuring we do not
leave it enabled unnecessarily.

Leave a single present call on the main thread. This saves a thread-hop,
while still allowing it to run in parallel with other HWC work.

Only attempt to offload present if the appropriate trunk stable flag (or
debug sysprop, "debug.sf.multithreaded_present") is set.

Bug: 241285491
Bug: 259132483
Test: manual: perfetto trace
Test: libcompositionengine_test
Change-Id: Ib9d074671e32c95875ef7e0791dd95d6e595e47a
diff --git a/services/surfaceflinger/CompositionEngine/src/CompositionEngine.cpp b/services/surfaceflinger/CompositionEngine/src/CompositionEngine.cpp
index 15fadbc..9041964 100644
--- a/services/surfaceflinger/CompositionEngine/src/CompositionEngine.cpp
+++ b/services/surfaceflinger/CompositionEngine/src/CompositionEngine.cpp
@@ -20,6 +20,7 @@
 #include <compositionengine/OutputLayer.h>
 #include <compositionengine/impl/CompositionEngine.h>
 #include <compositionengine/impl/Display.h>
+#include <ui/DisplayMap.h>
 
 #include <renderengine/RenderEngine.h>
 #include <utils/Trace.h>
@@ -88,6 +89,33 @@
     return mRefreshStartTime;
 }
 
+namespace {
+int numDisplaysWithOffloadPresentSupport(const CompositionRefreshArgs& args) {
+    if (!FlagManager::getInstance().multithreaded_present() || args.outputs.size() < 2) {
+        return 0;
+    }
+
+    int numEligibleDisplays = 0;
+    // Only run present in multiple threads if all HWC-enabled displays
+    // being refreshed support it.
+    if (!std::all_of(args.outputs.begin(), args.outputs.end(),
+                     [&numEligibleDisplays](const auto& output) {
+                         if (!ftl::Optional(output->getDisplayId())
+                                      .and_then(HalDisplayId::tryCast)) {
+                             // Not HWC-enabled, so it is always
+                             // client-composited.
+                             return true;
+                         }
+                         const bool support = output->supportsOffloadPresent();
+                         numEligibleDisplays += static_cast<int>(support);
+                         return support;
+                     })) {
+        return 0;
+    }
+    return numEligibleDisplays;
+}
+} // namespace
+
 void CompositionEngine::present(CompositionRefreshArgs& args) {
     ATRACE_CALL();
     ALOGV(__FUNCTION__);
@@ -105,8 +133,36 @@
         }
     }
 
+    // Offloading the HWC call for `present` allows us to simultaneously call it
+    // on multiple displays. This is desirable because these calls block and can
+    // be slow.
+    if (const int numEligibleDisplays = numDisplaysWithOffloadPresentSupport(args);
+        numEligibleDisplays > 1) {
+        // Leave the last eligible display on the main thread, which will
+        // allow it to run concurrently without an extra thread hop.
+        int numToOffload = numEligibleDisplays - 1;
+        for (auto& output : args.outputs) {
+            if (output->supportsOffloadPresent()) {
+                output->offloadPresentNextFrame();
+                if (--numToOffload == 0) {
+                    break;
+                }
+            }
+        }
+    }
+
+    ui::DisplayVector<ftl::Future<std::monostate>> presentFutures;
     for (const auto& output : args.outputs) {
-        output->present(args);
+        presentFutures.push_back(output->present(args));
+    }
+
+    {
+        ATRACE_NAME("Waiting on HWC");
+        for (auto& future : presentFutures) {
+            // TODO(b/185536303): Call ftl::Future::wait() once it exists, since
+            // we do not need the return value of get().
+            future.get();
+        }
     }
 }
 
diff --git a/services/surfaceflinger/CompositionEngine/src/Display.cpp b/services/surfaceflinger/CompositionEngine/src/Display.cpp
index 469fb38..0475881 100644
--- a/services/surfaceflinger/CompositionEngine/src/Display.cpp
+++ b/services/surfaceflinger/CompositionEngine/src/Display.cpp
@@ -430,4 +430,13 @@
     impl::Output::finishFrame(std::move(result));
 }
 
+bool Display::supportsOffloadPresent() const {
+    if (const auto halDisplayId = HalDisplayId::tryCast(mId)) {
+        const auto& hwc = getCompositionEngine().getHwComposer();
+        return hwc.hasDisplayCapability(*halDisplayId, DisplayCapability::MULTI_THREADED_PRESENT);
+    }
+
+    return false;
+}
+
 } // namespace android::compositionengine::impl
diff --git a/services/surfaceflinger/CompositionEngine/src/Output.cpp b/services/surfaceflinger/CompositionEngine/src/Output.cpp
index 2ae80de..e4d7578 100644
--- a/services/surfaceflinger/CompositionEngine/src/Output.cpp
+++ b/services/surfaceflinger/CompositionEngine/src/Output.cpp
@@ -427,7 +427,8 @@
     uncacheBuffers(refreshArgs.bufferIdsToUncache);
 }
 
-void Output::present(const compositionengine::CompositionRefreshArgs& refreshArgs) {
+ftl::Future<std::monostate> Output::present(
+        const compositionengine::CompositionRefreshArgs& refreshArgs) {
     ATRACE_FORMAT("%s for %s", __func__, mNamePlusId.c_str());
     ALOGV(__FUNCTION__);
 
@@ -448,8 +449,26 @@
 
     devOptRepaintFlash(refreshArgs);
     finishFrame(std::move(result));
-    presentFrameAndReleaseLayers();
+    ftl::Future<std::monostate> future;
+    if (mOffloadPresent) {
+        future = presentFrameAndReleaseLayersAsync();
+
+        // Only offload for this frame. The next frame will determine whether it
+        // needs to be offloaded. Leave the HwcAsyncWorker in place. For one thing,
+        // it is currently presenting. Further, it may be needed next frame, and
+        // we don't want to churn.
+        mOffloadPresent = false;
+    } else {
+        presentFrameAndReleaseLayers();
+        future = ftl::yield<std::monostate>({});
+    }
     renderCachedSets(refreshArgs);
+    return future;
+}
+
+void Output::offloadPresentNextFrame() {
+    mOffloadPresent = true;
+    updateHwcAsyncWorker();
 }
 
 void Output::uncacheBuffers(std::vector<uint64_t> const& bufferIdsToUncache) {
@@ -1084,6 +1103,14 @@
     finishPrepareFrame();
 }
 
+ftl::Future<std::monostate> Output::presentFrameAndReleaseLayersAsync() {
+    return ftl::Future<bool>(std::move(mHwComposerAsyncWorker->send([&]() {
+               presentFrameAndReleaseLayers();
+               return true;
+           })))
+            .then([](bool) { return std::monostate{}; });
+}
+
 std::future<bool> Output::chooseCompositionStrategyAsync(
         std::optional<android::HWComposer::DeviceRequestedChanges>* changes) {
     return mHwComposerAsyncWorker->send(
@@ -1600,8 +1627,15 @@
 }
 
 void Output::setPredictCompositionStrategy(bool predict) {
-    if (predict) {
-        mHwComposerAsyncWorker = std::make_unique<HwcAsyncWorker>();
+    mPredictCompositionStrategy = predict;
+    updateHwcAsyncWorker();
+}
+
+void Output::updateHwcAsyncWorker() {
+    if (mPredictCompositionStrategy || mOffloadPresent) {
+        if (!mHwComposerAsyncWorker) {
+            mHwComposerAsyncWorker = std::make_unique<HwcAsyncWorker>();
+        }
     } else {
         mHwComposerAsyncWorker.reset(nullptr);
     }
@@ -1616,7 +1650,7 @@
     uint64_t outputLayerHash = getState().outputLayerHash;
     editState().lastOutputLayerHash = outputLayerHash;
 
-    if (!getState().isEnabled || !mHwComposerAsyncWorker) {
+    if (!getState().isEnabled || !mPredictCompositionStrategy) {
         ALOGV("canPredictCompositionStrategy disabled");
         return false;
     }