Offload present to a separate thread

Make Output::present() return an ftl::Future. When offloading its
HWC call, present will return a future that can be waited upon.

In CompositionEngine::present, store a vector of futures and wait
upon the results before returning. This allow's HWC's present to run in
parallel with other work; in particular, present on other displays.
Waiting here ensures that post-composition work does not start until
present has completed. Future work may defer this even later.

Reuse HwcAsyncWorker to run present on a separate thread. Add a new
variable for determining whether to run validate in parallel, since the
presence of the HwcAsyncWorker could just mean that we are offloading
present.

Read the new DisplayCapability to determine whether a display can be
offloaded to a worker thread. Only run displays in parallel if they all
have the DisplayCapability. Non HWC-enabled displays without the
capability do not prevent other displays from being offloaded. They also
run last, since they can run in parallel with HWC-enabled displays.
(The ordering is now set by SurfaceFlinger, which places physical
displays at the start of the list of outputs.)

When telling a display to offload its present call, make it only last
for a single frame. This simplifies the code while ensuring we do not
leave it enabled unnecessarily.

Leave a single present call on the main thread. This saves a thread-hop,
while still allowing it to run in parallel with other HWC work.

Only attempt to offload present if the appropriate trunk stable flag (or
debug sysprop, "debug.sf.multithreaded_present") is set.

Bug: 241285491
Bug: 259132483
Test: manual: perfetto trace
Test: libcompositionengine_test
Change-Id: Ib9d074671e32c95875ef7e0791dd95d6e595e47a
diff --git a/services/surfaceflinger/CompositionEngine/Android.bp b/services/surfaceflinger/CompositionEngine/Android.bp
index 2740a97..455e623 100644
--- a/services/surfaceflinger/CompositionEngine/Android.bp
+++ b/services/surfaceflinger/CompositionEngine/Android.bp
@@ -91,6 +91,9 @@
     ],
     local_include_dirs: ["include"],
     export_include_dirs: ["include"],
+    shared_libs: [
+        "server_configurable_flags",
+    ],
 }
 
 cc_library {
@@ -114,6 +117,9 @@
         "libsurfaceflinger_common_test",
         "libsurfaceflingerflags_test",
     ],
+    shared_libs: [
+        "server_configurable_flags",
+    ],
     local_include_dirs: ["include"],
     export_include_dirs: ["include"],
 }
@@ -150,10 +156,11 @@
         "libsurfaceflinger_common_test",
         "libsurfaceflingerflags_test",
     ],
-    // For some reason, libvulkan isn't picked up from librenderengine
-    // Probably ASAN related?
     shared_libs: [
+        // For some reason, libvulkan isn't picked up from librenderengine
+        // Probably ASAN related?
         "libvulkan",
+        "server_configurable_flags",
     ],
     sanitize: {
         hwaddress: true,
diff --git a/services/surfaceflinger/CompositionEngine/include/compositionengine/Output.h b/services/surfaceflinger/CompositionEngine/include/compositionengine/Output.h
index 40ebf44..83abd44 100644
--- a/services/surfaceflinger/CompositionEngine/include/compositionengine/Output.h
+++ b/services/surfaceflinger/CompositionEngine/include/compositionengine/Output.h
@@ -26,6 +26,7 @@
 #include <vector>
 
 #include <compositionengine/LayerFE.h>
+#include <ftl/future.h>
 #include <renderengine/LayerSettings.h>
 #include <ui/Fence.h>
 #include <ui/FenceTime.h>
@@ -263,8 +264,15 @@
     // Prepare the output, updating the OutputLayers used in the output
     virtual void prepare(const CompositionRefreshArgs&, LayerFESet&) = 0;
 
-    // Presents the output, finalizing all composition details
-    virtual void present(const CompositionRefreshArgs&) = 0;
+    // Presents the output, finalizing all composition details. This may happen
+    // asynchronously, in which case the returned future must be waited upon.
+    virtual ftl::Future<std::monostate> present(const CompositionRefreshArgs&) = 0;
+
+    // Whether this output can be presented from another thread.
+    virtual bool supportsOffloadPresent() const = 0;
+
+    // Make the next call to `present` run asynchronously.
+    virtual void offloadPresentNextFrame() = 0;
 
     // Enables predicting composition strategy to run client composition earlier
     virtual void setPredictCompositionStrategy(bool) = 0;
diff --git a/services/surfaceflinger/CompositionEngine/include/compositionengine/impl/Display.h b/services/surfaceflinger/CompositionEngine/include/compositionengine/impl/Display.h
index de82931..eac5d97 100644
--- a/services/surfaceflinger/CompositionEngine/include/compositionengine/impl/Display.h
+++ b/services/surfaceflinger/CompositionEngine/include/compositionengine/impl/Display.h
@@ -62,6 +62,7 @@
     compositionengine::Output::FrameFences presentFrame() override;
     void setExpensiveRenderingExpected(bool) override;
     void finishFrame(GpuCompositionResult&&) override;
+    bool supportsOffloadPresent() const override;
 
     // compositionengine::Display overrides
     DisplayId getId() const override;
diff --git a/services/surfaceflinger/CompositionEngine/include/compositionengine/impl/Output.h b/services/surfaceflinger/CompositionEngine/include/compositionengine/impl/Output.h
index d95fbea..ec6a4e9 100644
--- a/services/surfaceflinger/CompositionEngine/include/compositionengine/impl/Output.h
+++ b/services/surfaceflinger/CompositionEngine/include/compositionengine/impl/Output.h
@@ -80,7 +80,9 @@
     void setReleasedLayers(ReleasedLayers&&) override;
 
     void prepare(const CompositionRefreshArgs&, LayerFESet&) override;
-    void present(const CompositionRefreshArgs&) override;
+    ftl::Future<std::monostate> present(const CompositionRefreshArgs&) override;
+    bool supportsOffloadPresent() const override { return false; }
+    void offloadPresentNextFrame() override;
 
     void uncacheBuffers(const std::vector<uint64_t>& bufferIdsToUncache) override;
     void rebuildLayerStacks(const CompositionRefreshArgs&, LayerFESet&) override;
@@ -121,6 +123,7 @@
     virtual std::future<bool> chooseCompositionStrategyAsync(
             std::optional<android::HWComposer::DeviceRequestedChanges>*);
     virtual void resetCompositionStrategy();
+    virtual ftl::Future<std::monostate> presentFrameAndReleaseLayersAsync();
 
 protected:
     std::unique_ptr<compositionengine::OutputLayer> createOutputLayer(const sp<LayerFE>&) const;
@@ -164,6 +167,7 @@
     ui::Dataspace getBestDataspace(ui::Dataspace*, bool*) const;
     compositionengine::Output::ColorProfile pickColorProfile(
             const compositionengine::CompositionRefreshArgs&) const;
+    void updateHwcAsyncWorker();
 
     std::string mName;
     std::string mNamePlusId;
@@ -177,6 +181,9 @@
     std::unique_ptr<planner::Planner> mPlanner;
     std::unique_ptr<HwcAsyncWorker> mHwComposerAsyncWorker;
 
+    bool mPredictCompositionStrategy = false;
+    bool mOffloadPresent = false;
+
     // Whether the content must be recomposed this frame.
     bool mMustRecompose = false;
 };
diff --git a/services/surfaceflinger/CompositionEngine/include/compositionengine/mock/Output.h b/services/surfaceflinger/CompositionEngine/include/compositionengine/mock/Output.h
index c88fbd6..95ea3a4 100644
--- a/services/surfaceflinger/CompositionEngine/include/compositionengine/mock/Output.h
+++ b/services/surfaceflinger/CompositionEngine/include/compositionengine/mock/Output.h
@@ -80,7 +80,10 @@
     MOCK_METHOD1(setReleasedLayers, void(ReleasedLayers&&));
 
     MOCK_METHOD2(prepare, void(const compositionengine::CompositionRefreshArgs&, LayerFESet&));
-    MOCK_METHOD1(present, void(const compositionengine::CompositionRefreshArgs&));
+    MOCK_METHOD1(present,
+                 ftl::Future<std::monostate>(const compositionengine::CompositionRefreshArgs&));
+    MOCK_CONST_METHOD0(supportsOffloadPresent, bool());
+    MOCK_METHOD(void, offloadPresentNextFrame, ());
 
     MOCK_METHOD1(uncacheBuffers, void(const std::vector<uint64_t>&));
     MOCK_METHOD2(rebuildLayerStacks,
diff --git a/services/surfaceflinger/CompositionEngine/src/CompositionEngine.cpp b/services/surfaceflinger/CompositionEngine/src/CompositionEngine.cpp
index 15fadbc..9041964 100644
--- a/services/surfaceflinger/CompositionEngine/src/CompositionEngine.cpp
+++ b/services/surfaceflinger/CompositionEngine/src/CompositionEngine.cpp
@@ -20,6 +20,7 @@
 #include <compositionengine/OutputLayer.h>
 #include <compositionengine/impl/CompositionEngine.h>
 #include <compositionengine/impl/Display.h>
+#include <ui/DisplayMap.h>
 
 #include <renderengine/RenderEngine.h>
 #include <utils/Trace.h>
@@ -88,6 +89,33 @@
     return mRefreshStartTime;
 }
 
+namespace {
+int numDisplaysWithOffloadPresentSupport(const CompositionRefreshArgs& args) {
+    if (!FlagManager::getInstance().multithreaded_present() || args.outputs.size() < 2) {
+        return 0;
+    }
+
+    int numEligibleDisplays = 0;
+    // Only run present in multiple threads if all HWC-enabled displays
+    // being refreshed support it.
+    if (!std::all_of(args.outputs.begin(), args.outputs.end(),
+                     [&numEligibleDisplays](const auto& output) {
+                         if (!ftl::Optional(output->getDisplayId())
+                                      .and_then(HalDisplayId::tryCast)) {
+                             // Not HWC-enabled, so it is always
+                             // client-composited.
+                             return true;
+                         }
+                         const bool support = output->supportsOffloadPresent();
+                         numEligibleDisplays += static_cast<int>(support);
+                         return support;
+                     })) {
+        return 0;
+    }
+    return numEligibleDisplays;
+}
+} // namespace
+
 void CompositionEngine::present(CompositionRefreshArgs& args) {
     ATRACE_CALL();
     ALOGV(__FUNCTION__);
@@ -105,8 +133,36 @@
         }
     }
 
+    // Offloading the HWC call for `present` allows us to simultaneously call it
+    // on multiple displays. This is desirable because these calls block and can
+    // be slow.
+    if (const int numEligibleDisplays = numDisplaysWithOffloadPresentSupport(args);
+        numEligibleDisplays > 1) {
+        // Leave the last eligible display on the main thread, which will
+        // allow it to run concurrently without an extra thread hop.
+        int numToOffload = numEligibleDisplays - 1;
+        for (auto& output : args.outputs) {
+            if (output->supportsOffloadPresent()) {
+                output->offloadPresentNextFrame();
+                if (--numToOffload == 0) {
+                    break;
+                }
+            }
+        }
+    }
+
+    ui::DisplayVector<ftl::Future<std::monostate>> presentFutures;
     for (const auto& output : args.outputs) {
-        output->present(args);
+        presentFutures.push_back(output->present(args));
+    }
+
+    {
+        ATRACE_NAME("Waiting on HWC");
+        for (auto& future : presentFutures) {
+            // TODO(b/185536303): Call ftl::Future::wait() once it exists, since
+            // we do not need the return value of get().
+            future.get();
+        }
     }
 }
 
diff --git a/services/surfaceflinger/CompositionEngine/src/Display.cpp b/services/surfaceflinger/CompositionEngine/src/Display.cpp
index 469fb38..0475881 100644
--- a/services/surfaceflinger/CompositionEngine/src/Display.cpp
+++ b/services/surfaceflinger/CompositionEngine/src/Display.cpp
@@ -430,4 +430,13 @@
     impl::Output::finishFrame(std::move(result));
 }
 
+bool Display::supportsOffloadPresent() const {
+    if (const auto halDisplayId = HalDisplayId::tryCast(mId)) {
+        const auto& hwc = getCompositionEngine().getHwComposer();
+        return hwc.hasDisplayCapability(*halDisplayId, DisplayCapability::MULTI_THREADED_PRESENT);
+    }
+
+    return false;
+}
+
 } // namespace android::compositionengine::impl
diff --git a/services/surfaceflinger/CompositionEngine/src/Output.cpp b/services/surfaceflinger/CompositionEngine/src/Output.cpp
index 2ae80de..e4d7578 100644
--- a/services/surfaceflinger/CompositionEngine/src/Output.cpp
+++ b/services/surfaceflinger/CompositionEngine/src/Output.cpp
@@ -427,7 +427,8 @@
     uncacheBuffers(refreshArgs.bufferIdsToUncache);
 }
 
-void Output::present(const compositionengine::CompositionRefreshArgs& refreshArgs) {
+ftl::Future<std::monostate> Output::present(
+        const compositionengine::CompositionRefreshArgs& refreshArgs) {
     ATRACE_FORMAT("%s for %s", __func__, mNamePlusId.c_str());
     ALOGV(__FUNCTION__);
 
@@ -448,8 +449,26 @@
 
     devOptRepaintFlash(refreshArgs);
     finishFrame(std::move(result));
-    presentFrameAndReleaseLayers();
+    ftl::Future<std::monostate> future;
+    if (mOffloadPresent) {
+        future = presentFrameAndReleaseLayersAsync();
+
+        // Only offload for this frame. The next frame will determine whether it
+        // needs to be offloaded. Leave the HwcAsyncWorker in place. For one thing,
+        // it is currently presenting. Further, it may be needed next frame, and
+        // we don't want to churn.
+        mOffloadPresent = false;
+    } else {
+        presentFrameAndReleaseLayers();
+        future = ftl::yield<std::monostate>({});
+    }
     renderCachedSets(refreshArgs);
+    return future;
+}
+
+void Output::offloadPresentNextFrame() {
+    mOffloadPresent = true;
+    updateHwcAsyncWorker();
 }
 
 void Output::uncacheBuffers(std::vector<uint64_t> const& bufferIdsToUncache) {
@@ -1084,6 +1103,14 @@
     finishPrepareFrame();
 }
 
+ftl::Future<std::monostate> Output::presentFrameAndReleaseLayersAsync() {
+    return ftl::Future<bool>(std::move(mHwComposerAsyncWorker->send([&]() {
+               presentFrameAndReleaseLayers();
+               return true;
+           })))
+            .then([](bool) { return std::monostate{}; });
+}
+
 std::future<bool> Output::chooseCompositionStrategyAsync(
         std::optional<android::HWComposer::DeviceRequestedChanges>* changes) {
     return mHwComposerAsyncWorker->send(
@@ -1600,8 +1627,15 @@
 }
 
 void Output::setPredictCompositionStrategy(bool predict) {
-    if (predict) {
-        mHwComposerAsyncWorker = std::make_unique<HwcAsyncWorker>();
+    mPredictCompositionStrategy = predict;
+    updateHwcAsyncWorker();
+}
+
+void Output::updateHwcAsyncWorker() {
+    if (mPredictCompositionStrategy || mOffloadPresent) {
+        if (!mHwComposerAsyncWorker) {
+            mHwComposerAsyncWorker = std::make_unique<HwcAsyncWorker>();
+        }
     } else {
         mHwComposerAsyncWorker.reset(nullptr);
     }
@@ -1616,7 +1650,7 @@
     uint64_t outputLayerHash = getState().outputLayerHash;
     editState().lastOutputLayerHash = outputLayerHash;
 
-    if (!getState().isEnabled || !mHwComposerAsyncWorker) {
+    if (!getState().isEnabled || !mPredictCompositionStrategy) {
         ALOGV("canPredictCompositionStrategy disabled");
         return false;
     }
diff --git a/services/surfaceflinger/CompositionEngine/tests/CompositionEngineTest.cpp b/services/surfaceflinger/CompositionEngine/tests/CompositionEngineTest.cpp
index 60ed660..a451ab2 100644
--- a/services/surfaceflinger/CompositionEngine/tests/CompositionEngineTest.cpp
+++ b/services/surfaceflinger/CompositionEngine/tests/CompositionEngineTest.cpp
@@ -20,12 +20,15 @@
 #include <compositionengine/mock/LayerFE.h>
 #include <compositionengine/mock/Output.h>
 #include <compositionengine/mock/OutputLayer.h>
+#include <ftl/future.h>
 #include <gtest/gtest.h>
 #include <renderengine/mock/RenderEngine.h>
 
 #include "MockHWComposer.h"
 #include "TimeStats/TimeStats.h"
 
+#include <variant>
+
 namespace android::compositionengine {
 namespace {
 
@@ -107,10 +110,19 @@
     EXPECT_CALL(*mOutput2, prepare(Ref(mRefreshArgs), _));
     EXPECT_CALL(*mOutput3, prepare(Ref(mRefreshArgs), _));
 
+    if (FlagManager::getInstance().multithreaded_present()) {
+        EXPECT_CALL(*mOutput1, getDisplayId()).WillOnce(Return(std::nullopt));
+        EXPECT_CALL(*mOutput2, getDisplayId()).WillOnce(Return(std::nullopt));
+        EXPECT_CALL(*mOutput3, getDisplayId()).WillOnce(Return(std::nullopt));
+    }
+
     // The last step is to actually present each output.
-    EXPECT_CALL(*mOutput1, present(Ref(mRefreshArgs)));
-    EXPECT_CALL(*mOutput2, present(Ref(mRefreshArgs)));
-    EXPECT_CALL(*mOutput3, present(Ref(mRefreshArgs)));
+    EXPECT_CALL(*mOutput1, present(Ref(mRefreshArgs)))
+            .WillOnce(Return(ftl::yield<std::monostate>({})));
+    EXPECT_CALL(*mOutput2, present(Ref(mRefreshArgs)))
+            .WillOnce(Return(ftl::yield<std::monostate>({})));
+    EXPECT_CALL(*mOutput3, present(Ref(mRefreshArgs)))
+            .WillOnce(Return(ftl::yield<std::monostate>({})));
 
     mRefreshArgs.outputs = {mOutput1, mOutput2, mOutput3};
     mEngine.present(mRefreshArgs);
diff --git a/services/surfaceflinger/CompositionEngine/tests/OutputTest.cpp b/services/surfaceflinger/CompositionEngine/tests/OutputTest.cpp
index 5537fcd..5006e7d 100644
--- a/services/surfaceflinger/CompositionEngine/tests/OutputTest.cpp
+++ b/services/surfaceflinger/CompositionEngine/tests/OutputTest.cpp
@@ -35,6 +35,7 @@
 
 #include <cmath>
 #include <cstdint>
+#include <variant>
 
 #include "CallOrderStateMachineHelper.h"
 #include "MockHWC2.h"
@@ -54,6 +55,7 @@
 using testing::Invoke;
 using testing::IsEmpty;
 using testing::Mock;
+using testing::NiceMock;
 using testing::Pointee;
 using testing::Property;
 using testing::Ref;
@@ -4900,5 +4902,54 @@
     EXPECT_EQ(mLayers[2].mLayerSettings, requests[0]);
 }
 
+struct OutputPresentFrameAndReleaseLayersAsyncTest : public ::testing::Test {
+    // Piggy-back on OutputPrepareFrameAsyncTest's version to avoid some boilerplate.
+    struct OutputPartialMock : public OutputPrepareFrameAsyncTest::OutputPartialMock {
+        // Set up the helper functions called by the function under test to use
+        // mock implementations.
+        MOCK_METHOD0(presentFrameAndReleaseLayers, void());
+        MOCK_METHOD0(presentFrameAndReleaseLayersAsync, ftl::Future<std::monostate>());
+    };
+    OutputPresentFrameAndReleaseLayersAsyncTest() {
+        mOutput->setDisplayColorProfileForTest(
+                std::unique_ptr<DisplayColorProfile>(mDisplayColorProfile));
+        mOutput->setRenderSurfaceForTest(std::unique_ptr<RenderSurface>(mRenderSurface));
+        mOutput->setCompositionEnabled(true);
+        mRefreshArgs.outputs = {mOutput};
+    }
+
+    mock::DisplayColorProfile* mDisplayColorProfile = new NiceMock<mock::DisplayColorProfile>();
+    mock::RenderSurface* mRenderSurface = new NiceMock<mock::RenderSurface>();
+    std::shared_ptr<OutputPartialMock> mOutput{std::make_shared<NiceMock<OutputPartialMock>>()};
+    CompositionRefreshArgs mRefreshArgs;
+};
+
+TEST_F(OutputPresentFrameAndReleaseLayersAsyncTest, notCalledWhenNotRequested) {
+    EXPECT_CALL(*mOutput, presentFrameAndReleaseLayersAsync()).Times(0);
+    EXPECT_CALL(*mOutput, presentFrameAndReleaseLayers()).Times(1);
+
+    mOutput->present(mRefreshArgs);
+}
+
+TEST_F(OutputPresentFrameAndReleaseLayersAsyncTest, calledWhenRequested) {
+    EXPECT_CALL(*mOutput, presentFrameAndReleaseLayersAsync())
+            .WillOnce(Return(ftl::yield<std::monostate>({})));
+    EXPECT_CALL(*mOutput, presentFrameAndReleaseLayers()).Times(0);
+
+    mOutput->offloadPresentNextFrame();
+    mOutput->present(mRefreshArgs);
+}
+
+TEST_F(OutputPresentFrameAndReleaseLayersAsyncTest, calledForOneFrame) {
+    ::testing::InSequence inseq;
+    EXPECT_CALL(*mOutput, presentFrameAndReleaseLayersAsync())
+            .WillOnce(Return(ftl::yield<std::monostate>({})));
+    EXPECT_CALL(*mOutput, presentFrameAndReleaseLayers()).Times(1);
+
+    mOutput->offloadPresentNextFrame();
+    mOutput->present(mRefreshArgs);
+    mOutput->present(mRefreshArgs);
+}
+
 } // namespace
 } // namespace android::compositionengine