DL lifecycle: handle slow DL binding.

Bug: 182214420
Test: atest PackageManagerShellCommandTest PackageManagerShellCommandIncrementalTest IncrementalServiceTest PackageManagerServiceTest ChecksumsTest
Change-Id: I5959e01177ab702de1f754f4ba433004925ce98b
diff --git a/services/incremental/IncrementalService.cpp b/services/incremental/IncrementalService.cpp
index 1fcc284..c38d0b3 100644
--- a/services/incremental/IncrementalService.cpp
+++ b/services/incremental/IncrementalService.cpp
@@ -74,6 +74,13 @@
 
     // If DL was up and not crashing for 10mins, we consider it healthy and reset all delays.
     static constexpr auto healthyDataLoaderUptime = 10min;
+
+    // For healthy DLs, we'll retry every ~5secs for ~10min
+    static constexpr auto bindRetryInterval = 5s;
+    static constexpr auto bindGracePeriod = 10min;
+
+    static constexpr auto bindingTimeout = 1min;
+
     // 10s, 100s (~2min), 1000s (~15min), 10000s (~3hrs)
     static constexpr auto minBindDelay = 10s;
     static constexpr auto maxBindDelay = 10000s;
@@ -293,6 +300,7 @@
         mTimedQueue(sm.getTimedQueue()),
         mProgressUpdateJobQueue(sm.getProgressUpdateJobQueue()),
         mFs(sm.getFs()),
+        mClock(sm.getClock()),
         mIncrementalDir(rootDir) {
     CHECK(mVold) << "Vold service is unavailable";
     CHECK(mDataLoaderManager) << "DataLoaderManagerService is unavailable";
@@ -302,6 +310,7 @@
     CHECK(mTimedQueue) << "TimedQueue is unavailable";
     CHECK(mProgressUpdateJobQueue) << "mProgressUpdateJobQueue is unavailable";
     CHECK(mFs) << "Fs is unavailable";
+    CHECK(mClock) << "Clock is unavailable";
 
     mJobQueue.reserve(16);
     mJobProcessor = std::thread([this]() {
@@ -2241,17 +2250,44 @@
                << status << " (current " << mCurrentStatus << ")";
 }
 
-Milliseconds IncrementalService::DataLoaderStub::updateBindDelay() {
+std::optional<Milliseconds> IncrementalService::DataLoaderStub::needToBind() {
     std::unique_lock lock(mMutex);
+
+    const auto now = mService.mClock->now();
+    const bool healthy = (mPreviousBindDelay == 0ms);
+
+    if (mCurrentStatus == IDataLoaderStatusListener::DATA_LOADER_BINDING &&
+        now - mCurrentStatusTs <= Constants::bindingTimeout) {
+        LOG(INFO) << "Binding still in progress. "
+                  << (healthy ? "The DL is healthy/freshly bound, ok to retry for a few times."
+                              : "Already unhealthy, don't do anything.");
+        // Binding still in progress.
+        if (!healthy) {
+            // Already unhealthy, don't do anything.
+            return {};
+        }
+        // The DL is healthy/freshly bound, ok to retry for a few times.
+        if (now - mPreviousBindTs <= Constants::bindGracePeriod) {
+            // Still within grace period.
+            if (now - mCurrentStatusTs >= Constants::bindRetryInterval) {
+                // Retry interval passed, retrying.
+                mCurrentStatusTs = now;
+                mPreviousBindDelay = 0ms;
+                return 0ms;
+            }
+            return {};
+        }
+        // fallthrough, mark as unhealthy, and retry with delay
+    }
+
     const auto previousBindTs = mPreviousBindTs;
-    const auto now = Clock::now();
     mPreviousBindTs = now;
 
     const auto nonCrashingInterval = std::max(castToMs(now - previousBindTs), 100ms);
     if (previousBindTs.time_since_epoch() == Clock::duration::zero() ||
         nonCrashingInterval > Constants::healthyDataLoaderUptime) {
         mPreviousBindDelay = 0ms;
-        return mPreviousBindDelay;
+        return 0ms;
     }
 
     constexpr auto minBindDelayMs = castToMs(Constants::minBindDelay);
@@ -2264,12 +2300,16 @@
     const auto bindDelayJitterRangeMs = bindDelayMs / Constants::bindDelayJitterDivider;
     const auto bindDelayJitterMs = rand() % (bindDelayJitterRangeMs * 2) - bindDelayJitterRangeMs;
     mPreviousBindDelay = std::chrono::milliseconds(bindDelayMs + bindDelayJitterMs);
-
     return mPreviousBindDelay;
 }
 
 bool IncrementalService::DataLoaderStub::bind() {
-    const auto bindDelay = updateBindDelay();
+    const auto maybeBindDelay = needToBind();
+    if (!maybeBindDelay) {
+        LOG(DEBUG) << "Skipping bind to " << mParams.packageName << " because of pending bind.";
+        return true;
+    }
+    const auto bindDelay = *maybeBindDelay;
     if (bindDelay > 1s) {
         LOG(INFO) << "Delaying bind to " << mParams.packageName << " by "
                   << bindDelay.count() / 1000 << "s";
@@ -2279,7 +2319,21 @@
     auto status = mService.mDataLoaderManager->bindToDataLoader(id(), mParams, bindDelay.count(),
                                                                 this, &result);
     if (!status.isOk() || !result) {
-        LOG(ERROR) << "Failed to bind a data loader for mount " << id();
+        const bool healthy = (bindDelay == 0ms);
+        LOG(ERROR) << "Failed to bind a data loader for mount " << id()
+                   << (healthy ? ", retrying." : "");
+
+        // Internal error, retry for healthy/new DLs.
+        // Let needToBind migrate it to unhealthy after too many retries.
+        if (healthy) {
+            if (mService.addTimedJob(*mService.mTimedQueue, id(), Constants::bindRetryInterval,
+                                     [this]() { fsmStep(); })) {
+                // Mark as binding so that we know it's not the DL's fault.
+                setCurrentStatus(IDataLoaderStatusListener::DATA_LOADER_BINDING);
+                return true;
+            }
+        }
+
         return false;
     }
     return true;
@@ -2339,7 +2393,14 @@
             // Do nothing, this is a reset state.
             break;
         case IDataLoaderStatusListener::DATA_LOADER_DESTROYED: {
-            return destroy();
+            switch (currentStatus) {
+                case IDataLoaderStatusListener::DATA_LOADER_BINDING:
+                    setCurrentStatus(IDataLoaderStatusListener::DATA_LOADER_DESTROYED);
+                    return true;
+                default:
+                    return destroy();
+            }
+            break;
         }
         case IDataLoaderStatusListener::DATA_LOADER_STARTED: {
             switch (currentStatus) {
@@ -2353,6 +2414,7 @@
             switch (currentStatus) {
                 case IDataLoaderStatusListener::DATA_LOADER_DESTROYED:
                 case IDataLoaderStatusListener::DATA_LOADER_UNAVAILABLE:
+                case IDataLoaderStatusListener::DATA_LOADER_BINDING:
                     return bind();
                 case IDataLoaderStatusListener::DATA_LOADER_BOUND:
                     return create();
@@ -2372,7 +2434,8 @@
                 fromServiceSpecificError(-EINVAL, "onStatusChange came to invalid DataLoaderStub");
     }
     if (id() != mountId) {
-        LOG(ERROR) << "Mount ID mismatch: expected " << id() << ", but got: " << mountId;
+        LOG(ERROR) << "onStatusChanged: mount ID mismatch: expected " << id()
+                   << ", but got: " << mountId;
         return binder::Status::fromServiceSpecificError(-EPERM, "Mount ID mismatch.");
     }
     if (newStatus == IDataLoaderStatusListener::DATA_LOADER_UNRECOVERABLE) {
@@ -2396,11 +2459,13 @@
         }
 
         oldStatus = mCurrentStatus;
-        mCurrentStatus = newStatus;
         targetStatus = mTargetStatus;
-
         listener = mStatusListener;
 
+        // Change the status.
+        mCurrentStatus = newStatus;
+        mCurrentStatusTs = mService.mClock->now();
+
         if (mCurrentStatus == IDataLoaderStatusListener::DATA_LOADER_UNAVAILABLE ||
             mCurrentStatus == IDataLoaderStatusListener::DATA_LOADER_UNRECOVERABLE) {
             // For unavailable, unbind from DataLoader to ensure proper re-commit.
@@ -2428,7 +2493,8 @@
                                          "reportStreamHealth came to invalid DataLoaderStub");
     }
     if (id() != mountId) {
-        LOG(ERROR) << "Mount ID mismatch: expected " << id() << ", but got: " << mountId;
+        LOG(ERROR) << "reportStreamHealth: mount ID mismatch: expected " << id()
+                   << ", but got: " << mountId;
         return binder::Status::fromServiceSpecificError(-EPERM, "Mount ID mismatch.");
     }
     {
@@ -2694,6 +2760,8 @@
 void IncrementalService::DataLoaderStub::onDump(int fd) {
     dprintf(fd, "    dataLoader: {\n");
     dprintf(fd, "      currentStatus: %d\n", mCurrentStatus);
+    dprintf(fd, "      currentStatusTs: %lldmcs\n",
+            (long long)(elapsedMcs(mCurrentStatusTs, Clock::now())));
     dprintf(fd, "      targetStatus: %d\n", mTargetStatus);
     dprintf(fd, "      targetStatusTs: %lldmcs\n",
             (long long)(elapsedMcs(mTargetStatusTs, Clock::now())));