Reland "servicemanager: fix lazy service issues"

This reverts commit 66417657c54cebaeda3be19e539bb17d89172ed4.

This CL is relanding removing a check, that was
actually hit when services die.

Most of the issues here occured in a single thread, and
they were due to inconsistent state in the state machine
around clients, and it appears there were a lot of other
things added which papered over these issues.

I've marked explanations in more detail in the code
itself.

Bug: 264814573
Test: aidl_lazy_test in a loop at 500x speed by
    adjusting timings, running for an hour

Change-Id: Icbd72470dbe91761800761e81893f3ebe343e430
diff --git a/cmds/servicemanager/ServiceManager.cpp b/cmds/servicemanager/ServiceManager.cpp
index 695faf8..eadf678 100644
--- a/cmds/servicemanager/ServiceManager.cpp
+++ b/cmds/servicemanager/ServiceManager.cpp
@@ -222,6 +222,13 @@
 }
 #endif  // !VENDORSERVICEMANAGER
 
+ServiceManager::Service::~Service() {
+    if (!hasClients) {
+        // only expected to happen on process death
+        LOG(WARNING) << "a service was removed when there are clients";
+    }
+}
+
 ServiceManager::ServiceManager(std::unique_ptr<Access>&& access) : mAccess(std::move(access)) {
 // TODO(b/151696835): reenable performance hack when we solve bug, since with
 //     this hack and other fixes, it is unlikely we will see even an ephemeral
@@ -293,8 +300,13 @@
     }
 
     if (out) {
-        // Setting this guarantee each time we hand out a binder ensures that the client-checking
-        // loop knows about the event even if the client immediately drops the service
+        // Force onClients to get sent, and then make sure the timerfd won't clear it
+        // by setting guaranteeClient again. This logic could be simplified by using
+        // a time-based guarantee. However, forcing onClients(true) to get sent
+        // right here is always going to be important for processes serving multiple
+        // lazy interfaces.
+        service->guaranteeClient = true;
+        CHECK(handleServiceClientCallback(2 /* sm + transaction */, name, false));
         service->guaranteeClient = true;
     }
 
@@ -384,8 +396,13 @@
     };
 
     if (auto it = mNameToRegistrationCallback.find(name); it != mNameToRegistrationCallback.end()) {
+        // See also getService - handles case where client never gets the service,
+        // we want the service to quit.
+        mNameToService[name].guaranteeClient = true;
+        CHECK(handleServiceClientCallback(2 /* sm + transaction */, name, false));
+        mNameToService[name].guaranteeClient = true;
+
         for (const sp<IServiceCallback>& cb : it->second) {
-            mNameToService[name].guaranteeClient = true;
             // permission checked in registerForNotifications
             cb->onRegistration(name, binder);
         }
@@ -696,28 +713,28 @@
 
 void ServiceManager::handleClientCallbacks() {
     for (const auto& [name, service] : mNameToService) {
-        handleServiceClientCallback(name, true);
+        handleServiceClientCallback(1 /* sm has one refcount */, name, true);
     }
 }
 
-ssize_t ServiceManager::handleServiceClientCallback(const std::string& serviceName,
-                                                    bool isCalledOnInterval) {
+bool ServiceManager::handleServiceClientCallback(size_t knownClients,
+                                                 const std::string& serviceName,
+                                                 bool isCalledOnInterval) {
     auto serviceIt = mNameToService.find(serviceName);
     if (serviceIt == mNameToService.end() || mNameToClientCallback.count(serviceName) < 1) {
-        return -1;
+        return true; // return we do have clients a.k.a. DON'T DO ANYTHING
     }
 
     Service& service = serviceIt->second;
     ssize_t count = service.getNodeStrongRefCount();
 
-    // binder driver doesn't support this feature
-    if (count == -1) return count;
+    // binder driver doesn't support this feature, consider we have clients
+    if (count == -1) return true;
 
-    bool hasClients = count > 1; // this process holds a strong count
+    bool hasKernelReportedClients = static_cast<size_t>(count) > knownClients;
 
     if (service.guaranteeClient) {
-        // we have no record of this client
-        if (!service.hasClients && !hasClients) {
+        if (!service.hasClients && !hasKernelReportedClients) {
             sendClientCallbackNotifications(serviceName, true,
                                             "service is guaranteed to be in use");
         }
@@ -726,21 +743,23 @@
         service.guaranteeClient = false;
     }
 
-    // only send notifications if this was called via the interval checking workflow
-    if (isCalledOnInterval) {
-        if (hasClients && !service.hasClients) {
-            // client was retrieved in some other way
-            sendClientCallbackNotifications(serviceName, true, "we now have a record of a client");
-        }
+    // Regardless of this situation, we want to give this notification as soon as possible.
+    // This way, we have a chance of preventing further thrashing.
+    if (hasKernelReportedClients && !service.hasClients) {
+        sendClientCallbackNotifications(serviceName, true, "we now have a record of a client");
+    }
 
-        // there are no more clients, but the callback has not been called yet
-        if (!hasClients && service.hasClients) {
+    // But limit rate of shutting down service.
+    if (isCalledOnInterval) {
+        if (!hasKernelReportedClients && service.hasClients) {
             sendClientCallbackNotifications(serviceName, false,
                                             "we now have no record of a client");
         }
     }
 
-    return count;
+    // May be different than 'hasKernelReportedClients'. We intentionally delay
+    // information about clients going away to reduce thrashing.
+    return service.hasClients;
 }
 
 void ServiceManager::sendClientCallbackNotifications(const std::string& serviceName,
@@ -753,13 +772,10 @@
     }
     Service& service = serviceIt->second;
 
-    CHECK(hasClients != service.hasClients)
-            << "Record shows: " << service.hasClients
-            << " so we can't tell clients again that we have client: " << hasClients
-            << " when: " << context;
+    CHECK_NE(hasClients, service.hasClients) << context;
 
-    ALOGI("Notifying %s they %s have clients when %s", serviceName.c_str(),
-          hasClients ? "do" : "don't", context);
+    ALOGI("Notifying %s they %s (previously: %s) have clients when %s", serviceName.c_str(),
+          hasClients ? "do" : "don't", service.hasClients ? "do" : "don't", context);
 
     auto ccIt = mNameToClientCallback.find(serviceName);
     CHECK(ccIt != mNameToClientCallback.end())
@@ -803,26 +819,29 @@
         return Status::fromExceptionCode(Status::EX_ILLEGAL_STATE);
     }
 
+    // important because we don't have timer-based guarantees, we don't want to clear
+    // this
     if (serviceIt->second.guaranteeClient) {
         ALOGI("Tried to unregister %s, but there is about to be a client.", name.c_str());
         return Status::fromExceptionCode(Status::EX_ILLEGAL_STATE);
     }
 
-    int clients = handleServiceClientCallback(name, false);
-
-    // clients < 0: feature not implemented or other error. Assume clients.
-    // Otherwise:
     // - kernel driver will hold onto one refcount (during this transaction)
     // - servicemanager has a refcount (guaranteed by this transaction)
-    // So, if clients > 2, then at least one other service on the system must hold a refcount.
-    if (clients < 0 || clients > 2) {
-        // client callbacks are either disabled or there are other clients
-        ALOGI("Tried to unregister %s, but there are clients: %d", name.c_str(), clients);
-        // Set this flag to ensure the clients are acknowledged in the next callback
+    constexpr size_t kKnownClients = 2;
+
+    if (handleServiceClientCallback(kKnownClients, name, false)) {
+        ALOGI("Tried to unregister %s, but there are clients.", name.c_str());
+
+        // Since we had a failed registration attempt, and the HIDL implementation of
+        // delaying service shutdown for multiple periods wasn't ported here... this may
+        // help reduce thrashing, but we should be able to remove it.
         serviceIt->second.guaranteeClient = true;
+
         return Status::fromExceptionCode(Status::EX_ILLEGAL_STATE);
     }
 
+    ALOGI("Unregistering %s", name.c_str());
     mNameToService.erase(name);
 
     return Status::ok();