pthread_cond: only call futex_wake when there are waiters

pthread_cond_pulse would unconditionally call futex, which meant that
a number of processes were spending a significant amount of time in
pthread_cond_broadcast and pthread_cond_signal when there were no
threads waiting on that pthread_cond_t.

This change adds a counter to the 64-bit pthread_cond_t struct and
only calls futex() in cases where there is a nonzero waiter count. The
32-bit pthread_cond_t is unchanged due to compatibility reasons.

Test: no pthread_cond_broadcast/signal stacks without try_to_wake_up
in SurfaceFlinger
bug: 168831708

Change-Id: I105e1345cd2a3a75f98cd0acf316e790ba1716f0
diff --git a/libc/bionic/pthread_cond.cpp b/libc/bionic/pthread_cond.cpp
index e857069..793dcd9 100644
--- a/libc/bionic/pthread_cond.cpp
+++ b/libc/bionic/pthread_cond.cpp
@@ -116,7 +116,8 @@
   }
 
 #if defined(__LP64__)
-  char __reserved[44];
+  atomic_uint waiters;
+  char __reserved[40];
 #endif
 };
 
@@ -141,6 +142,10 @@
   }
   atomic_init(&cond->state, init_state);
 
+#if defined(__LP64__)
+  atomic_init(&cond->waiters, 0);
+#endif
+
   return 0;
 }
 
@@ -163,6 +168,12 @@
   // not be called. That's why pthread_wait/signal pair can't be used as a method for memory
   // synchronization. And it doesn't help even if we use any fence here.
 
+#if defined(__LP64__)
+  if (atomic_load_explicit(&cond->waiters, memory_order_relaxed) == 0) {
+    return 0;
+  }
+#endif
+
   // The increase of value should leave flags alone, even if the value can overflows.
   atomic_fetch_add_explicit(&cond->state, COND_COUNTER_STEP, memory_order_relaxed);
 
@@ -178,9 +189,19 @@
   }
 
   unsigned int old_state = atomic_load_explicit(&cond->state, memory_order_relaxed);
+
+#if defined(__LP64__)
+  atomic_fetch_add_explicit(&cond->waiters, 1, memory_order_relaxed);
+#endif
+
   pthread_mutex_unlock(mutex);
   int status = __futex_wait_ex(&cond->state, cond->process_shared(), old_state,
                                use_realtime_clock, abs_timeout_or_null);
+
+#if defined(__LP64__)
+  atomic_fetch_sub_explicit(&cond->waiters, 1, memory_order_relaxed);
+#endif
+
   pthread_mutex_lock(mutex);
 
   if (status == -ETIMEDOUT) {