Support priority inheritance mutex in 32-bit programs.
Add fast path calling PIMutexTryLock() in pthread_mutex_lock.
Add trace for pi mutex waiting.
Bug: http://b/29177606
Test: run bionic-unit-tests.
Test: run bionic-benchmarks.
Change-Id: I30b6436692d5ea6b63ca9905df745edb843b5528
diff --git a/libc/bionic/pthread_mutex.cpp b/libc/bionic/pthread_mutex.cpp
index ed90639..e5f2a31 100644
--- a/libc/bionic/pthread_mutex.cpp
+++ b/libc/bionic/pthread_mutex.cpp
@@ -31,6 +31,7 @@
#include <errno.h>
#include <limits.h>
#include <stdatomic.h>
+#include <stdlib.h>
#include <string.h>
#include <sys/cdefs.h>
#include <sys/mman.h>
@@ -130,8 +131,6 @@
return 0;
}
-#if defined(__LP64__)
-
// Priority Inheritance mutex implementation
struct PIMutex {
// mutex type, can be 0 (normal), 1 (recursive), 2 (errorcheck), constant during lifetime
@@ -172,12 +171,16 @@
return EBUSY;
}
-static int PIMutexTimedLock(PIMutex& mutex, const timespec* abs_timeout) {
+// Inlining this function in pthread_mutex_lock() add the cost of stack frame instructions on
+// ARM/ARM64, which increases at most 20 percent overhead. So make it noinline.
+static int __attribute__((noinline)) PIMutexTimedLock(PIMutex& mutex,
+ const timespec* abs_timeout) {
int ret = PIMutexTryLock(mutex);
if (__predict_true(ret == 0)) {
return 0;
}
if (ret == EBUSY) {
+ ScopedTrace trace("Contending for pthread mutex");
ret = -__futex_pi_lock_ex(&mutex.owner_tid, mutex.shared, true, abs_timeout);
}
return ret;
@@ -228,7 +231,97 @@
}
return EBUSY;
}
-#endif // defined(__LP64__)
+
+#if !defined(__LP64__)
+
+namespace PIMutexAllocator {
+// pthread_mutex_t has only 4 bytes in 32-bit programs, which are not enough to hold PIMutex.
+// So we use malloc to allocate PIMutexes and use 16-bit of pthread_mutex_t as indexes to find
+// the allocated PIMutexes. This allows at most 65536 PI mutexes.
+// When calling operations like pthread_mutex_lock/unlock, the 16-bit index is mapped to the
+// corresponding PIMutex. To make the map operation fast, we use a lockless mapping method:
+// Once a PIMutex is allocated, all the data used to map index to the PIMutex isn't changed until
+// it is destroyed.
+// Below are the data structures:
+// // struct Node contains a PIMutex.
+// typedef Node NodeArray[256];
+// typedef NodeArray* NodeArrayP;
+// NodeArrayP nodes[256];
+//
+// A 16-bit index is mapped to Node as below:
+// (*nodes[index >> 8])[index & 0xff]
+//
+// Also use a free list to allow O(1) finding recycled PIMutexes.
+
+union Node {
+ PIMutex mutex;
+ int next_free_id; // If not -1, refer to the next node in the free PIMutex list.
+};
+typedef Node NodeArray[256];
+typedef NodeArray* NodeArrayP;
+
+// lock_ protects below items.
+static Lock lock;
+static NodeArrayP* nodes;
+static int next_to_alloc_id;
+static int first_free_id = -1; // If not -1, refer to the first node in the free PIMutex list.
+
+static inline __always_inline Node& IdToNode(int id) {
+ return (*nodes[id >> 8])[id & 0xff];
+}
+
+static inline __always_inline PIMutex& IdToPIMutex(int id) {
+ return IdToNode(id).mutex;
+}
+
+static int AllocIdLocked() {
+ if (first_free_id != -1) {
+ int result = first_free_id;
+ first_free_id = IdToNode(result).next_free_id;
+ return result;
+ }
+ if (next_to_alloc_id >= 0x10000) {
+ return -1;
+ }
+ int array_pos = next_to_alloc_id >> 8;
+ int node_pos = next_to_alloc_id & 0xff;
+ if (node_pos == 0) {
+ if (array_pos == 0) {
+ nodes = static_cast<NodeArray**>(calloc(256, sizeof(NodeArray*)));
+ if (nodes == nullptr) {
+ return -1;
+ }
+ }
+ nodes[array_pos] = static_cast<NodeArray*>(malloc(sizeof(NodeArray)));
+ if (nodes[array_pos] == nullptr) {
+ return -1;
+ }
+ }
+ return next_to_alloc_id++;
+}
+
+// If succeed, return an id referring to a PIMutex, otherwise return -1.
+// A valid id is in range [0, 0xffff].
+static int AllocId() {
+ lock.lock();
+ int result = AllocIdLocked();
+ lock.unlock();
+ if (result != -1) {
+ memset(&IdToPIMutex(result), 0, sizeof(PIMutex));
+ }
+ return result;
+}
+
+static void FreeId(int id) {
+ lock.lock();
+ IdToNode(id).next_free_id = first_free_id;
+ first_free_id = id;
+ lock.unlock();
+}
+
+} // namespace PIMutexAllocator
+
+#endif // !defined(__LP64__)
/* Convenience macro, creates a mask of 'bits' bits that starts from
@@ -324,7 +417,7 @@
// For a PI mutex, it includes below fields:
// Atomic(uint16_t) state;
-// PIMutex pi_mutex;
+// PIMutex pi_mutex; // uint16_t pi_mutex_id in 32-bit programs
//
// state holds the following fields:
//
@@ -332,6 +425,7 @@
// 15-14 type mutex type, should be 3
//
// pi_mutex holds the state of a PI mutex.
+// pi_mutex_id holds an integer to find the state of a PI mutex.
//
// For a Non-PI mutex, it includes below fields:
// Atomic(uint16_t) state;
@@ -351,20 +445,42 @@
// thread id.
//
// PI mutexes and Non-PI mutexes are distinguished by checking type field in state.
-struct pthread_mutex_internal_t {
- _Atomic(uint16_t) state;
#if defined(__LP64__)
- uint16_t __pad;
- union {
- atomic_int owner_tid;
- PIMutex pi_mutex;
- };
- char __reserved[28];
-#else
- _Atomic(uint16_t) owner_tid;
-#endif
+struct pthread_mutex_internal_t {
+ _Atomic(uint16_t) state;
+ uint16_t __pad;
+ union {
+ atomic_int owner_tid;
+ PIMutex pi_mutex;
+ };
+ char __reserved[28];
+
+ PIMutex& ToPIMutex() {
+ return pi_mutex;
+ }
+
+ void FreePIMutex() {
+ }
} __attribute__((aligned(4)));
+#else
+struct pthread_mutex_internal_t {
+ _Atomic(uint16_t) state;
+ union {
+ _Atomic(uint16_t) owner_tid;
+ uint16_t pi_mutex_id;
+ };
+
+ PIMutex& ToPIMutex() {
+ return PIMutexAllocator::IdToPIMutex(pi_mutex_id);
+ }
+
+ void FreePIMutex() {
+ PIMutexAllocator::FreeId(pi_mutex_id);
+ }
+} __attribute__((aligned(4)));
+#endif
+
static_assert(sizeof(pthread_mutex_t) == sizeof(pthread_mutex_internal_t),
"pthread_mutex_t should actually be pthread_mutex_internal_t in implementation.");
@@ -407,13 +523,20 @@
}
if (((*attr & MUTEXATTR_PROTOCOL_MASK) >> MUTEXATTR_PROTOCOL_SHIFT) == PTHREAD_PRIO_INHERIT) {
-#if defined(__LP64__)
- atomic_init(&mutex->state, MUTEX_TYPE_BITS_WITH_PI);
- mutex->pi_mutex.type = *attr & MUTEXATTR_TYPE_MASK;
- mutex->pi_mutex.shared = (*attr & MUTEXATTR_SHARED_MASK) != 0;
-#else
- return EINVAL;
+#if !defined(__LP64__)
+ if (state & MUTEX_SHARED_MASK) {
+ return EINVAL;
+ }
+ int id = PIMutexAllocator::AllocId();
+ if (id == -1) {
+ return ENOMEM;
+ }
+ mutex->pi_mutex_id = id;
#endif
+ atomic_init(&mutex->state, MUTEX_TYPE_BITS_WITH_PI);
+ PIMutex& pi_mutex = mutex->ToPIMutex();
+ pi_mutex.type = *attr & MUTEXATTR_TYPE_MASK;
+ pi_mutex.shared = (*attr & MUTEXATTR_SHARED_MASK) != 0;
} else {
atomic_init(&mutex->state, state);
atomic_init(&mutex->owner_tid, 0);
@@ -668,18 +791,20 @@
uint16_t old_state = atomic_load_explicit(&mutex->state, memory_order_relaxed);
uint16_t mtype = (old_state & MUTEX_TYPE_MASK);
- uint16_t shared = (old_state & MUTEX_SHARED_MASK);
// Avoid slowing down fast path of normal mutex lock operation.
if (__predict_true(mtype == MUTEX_TYPE_BITS_NORMAL)) {
- if (__predict_true(NonPI::NormalMutexTryLock(mutex, shared) == 0)) {
- return 0;
- }
+ uint16_t shared = (old_state & MUTEX_SHARED_MASK);
+ if (__predict_true(NonPI::NormalMutexTryLock(mutex, shared) == 0)) {
+ return 0;
+ }
+ } else if (mtype == MUTEX_TYPE_BITS_WITH_PI) {
+ PIMutex& m = mutex->ToPIMutex();
+ // Handle common case first.
+ if (__predict_true(PIMutexTryLock(m) == 0)) {
+ return 0;
+ }
+ return PIMutexTimedLock(mutex->ToPIMutex(), nullptr);
}
-#if defined(__LP64__)
- if (mtype == MUTEX_TYPE_BITS_WITH_PI) {
- return PIMutexTimedLock(mutex->pi_mutex, nullptr);
- }
-#endif
return NonPI::MutexLockWithTimeout(mutex, false, nullptr);
}
@@ -704,11 +829,9 @@
NonPI::NormalMutexUnlock(mutex, shared);
return 0;
}
-#if defined(__LP64__)
if (mtype == MUTEX_TYPE_BITS_WITH_PI) {
- return PIMutexUnlock(mutex->pi_mutex);
+ return PIMutexUnlock(mutex->ToPIMutex());
}
-#endif
// Do we already own this recursive or error-check mutex?
pid_t tid = __get_thread()->tid;
@@ -752,11 +875,9 @@
uint16_t shared = (old_state & MUTEX_SHARED_MASK);
return NonPI::NormalMutexTryLock(mutex, shared);
}
-#if defined(__LP64__)
if (mtype == MUTEX_TYPE_BITS_WITH_PI) {
- return PIMutexTryLock(mutex->pi_mutex);
+ return PIMutexTryLock(mutex->ToPIMutex());
}
-#endif
// Do we already own this recursive or error-check mutex?
pid_t tid = __get_thread()->tid;
@@ -813,23 +934,23 @@
return 0;
}
}
-#if defined(__LP64__)
if (mtype == MUTEX_TYPE_BITS_WITH_PI) {
- return PIMutexTimedLock(mutex->pi_mutex, abs_timeout);
+ return PIMutexTimedLock(mutex->ToPIMutex(), abs_timeout);
}
-#endif
return NonPI::MutexLockWithTimeout(mutex, true, abs_timeout);
}
int pthread_mutex_destroy(pthread_mutex_t* mutex_interface) {
pthread_mutex_internal_t* mutex = __get_internal_mutex(mutex_interface);
uint16_t old_state = atomic_load_explicit(&mutex->state, memory_order_relaxed);
-#if defined(__LP64__)
uint16_t mtype = (old_state & MUTEX_TYPE_MASK);
if (mtype == MUTEX_TYPE_BITS_WITH_PI) {
- return PIMutexDestroy(mutex->pi_mutex);
+ int result = PIMutexDestroy(mutex->ToPIMutex());
+ if (result == 0) {
+ mutex->FreePIMutex();
+ }
+ return result;
}
-#endif
// Store 0xffff to make the mutex unusable. Although POSIX standard says it is undefined
// behavior to destroy a locked mutex, we prefer not to change mutex->state in that situation.
if (MUTEX_STATE_BITS_IS_UNLOCKED(old_state) &&