Avoid using malloc debug code after exit.

I wrote a new unit test that would fail on the old version of the
code.

On a walleye big cpu, this costs about 40ns-50ns (going from ~430ns to ~480ns).
I think this is an acceptable performance degradation.

Bug: 131867816

Test: New unit tests pass.
Change-Id: I4c0f4373fb0694bf29c3824dbb1224a8a17e211e
diff --git a/libc/malloc_debug/malloc_debug.cpp b/libc/malloc_debug/malloc_debug.cpp
index 91e1d26..53fcead 100644
--- a/libc/malloc_debug/malloc_debug.cpp
+++ b/libc/malloc_debug/malloc_debug.cpp
@@ -29,6 +29,7 @@
 #include <errno.h>
 #include <inttypes.h>
 #include <malloc.h>
+#include <pthread.h>
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
@@ -103,6 +104,32 @@
 __END_DECLS
 // ------------------------------------------------------------------------
 
+class ScopedConcurrentLock {
+ public:
+  ScopedConcurrentLock() {
+    pthread_rwlock_rdlock(&lock_);
+  }
+  ~ScopedConcurrentLock() {
+    pthread_rwlock_unlock(&lock_);
+  }
+
+  static void Init() {
+    pthread_rwlockattr_t attr;
+    // Set the attribute so that when a write lock is pending, read locks are no
+    // longer granted.
+    pthread_rwlockattr_setkind_np(&attr, PTHREAD_RWLOCK_PREFER_WRITER_NONRECURSIVE_NP);
+    pthread_rwlock_init(&lock_, &attr);
+  }
+
+  static void BlockAllOperations() {
+    pthread_rwlock_wrlock(&lock_);
+  }
+
+ private:
+  static pthread_rwlock_t lock_;
+};
+pthread_rwlock_t ScopedConcurrentLock::lock_;
+
 static void InitAtfork() {
   static pthread_once_t atfork_init = PTHREAD_ONCE_INIT;
   pthread_once(&atfork_init, []() {
@@ -257,6 +284,8 @@
     info_log("%s: malloc debug enabled", getprogname());
   }
 
+  ScopedConcurrentLock::Init();
+
   return true;
 }
 
@@ -265,6 +294,10 @@
     return;
   }
 
+  // Make sure that there are no other threads doing debug allocations
+  // before we kill everything.
+  ScopedConcurrentLock::BlockAllOperations();
+
   // Turn off capturing allocations calls.
   DebugDisableSet(true);
 
@@ -292,6 +325,8 @@
 
 void debug_get_malloc_leak_info(uint8_t** info, size_t* overall_size, size_t* info_size,
                                 size_t* total_memory, size_t* backtrace_size) {
+  ScopedConcurrentLock lock;
+
   ScopedDisableDebugCalls disable;
 
   // Verify the arguments.
@@ -325,6 +360,7 @@
   if (DebugCallsDisabled() || pointer == nullptr) {
     return g_dispatch->malloc_usable_size(pointer);
   }
+  ScopedConcurrentLock lock;
   ScopedDisableDebugCalls disable;
 
   if (!VerifyPointer(pointer, "malloc_usable_size")) {
@@ -388,6 +424,7 @@
   if (DebugCallsDisabled()) {
     return g_dispatch->malloc(size);
   }
+  ScopedConcurrentLock lock;
   ScopedDisableDebugCalls disable;
 
   void* pointer = InternalMalloc(size);
@@ -463,6 +500,7 @@
   if (DebugCallsDisabled() || pointer == nullptr) {
     return g_dispatch->free(pointer);
   }
+  ScopedConcurrentLock lock;
   ScopedDisableDebugCalls disable;
 
   if (g_debug->config().options() & RECORD_ALLOCS) {
@@ -480,6 +518,7 @@
   if (DebugCallsDisabled()) {
     return g_dispatch->memalign(alignment, bytes);
   }
+  ScopedConcurrentLock lock;
   ScopedDisableDebugCalls disable;
 
   if (bytes == 0) {
@@ -558,6 +597,7 @@
   if (DebugCallsDisabled()) {
     return g_dispatch->realloc(pointer, bytes);
   }
+  ScopedConcurrentLock lock;
   ScopedDisableDebugCalls disable;
 
   if (pointer == nullptr) {
@@ -676,6 +716,7 @@
   if (DebugCallsDisabled()) {
     return g_dispatch->calloc(nmemb, bytes);
   }
+  ScopedConcurrentLock lock;
   ScopedDisableDebugCalls disable;
 
   size_t size;
@@ -737,6 +778,8 @@
   if (DebugCallsDisabled() || !g_debug->TrackPointers()) {
     return g_dispatch->malloc_info(options, fp);
   }
+  ScopedConcurrentLock lock;
+  ScopedDisableDebugCalls disable;
 
   MallocXmlElem root(fp, "malloc", "version=\"debug-malloc-1\"");
   std::vector<ListInfoType> list;
@@ -786,6 +829,7 @@
 
 int debug_iterate(uintptr_t base, size_t size, void (*callback)(uintptr_t, size_t, void*),
                   void* arg) {
+  ScopedConcurrentLock lock;
   if (g_debug->TrackPointers()) {
     // Since malloc is disabled, don't bother acquiring any locks.
     for (auto it = PointerData::begin(); it != PointerData::end(); ++it) {
@@ -800,6 +844,7 @@
 }
 
 void debug_malloc_disable() {
+  ScopedConcurrentLock lock;
   g_dispatch->malloc_disable();
   if (g_debug->pointer) {
     g_debug->pointer->PrepareFork();
@@ -807,6 +852,7 @@
 }
 
 void debug_malloc_enable() {
+  ScopedConcurrentLock lock;
   if (g_debug->pointer) {
     g_debug->pointer->PostForkParent();
   }
@@ -817,6 +863,7 @@
   if (DebugCallsDisabled() || pointer == nullptr) {
     return 0;
   }
+  ScopedConcurrentLock lock;
   ScopedDisableDebugCalls disable;
 
   if (!(g_debug->config().options() & BACKTRACE)) {
@@ -870,6 +917,7 @@
 }
 
 bool debug_write_malloc_leak_info(FILE* fp) {
+  ScopedConcurrentLock lock;
   ScopedDisableDebugCalls disable;
 
   std::lock_guard<std::mutex> guard(g_dump_lock);
@@ -883,6 +931,7 @@
 }
 
 void debug_dump_heap(const char* file_name) {
+  ScopedConcurrentLock lock;
   ScopedDisableDebugCalls disable;
 
   std::lock_guard<std::mutex> guard(g_dump_lock);