diff --git a/libc/Android.bp b/libc/Android.bp
index 650cfbd..c19cb78 100644
--- a/libc/Android.bp
+++ b/libc/Android.bp
@@ -1225,7 +1225,7 @@
         // so we can't include them in libc_ndk.a.
         "bionic/__cxa_thread_atexit_impl.cpp",
         "bionic/android_unsafe_frame_pointer_chase.cpp",
-        "stdlib/atexit.c",
+        "bionic/atexit.cpp",
         "bionic/fork.cpp",
     ],
 
diff --git a/libc/bionic/atexit.cpp b/libc/bionic/atexit.cpp
new file mode 100644
index 0000000..df306af
--- /dev/null
+++ b/libc/bionic/atexit.cpp
@@ -0,0 +1,263 @@
+/*
+ * Copyright (C) 2020 The Android Open Source Project
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *  * Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in
+ *    the documentation and/or other materials provided with the
+ *    distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+ * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+ * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS
+ * OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
+ * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
+ * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include "atexit.h"
+
+#include <errno.h>
+#include <pthread.h>
+#include <stdint.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/mman.h>
+#include <sys/param.h>
+#include <sys/prctl.h>
+
+#include <async_safe/CHECK.h>
+#include <async_safe/log.h>
+
+#include "platform/bionic/page.h"
+
+extern "C" void __libc_stdio_cleanup();
+extern "C" void __unregister_atfork(void* dso);
+
+namespace {
+
+struct AtexitEntry {
+  void (*fn)(void*);  // the __cxa_atexit callback
+  void* arg;          // argument for `fn` callback
+  void* dso;          // shared module handle
+};
+
+class AtexitArray {
+ public:
+  size_t size() const { return size_; }
+  uint64_t total_appends() const { return total_appends_; }
+  const AtexitEntry& operator[](size_t idx) const { return array_[idx]; }
+
+  bool append_entry(const AtexitEntry& entry);
+  AtexitEntry extract_entry(size_t idx);
+  void recompact();
+
+ private:
+  AtexitEntry* array_;
+  size_t size_;
+  size_t extracted_count_;
+  size_t capacity_;
+
+  // An entry can be appended by a __cxa_finalize callback. Track the number of appends so we
+  // restart concurrent __cxa_finalize passes.
+  uint64_t total_appends_;
+
+  static size_t round_up_to_page_bytes(size_t capacity) {
+    return PAGE_END(capacity * sizeof(AtexitEntry));
+  }
+
+  static size_t next_capacity(size_t capacity) {
+    // Double the capacity each time.
+    size_t result = round_up_to_page_bytes(MAX(1, capacity * 2)) / sizeof(AtexitEntry);
+    CHECK(result > capacity);
+    return result;
+  }
+
+  // Recompact the array if it will save at least one page of memory at the end.
+  bool needs_recompaction() {
+    return round_up_to_page_bytes(size_ - extracted_count_) < round_up_to_page_bytes(size_);
+  }
+
+  void set_writable(bool writable);
+  bool expand_capacity();
+};
+
+}  // anonymous namespace
+
+bool AtexitArray::append_entry(const AtexitEntry& entry) {
+  bool result = false;
+
+  set_writable(true);
+  if (size_ < capacity_ || expand_capacity()) {
+    array_[size_++] = entry;
+    ++total_appends_;
+    result = true;
+  }
+  set_writable(false);
+
+  return result;
+}
+
+// Extract an entry and return it.
+AtexitEntry AtexitArray::extract_entry(size_t idx) {
+  AtexitEntry result = array_[idx];
+
+  set_writable(true);
+  array_[idx] = {};
+  ++extracted_count_;
+  set_writable(false);
+
+  return result;
+}
+
+void AtexitArray::recompact() {
+  if (!needs_recompaction()) return;
+
+  set_writable(true);
+
+  // Optimization: quickly skip over the initial non-null entries.
+  size_t src = 0, dst = 0;
+  while (src < size_ && array_[src].fn != nullptr) {
+    ++src;
+    ++dst;
+  }
+
+  // Shift the non-null entries forward, and zero out the removed entries at the end of the array.
+  for (; src < size_; ++src) {
+    const AtexitEntry entry = array_[src];
+    array_[src] = {};
+    if (entry.fn != nullptr) {
+      array_[dst++] = entry;
+    }
+  }
+
+  // If the table uses fewer pages, clean the pages at the end.
+  size_t old_bytes = round_up_to_page_bytes(size_);
+  size_t new_bytes = round_up_to_page_bytes(dst);
+  if (new_bytes < old_bytes) {
+    madvise(reinterpret_cast<char*>(array_) + new_bytes, old_bytes - new_bytes, MADV_DONTNEED);
+  }
+
+  size_ = dst;
+  extracted_count_ = 0;
+
+  set_writable(false);
+}
+
+// Use mprotect to make the array writable or read-only. Returns true on success. Making the array
+// read-only could protect against either unintentional or malicious corruption of the array.
+void AtexitArray::set_writable(bool writable) {
+  if (array_ == nullptr) return;
+  const int prot = PROT_READ | (writable ? PROT_WRITE : 0);
+  if (mprotect(array_, round_up_to_page_bytes(capacity_), prot) != 0) {
+    async_safe_fatal("mprotect failed on atexit array: %s", strerror(errno));
+  }
+}
+
+bool AtexitArray::expand_capacity() {
+  const size_t new_capacity = next_capacity(capacity_);
+  const size_t new_capacity_bytes = round_up_to_page_bytes(new_capacity);
+
+  void* new_pages;
+  if (array_ == nullptr) {
+    new_pages = mmap(nullptr, new_capacity_bytes, PROT_READ | PROT_WRITE,
+                     MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
+  } else {
+    new_pages =
+        mremap(array_, round_up_to_page_bytes(capacity_), new_capacity_bytes, MREMAP_MAYMOVE);
+  }
+  if (new_pages == MAP_FAILED) {
+    async_safe_format_log(ANDROID_LOG_WARN, "libc",
+                          "__cxa_atexit: mmap/mremap failed to allocate %zu bytes: %s",
+                          new_capacity_bytes, strerror(errno));
+    return false;
+  }
+
+  prctl(PR_SET_VMA, PR_SET_VMA_ANON_NAME, new_pages, new_capacity_bytes, "atexit handlers");
+  array_ = static_cast<AtexitEntry*>(new_pages);
+  capacity_ = new_capacity;
+  return true;
+}
+
+static AtexitArray g_array;
+static pthread_mutex_t g_atexit_lock = PTHREAD_MUTEX_INITIALIZER;
+
+static inline void atexit_lock() {
+  pthread_mutex_lock(&g_atexit_lock);
+}
+
+static inline void atexit_unlock() {
+  pthread_mutex_unlock(&g_atexit_lock);
+}
+
+// Register a function to be called either when a library is unloaded (dso != nullptr), or when the
+// program exits (dso == nullptr). The `dso` argument is typically the address of a hidden
+// __dso_handle variable. This function is also used as the backend for the atexit function.
+//
+// See https://itanium-cxx-abi.github.io/cxx-abi/abi.html#dso-dtor.
+//
+int __cxa_atexit(void (*func)(void*), void* arg, void* dso) {
+  int result = -1;
+
+  if (func != nullptr) {
+    atexit_lock();
+    if (g_array.append_entry({.fn = func, .arg = arg, .dso = dso})) {
+      result = 0;
+    }
+    atexit_unlock();
+  }
+
+  return result;
+}
+
+void __cxa_finalize(void* dso) {
+  atexit_lock();
+
+  static uint32_t call_depth = 0;
+  ++call_depth;
+
+restart:
+  const uint64_t total_appends = g_array.total_appends();
+
+  for (ssize_t i = g_array.size() - 1; i >= 0; --i) {
+    if (g_array[i].fn == nullptr || (dso != nullptr && g_array[i].dso != dso)) continue;
+
+    // Clear the entry in the array because its DSO handle will become invalid, and to avoid calling
+    // an entry again if __cxa_finalize is called recursively.
+    const AtexitEntry entry = g_array.extract_entry(i);
+
+    atexit_unlock();
+    entry.fn(entry.arg);
+    atexit_lock();
+
+    if (g_array.total_appends() != total_appends) goto restart;
+  }
+
+  // Avoid recompaction on recursive calls because it's unnecessary and would require earlier,
+  // concurrent __cxa_finalize calls to restart. Skip recompaction on program exit too
+  // (dso == nullptr), because the memory will be reclaimed soon anyway.
+  --call_depth;
+  if (call_depth == 0 && dso != nullptr) {
+    g_array.recompact();
+  }
+
+  atexit_unlock();
+
+  if (dso != nullptr) {
+    __unregister_atfork(dso);
+  } else {
+    // If called via exit(), flush output of all open files.
+    __libc_stdio_cleanup();
+  }
+}
diff --git a/libc/stdlib/atexit.h b/libc/bionic/atexit.h
similarity index 90%
rename from libc/stdlib/atexit.h
rename to libc/bionic/atexit.h
index e01bb34..8abcdc5 100644
--- a/libc/stdlib/atexit.h
+++ b/libc/bionic/atexit.h
@@ -30,5 +30,13 @@
  *
  */
 
-int	__cxa_atexit(void (*)(void *), void *, void *);
-void	__cxa_finalize(void *);
+#pragma once
+
+#include <sys/cdefs.h>
+
+__BEGIN_DECLS
+
+int __cxa_atexit(void (*)(void*), void*, void*);
+void __cxa_finalize(void*);
+
+__END_DECLS
diff --git a/libc/bionic/malloc_heapprofd.cpp b/libc/bionic/malloc_heapprofd.cpp
index a1b7c07..51becf0 100644
--- a/libc/bionic/malloc_heapprofd.cpp
+++ b/libc/bionic/malloc_heapprofd.cpp
@@ -48,48 +48,104 @@
 #include "malloc_heapprofd.h"
 #include "malloc_limit.h"
 
+// Installing heapprofd hooks is a multi step process, as outlined below.
+//
+// The incremental hooking and a dedicated task thread are used since we cannot
+// do heavy work within a signal handler, or when blocking a malloc invocation.
+//
+// +--->+-------------+------------------+
+// | +->+kInitialState+----------------+ |  malloc functions are not intercepted in any way.
+// | |  +-------+-----+                | |
+// | |          |                      | |
+// | |          v                      | |
+// | |  +-------+----------------+     | |  currently installing the ephemeral hooks.
+// | |  |kInstallingEphemeralHook|<--+ | |
+// | |  +-------+----------------+   | | |
+// | |          |                    | | |
+// | |          v                    | | |
+// | |  +-------+---------------+    | | |  ephemeral hooks are installed. on the first call to
+// | |  |kEphemeralHookInstalled|    | | |  malloc these hooks spawn a thread that installs the
+// | |  +-------+---------------+    | | |  heapprofd hooks.
+// | |          |                    | | |
+// | |          v                    | | |
+// | |  +-------+--------------+     | | |  first call to malloc happened. the hooks are reset to
+// | +--|kRemovingEphemeralHook|     | | |  kInitialState.
+// |    +----------------------+     | | |
+// |                                 | | |
+// |                                 | | |
+// |    +---------------+            | | |  currently installing the heapprofd hook
+// |    |kInstallingHook|<-----------|-+ |
+// |    +-------+-------+            |   |
+// |            |                    |   |
+// |            v                    |   |
+// |    +-------+------+             |   |  heapprofd hooks are installed. these forward calls to
+// |    |kHookInstalled|-------------+   |  malloc / free / etc. to heapprofd_client.so.
+// |    +-------+------+                 |
+// |            |                        |
+// |            v                        |
+// |    +-------+---------+              |  currently resetting the hooks to default.
+// |----+kUninstallingHook|              |
+//      +-----------------+              |
+//                                       |
+//                                       |
+//      +------------------+             |  malloc debug / malloc hooks are active. these take
+//      |kIncompatibleHooks+<------------+  precendence over heapprofd, so heapprofd will not get
+//      +------------------+                enabled. this is a terminal state.
+//
+enum MallocHeapprofdState : uint8_t {
+  kInitialState,
+  kInstallingEphemeralHook,
+  kEphemeralHookInstalled,
+  kRemovingEphemeralHook,
+  kInstallingHook,
+  kHookInstalled,
+  kUninstallingHook,
+  kIncompatibleHooks
+};
+
+enum ModifyGlobalsMode {
+  kWithLock,   // all calls to MaybeModifyGlobals with kWithLock will serialise. they can fail
+               // due to a concurrent call with kWithoutLock.
+  kWithoutLock // calls to MaybeModifyGlobals with kWithoutLock do not serialise. they can fail
+               // due to concurrent calls with kWithoutLock or kWithLock.
+};
+
+// Provide mutual exclusion so no two threads try to modify the globals at the same time.
+template <typename Fn>
+bool MaybeModifyGlobals(ModifyGlobalsMode mode, Fn f) {
+  bool success = false;
+  if (mode == kWithLock) {
+    pthread_mutex_lock(&gGlobalsMutateLock);
+  }
+  // As we have grabbed the mutex, the following condition should always hold, except
+  // if we are currently running HandleHeapprofdSignal.
+  if (!atomic_exchange(&gGlobalsMutating, true)) {
+    f();
+    success = true;
+    atomic_store(&gGlobalsMutating, false);
+  } else {
+    error_log("%s: heapprofd client: concurrent modification.", getprogname());
+  }
+  if (mode == kWithLock) {
+    pthread_mutex_unlock(&gGlobalsMutateLock);
+  }
+  return success;
+}
+
+extern "C" void* MallocInitHeapprofdHook(size_t);
+
 static constexpr char kHeapprofdSharedLib[] = "heapprofd_client.so";
 static constexpr char kHeapprofdPrefix[] = "heapprofd";
 static constexpr char kHeapprofdPropertyEnable[] = "heapprofd.enable";
 
-// The logic for triggering heapprofd (at runtime) is as follows:
-// 1. A reserved profiling signal is received by the process, its si_value
-//    discriminating between different handlers. For the case of heapprofd,
-//    HandleHeapprofdSignal is called.
-// 2. If the initialization is not already in flight
-//    (gHeapprofdInitInProgress is false), the malloc hook is set to
-//    point at InitHeapprofdHook, and gHeapprofdInitInProgress is set to
-//    true.
-// 3. The next malloc call enters InitHeapprofdHook, which removes the malloc
-//    hook, and spawns a detached pthread to run the InitHeapprofd task.
-//    (gHeapprofdInitHookInstalled atomic is used to perform this once.)
-// 4. InitHeapprofd, on a dedicated pthread, loads the heapprofd client library,
-//    installs the full set of heapprofd hooks, and invokes the client's
-//    initializer. The dedicated pthread then terminates.
-// 5. gHeapprofdInitInProgress and gHeapprofdInitHookInstalled are
-//    reset to false such that heapprofd can be reinitialized. Reinitialization
-//    means that a new profiling session is started, and any still active is
-//    torn down.
-//
-// The incremental hooking and a dedicated task thread are used since we cannot
-// do heavy work within a signal handler, or when blocking a malloc invocation.
+constexpr char kHeapprofdProgramPropertyPrefix[] = "heapprofd.enable.";
+constexpr size_t kHeapprofdProgramPropertyPrefixSize = sizeof(kHeapprofdProgramPropertyPrefix) - 1;
+constexpr size_t kMaxCmdlineSize = 512;
 
 // The handle returned by dlopen when previously loading the heapprofd
 // hooks. nullptr if shared library has not been already been loaded.
 static _Atomic (void*) gHeapprofdHandle = nullptr;
-
-static _Atomic bool gHeapprofdInitInProgress = false;
-static _Atomic bool gHeapprofdInitHookInstalled = false;
-
-// Set to true if the process has enabled malloc_debug or malloc_hooks, which
-// are incompatible (and take precedence over) heapprofd.
-static _Atomic bool gHeapprofdIncompatibleHooks = false;
-
-extern "C" void* MallocInitHeapprofdHook(size_t);
-
-constexpr char kHeapprofdProgramPropertyPrefix[] = "heapprofd.enable.";
-constexpr size_t kHeapprofdProgramPropertyPrefixSize = sizeof(kHeapprofdProgramPropertyPrefix) - 1;
-constexpr size_t kMaxCmdlineSize = 512;
+static _Atomic MallocHeapprofdState gHeapprofdState = kInitialState;
 
 static bool GetHeapprofdProgramProperty(char* data, size_t size) {
   if (size < kHeapprofdProgramPropertyPrefixSize) {
@@ -157,22 +213,29 @@
 // Previously installed default dispatch table, if it exists. This is used to
 // load heapprofd properly when GWP-ASan was already installed. If GWP-ASan was
 // already installed, heapprofd will take over the dispatch table, but will use
-// GWP-ASan as the backing dispatch. This variable is atomically protected by
-// gHeapprofdInitInProgress.
-static const MallocDispatch* gPreviousDefaultDispatchTable = nullptr;
+// GWP-ASan as the backing dispatch. Writes to this variable is atomically
+// protected by MaybeModifyGlobals.
+// Reads are not protected, so this is atomic. We cannot fail the call in
+// MallocInitHeapprofdHook.
+static _Atomic (const MallocDispatch*) gPreviousDefaultDispatchTable = nullptr;
 static MallocDispatch gEphemeralDispatch;
 
 void HandleHeapprofdSignal() {
-  if (atomic_load_explicit(&gHeapprofdIncompatibleHooks, memory_order_acquire)) {
+  if (atomic_load(&gHeapprofdState) == kIncompatibleHooks) {
     error_log("%s: not enabling heapprofd, malloc_debug/malloc_hooks are enabled.", getprogname());
     return;
   }
 
-  // Checking this variable is only necessary when this could conflict with
-  // the change to enable the allocation limit. All other places will
-  // not ever have a conflict modifying the globals.
-  if (!atomic_exchange(&gGlobalsMutating, true)) {
-    if (!atomic_exchange(&gHeapprofdInitInProgress, true)) {
+  // We cannot grab the mutex here, as this is used in a signal handler.
+  MaybeModifyGlobals(kWithoutLock, [] {
+    MallocHeapprofdState expected = kInitialState;
+    // If hooks are already installed, we still want to install ephemeral hooks to retrigger
+    // heapprofd client initialization.
+    MallocHeapprofdState expected2 = kHookInstalled;
+    if (atomic_compare_exchange_strong(&gHeapprofdState, &expected,
+          kInstallingEphemeralHook) ||
+        atomic_compare_exchange_strong(&gHeapprofdState, &expected2,
+          kInstallingEphemeralHook)) {
       const MallocDispatch* default_dispatch = GetDefaultDispatchTable();
 
       // Below, we initialize heapprofd lazily by redirecting libc's malloc() to
@@ -185,7 +248,7 @@
         //  1. No malloc hooking has been done (heapprofd, GWP-ASan, etc.). In
         //  this case, everything but malloc() should come from the system
         //  allocator.
-        gPreviousDefaultDispatchTable = nullptr;
+        atomic_store(&gPreviousDefaultDispatchTable, nullptr);
         gEphemeralDispatch = *NativeAllocatorDispatch();
       } else if (DispatchIsGwpAsan(default_dispatch)) {
         //  2. GWP-ASan was installed. We should use GWP-ASan for everything but
@@ -193,7 +256,7 @@
         //  installed. After heapprofd is finished installing, we will use
         //  GWP-ASan as heapprofd's backing allocator to allow heapprofd and
         //  GWP-ASan to coexist.
-        gPreviousDefaultDispatchTable = default_dispatch;
+        atomic_store(&gPreviousDefaultDispatchTable, default_dispatch);
         gEphemeralDispatch = *default_dispatch;
       } else {
         // 3. It may be possible at this point in time that heapprofd is
@@ -203,7 +266,7 @@
         // We've checked that no other malloc interceptors are being used by
         // validating `gHeapprofdIncompatibleHooks` above, so we don't need to
         // worry about that case here.
-        gPreviousDefaultDispatchTable = nullptr;
+        atomic_store(&gPreviousDefaultDispatchTable, nullptr);
         gEphemeralDispatch = *NativeAllocatorDispatch();
       }
 
@@ -218,9 +281,12 @@
           atomic_store(&globals->current_dispatch_table, &gEphemeralDispatch);
         }
       });
+      atomic_store(&gHeapprofdState, kEphemeralHookInstalled);
+    } else {
+      error_log("%s: heapprofd: failed to transition kInitialState -> kInstallingEphemeralHook. "
+          "current state (possible race): %d", getprogname(), expected2);
     }
-    atomic_store(&gGlobalsMutating, false);
-  }
+  });
   // Otherwise, we're racing against malloc_limit's enable logic (at most once
   // per process, and a niche feature). This is highly unlikely, so simply give
   // up if it does happen.
@@ -250,7 +316,7 @@
 }
 
 void HeapprofdRememberHookConflict() {
-  atomic_store_explicit(&gHeapprofdIncompatibleHooks, true, memory_order_release);
+  atomic_store(&gHeapprofdState, kIncompatibleHooks);
 }
 
 static void CommonInstallHooks(libc_globals* globals) {
@@ -268,68 +334,81 @@
   // Before we set the new default_dispatch_table in FinishInstallHooks, save
   // the previous dispatch table. If DispatchReset() gets called later, we want
   // to be able to restore the dispatch. We're still under
-  // gHeapprofdInitInProgress locks at this point.
-  gPreviousDefaultDispatchTable = GetDefaultDispatchTable();
+  // MaybeModifyGlobals locks at this point.
+  atomic_store(&gPreviousDefaultDispatchTable, GetDefaultDispatchTable());
 
   if (FinishInstallHooks(globals, nullptr, kHeapprofdPrefix)) {
     atomic_store(&gHeapprofdHandle, impl_handle);
   } else if (!reusing_handle) {
     dlclose(impl_handle);
   }
-
-  atomic_store(&gHeapprofdInitInProgress, false);
 }
 
 void HeapprofdInstallHooksAtInit(libc_globals* globals) {
-  if (atomic_exchange(&gHeapprofdInitInProgress, true)) {
-    return;
-  }
-  CommonInstallHooks(globals);
+  MaybeModifyGlobals(kWithoutLock, [globals] {
+    MallocHeapprofdState expected = kInitialState;
+    if (atomic_compare_exchange_strong(&gHeapprofdState, &expected, kInstallingHook)) {
+      CommonInstallHooks(globals);
+      atomic_store(&gHeapprofdState, kHookInstalled);
+    } else {
+      error_log("%s: heapprofd: failed to transition kInitialState -> kInstallingHook. "
+          "current state (possible race): %d", getprogname(), expected);
+    }
+  });
 }
 
 static void* InitHeapprofd(void*) {
-  pthread_mutex_lock(&gGlobalsMutateLock);
-  __libc_globals.mutate([](libc_globals* globals) {
-    CommonInstallHooks(globals);
+  MaybeModifyGlobals(kWithLock, [] {
+    MallocHeapprofdState expected = kInitialState;
+    if (atomic_compare_exchange_strong(&gHeapprofdState, &expected, kInstallingHook)) {
+      __libc_globals.mutate([](libc_globals* globals) {
+        CommonInstallHooks(globals);
+      });
+      atomic_store(&gHeapprofdState, kHookInstalled);
+    } else {
+      error_log("%s: heapprofd: failed to transition kInitialState -> kInstallingHook. "
+          "current state (possible race): %d", getprogname(), expected);
+    }
   });
-  pthread_mutex_unlock(&gGlobalsMutateLock);
-
-  // Allow to install hook again to re-initialize heap profiling after the
-  // current session finished.
-  atomic_store(&gHeapprofdInitHookInstalled, false);
   return nullptr;
 }
 
 extern "C" void* MallocInitHeapprofdHook(size_t bytes) {
-  if (!atomic_exchange(&gHeapprofdInitHookInstalled, true)) {
-    pthread_mutex_lock(&gGlobalsMutateLock);
-    __libc_globals.mutate([](libc_globals* globals) {
-      atomic_store(&globals->default_dispatch_table, gPreviousDefaultDispatchTable);
-      if (!MallocLimitInstalled()) {
-        atomic_store(&globals->current_dispatch_table, gPreviousDefaultDispatchTable);
-      }
-    });
-    pthread_mutex_unlock(&gGlobalsMutateLock);
+  MaybeModifyGlobals(kWithLock, [] {
+    MallocHeapprofdState expected = kEphemeralHookInstalled;
+    if (atomic_compare_exchange_strong(&gHeapprofdState, &expected, kRemovingEphemeralHook)) {
+      __libc_globals.mutate([](libc_globals* globals) {
+        const MallocDispatch* previous_dispatch = atomic_load(&gPreviousDefaultDispatchTable);
+        atomic_store(&globals->default_dispatch_table, previous_dispatch);
+        if (!MallocLimitInstalled()) {
+          atomic_store(&globals->current_dispatch_table, previous_dispatch);
+        }
+      });
+      atomic_store(&gHeapprofdState, kInitialState);
 
-    pthread_t thread_id;
-    if (pthread_create(&thread_id, nullptr, InitHeapprofd, nullptr) != 0) {
-      error_log("%s: heapprofd: failed to pthread_create.", getprogname());
-    } else if (pthread_detach(thread_id) != 0) {
-      error_log("%s: heapprofd: failed to pthread_detach", getprogname());
+      pthread_t thread_id;
+      if (pthread_create(&thread_id, nullptr, InitHeapprofd, nullptr) != 0) {
+        error_log("%s: heapprofd: failed to pthread_create.", getprogname());
+      } else if (pthread_setname_np(thread_id, "heapprofdinit") != 0) {
+        error_log("%s: heapprod: failed to pthread_setname_np", getprogname());
+      } else if (pthread_detach(thread_id) != 0) {
+        error_log("%s: heapprofd: failed to pthread_detach", getprogname());
+      }
+    } else {
+      warning_log("%s: heapprofd: could not transition kEphemeralHookInstalled -> "
+          "kRemovingEphemeralHook. current state (possible race): %d. this can be benign "
+          "if two threads try this transition at the same time", getprogname(),
+          expected);
     }
-    if (pthread_setname_np(thread_id, "heapprofdinit") != 0) {
-      error_log("%s: heapprod: failed to pthread_setname_np", getprogname());
-    }
-  }
+  });
   // If we had a previous dispatch table, use that to service the allocation,
   // otherwise fall back to the native allocator.
-  // `gPreviousDefaultDispatchTable` won't change underneath us, as it's
-  // protected by the `gHeapProfdInitInProgress` lock (which we currently hold).
-  // The lock was originally taken by our caller in `HandleHeapprofdSignal()`,
-  // and will be released by `CommonInstallHooks()` via. our `InitHeapprofd()`
-  // thread that we just created.
-  if (gPreviousDefaultDispatchTable) {
-    return gPreviousDefaultDispatchTable->malloc(bytes);
+  // This could be modified by a concurrent HandleHeapprofdSignal, but that is
+  // benign as we will dispatch to the ephemeral handler, which will then dispatch
+  // to the underlying one.
+  const MallocDispatch* previous_dispatch = atomic_load(&gPreviousDefaultDispatchTable);
+  if (previous_dispatch) {
+    return previous_dispatch->malloc(bytes);
   }
   return NativeAllocatorDispatch()->malloc(bytes);
 }
@@ -345,20 +424,34 @@
 }
 
 static bool DispatchReset() {
-  if (!atomic_exchange(&gHeapprofdInitInProgress, true)) {
-    pthread_mutex_lock(&gGlobalsMutateLock);
-    __libc_globals.mutate([](libc_globals* globals) {
-      atomic_store(&globals->default_dispatch_table, gPreviousDefaultDispatchTable);
-      if (!MallocLimitInstalled()) {
-        atomic_store(&globals->current_dispatch_table, gPreviousDefaultDispatchTable);
-      }
-    });
-    pthread_mutex_unlock(&gGlobalsMutateLock);
-    atomic_store(&gHeapprofdInitInProgress, false);
+  if (atomic_load(&gHeapprofdState) == kInitialState) {
     return true;
   }
-  errno = EAGAIN;
-  return false;
+
+  bool success = false;
+  MaybeModifyGlobals(kWithLock, [&success] {
+    MallocHeapprofdState expected = kHookInstalled;
+
+    if(atomic_compare_exchange_strong(&gHeapprofdState, &expected, kUninstallingHook)){
+      __libc_globals.mutate([](libc_globals* globals) {
+        const MallocDispatch* previous_dispatch = atomic_load(&gPreviousDefaultDispatchTable);
+        atomic_store(&globals->default_dispatch_table, previous_dispatch);
+        if (!MallocLimitInstalled()) {
+          atomic_store(&globals->current_dispatch_table, previous_dispatch);
+        }
+      });
+      atomic_store(&gHeapprofdState, kInitialState);
+      success = true;
+    } else {
+      error_log("%s: heapprofd: failed to transition kHookInstalled -> kUninstallingHook. "
+          "current state (possible race): %d", getprogname(),
+          expected);
+    }
+  });
+  if (!success) {
+    errno = EAGAIN;
+  }
+  return success;
 }
 
 bool HeapprofdMallopt(int opcode, void* arg, size_t arg_size) {
diff --git a/libc/stdlib/atexit.c b/libc/stdlib/atexit.c
deleted file mode 100644
index 0efb118..0000000
--- a/libc/stdlib/atexit.c
+++ /dev/null
@@ -1,200 +0,0 @@
-/*	$OpenBSD: atexit.c,v 1.20 2014/07/11 09:51:37 kettenis Exp $ */
-/*
- * Copyright (c) 2002 Daniel Hartmeier
- * All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- *
- *    - Redistributions of source code must retain the above copyright
- *      notice, this list of conditions and the following disclaimer.
- *    - Redistributions in binary form must reproduce the above
- *      copyright notice, this list of conditions and the following
- *      disclaimer in the documentation and/or other materials provided
- *      with the distribution.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
- * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
- * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
- * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
- * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
- * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
- * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
- * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
- * POSSIBILITY OF SUCH DAMAGE.
- *
- */
-
-#include "atexit.h"
-
-#include <pthread.h>
-#include <stdlib.h>
-#include <string.h>
-#include <unistd.h>
-#include <sys/mman.h>
-#include <sys/prctl.h>
-#include <sys/types.h>
-
-static pthread_mutex_t g_atexit_lock = PTHREAD_MUTEX_INITIALIZER;
-#define _ATEXIT_LOCK() pthread_mutex_lock(&g_atexit_lock)
-#define _ATEXIT_UNLOCK() pthread_mutex_unlock(&g_atexit_lock)
-
-struct atexit {
-	struct atexit *next;		/* next in list */
-	int ind;			/* next index in this table */
-	int max;			/* max entries >= ATEXIT_SIZE */
-	struct atexit_fn {
-		void (*fn_ptr)(void *);
-		void *fn_arg;		/* argument for CXA callback */
-		void *fn_dso;		/* shared module handle */
-	} fns[1];			/* the table itself */
-};
-
-static struct atexit *__atexit;
-static int restartloop;
-
-/* BEGIN android-changed: __unregister_atfork is used by __cxa_finalize */
-extern void __unregister_atfork(void* dso);
-/* END android-changed */
-
-/*
- * Function pointers are stored in a linked list of pages. The list
- * is initially empty, and pages are allocated on demand. The first
- * function pointer in the first allocated page (the last one in
- * the linked list) is reserved for the cleanup function.
- *
- * Outside the following functions, all pages are mprotect()'ed
- * to prevent unintentional/malicious corruption.
- */
-
-/*
- * Register a function to be performed at exit or when a shared object
- * with the given dso handle is unloaded dynamically.  Also used as
- * the backend for atexit().  For more info on this API, see:
- *
- *	http://www.codesourcery.com/cxx-abi/abi.html#dso-dtor
- */
-int
-__cxa_atexit(void (*func)(void *), void *arg, void *dso)
-{
-	struct atexit_fn *fnp;
-	size_t pgsize = getpagesize();
-	int ret = -1;
-
-	if (pgsize < sizeof(struct atexit))
-		return (-1);
-	_ATEXIT_LOCK();
-	struct atexit *p = __atexit;
-	if (p != NULL) {
-		if (p->ind + 1 >= p->max)
-			p = NULL;
-		else if (mprotect(p, pgsize, PROT_READ | PROT_WRITE))
-			goto unlock;
-	}
-	if (p == NULL) {
-		p = mmap(NULL, pgsize, PROT_READ | PROT_WRITE,
-		    MAP_ANON | MAP_PRIVATE, -1, 0);
-		if (p == MAP_FAILED)
-			goto unlock;
-/* BEGIN android-changed */
-		prctl(PR_SET_VMA, PR_SET_VMA_ANON_NAME, p, pgsize,
-		    "atexit handlers");
-/* END android-changed */
-		if (__atexit == NULL) {
-			memset(&p->fns[0], 0, sizeof(p->fns[0]));
-			p->ind = 1;
-		} else
-			p->ind = 0;
-		p->max = (pgsize - ((char *)&p->fns[0] - (char *)p)) /
-		    sizeof(p->fns[0]);
-		p->next = __atexit;
-		__atexit = p;
-	}
-	fnp = &p->fns[p->ind++];
-	fnp->fn_ptr = func;
-	fnp->fn_arg = arg;
-	fnp->fn_dso = dso;
-	if (mprotect(p, pgsize, PROT_READ))
-		goto unlock;
-	restartloop = 1;
-	ret = 0;
-unlock:
-	_ATEXIT_UNLOCK();
-	return (ret);
-}
-
-/*
- * Call all handlers registered with __cxa_atexit() for the shared
- * object owning 'dso'.
- * Note: if 'dso' is NULL, then all remaining handlers are called.
- */
-void
-__cxa_finalize(void *dso)
-{
-	struct atexit *p, *q;
-	struct atexit_fn fn;
-	int n, pgsize = getpagesize();
-	static int call_depth;
-
-	_ATEXIT_LOCK();
-	call_depth++;
-
-restart:
-	restartloop = 0;
-	for (p = __atexit; p != NULL; p = p->next) {
-		for (n = p->ind; --n >= 0;) {
-			if (p->fns[n].fn_ptr == NULL)
-				continue;	/* already called */
-			if (dso != NULL && dso != p->fns[n].fn_dso)
-				continue;	/* wrong DSO */
-
-			/*
-			 * Mark handler as having been already called to avoid
-			 * dupes and loops, then call the appropriate function.
-			 */
-			fn = p->fns[n];
-			if (mprotect(p, pgsize, PROT_READ | PROT_WRITE) == 0) {
-				p->fns[n].fn_ptr = NULL;
-				mprotect(p, pgsize, PROT_READ);
-			}
-			_ATEXIT_UNLOCK();
-			(*fn.fn_ptr)(fn.fn_arg);
-			_ATEXIT_LOCK();
-			if (restartloop)
-				goto restart;
-		}
-	}
-
-	call_depth--;
-
-	/*
-	 * If called via exit(), unmap the pages since we have now run
-	 * all the handlers.  We defer this until calldepth == 0 so that
-	 * we don't unmap things prematurely if called recursively.
-	 */
-	if (dso == NULL && call_depth == 0) {
-		for (p = __atexit; p != NULL; ) {
-			q = p;
-			p = p->next;
-			munmap(q, pgsize);
-		}
-		__atexit = NULL;
-	}
-	_ATEXIT_UNLOCK();
-
-	/* If called via exit(), flush output of all open files. */
-	if (dso == NULL) {
-		extern void __libc_stdio_cleanup(void);
-		__libc_stdio_cleanup();
-	}
-
-  /* BEGIN android-changed: call __unregister_atfork if dso is not null */
-  if (dso != NULL) {
-    __unregister_atfork(dso);
-  }
-  /* END android-changed */
-}
diff --git a/libdl/Android.bp b/libdl/Android.bp
index 8e3a3fc..f431e84 100644
--- a/libdl/Android.bp
+++ b/libdl/Android.bp
@@ -29,6 +29,11 @@
     sanitize: {
         never: true,
     },
+
+    apex_available: [
+        "//apex_available:platform",
+        "com.android.runtime",
+    ],
 }
 
 cc_library {
diff --git a/libdl/libdl.cpp b/libdl/libdl.cpp
index 1ee4012..a56a5ab 100644
--- a/libdl/libdl.cpp
+++ b/libdl/libdl.cpp
@@ -138,16 +138,4 @@
   return __loader_android_get_application_target_sdk_version();
 }
 
-#if defined(__arm__)
-// An arm32 unwinding table has an R_ARM_NONE relocation to
-// __aeabi_unwind_cpp_pr0. This shared library will never invoke the unwinder,
-// so it doesn't actually need the routine. Define a dummy version here,
-// because the real version calls libc functions (e.g. memcpy, abort), which
-// would create a dependency cycle with libc.so.
-__attribute__((visibility("hidden")))
-void __aeabi_unwind_cpp_pr0() {
-  __builtin_trap();
-}
-#endif
-
 } // extern "C"
diff --git a/libdl/libdl_android.cpp b/libdl/libdl_android.cpp
index 77b3bf8..47a164a 100644
--- a/libdl/libdl_android.cpp
+++ b/libdl/libdl_android.cpp
@@ -115,16 +115,4 @@
   return __loader_android_get_exported_namespace(name);
 }
 
-#if defined(__arm__)
-// An arm32 unwinding table has an R_ARM_NONE relocation to
-// __aeabi_unwind_cpp_pr0. This shared library will never invoke the unwinder,
-// so it doesn't actually need the routine. Define a dummy version here,
-// because the real version calls libc functions (e.g. memcpy, abort), which
-// would create a dependency cycle with libc.so.
-__attribute__((visibility("hidden")))
-void __aeabi_unwind_cpp_pr0() {
-  __builtin_trap();
-}
-#endif
-
 } // extern "C"
diff --git a/linker/Android.bp b/linker/Android.bp
index 4be080b..3190870 100644
--- a/linker/Android.bp
+++ b/linker/Android.bp
@@ -98,6 +98,7 @@
     static_libs: [
         "libziparchive",
         "libbase",
+        "libdl", // libbase uses dlsym
         "libz",
 
         "libasync_safe",
diff --git a/linker/ld_android.cpp b/linker/ld_android.cpp
index 152c2e2..0239c30 100644
--- a/linker/ld_android.cpp
+++ b/linker/ld_android.cpp
@@ -58,15 +58,3 @@
 __strong_alias(__loader_dl_unwind_find_exidx, __internal_linker_error);
 #endif
 __strong_alias(rtld_db_dlactivity, __internal_linker_error);
-
-#if defined(__arm__)
-// An arm32 unwinding table has an R_ARM_NONE relocation to
-// __aeabi_unwind_cpp_pr0. This shared library will never invoke the unwinder,
-// so it doesn't actually need the routine. Define a dummy version here,
-// because the real version calls libc functions (e.g. memcpy, abort), which
-// would create a dependency cycle with libc.so.
-__attribute__((visibility("hidden")))
-extern "C" void __aeabi_unwind_cpp_pr0() {
-  __builtin_trap();
-}
-#endif
diff --git a/tests/Android.bp b/tests/Android.bp
index 4bd96ad..8b1eebc 100644
--- a/tests/Android.bp
+++ b/tests/Android.bp
@@ -79,6 +79,7 @@
     defaults: ["bionic_tests_defaults"],
     srcs: [
         "__aeabi_read_tp_test.cpp",
+        "__cxa_atexit_test.cpp",
         "alloca_test.cpp",
         "android_get_device_api_level.cpp",
         "arpa_inet_test.cpp",
diff --git a/tests/__cxa_atexit_test.cpp b/tests/__cxa_atexit_test.cpp
new file mode 100644
index 0000000..6a122d1
--- /dev/null
+++ b/tests/__cxa_atexit_test.cpp
@@ -0,0 +1,77 @@
+/*
+ * Copyright (C) 2020 The Android Open Source Project
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *  * Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in
+ *    the documentation and/or other materials provided with the
+ *    distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+ * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+ * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS
+ * OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
+ * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
+ * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <cxxabi.h>
+#include <gtest/gtest.h>
+
+TEST(__cxa_atexit, simple) {
+  int counter = 0;
+
+  __cxxabiv1::__cxa_atexit([](void* arg) { ++*static_cast<int*>(arg); }, &counter, &counter);
+
+  __cxxabiv1::__cxa_finalize(&counter);
+  ASSERT_EQ(counter, 1);
+
+  // The handler won't be called twice.
+  __cxxabiv1::__cxa_finalize(&counter);
+  ASSERT_EQ(counter, 1);
+}
+
+TEST(__cxa_atexit, order) {
+  static std::vector<int> actual;
+
+  char handles[2];
+
+  auto append_to_actual = [](void* arg) {
+    int* idx = static_cast<int*>(arg);
+    actual.push_back(*idx);
+    delete idx;
+  };
+
+  for (int i = 0; i < 500; ++i) {
+    __cxxabiv1::__cxa_atexit(append_to_actual, new int{i}, &handles[i % 2]);
+  }
+
+  __cxxabiv1::__cxa_finalize(&handles[0]);
+
+  for (int i = 500; i < 750; ++i) {
+    __cxxabiv1::__cxa_atexit(append_to_actual, new int{i}, &handles[1]);
+  }
+
+  __cxxabiv1::__cxa_finalize(&handles[1]);
+
+  std::vector<int> expected;
+  for (int i = 498; i >= 0; i -= 2) expected.push_back(i);
+  for (int i = 749; i >= 500; --i) expected.push_back(i);
+  for (int i = 499; i >= 1; i -= 2) expected.push_back(i);
+
+  ASSERT_EQ(expected.size(), actual.size());
+  for (size_t i = 0; i < expected.size(); ++i) {
+    ASSERT_EQ(expected[i], actual[i]) << "index=" << i;
+  }
+}
