Merge changes from topics "crashapi2", "crashapi3" into main

* changes:
  header and android_crash_detail prefix for crash_detail API
  Allow to re-use the same crash_detail.
diff --git a/android-changes-for-ndk-developers.md b/android-changes-for-ndk-developers.md
index ef5d34d..26e57b6 100644
--- a/android-changes-for-ndk-developers.md
+++ b/android-changes-for-ndk-developers.md
@@ -227,7 +227,7 @@
 information using widely-available tools.)
 
 ```
-$ readelf --header libBroken.so | grep 'section headers'
+$ readelf --headers libBroken.so | grep 'section headers'
   Start of section headers:          0 (bytes into file)
   Size of section headers:           0 (bytes)
   Number of section headers:         0
diff --git a/docs/32-bit-abi.md b/docs/32-bit-abi.md
index 3be6b1a..7a96e2f 100644
--- a/docs/32-bit-abi.md
+++ b/docs/32-bit-abi.md
@@ -109,3 +109,15 @@
 mutexes for tids that don't fit in 16 bits. This typically manifests as
 a hang in `pthread_mutex_lock` if the libc startup code doesn't detect
 this condition and abort.
+
+
+## `getuid()` and friends wrongly set errno for very large results
+
+This doesn't generally affect Android devices, because we don't have any
+uids/gids/pids large enough, but 32-bit Android doesn't take into account
+that functions like getuid() potentially have return values that cover the
+entire 32-bit, and can't fail. This means that the usual "if the result is
+between -1 and -4096, set errno and return -1" code is inappropriate for
+these functions. Since LP32 is unlikely to be still supported long before
+those limits could ever matter, although -- unlike the others in this
+document -- this defect is actually fixable, it doesn't seem worth fixing.
diff --git a/libc/Android.bp b/libc/Android.bp
index 15c8f5e..7e5e972 100644
--- a/libc/Android.bp
+++ b/libc/Android.bp
@@ -20,31 +20,6 @@
     ],
 }
 
-libc_common_src_files = [
-    "bionic/ether_aton.c",
-    "bionic/ether_ntoa.c",
-    "bionic/exit.cpp",
-    "bionic/initgroups.c",
-    "bionic/isatty.c",
-    "bionic/sched_cpualloc.c",
-    "bionic/sched_cpucount.c",
-    "bionic/sysprop_helpers.cpp",
-    "stdio/fmemopen.cpp",
-    "stdio/parsefloat.c",
-    "stdio/refill.c",
-    "stdio/stdio.cpp",
-    "stdio/stdio_ext.cpp",
-    "stdio/vfscanf.cpp",
-    "stdio/vfwscanf.cpp",
-]
-
-// off64_t/time64_t support on LP32.
-// ========================================================
-libc_common_src_files_32 = [
-    "bionic/legacy_32_bit_support.cpp",
-    "bionic/time64.c",
-]
-
 libc_common_flags = [
     "-D_LIBC=1",
     "-D__BIONIC_LP32_USE_STAT64",
@@ -845,9 +820,12 @@
         "bionic/dup.cpp",
         "bionic/environ.cpp",
         "bionic/error.cpp",
+        "bionic/ether_aton.c",
+        "bionic/ether_ntoa.c",
         "bionic/eventfd.cpp",
         "bionic/exec.cpp",
         "bionic/execinfo.cpp",
+        "bionic/exit.cpp",
         "bionic/faccessat.cpp",
         "bionic/fchmod.cpp",
         "bionic/fchmodat.cpp",
@@ -882,6 +860,7 @@
         "bionic/ifaddrs.cpp",
         "bionic/inotify_init.cpp",
         "bionic/ioctl.cpp",
+        "bionic/isatty.cpp",
         "bionic/killpg.cpp",
         "bionic/langinfo.cpp",
         "bionic/lchown.cpp",
@@ -952,6 +931,8 @@
         "bionic/rename.cpp",
         "bionic/rmdir.cpp",
         "bionic/scandir.cpp",
+        "bionic/sched_cpualloc.cpp",
+        "bionic/sched_cpucount.cpp",
         "bionic/sched_getaffinity.cpp",
         "bionic/sched_getcpu.cpp",
         "bionic/semaphore.cpp",
@@ -989,6 +970,7 @@
         "bionic/sys_time.cpp",
         "bionic/sysinfo.cpp",
         "bionic/syslog.cpp",
+        "bionic/sysprop_helpers.cpp",
         "bionic/system.cpp",
         "bionic/system_property_api.cpp",
         "bionic/system_property_set.cpp",
@@ -1012,6 +994,16 @@
         "bionic/wcwidth.cpp",
         "bionic/wmempcpy.cpp",
 
+        // Forked but not yet cleaned up/rewritten stdio code.
+        // TODO: finish cleanup.
+        "stdio/fmemopen.cpp",
+        "stdio/parsefloat.c",
+        "stdio/refill.c",
+        "stdio/stdio.cpp",
+        "stdio/stdio_ext.cpp",
+        "stdio/vfscanf.cpp",
+        "stdio/vfwscanf.cpp",
+
         // TODO: why isn't this in a static-libc-only module?
         // This contains a weak stub implementation of __find_icu_symbol for wctype.cpp,
         // which will be overridden by the actual one in libc.so.
@@ -1224,10 +1216,16 @@
         },
     },
 
-    // TODO: move to libc/bionic/legacy_32_bit_support.cpp or #if __LP64__ instead.
     multilib: {
         lib32: {
-            srcs: ["bionic/mmap.cpp"],
+            srcs: [
+                // off64_t/time64_t support on LP32.
+                "bionic/legacy_32_bit_support.cpp",
+                "bionic/time64.c",
+
+                // TODO: move to libc/bionic/legacy_32_bit_support.cpp or #if __LP64__ instead.
+                "bionic/mmap.cpp",
+            ],
         },
     },
 
@@ -1337,20 +1335,13 @@
 }
 
 // ========================================================
-// libc_common.a
+// libc_common.a --- everything shared by libc.a and libc.so
 // ========================================================
 
 cc_library_static {
     defaults: ["libc_defaults"],
     name: "libc_common",
 
-    srcs: libc_common_src_files,
-    multilib: {
-        lib32: {
-            srcs: libc_common_src_files_32,
-        },
-    },
-
     whole_static_libs: [
         "libarm-optimized-routines-string",
         "libasync_safe",
@@ -1378,7 +1369,7 @@
 }
 
 // ========================================================
-// libc_static_dispatch.a
+// libc_static_dispatch.a --- libc.a ifuncs
 // ========================================================
 cc_library_static {
     defaults: ["libc_defaults"],
@@ -1404,7 +1395,7 @@
 }
 
 // ========================================================
-// libc_dynamic_dispatch.a
+// libc_dynamic_dispatch.a --- libc.so ifuncs
 // ========================================================
 cc_library_static {
     defaults: ["libc_defaults"],
@@ -1898,6 +1889,7 @@
         "//external/gwp_asan",
         "//external/jemalloc_new",
         "//external/libunwind_llvm",
+        "//external/llvm-libc",
         "//external/scudo",
         "//system/core/property_service/libpropertyinfoparser",
         "//system/extras/toolchain-extras",
diff --git a/libc/bionic/grp_pwd.cpp b/libc/bionic/grp_pwd.cpp
index 600693c..82ee7ba 100644
--- a/libc/bionic/grp_pwd.cpp
+++ b/libc/bionic/grp_pwd.cpp
@@ -609,6 +609,8 @@
 }
 
 // All users are in just one group, the one passed in.
+// In practice, id(1) will show you in a lot more groups, because adbd
+// adds you to a lot of supplementary groups when dropping privileges.
 int getgrouplist(const char* /*user*/, gid_t group, gid_t* groups, int* ngroups) {
   if (*ngroups < 1) {
     *ngroups = 1;
@@ -618,6 +620,12 @@
   return (*ngroups = 1);
 }
 
+// See getgrouplist() to understand why we don't call it.
+int initgroups(const char* /*user*/, gid_t group) {
+  gid_t groups[] = {group};
+  return setgroups(1, groups);
+}
+
 char* getlogin() { // NOLINT: implementing bad function.
   passwd *pw = getpwuid(getuid()); // NOLINT: implementing bad function in terms of bad function.
   return pw ? pw->pw_name : nullptr;
diff --git a/libc/bionic/initgroups.c b/libc/bionic/initgroups.c
deleted file mode 100644
index dea6d96..0000000
--- a/libc/bionic/initgroups.c
+++ /dev/null
@@ -1,58 +0,0 @@
-/*
- * Copyright (C) 2008 The Android Open Source Project
- * All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- *  * Redistributions of source code must retain the above copyright
- *    notice, this list of conditions and the following disclaimer.
- *  * Redistributions in binary form must reproduce the above copyright
- *    notice, this list of conditions and the following disclaimer in
- *    the documentation and/or other materials provided with the
- *    distribution.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
- * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
- * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
- * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
- * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
- * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS
- * OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
- * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
- * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
- * SUCH DAMAGE.
- */
-#include <grp.h>
-#include <unistd.h>
-#include <stdlib.h>
-
-#define  INIT_GROUPS  2
-
-int
-initgroups (const char *user, gid_t group)
-{
-    gid_t   groups0[ INIT_GROUPS ];
-    gid_t*  groups    = groups0;
-    int     ret       = -1;
-    int     numgroups = INIT_GROUPS;
-
-    if (getgrouplist(user, group, groups, &numgroups) < 0) {
-        groups = malloc(numgroups*sizeof(groups[0]));
-        if (groups == NULL)
-            return -1;
-        if (getgrouplist(user,group,groups,&numgroups) < 0) {
-            goto EXIT;
-        }
-    }
-
-    ret = setgroups(numgroups, groups);
-
-EXIT:
-    if (groups != groups0)
-        free(groups);
-
-    return ret;
-}
diff --git a/libc/bionic/isatty.c b/libc/bionic/isatty.cpp
similarity index 92%
rename from libc/bionic/isatty.c
rename to libc/bionic/isatty.cpp
index 93af6c5..8a4aaf1 100644
--- a/libc/bionic/isatty.c
+++ b/libc/bionic/isatty.cpp
@@ -26,14 +26,10 @@
  * SUCH DAMAGE.
  */
 
-#include <unistd.h>
 #include <termios.h>
-#include <errno.h>
+#include <unistd.h>
 
-int
-isatty (int  fd)
-{
-  struct termios term;
-
-  return tcgetattr (fd, &term) == 0;
+int isatty(int fd) {
+  termios term;
+  return tcgetattr(fd, &term) == 0;
 }
diff --git a/libc/bionic/libc_init_dynamic.cpp b/libc/bionic/libc_init_dynamic.cpp
index 295484b..1180a51 100644
--- a/libc/bionic/libc_init_dynamic.cpp
+++ b/libc/bionic/libc_init_dynamic.cpp
@@ -61,6 +61,7 @@
 };
 
 void memtag_stack_dlopen_callback() {
+  async_safe_format_log(ANDROID_LOG_INFO, "libc", "remapping stacks as PROT_MTE");
   __pthread_internal_remap_stack_with_mte();
 }
 
diff --git a/libc/bionic/libc_init_static.cpp b/libc/bionic/libc_init_static.cpp
index 00faa5b..f091ff8 100644
--- a/libc/bionic/libc_init_static.cpp
+++ b/libc/bionic/libc_init_static.cpp
@@ -305,6 +305,14 @@
   bool memtag_stack = false;
   HeapTaggingLevel level =
       __get_tagging_level(memtag_dynamic_entries, phdr_start, phdr_ct, load_bias, &memtag_stack);
+  // This is used by the linker (in linker.cpp) to communicate than any library linked by this
+  // executable enables memtag-stack.
+  if (__libc_shared_globals()->initial_memtag_stack) {
+    if (!memtag_stack) {
+      async_safe_format_log(ANDROID_LOG_INFO, "libc", "enabling PROT_MTE as requested by linker");
+    }
+    memtag_stack = true;
+  }
   char* env = getenv("BIONIC_MEMTAG_UPGRADE_SECS");
   static const char kAppProcessName[] = "app_process64";
   const char* progname = __libc_shared_globals()->init_progname;
@@ -373,6 +381,8 @@
   }
   // We did not enable MTE, so we do not need to arm the upgrade timer.
   __libc_shared_globals()->heap_tagging_upgrade_timer_sec = 0;
+  // We also didn't enable memtag_stack.
+  __libc_shared_globals()->initial_memtag_stack = false;
 }
 #else   // __aarch64__
 void __libc_init_mte(const memtag_dynamic_entries_t*, const void*, size_t, uintptr_t, void*) {}
diff --git a/libc/bionic/sched_cpualloc.c b/libc/bionic/sched_cpualloc.c
deleted file mode 100644
index 345de91..0000000
--- a/libc/bionic/sched_cpualloc.c
+++ /dev/null
@@ -1,43 +0,0 @@
-/*
- * Copyright (C) 2010 The Android Open Source Project
- * All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- *  * Redistributions of source code must retain the above copyright
- *    notice, this list of conditions and the following disclaimer.
- *  * Redistributions in binary form must reproduce the above copyright
- *    notice, this list of conditions and the following disclaimer in
- *    the documentation and/or other materials provided with the
- *    distribution.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
- * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
- * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
- * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
- * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
- * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS
- * OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
- * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
- * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
- * SUCH DAMAGE.
- */
-#define _GNU_SOURCE 1
-#include <sched.h>
-#include <stdlib.h>
-
-cpu_set_t* __sched_cpualloc(size_t count)
-{
-    // The static analyzer complains that CPU_ALLOC_SIZE eventually expands to
-    // N * sizeof(unsigned long), which is incompatible with cpu_set_t. This is
-    // on purpose.
-    return (cpu_set_t*) malloc(CPU_ALLOC_SIZE(count)); // NOLINT
-}
-
-void __sched_cpufree(cpu_set_t* set)
-{
-    free(set);
-}
diff --git a/libc/bionic/sched_cpucount.c b/libc/bionic/sched_cpualloc.cpp
similarity index 83%
copy from libc/bionic/sched_cpucount.c
copy to libc/bionic/sched_cpualloc.cpp
index 6f66589..4c8b3c3 100644
--- a/libc/bionic/sched_cpucount.c
+++ b/libc/bionic/sched_cpualloc.cpp
@@ -25,17 +25,14 @@
  * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
-#define _GNU_SOURCE 1
+
 #include <sched.h>
+#include <stdlib.h>
 
-int __sched_cpucount(size_t setsize, const cpu_set_t* set) {
-  int nn = 0;
-  int nn_max = setsize / sizeof(__CPU_BITTYPE);
-  int count = 0;
+cpu_set_t* __sched_cpualloc(size_t count) {
+  return static_cast<cpu_set_t*>(malloc(CPU_ALLOC_SIZE(count)));
+}
 
-  for ( ; nn < nn_max; nn++ ) {
-    count += __builtin_popcountl(set->__bits[nn]);
-  }
-
-  return count;
+void __sched_cpufree(cpu_set_t* set) {
+  free(set);
 }
diff --git a/libc/bionic/sched_cpucount.c b/libc/bionic/sched_cpucount.cpp
similarity index 89%
rename from libc/bionic/sched_cpucount.c
rename to libc/bionic/sched_cpucount.cpp
index 6f66589..3ec27bb 100644
--- a/libc/bionic/sched_cpucount.c
+++ b/libc/bionic/sched_cpucount.cpp
@@ -25,17 +25,13 @@
  * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
-#define _GNU_SOURCE 1
+
 #include <sched.h>
 
 int __sched_cpucount(size_t setsize, const cpu_set_t* set) {
-  int nn = 0;
-  int nn_max = setsize / sizeof(__CPU_BITTYPE);
   int count = 0;
-
-  for ( ; nn < nn_max; nn++ ) {
-    count += __builtin_popcountl(set->__bits[nn]);
+  for (size_t i = 0; i < setsize / sizeof(__CPU_BITTYPE); i++) {
+    count += __builtin_popcountl(set->__bits[i]);
   }
-
   return count;
 }
diff --git a/libc/bionic/sysconf.cpp b/libc/bionic/sysconf.cpp
index edbdef1..ff72b93 100644
--- a/libc/bionic/sysconf.cpp
+++ b/libc/bionic/sysconf.cpp
@@ -41,6 +41,16 @@
 #include "platform/bionic/page.h"
 #include "private/bionic_tls.h"
 
+static long __sysconf_fread_long(const char* path) {
+  long result = 0;
+  FILE* fp = fopen(path, "re");
+  if (fp != nullptr) {
+    fscanf(fp, "%ld", &result);
+    fclose(fp);
+  }
+  return result;
+}
+
 struct sysconf_cache {
   long size, assoc, linesize;
 
@@ -90,16 +100,6 @@
 
 #else
 
-long __sysconf_fread_long(const char* path) {
-  long result = 0;
-  FILE* fp = fopen(path, "re");
-  if (fp != nullptr) {
-    fscanf(fp, "%ld", &result);
-    fclose(fp);
-  }
-  return result;
-}
-
 static sysconf_caches* __sysconf_caches() {
   static sysconf_caches cached = []{
     sysconf_caches info = {};
@@ -183,6 +183,7 @@
     case _SC_AVPHYS_PAGES:      return get_avphys_pages();
     case _SC_CHILD_MAX:         return __sysconf_rlimit(RLIMIT_NPROC);
     case _SC_CLK_TCK:           return static_cast<long>(getauxval(AT_CLKTCK));
+    case _SC_NGROUPS_MAX:       return __sysconf_fread_long("/proc/sys/kernel/ngroups_max");
     case _SC_NPROCESSORS_CONF:  return get_nprocs_conf();
     case _SC_NPROCESSORS_ONLN:  return get_nprocs();
     case _SC_OPEN_MAX:          return __sysconf_rlimit(RLIMIT_NOFILE);
@@ -204,7 +205,6 @@
     case _SC_COLL_WEIGHTS_MAX:  return _POSIX2_COLL_WEIGHTS_MAX;  // Minimum requirement.
     case _SC_EXPR_NEST_MAX:     return _POSIX2_EXPR_NEST_MAX;     // Minimum requirement.
     case _SC_LINE_MAX:          return _POSIX2_LINE_MAX;          // Minimum requirement.
-    case _SC_NGROUPS_MAX:       return NGROUPS_MAX;
     case _SC_PASS_MAX:          return PASS_MAX;
     case _SC_2_C_BIND:          return _POSIX2_C_BIND;
     case _SC_2_C_DEV:           return _POSIX2_C_DEV;
diff --git a/libc/private/bsd_sys_param.h b/libc/private/bsd_sys_param.h
index be5f692..ab54aa0 100644
--- a/libc/private/bsd_sys_param.h
+++ b/libc/private/bsd_sys_param.h
@@ -20,4 +20,4 @@
 
 /* OpenBSD has these in <sys/param.h>, but "ALIGN" isn't something we want to reserve. */
 #define ALIGNBYTES (sizeof(uintptr_t) - 1)
-#define ALIGN(p) (((uintptr_t)(p) + ALIGNBYTES) & ~ALIGNBYTES)
+#define ALIGN(p) ((__BIONIC_CAST(reinterpret_cast, uintptr_t, p) + ALIGNBYTES) & ~ALIGNBYTES)
diff --git a/libc/stdio/local.h b/libc/stdio/local.h
index a5eb636..62efea1 100644
--- a/libc/stdio/local.h
+++ b/libc/stdio/local.h
@@ -236,7 +236,7 @@
 /* OpenBSD exposes these in <stdio.h>, but we only want them exposed to the implementation. */
 #define __sferror(p) (((p)->_flags & __SERR) != 0)
 #define __sclearerr(p) ((void)((p)->_flags &= ~(__SERR | __SEOF)))
-#define __sgetc(p) (--(p)->_r < 0 ? __srget(p) : (int)(*(p)->_p++))
+#define __sgetc(p) (--(p)->_r < 0 ? __srget(p) : __BIONIC_CAST(static_cast, int, *(p)->_p++))
 
 /* OpenBSD declares these in fvwrite.h, but we share them with C++ parts of the implementation. */
 struct __siov {
@@ -288,7 +288,7 @@
 char* __hldtoa(long double, const char*, int, int*, int*, char**);
 char* __ldtoa(long double*, int, int, int*, int*, char**);
 
-#define WCIO_GET(fp) (_EXT(fp) ? &(_EXT(fp)->_wcio) : (struct wchar_io_data*)0)
+#define WCIO_GET(fp) (_EXT(fp) ? &(_EXT(fp)->_wcio) : NULL)
 
 #define ORIENT_BYTES (-1)
 #define ORIENT_UNKNOWN 0
diff --git a/libc/stdio/scanf_common.h b/libc/stdio/scanf_common.h
index 8132e90..1b6b87f 100644
--- a/libc/stdio/scanf_common.h
+++ b/libc/stdio/scanf_common.h
@@ -82,7 +82,7 @@
 #define CT_FLOAT 4   // Float: strtod
 
 #define to_digit(c) static_cast<int>((c) - '0')
-#define is_digit(c) ((unsigned)to_digit(c) <= 9)
+#define is_digit(c) (static_cast<unsigned>(to_digit(c)) <= 9)
 
 // Append a digit to a value and check for overflow.
 #define APPEND_DIGIT(val, dig)               \
@@ -112,4 +112,4 @@
   __fortify_fatal("%%w%s%d is unsupported", fast ? "f" : "", size);
 }
 
-#pragma clang diagnostic pop
\ No newline at end of file
+#pragma clang diagnostic pop
diff --git a/libc/stdio/vfscanf.cpp b/libc/stdio/vfscanf.cpp
index 3607995..92ff541 100644
--- a/libc/stdio/vfscanf.cpp
+++ b/libc/stdio/vfscanf.cpp
@@ -629,10 +629,10 @@
          * as `[-+]0`.
          */
         if (flags & NDIGITS) {
-          if (p > buf) (void)ungetc(*(u_char*)--p, fp);
+          if (p > buf) ungetc(*reinterpret_cast<u_char*>(--p), fp);
           goto match_failure;
         }
-        c = ((u_char*)p)[-1];
+        c = reinterpret_cast<u_char*>(p)[-1];
         if ((base == 2 && (c == 'b' || c == 'B')) || c == 'x' || c == 'X') {
           --p;
           (void)ungetc(c, fp);
@@ -647,7 +647,7 @@
             res = strtoimax(buf, nullptr, base);
           }
           if (flags & POINTER) {
-            *va_arg(ap, void**) = (void*)(uintptr_t)res;
+            *va_arg(ap, void**) = reinterpret_cast<void*>(res);
           } else if (flags & MAXINT) {
             *va_arg(ap, intmax_t*) = res;
           } else if (flags & LLONG) {
@@ -685,7 +685,7 @@
             float res = strtof(buf, &p);
             *va_arg(ap, float*) = res;
           }
-          if ((size_t)(p - buf) != width) abort();
+          if (static_cast<size_t>(p - buf) != width) abort();
           nassigned++;
         }
         nread += width;
diff --git a/libc/stdio/vfwscanf.cpp b/libc/stdio/vfwscanf.cpp
index 3df4a87..21d1783 100644
--- a/libc/stdio/vfwscanf.cpp
+++ b/libc/stdio/vfwscanf.cpp
@@ -32,6 +32,7 @@
  */
 
 #include "scanf_common.h"
+
 // An interpretive version of __sccl from vfscanf.c --- a table of all wchar_t values would
 // be a little too expensive, and some kind of compressed version isn't worth the trouble.
 static inline bool in_ccl(wchar_t wc, const wchar_t* ccl) {
@@ -335,7 +336,7 @@
           if (!(flags & SUPPRESS)) p = va_arg(ap, wchar_t*);
           n = 0;
           while (width-- != 0 && (wi = __fgetwc_unlock(fp)) != WEOF) {
-            if (!(flags & SUPPRESS)) *p++ = (wchar_t)wi;
+            if (!(flags & SUPPRESS)) *p++ = static_cast<wchar_t>(wi);
             n++;
           }
           if (n == 0) goto input_failure;
@@ -348,10 +349,10 @@
           while (width != 0 && (wi = __fgetwc_unlock(fp)) != WEOF) {
             if (width >= MB_CUR_MAX && !(flags & SUPPRESS)) {
               nconv = wcrtomb(mbp, wi, &mbs);
-              if (nconv == (size_t)-1) goto input_failure;
+              if (nconv == static_cast<size_t>(-1)) goto input_failure;
             } else {
               nconv = wcrtomb(mbbuf, wi, &mbs);
-              if (nconv == (size_t)-1) goto input_failure;
+              if (nconv == static_cast<size_t>(-1)) goto input_failure;
               if (nconv > width) {
                 __ungetwc(wi, fp);
                 break;
@@ -373,7 +374,7 @@
       case CT_STRING:
         // CT_CCL: scan a (nonempty) character class (sets NOSKIP).
         // CT_STRING: like CCL, but zero-length string OK, & no NOSKIP.
-        if (width == 0) width = (size_t)~0; // 'infinity'.
+        if (width == 0) width = SIZE_MAX; // 'infinity'.
         if ((flags & SUPPRESS) && (flags & LONG)) {
           n = 0;
           while ((wi = __fgetwc_unlock(fp)) != WEOF && width-- != 0 && ((c == CT_CCL && in_ccl(wi, ccl)) || (c == CT_STRING && !iswspace(wi)))) n++;
@@ -381,7 +382,7 @@
         } else if (flags & LONG) {
           p0 = p = va_arg(ap, wchar_t*);
           while ((wi = __fgetwc_unlock(fp)) != WEOF && width-- != 0 && ((c == CT_CCL && in_ccl(wi, ccl)) || (c == CT_STRING && !iswspace(wi)))) {
-            *p++ = (wchar_t)wi;
+            *p++ = static_cast<wchar_t>(wi);
           }
           if (wi != WEOF) __ungetwc(wi, fp);
           n = p - p0;
@@ -392,10 +393,10 @@
           while ((wi = __fgetwc_unlock(fp)) != WEOF && width != 0 && ((c == CT_CCL && in_ccl(wi, ccl)) || (c == CT_STRING && !iswspace(wi)))) {
             if (width >= MB_CUR_MAX && !(flags & SUPPRESS)) {
               nconv = wcrtomb(mbp, wi, &mbs);
-              if (nconv == (size_t)-1) goto input_failure;
+              if (nconv == static_cast<size_t>(-1)) goto input_failure;
             } else {
               nconv = wcrtomb(mbbuf, wi, &mbs);
-              if (nconv == (size_t)-1) goto input_failure;
+              if (nconv == static_cast<size_t>(-1)) goto input_failure;
               if (nconv > width) break;
               if (!(flags & SUPPRESS)) memcpy(mbp, mbbuf, nconv);
             }
@@ -485,7 +486,7 @@
             case 'e':
             case 'f':
               if (base == 0) base = 10;
-              if (base != 16 && (int)(c - '0') >= base) break; /* not legal here */
+              if (base != 16 && static_cast<int>(c - '0') >= base) break; /* not legal here */
               flags &= ~(SIGNOK | PFBOK | PFXOK | NDIGITS);
               goto ok;
 
@@ -523,7 +524,7 @@
           /*
            * c is legal: store it and look at the next.
            */
-          *p++ = (wchar_t)c;
+          *p++ = static_cast<wchar_t>(c);
         }
         /*
          * If we had only a sign, it is no good; push back the sign.
@@ -548,7 +549,7 @@
           else
             res = wcstoumax(buf, NULL, base);
           if (flags & POINTER)
-            *va_arg(ap, void**) = (void*)(uintptr_t)res;
+            *va_arg(ap, void**) = reinterpret_cast<void*>(res);
           else if (flags & MAXINT)
             *va_arg(ap, intmax_t*) = res;
           else if (flags & LLONG)
@@ -587,7 +588,7 @@
             float res = wcstof(buf, &p);
             *va_arg(ap, float*) = res;
           }
-          if (p - buf != (ptrdiff_t)width) abort();
+          if (static_cast<size_t>(p - buf) != width) abort();
           nassigned++;
         }
         nread += width;
diff --git a/linker/NOTICE b/linker/NOTICE
index 7fd1877..9b66b4e 100644
--- a/linker/NOTICE
+++ b/linker/NOTICE
@@ -390,3 +390,31 @@
 
 -------------------------------------------------------------------
 
+Copyright (C) 2024 The Android Open Source Project
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions
+are met:
+ * Redistributions of source code must retain the above copyright
+   notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+   notice, this list of conditions and the following disclaimer in
+   the documentation and/or other materials provided with the
+   distribution.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS
+OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
+AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
+OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+SUCH DAMAGE.
+
+-------------------------------------------------------------------
+
diff --git a/linker/linker.cpp b/linker/linker.cpp
index 60c8e31..b0caedd 100644
--- a/linker/linker.cpp
+++ b/linker/linker.cpp
@@ -1695,13 +1695,31 @@
   }
 
   // Step 3: pre-link all DT_NEEDED libraries in breadth first order.
+  bool any_memtag_stack = false;
   for (auto&& task : load_tasks) {
     soinfo* si = task->get_soinfo();
     if (!si->is_linked() && !si->prelink_image()) {
       return false;
     }
+    // si->memtag_stack() needs to be called after si->prelink_image() which populates
+    // the dynamic section.
+    if (si->has_min_version(7) && si->memtag_stack()) {
+      any_memtag_stack = true;
+      LD_LOG(kLogDlopen,
+             "... load_library requesting stack MTE for: realpath=\"%s\", soname=\"%s\"",
+             si->get_realpath(), si->get_soname());
+    }
     register_soinfo_tls(si);
   }
+  if (any_memtag_stack) {
+    if (auto* cb = __libc_shared_globals()->memtag_stack_dlopen_callback) {
+      cb();
+    } else {
+      // find_library is used by the initial linking step, so we communicate that we
+      // want memtag_stack enabled to __libc_init_mte.
+      __libc_shared_globals()->initial_memtag_stack = true;
+    }
+  }
 
   // Step 4: Construct the global group. DF_1_GLOBAL bit is force set for LD_PRELOADed libs because
   // they must be added to the global group. Note: The DF_1_GLOBAL bit for a library is normally set
@@ -2213,14 +2231,6 @@
   loading_trace.End();
 
   if (si != nullptr) {
-    if (si->has_min_version(7) && si->memtag_stack()) {
-      LD_LOG(kLogDlopen, "... dlopen enabling MTE for: realpath=\"%s\", soname=\"%s\"",
-             si->get_realpath(), si->get_soname());
-      if (auto* cb = __libc_shared_globals()->memtag_stack_dlopen_callback) {
-        cb();
-      }
-    }
-
     void* handle = si->to_handle();
     LD_LOG(kLogDlopen,
            "... dlopen calling constructors: realpath=\"%s\", soname=\"%s\", handle=%p",
@@ -3354,7 +3364,7 @@
                               "\"%s\" has text relocations",
                               get_realpath());
     add_dlwarning(get_realpath(), "text relocations");
-    if (phdr_table_unprotect_segments(phdr, phnum, load_bias, should_pad_segments_) < 0) {
+    if (phdr_table_unprotect_segments(phdr, phnum, load_bias) < 0) {
       DL_ERR("can't unprotect loadable segments for \"%s\": %s", get_realpath(), strerror(errno));
       return false;
     }
@@ -3370,7 +3380,7 @@
 #if !defined(__LP64__)
   if (has_text_relocations) {
     // All relocations are done, we can protect our segments back to read-only.
-    if (phdr_table_protect_segments(phdr, phnum, load_bias, should_pad_segments_) < 0) {
+    if (phdr_table_protect_segments(phdr, phnum, load_bias) < 0) {
       DL_ERR("can't protect segments for \"%s\": %s",
              get_realpath(), strerror(errno));
       return false;
@@ -3408,7 +3418,7 @@
 }
 
 bool soinfo::protect_relro() {
-  if (phdr_table_protect_gnu_relro(phdr, phnum, load_bias, should_pad_segments_) < 0) {
+  if (phdr_table_protect_gnu_relro(phdr, phnum, load_bias) < 0) {
     DL_ERR("can't enable GNU RELRO protection for \"%s\": %s",
            get_realpath(), strerror(errno));
     return false;
diff --git a/linker/linker_main.cpp b/linker/linker_main.cpp
index 018a5eb..d6592af 100644
--- a/linker/linker_main.cpp
+++ b/linker/linker_main.cpp
@@ -201,7 +201,6 @@
   const ElfW(Phdr)* phdr;
   size_t phdr_count;
   ElfW(Addr) entry_point;
-  bool should_pad_segments;
 };
 
 static ExecutableInfo get_executable_info(const char* arg_path) {
@@ -294,7 +293,6 @@
   result.phdr = elf_reader.loaded_phdr();
   result.phdr_count = elf_reader.phdr_count();
   result.entry_point = elf_reader.entry_point();
-  result.should_pad_segments = elf_reader.should_pad_segments();
   return result;
 }
 
@@ -368,7 +366,6 @@
   somain = si;
   si->phdr = exe_info.phdr;
   si->phnum = exe_info.phdr_count;
-  si->set_should_pad_segments(exe_info.should_pad_segments);
   get_elf_base_from_phdr(si->phdr, si->phnum, &si->base, &si->load_bias);
   si->size = phdr_table_get_load_size(si->phdr, si->phnum);
   si->dynamic = nullptr;
@@ -402,14 +399,11 @@
     auto note_gnu_property = GnuPropertySection(somain);
     if (note_gnu_property.IsBTICompatible() &&
         (phdr_table_protect_segments(somain->phdr, somain->phnum, somain->load_bias,
-                                     somain->should_pad_segments(), &note_gnu_property) < 0)) {
+                                     &note_gnu_property) < 0)) {
       __linker_error("error: can't protect segments for \"%s\": %s", exe_info.path.c_str(),
                      strerror(errno));
     }
   }
-
-  __libc_init_mte(somain->memtag_dynamic_entries(), somain->phdr, somain->phnum, somain->load_bias,
-                  args.argv);
 #endif
 
   // Register the main executable and the linker upfront to have
@@ -499,6 +493,12 @@
     }
     si->increment_ref_count();
   }
+#if defined(__aarch64__)
+  // This has to happen after the find_libraries, which will have collected any possible
+  // libraries that request memtag_stack in the dynamic section.
+  __libc_init_mte(somain->memtag_dynamic_entries(), somain->phdr, somain->phnum, somain->load_bias,
+                  args.argv);
+#endif
 
   linker_finalize_static_tls();
   __libc_init_main_thread_final();
diff --git a/linker/linker_phdr.cpp b/linker/linker_phdr.cpp
index af0ef1d..82b37a4 100644
--- a/linker/linker_phdr.cpp
+++ b/linker/linker_phdr.cpp
@@ -196,7 +196,7 @@
     // For Armv8.5-A loaded executable segments may require PROT_BTI.
     if (note_gnu_property_.IsBTICompatible()) {
       did_load_ = (phdr_table_protect_segments(phdr_table_, phdr_num_, load_bias_,
-                                               should_pad_segments_, &note_gnu_property_) == 0);
+                                               &note_gnu_property_) == 0);
     }
 #endif
   }
@@ -717,13 +717,21 @@
       continue;
     }
 
+    // Some obfuscated ELFs may contain "empty" PT_NOTE program headers that don't
+    // point to any part of the ELF (p_memsz == 0). Skip these since there is
+    // nothing to decode. See: b/324468126
+    if (phdr->p_memsz == 0) {
+      continue;
+    }
+
     // note_fragment is scoped to within the loop so that there is
     // at most 1 PT_NOTE mapped at anytime during this search.
     MappedFileFragment note_fragment;
     if (!note_fragment.Map(fd_, file_offset_, phdr->p_offset, phdr->p_memsz)) {
-      DL_WARN("\"%s\" note mmap failed: %s", name_.c_str(), strerror(errno));
-      // If mmap failed, skip the optimization but don't block ELF loading
-      return true;
+      DL_ERR("\"%s\": PT_NOTE mmap(nullptr, %p, PROT_READ, MAP_PRIVATE, %d, %p) failed: %m",
+             name_.c_str(), reinterpret_cast<void*>(phdr->p_memsz), fd_,
+             reinterpret_cast<void*>(page_start(file_offset_ + phdr->p_offset)));
+      return false;
     }
 
     const ElfW(Nhdr)* note_hdr = nullptr;
@@ -748,36 +756,6 @@
   return true;
 }
 
-static inline void _extend_load_segment_vma(const ElfW(Phdr)* phdr_table, size_t phdr_count,
-                                             size_t phdr_idx, ElfW(Addr)* p_memsz,
-                                             ElfW(Addr)* p_filesz) {
-  const ElfW(Phdr)* phdr = &phdr_table[phdr_idx];
-  const ElfW(Phdr)* next = nullptr;
-  size_t next_idx = phdr_idx + 1;
-  if (next_idx < phdr_count && phdr_table[next_idx].p_type == PT_LOAD) {
-    next = &phdr_table[next_idx];
-  }
-
-  // If this is the last LOAD segment, no extension is needed
-  if (!next || *p_memsz != *p_filesz) {
-    return;
-  }
-
-  ElfW(Addr) next_start = page_start(next->p_vaddr);
-  ElfW(Addr) curr_end = page_end(phdr->p_vaddr + *p_memsz);
-
-  // If adjacent segment mappings overlap, no extension is needed.
-  if (curr_end >= next_start) {
-    return;
-  }
-
-  // Extend the LOAD segment mapping to be contiguous with that of
-  // the next LOAD segment.
-  ElfW(Addr) extend = next_start - curr_end;
-  *p_memsz += extend;
-  *p_filesz += extend;
-}
-
 bool ElfReader::LoadSegments() {
   for (size_t i = 0; i < phdr_num_; ++i) {
     const ElfW(Phdr)* phdr = &phdr_table_[i];
@@ -786,24 +764,18 @@
       continue;
     }
 
-    ElfW(Addr) p_memsz = phdr->p_memsz;
-    ElfW(Addr) p_filesz = phdr->p_filesz;
-    if (phdr->p_align > kPageSize && should_pad_segments_) {
-      _extend_load_segment_vma(phdr_table_, phdr_num_, i, &p_memsz, &p_filesz);
-    }
-
     // Segment addresses in memory.
     ElfW(Addr) seg_start = phdr->p_vaddr + load_bias_;
-    ElfW(Addr) seg_end = seg_start + p_memsz;
+    ElfW(Addr) seg_end   = seg_start + phdr->p_memsz;
 
     ElfW(Addr) seg_page_start = page_start(seg_start);
     ElfW(Addr) seg_page_end = page_end(seg_end);
 
-    ElfW(Addr) seg_file_end = seg_start + p_filesz;
+    ElfW(Addr) seg_file_end   = seg_start + phdr->p_filesz;
 
     // File offsets.
     ElfW(Addr) file_start = phdr->p_offset;
-    ElfW(Addr) file_end = file_start + p_filesz;
+    ElfW(Addr) file_end   = file_start + phdr->p_filesz;
 
     ElfW(Addr) file_page_start = page_start(file_start);
     ElfW(Addr) file_length = file_end - file_page_start;
@@ -813,12 +785,12 @@
       return false;
     }
 
-    if (file_start + phdr->p_filesz > static_cast<size_t>(file_size_)) {
+    if (file_end > static_cast<size_t>(file_size_)) {
       DL_ERR("invalid ELF file \"%s\" load segment[%zd]:"
           " p_offset (%p) + p_filesz (%p) ( = %p) past end of file (0x%" PRIx64 ")",
           name_.c_str(), i, reinterpret_cast<void*>(phdr->p_offset),
           reinterpret_cast<void*>(phdr->p_filesz),
-          reinterpret_cast<void*>(file_start + phdr->p_filesz), file_size_);
+          reinterpret_cast<void*>(file_end), file_size_);
       return false;
     }
 
@@ -858,18 +830,8 @@
 
     // if the segment is writable, and does not end on a page boundary,
     // zero-fill it until the page limit.
-    //
-    // The intention is to zero the partial page at that may exist at the
-    // end of a file backed mapping. With the extended seg_file_end, this
-    // file offset as calculated from the mapping start can overrun the end
-    // of the file. However pages in that range cannot be touched by userspace
-    // because the kernel will not be able to handle a file map fault past the
-    // extent of the file. No need to try zeroing this untouchable region.
-    // Zero the partial page at the end of the original unextended seg_file_end.
-    ElfW(Addr) seg_file_end_orig = seg_start + phdr->p_filesz;
-    if ((phdr->p_flags & PF_W) != 0 && page_offset(seg_file_end_orig) > 0) {
-      memset(reinterpret_cast<void*>(seg_file_end_orig), 0,
-             kPageSize - page_offset(seg_file_end_orig));
+    if ((phdr->p_flags & PF_W) != 0 && page_offset(seg_file_end) > 0) {
+      memset(reinterpret_cast<void*>(seg_file_end), 0, page_size() - page_offset(seg_file_end));
     }
 
     seg_file_end = page_end(seg_file_end);
@@ -902,23 +864,17 @@
  * phdr_table_protect_segments and phdr_table_unprotect_segments.
  */
 static int _phdr_table_set_load_prot(const ElfW(Phdr)* phdr_table, size_t phdr_count,
-                                     ElfW(Addr) load_bias, int extra_prot_flags,
-                                     bool should_pad_segments) {
-  for (size_t i = 0; i < phdr_count; ++i) {
-    const ElfW(Phdr)* phdr = &phdr_table[i];
+                                     ElfW(Addr) load_bias, int extra_prot_flags) {
+  const ElfW(Phdr)* phdr = phdr_table;
+  const ElfW(Phdr)* phdr_limit = phdr + phdr_count;
 
+  for (; phdr < phdr_limit; phdr++) {
     if (phdr->p_type != PT_LOAD || (phdr->p_flags & PF_W) != 0) {
       continue;
     }
 
-    ElfW(Addr) p_memsz = phdr->p_memsz;
-    ElfW(Addr) p_filesz = phdr->p_filesz;
-    if (phdr->p_align > kPageSize && should_pad_segments) {
-      _extend_load_segment_vma(phdr_table, phdr_count, i, &p_memsz, &p_filesz);
-    }
-
-    ElfW(Addr) seg_page_start = page_start(phdr->p_vaddr + load_bias);
-    ElfW(Addr) seg_page_end = page_end(phdr->p_vaddr + p_memsz + load_bias);
+    ElfW(Addr) seg_page_start = page_start(phdr->p_vaddr) + load_bias;
+    ElfW(Addr) seg_page_end = page_end(phdr->p_vaddr + phdr->p_memsz) + load_bias;
 
     int prot = PFLAGS_TO_PROT(phdr->p_flags) | extra_prot_flags;
     if ((prot & PROT_WRITE) != 0) {
@@ -953,21 +909,19 @@
  *   phdr_table  -> program header table
  *   phdr_count  -> number of entries in tables
  *   load_bias   -> load bias
- *   should_pad_segments -> Are segments extended to avoid gaps in the memory map
  *   prop        -> GnuPropertySection or nullptr
  * Return:
  *   0 on success, -1 on failure (error code in errno).
  */
 int phdr_table_protect_segments(const ElfW(Phdr)* phdr_table, size_t phdr_count,
-                                ElfW(Addr) load_bias, bool should_pad_segments,
-                                const GnuPropertySection* prop __unused) {
+                                ElfW(Addr) load_bias, const GnuPropertySection* prop __unused) {
   int prot = 0;
 #if defined(__aarch64__)
   if ((prop != nullptr) && prop->IsBTICompatible()) {
     prot |= PROT_BTI;
   }
 #endif
-  return _phdr_table_set_load_prot(phdr_table, phdr_count, load_bias, prot, should_pad_segments);
+  return _phdr_table_set_load_prot(phdr_table, phdr_count, load_bias, prot);
 }
 
 /* Change the protection of all loaded segments in memory to writable.
@@ -983,53 +937,19 @@
  *   phdr_table  -> program header table
  *   phdr_count  -> number of entries in tables
  *   load_bias   -> load bias
- *   should_pad_segments -> Are segments extended to avoid gaps in the memory map
  * Return:
  *   0 on success, -1 on failure (error code in errno).
  */
 int phdr_table_unprotect_segments(const ElfW(Phdr)* phdr_table,
-                                  size_t phdr_count, ElfW(Addr) load_bias,
-                                  bool should_pad_segments) {
-  return _phdr_table_set_load_prot(phdr_table, phdr_count, load_bias, PROT_WRITE,
-                                   should_pad_segments);
-}
-
-static inline void _extend_gnu_relro_prot_end(const ElfW(Phdr)* relro_phdr,
-                                              const ElfW(Phdr)* phdr_table, size_t phdr_count,
-                                              ElfW(Addr) load_bias, ElfW(Addr)* seg_page_end) {
-  // Find the index and phdr of the LOAD containing the GNU_RELRO segment
-  for (size_t index = 0; index < phdr_count; ++index) {
-    const ElfW(Phdr)* phdr = &phdr_table[index];
-
-    if (phdr->p_type == PT_LOAD && phdr->p_vaddr == relro_phdr->p_vaddr) {
-      // If the PT_GNU_RELRO mem size is not at least as large as the corresponding
-      // LOAD segment mem size, we need to protect only a partial region of the
-      // LOAD segment and therefore cannot avoid a VMA split.
-      if (relro_phdr->p_memsz < phdr->p_memsz) {
-        break;
-      }
-
-      ElfW(Addr) p_memsz = phdr->p_memsz;
-      ElfW(Addr) p_filesz = phdr->p_filesz;
-
-      // Attempt extending the VMA (mprotect range). Without extending the range
-      // mprotect will only RO protect a part of the extend RW LOAD segment, which will
-      // leave an extra split RW VMA (the gap).
-      _extend_load_segment_vma(phdr_table, phdr_count, index, &p_memsz, &p_filesz);
-
-      *seg_page_end = page_end(phdr->p_vaddr + p_memsz + load_bias);
-
-      break;
-    }
-  }
+                                  size_t phdr_count, ElfW(Addr) load_bias) {
+  return _phdr_table_set_load_prot(phdr_table, phdr_count, load_bias, PROT_WRITE);
 }
 
 /* Used internally by phdr_table_protect_gnu_relro and
  * phdr_table_unprotect_gnu_relro.
  */
 static int _phdr_table_set_gnu_relro_prot(const ElfW(Phdr)* phdr_table, size_t phdr_count,
-                                          ElfW(Addr) load_bias, int prot_flags,
-                                          bool should_pad_segments) {
+                                          ElfW(Addr) load_bias, int prot_flags) {
   const ElfW(Phdr)* phdr = phdr_table;
   const ElfW(Phdr)* phdr_limit = phdr + phdr_count;
 
@@ -1054,16 +974,8 @@
     //       the program is likely to fail at runtime. So in effect the
     //       linker must only emit a PT_GNU_RELRO segment if it ensures
     //       that it starts on a page boundary.
-    ElfW(Addr) seg_page_start = page_start(phdr->p_vaddr + load_bias);
-    ElfW(Addr) seg_page_end = page_end(phdr->p_vaddr + phdr->p_memsz + load_bias);
-
-    // Before extending the RO protection, we need to ensure that the segments were extended
-    // by bionic, because the kernel won't map gaps so it usually contains unrelated
-    // mappings which will be incorrectly protected as RO likely leading to
-    // segmentation fault.
-    if (phdr->p_align > kPageSize && should_pad_segments) {
-      _extend_gnu_relro_prot_end(phdr, phdr_table, phdr_count, load_bias, &seg_page_end);
-    }
+    ElfW(Addr) seg_page_start = page_start(phdr->p_vaddr) + load_bias;
+    ElfW(Addr) seg_page_end = page_end(phdr->p_vaddr + phdr->p_memsz) + load_bias;
 
     int ret = mprotect(reinterpret_cast<void*>(seg_page_start),
                        seg_page_end - seg_page_start,
@@ -1088,14 +1000,12 @@
  *   phdr_table  -> program header table
  *   phdr_count  -> number of entries in tables
  *   load_bias   -> load bias
- *   should_pad_segments -> Were segments extended to avoid gaps in the memory map
  * Return:
  *   0 on success, -1 on failure (error code in errno).
  */
-int phdr_table_protect_gnu_relro(const ElfW(Phdr)* phdr_table, size_t phdr_count,
-                                 ElfW(Addr) load_bias, bool should_pad_segments) {
-  return _phdr_table_set_gnu_relro_prot(phdr_table, phdr_count, load_bias, PROT_READ,
-                                        should_pad_segments);
+int phdr_table_protect_gnu_relro(const ElfW(Phdr)* phdr_table,
+                                 size_t phdr_count, ElfW(Addr) load_bias) {
+  return _phdr_table_set_gnu_relro_prot(phdr_table, phdr_count, load_bias, PROT_READ);
 }
 
 /* Serialize the GNU relro segments to the given file descriptor. This can be
diff --git a/linker/linker_phdr.h b/linker/linker_phdr.h
index 4deed33..e5b87bb 100644
--- a/linker/linker_phdr.h
+++ b/linker/linker_phdr.h
@@ -128,14 +128,13 @@
 size_t phdr_table_get_maximum_alignment(const ElfW(Phdr)* phdr_table, size_t phdr_count);
 
 int phdr_table_protect_segments(const ElfW(Phdr)* phdr_table, size_t phdr_count,
-                                ElfW(Addr) load_bias, bool should_pad_segments,
-                                const GnuPropertySection* prop = nullptr);
+                                ElfW(Addr) load_bias, const GnuPropertySection* prop = nullptr);
 
 int phdr_table_unprotect_segments(const ElfW(Phdr)* phdr_table, size_t phdr_count,
-                                  ElfW(Addr) load_bias, bool should_pad_segments);
+                                  ElfW(Addr) load_bias);
 
 int phdr_table_protect_gnu_relro(const ElfW(Phdr)* phdr_table, size_t phdr_count,
-                                 ElfW(Addr) load_bias, bool should_pad_segments);
+                                 ElfW(Addr) load_bias);
 
 int phdr_table_serialize_gnu_relro(const ElfW(Phdr)* phdr_table, size_t phdr_count,
                                    ElfW(Addr) load_bias, int fd, size_t* file_offset);
diff --git a/linker/linker_relocate.cpp b/linker/linker_relocate.cpp
index 5b58895..952dade 100644
--- a/linker/linker_relocate.cpp
+++ b/linker/linker_relocate.cpp
@@ -187,8 +187,7 @@
   auto protect_segments = [&]() {
     // Make .text executable.
     if (phdr_table_protect_segments(relocator.si->phdr, relocator.si->phnum,
-                                    relocator.si->load_bias,
-                                    relocator.si->should_pad_segments()) < 0) {
+                                    relocator.si->load_bias) < 0) {
       DL_ERR("can't protect segments for \"%s\": %s",
              relocator.si->get_realpath(), strerror(errno));
       return false;
@@ -198,8 +197,7 @@
   auto unprotect_segments = [&]() {
     // Make .text writable.
     if (phdr_table_unprotect_segments(relocator.si->phdr, relocator.si->phnum,
-                                      relocator.si->load_bias,
-                                      relocator.si->should_pad_segments()) < 0) {
+                                      relocator.si->load_bias) < 0) {
       DL_ERR("can't unprotect loadable segments for \"%s\": %s",
              relocator.si->get_realpath(), strerror(errno));
       return false;
diff --git a/tests/Android.bp b/tests/Android.bp
index 0ba91ea..78c2c10 100644
--- a/tests/Android.bp
+++ b/tests/Android.bp
@@ -1127,7 +1127,12 @@
     shared_libs: [
         "libbase",
     ],
-    data_libs: ["libtest_simple_memtag_stack"],
+    data_libs: ["libtest_simple_memtag_stack", "libtest_depends_on_simple_memtag_stack"],
+    data_bins: [
+        "testbinary_depends_on_simple_memtag_stack",
+        "testbinary_depends_on_depends_on_simple_memtag_stack",
+        "testbinary_is_stack_mte_after_dlopen"
+    ],
     header_libs: ["bionic_libc_platform_headers"],
     test_suites: ["device-tests"],
 }
diff --git a/tests/dlext_test.cpp b/tests/dlext_test.cpp
index 6883da9..d078e50 100644
--- a/tests/dlext_test.cpp
+++ b/tests/dlext_test.cpp
@@ -31,7 +31,6 @@
 #include <android-base/test_utils.h>
 
 #include <sys/mman.h>
-#include <sys/stat.h>
 #include <sys/types.h>
 #include <sys/vfs.h>
 #include <sys/wait.h>
@@ -2047,11 +2046,6 @@
                                                              -1, 0));
   ASSERT_TRUE(reinterpret_cast<void*>(reserved_addr) != MAP_FAILED);
 
-  struct stat file_stat;
-  int ret = TEMP_FAILURE_RETRY(stat(private_library_absolute_path.c_str(), &file_stat));
-  ASSERT_EQ(ret, 0) << "Failed to stat library";
-  size_t file_size = file_stat.st_size;
-
   for (const auto& rec : maps_to_copy) {
     uintptr_t offset = rec.addr_start - addr_start;
     size_t size = rec.addr_end - rec.addr_start;
@@ -2059,13 +2053,7 @@
     void* map = mmap(addr, size, PROT_READ | PROT_WRITE,
                      MAP_ANON | MAP_PRIVATE | MAP_FIXED, -1, 0);
     ASSERT_TRUE(map != MAP_FAILED);
-    size_t seg_size = size;
-    // See comment on file map fault in ElfReader::LoadSegments()
-    // bionic/linker/linker_phdr.cpp
-    if (rec.offset + size > file_size) {
-      seg_size = file_size - rec.offset;
-    }
-    memcpy(map, reinterpret_cast<void*>(rec.addr_start), seg_size);
+    memcpy(map, reinterpret_cast<void*>(rec.addr_start), size);
     mprotect(map, size, rec.perms);
   }
 
diff --git a/tests/fdsan_test.cpp b/tests/fdsan_test.cpp
index 016970f..c1e926b 100644
--- a/tests/fdsan_test.cpp
+++ b/tests/fdsan_test.cpp
@@ -33,7 +33,6 @@
 #include <android-base/silent_death_test.h>
 #include <android-base/unique_fd.h>
 
-#define FDSAN_TEST(test_name) TEST_F(FdsanTest, test_name)
 #define EXPECT_FDSAN_DEATH(expression, regex)                                                \
   EXPECT_DEATH((android_fdsan_set_error_level(ANDROID_FDSAN_ERROR_LEVEL_FATAL), expression), \
                (regex))
diff --git a/tests/grp_pwd_test.cpp b/tests/grp_pwd_test.cpp
index d3acf03..16b8d5a 100644
--- a/tests/grp_pwd_test.cpp
+++ b/tests/grp_pwd_test.cpp
@@ -851,6 +851,11 @@
 #endif
 }
 
+TEST(grp, initgroups) {
+  if (getuid() != 0) GTEST_SKIP() << "test requires root";
+  ASSERT_EQ(0, initgroups("root", 0));
+}
+
 #if defined(__BIONIC__)
 static void TestAidNamePrefix(const std::string& file_path) {
   std::string file_contents;
diff --git a/tests/libs/Android.bp b/tests/libs/Android.bp
index 68efbd9..06ee132 100644
--- a/tests/libs/Android.bp
+++ b/tests/libs/Android.bp
@@ -234,7 +234,7 @@
 }
 
 // -----------------------------------------------------------------------------
-// Library used by memtag_stack_dlopen_test tests
+// Libraries and binaries used by memtag_stack_dlopen_test tests
 // -----------------------------------------------------------------------------
 cc_test_library {
     name: "libtest_simple_memtag_stack",
@@ -244,6 +244,50 @@
     srcs: ["dlopen_testlib_simple.cpp"],
 }
 
+cc_test_library {
+    name: "libtest_depends_on_simple_memtag_stack",
+    sanitize: {
+        memtag_stack: false,
+    },
+    shared_libs: [
+        "libtest_simple_memtag_stack",
+    ],
+    srcs: ["dlopen_testlib_depends_on_simple.cpp"],
+}
+
+cc_binary {
+    name: "testbinary_is_stack_mte_after_dlopen",
+    sanitize: {
+        memtag_stack: false,
+        memtag_heap: true,
+    },
+    srcs: ["testbinary_is_stack_mte_after_dlopen.cpp"],
+}
+
+cc_binary {
+    name: "testbinary_depends_on_simple_memtag_stack",
+    sanitize: {
+        memtag_stack: false,
+        memtag_heap: true,
+    },
+    shared_libs: [
+        "libtest_simple_memtag_stack",
+    ],
+    srcs: ["testbinary_is_stack_mte.cpp"],
+}
+
+cc_binary {
+    name: "testbinary_depends_on_depends_on_simple_memtag_stack",
+    sanitize: {
+        memtag_stack: false,
+        memtag_heap: true,
+    },
+    shared_libs: [
+        "libtest_depends_on_simple_memtag_stack",
+    ],
+    srcs: ["testbinary_is_stack_mte.cpp"],
+}
+
 // -----------------------------------------------------------------------------
 // Libraries used by hwasan_test
 // -----------------------------------------------------------------------------
diff --git a/tests/libs/libs_utils.h b/tests/libs/CHECK.h
similarity index 71%
rename from tests/libs/libs_utils.h
rename to tests/libs/CHECK.h
index 7dae241..2575d5b 100644
--- a/tests/libs/libs_utils.h
+++ b/tests/libs/CHECK.h
@@ -14,14 +14,13 @@
  * limitations under the License.
  */
 
-#ifndef LIBS_UTILS_H
-#define LIBS_UTILS_H
+#pragma once
+
+// Tests proper can use libbase, but libraries for testing dlopen()
+// should probably avoid dependencies other than ones we're specifically
+// trying to test.
 
 #include <assert.h>
-#include <stdio.h>
-#include <stdlib.h>
 
 #define CHECK(e) \
-    ((e) ? static_cast<void>(0) : __assert2(__FILE__, __LINE__, __PRETTY_FUNCTION__, #e))
-
-#endif  // LIBS_UTILS_H
+  ((e) ? static_cast<void>(0) : __assert2(__FILE__, __LINE__, __PRETTY_FUNCTION__, #e))
diff --git a/tests/libs/cfi_test_helper.cpp b/tests/libs/cfi_test_helper.cpp
index c1a7b6d..71cdc89 100644
--- a/tests/libs/cfi_test_helper.cpp
+++ b/tests/libs/cfi_test_helper.cpp
@@ -14,11 +14,10 @@
  * limitations under the License.
  */
 
-#include <assert.h>
 #include <stdint.h>
 #include <stdlib.h>
 
-#include "libs_utils.h"
+#include "CHECK.h"
 
 // This library is built for all targets, including host tests, so __cfi_slowpath may not be
 // present. But it is only used in the bionic loader tests.
diff --git a/tests/libs/cfi_test_helper2.cpp b/tests/libs/cfi_test_helper2.cpp
index 11a6036..d7cd495 100644
--- a/tests/libs/cfi_test_helper2.cpp
+++ b/tests/libs/cfi_test_helper2.cpp
@@ -14,10 +14,9 @@
  * limitations under the License.
  */
 
-#include <assert.h>
 #include <dlfcn.h>
 
-#include "libs_utils.h"
+#include "CHECK.h"
 
 int main(void) {
   void* handle;
diff --git a/tests/libs/dlopen_b.cpp b/tests/libs/dlopen_b.cpp
index 092c96c..5b36242 100644
--- a/tests/libs/dlopen_b.cpp
+++ b/tests/libs/dlopen_b.cpp
@@ -1,8 +1,9 @@
 #include <dlfcn.h>
-extern "C" void *dlopen_b() {
-  // Work around for http://b/20049306, which isn't going to be fixed.
-  static int defeat_sibling_call_optimization = 0;
 
+// Work around for http://b/20049306, which isn't going to be fixed.
+int defeat_sibling_call_optimization = 0;
+
+extern "C" void* dlopen_b() {
   // This is supposed to succeed because this library has DT_RUNPATH
   // for libtest_dt_runpath_x.so which should be taken into account
   // by dlopen.
diff --git a/tests/libs/libs_utils.h b/tests/libs/dlopen_testlib_depends_on_simple.cpp
similarity index 68%
copy from tests/libs/libs_utils.h
copy to tests/libs/dlopen_testlib_depends_on_simple.cpp
index 7dae241..9e130d4 100644
--- a/tests/libs/libs_utils.h
+++ b/tests/libs/dlopen_testlib_depends_on_simple.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (C) 2017 The Android Open Source Project
+ * Copyright (C) 2024 The Android Open Source Project
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -14,14 +14,11 @@
  * limitations under the License.
  */
 
-#ifndef LIBS_UTILS_H
-#define LIBS_UTILS_H
-
-#include <assert.h>
-#include <stdio.h>
+#include <stdint.h>
 #include <stdlib.h>
 
-#define CHECK(e) \
-    ((e) ? static_cast<void>(0) : __assert2(__FILE__, __LINE__, __PRETTY_FUNCTION__, #e))
+extern "C" bool dlopen_testlib_simple_func();
 
-#endif  // LIBS_UTILS_H
+extern "C" bool dlopen_testlib_call_simple_func() {
+  return dlopen_testlib_simple_func();
+}
diff --git a/tests/libs/preinit_getauxval_test_helper.cpp b/tests/libs/preinit_getauxval_test_helper.cpp
index 2a79b97..53d4cc9 100644
--- a/tests/libs/preinit_getauxval_test_helper.cpp
+++ b/tests/libs/preinit_getauxval_test_helper.cpp
@@ -19,7 +19,7 @@
 #include <unistd.h>
 #include <sys/auxv.h>
 
-#include "libs_utils.h"
+#include "CHECK.h"
 
 static unsigned long g_AT_RANDOM;
 static unsigned long g_AT_PAGESZ;
diff --git a/tests/libs/preinit_syscall_test_helper.cpp b/tests/libs/preinit_syscall_test_helper.cpp
index 9b6b6df..3ca8131 100644
--- a/tests/libs/preinit_syscall_test_helper.cpp
+++ b/tests/libs/preinit_syscall_test_helper.cpp
@@ -19,7 +19,7 @@
 #include <unistd.h>
 #include <sys/auxv.h>
 
-#include "libs_utils.h"
+#include "CHECK.h"
 
 static ssize_t g_result;
 static int g_errno;
diff --git a/tests/libs/stack_tagging_helper.cpp b/tests/libs/stack_tagging_helper.cpp
index 7396dd0..e7e26af 100644
--- a/tests/libs/stack_tagging_helper.cpp
+++ b/tests/libs/stack_tagging_helper.cpp
@@ -28,7 +28,7 @@
 
 #include <bionic/malloc.h>
 
-#include "libs_utils.h"
+#include "CHECK.h"
 
 #if defined(__aarch64__)
 
diff --git a/tests/libs/libs_utils.h b/tests/libs/testbinary_is_stack_mte.cpp
similarity index 62%
copy from tests/libs/libs_utils.h
copy to tests/libs/testbinary_is_stack_mte.cpp
index 7dae241..8dde83c 100644
--- a/tests/libs/libs_utils.h
+++ b/tests/libs/testbinary_is_stack_mte.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (C) 2017 The Android Open Source Project
+ * Copyright (C) 2024 The Android Open Source Project
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -14,14 +14,25 @@
  * limitations under the License.
  */
 
-#ifndef LIBS_UTILS_H
-#define LIBS_UTILS_H
-
-#include <assert.h>
+#include <stdint.h>
 #include <stdio.h>
 #include <stdlib.h>
 
-#define CHECK(e) \
-    ((e) ? static_cast<void>(0) : __assert2(__FILE__, __LINE__, __PRETTY_FUNCTION__, #e))
+#include "../mte_utils.h"
+#include "CHECK.h"
 
-#endif  // LIBS_UTILS_H
+#if defined(__BIONIC__) && defined(__aarch64__)
+
+extern "C" int main(int, char**) {
+  int ret = is_stack_mte_on() ? 0 : 1;
+  printf("RAN\n");
+  return ret;
+}
+
+#else
+
+extern "C" int main(int, char**) {
+  printf("RAN\n");
+  return 1;
+}
+#endif
diff --git a/tests/libs/testbinary_is_stack_mte_after_dlopen.cpp b/tests/libs/testbinary_is_stack_mte_after_dlopen.cpp
new file mode 100644
index 0000000..c5e6868
--- /dev/null
+++ b/tests/libs/testbinary_is_stack_mte_after_dlopen.cpp
@@ -0,0 +1,133 @@
+/*
+ * Copyright (C) 2024 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <stdint.h>
+#include <stdio.h>
+#include <stdlib.h>
+
+#include <condition_variable>
+#include <thread>
+
+#include <dlfcn.h>
+#include <inttypes.h>
+#include <stdlib.h>
+#include <unistd.h>
+
+#include "../mte_utils.h"
+#include "CHECK.h"
+
+#if defined(__BIONIC__) && defined(__aarch64__)
+
+enum State { kInit, kThreadStarted, kStackRemapped };
+
+// We can't use pthread_getattr_np because that uses the rlimit rather than the actual mapping
+// bounds.
+static void find_main_stack_limits(uintptr_t* low, uintptr_t* high) {
+  uintptr_t startstack = reinterpret_cast<uintptr_t>(__builtin_frame_address(0));
+
+  // Hunt for the region that contains that address.
+  FILE* fp = fopen("/proc/self/maps", "re");
+  if (fp == nullptr) {
+    abort();
+  }
+  char line[BUFSIZ];
+  while (fgets(line, sizeof(line), fp) != nullptr) {
+    uintptr_t lo, hi;
+    if (sscanf(line, "%" SCNxPTR "-%" SCNxPTR, &lo, &hi) == 2) {
+      if (lo <= startstack && startstack <= hi) {
+        *low = lo;
+        *high = hi;
+        fclose(fp);
+        return;
+      }
+    }
+  }
+  abort();
+}
+
+template <typename Fn>
+unsigned int fault_new_stack_page(uintptr_t low, Fn f) {
+  uintptr_t new_low;
+  uintptr_t new_high;
+  volatile char buf[4096];
+  buf[4095] = 1;
+  find_main_stack_limits(&new_low, &new_high);
+  if (new_low < low) {
+    f();
+    return new_high;
+  }
+  // Useless, but should defeat TCO.
+  return new_low + fault_new_stack_page(low, f);
+}
+extern "C" int main(int argc, char** argv) {
+  if (argc < 2) {
+    return 1;
+  }
+  const char* path = argv[1];
+  CHECK(access(path, F_OK) == 0);  // Verify test setup.
+  CHECK(!is_stack_mte_on());
+  std::mutex m;
+  std::condition_variable cv;
+  State state = kInit;
+
+  bool is_early_thread_mte_on = false;
+  std::thread early_th([&] {
+    {
+      std::lock_guard lk(m);
+      state = kThreadStarted;
+    }
+    cv.notify_one();
+    {
+      std::unique_lock lk(m);
+      cv.wait(lk, [&] { return state == kStackRemapped; });
+    }
+    is_early_thread_mte_on = is_stack_mte_on();
+  });
+  {
+    std::unique_lock lk(m);
+    cv.wait(lk, [&] { return state == kThreadStarted; });
+  }
+  void* handle = dlopen(path, RTLD_NOW);
+  {
+    std::lock_guard lk(m);
+    state = kStackRemapped;
+  }
+  cv.notify_one();
+  CHECK(handle != nullptr);
+  CHECK(is_stack_mte_on());
+
+  bool new_stack_page_mte_on = false;
+  uintptr_t low;
+  uintptr_t high;
+  find_main_stack_limits(&low, &high);
+  fault_new_stack_page(low, [&] { new_stack_page_mte_on = is_stack_mte_on(); });
+  CHECK(new_stack_page_mte_on);
+
+  bool is_late_thread_mte_on = false;
+  std::thread late_th([&] { is_late_thread_mte_on = is_stack_mte_on(); });
+  late_th.join();
+  early_th.join();
+  CHECK(is_late_thread_mte_on);
+  CHECK(is_early_thread_mte_on);
+  printf("RAN\n");
+  return 0;
+}
+
+#else
+extern "C" int main(int, char**) {
+  return 1;
+}
+#endif
diff --git a/tests/memtag_stack_dlopen_test.cpp b/tests/memtag_stack_dlopen_test.cpp
index 308af1e..68ddb81 100644
--- a/tests/memtag_stack_dlopen_test.cpp
+++ b/tests/memtag_stack_dlopen_test.cpp
@@ -35,113 +35,82 @@
 
 #include <android-base/silent_death_test.h>
 #include <android-base/test_utils.h>
+#include "mte_utils.h"
 #include "utils.h"
 
+TEST(MemtagStackDlopenTest, DependentBinaryGetsMemtagStack) {
 #if defined(__BIONIC__) && defined(__aarch64__)
-__attribute__((target("mte"))) bool is_stack_mte_on() {
-  alignas(16) int x = 0;
-  void* p = reinterpret_cast<void*>(reinterpret_cast<uintptr_t>(&x) + (1UL << 57));
-  void* p_cpy = p;
-  __builtin_arm_stg(p);
-  p = __builtin_arm_ldg(p);
-  __builtin_arm_stg(&x);
-  return p == p_cpy;
-}
+  if (!running_with_mte()) GTEST_SKIP() << "Test requires MTE.";
+  if (is_stack_mte_on())
+    GTEST_SKIP() << "Stack MTE needs to be off for this test. Are you running fullmte?";
 
-// We can't use pthread_getattr_np because that uses the rlimit rather than the actual mapping
-// bounds.
-static void find_main_stack_limits(uintptr_t* low, uintptr_t* high) {
-  uintptr_t startstack = reinterpret_cast<uintptr_t>(__builtin_frame_address(0));
-
-  // Hunt for the region that contains that address.
-  FILE* fp = fopen("/proc/self/maps", "re");
-  if (fp == nullptr) {
-    abort();
-  }
-  char line[BUFSIZ];
-  while (fgets(line, sizeof(line), fp) != nullptr) {
-    uintptr_t lo, hi;
-    if (sscanf(line, "%" SCNxPTR "-%" SCNxPTR, &lo, &hi) == 2) {
-      if (lo <= startstack && startstack <= hi) {
-        *low = lo;
-        *high = hi;
-        fclose(fp);
-        return;
-      }
-    }
-  }
-  abort();
-}
-
-template <typename Fn>
-unsigned int fault_new_stack_page(uintptr_t low, Fn f) {
-  uintptr_t new_low;
-  uintptr_t new_high;
-  volatile char buf[4096];
-  buf[4095] = 1;
-  find_main_stack_limits(&new_low, &new_high);
-  if (new_low < low) {
-    f();
-    return new_high;
-  }
-  // Useless, but should defeat TCO.
-  return new_low + fault_new_stack_page(low, f);
-}
-
+  std::string path =
+      android::base::GetExecutableDirectory() + "/testbinary_depends_on_simple_memtag_stack";
+  ExecTestHelper eth;
+  std::string ld_library_path = "LD_LIBRARY_PATH=" + android::base::GetExecutableDirectory();
+  eth.SetArgs({path.c_str(), nullptr});
+  eth.SetEnv({ld_library_path.c_str(), nullptr});
+  eth.Run([&]() { execve(path.c_str(), eth.GetArgs(), eth.GetEnv()); }, 0, "RAN");
+#else
+  GTEST_SKIP() << "requires bionic arm64";
 #endif
+}
 
-enum State { kInit, kThreadStarted, kStackRemapped };
+TEST(MemtagStackDlopenTest, DependentBinaryGetsMemtagStack2) {
+#if defined(__BIONIC__) && defined(__aarch64__)
+  if (!running_with_mte()) GTEST_SKIP() << "Test requires MTE.";
+  if (is_stack_mte_on())
+    GTEST_SKIP() << "Stack MTE needs to be off for this test. Are you running fullmte?";
+
+  std::string path = android::base::GetExecutableDirectory() +
+                     "/testbinary_depends_on_depends_on_simple_memtag_stack";
+  ExecTestHelper eth;
+  std::string ld_library_path = "LD_LIBRARY_PATH=" + android::base::GetExecutableDirectory();
+  eth.SetArgs({path.c_str(), nullptr});
+  eth.SetEnv({ld_library_path.c_str(), nullptr});
+  eth.Run([&]() { execve(path.c_str(), eth.GetArgs(), eth.GetEnv()); }, 0, "RAN");
+#else
+  GTEST_SKIP() << "requires bionic arm64";
+#endif
+}
 
 TEST(MemtagStackDlopenTest, DlopenRemapsStack) {
 #if defined(__BIONIC__) && defined(__aarch64__)
+  // If this test is failing, look at crash logcat for why the test binary died.
   if (!running_with_mte()) GTEST_SKIP() << "Test requires MTE.";
+  if (is_stack_mte_on())
+    GTEST_SKIP() << "Stack MTE needs to be off for this test. Are you running fullmte?";
 
-  std::string path = android::base::GetExecutableDirectory() + "/libtest_simple_memtag_stack.so";
-  ASSERT_EQ(0, access(path.c_str(), F_OK));  // Verify test setup.
-  EXPECT_FALSE(is_stack_mte_on());
-  std::mutex m;
-  std::condition_variable cv;
-  State state = kInit;
+  std::string path =
+      android::base::GetExecutableDirectory() + "/testbinary_is_stack_mte_after_dlopen";
+  std::string lib_path =
+      android::base::GetExecutableDirectory() + "/libtest_simple_memtag_stack.so";
+  ExecTestHelper eth;
+  std::string ld_library_path = "LD_LIBRARY_PATH=" + android::base::GetExecutableDirectory();
+  eth.SetArgs({path.c_str(), lib_path.c_str(), nullptr});
+  eth.SetEnv({ld_library_path.c_str(), nullptr});
+  eth.Run([&]() { execve(path.c_str(), eth.GetArgs(), eth.GetEnv()); }, 0, "RAN");
+#else
+  GTEST_SKIP() << "requires bionic arm64";
+#endif
+}
 
-  bool is_early_thread_mte_on = false;
-  std::thread early_th([&] {
-    {
-      std::lock_guard lk(m);
-      state = kThreadStarted;
-    }
-    cv.notify_one();
-    {
-      std::unique_lock lk(m);
-      cv.wait(lk, [&] { return state == kStackRemapped; });
-    }
-    is_early_thread_mte_on = is_stack_mte_on();
-  });
-  {
-    std::unique_lock lk(m);
-    cv.wait(lk, [&] { return state == kThreadStarted; });
-  }
-  void* handle = dlopen(path.c_str(), RTLD_NOW);
-  {
-    std::lock_guard lk(m);
-    state = kStackRemapped;
-  }
-  cv.notify_one();
-  ASSERT_NE(handle, nullptr);
-  EXPECT_TRUE(is_stack_mte_on());
+TEST(MemtagStackDlopenTest, DlopenRemapsStack2) {
+#if defined(__BIONIC__) && defined(__aarch64__)
+  // If this test is failing, look at crash logcat for why the test binary died.
+  if (!running_with_mte()) GTEST_SKIP() << "Test requires MTE.";
+  if (is_stack_mte_on())
+    GTEST_SKIP() << "Stack MTE needs to be off for this test. Are you running fullmte?";
 
-  bool new_stack_page_mte_on = false;
-  uintptr_t low;
-  uintptr_t high;
-  find_main_stack_limits(&low, &high);
-  fault_new_stack_page(low, [&] { new_stack_page_mte_on = is_stack_mte_on(); });
-  EXPECT_TRUE(new_stack_page_mte_on);
-
-  bool is_late_thread_mte_on = false;
-  std::thread late_th([&] { is_late_thread_mte_on = is_stack_mte_on(); });
-  late_th.join();
-  early_th.join();
-  EXPECT_TRUE(is_early_thread_mte_on);
-  EXPECT_TRUE(is_late_thread_mte_on);
+  std::string path =
+      android::base::GetExecutableDirectory() + "/testbinary_is_stack_mte_after_dlopen";
+  std::string lib_path =
+      android::base::GetExecutableDirectory() + "/libtest_depends_on_simple_memtag_stack.so";
+  ExecTestHelper eth;
+  std::string ld_library_path = "LD_LIBRARY_PATH=" + android::base::GetExecutableDirectory();
+  eth.SetArgs({path.c_str(), lib_path.c_str(), nullptr});
+  eth.SetEnv({ld_library_path.c_str(), nullptr});
+  eth.Run([&]() { execve(path.c_str(), eth.GetArgs(), eth.GetEnv()); }, 0, "RAN");
 #else
   GTEST_SKIP() << "requires bionic arm64";
 #endif
diff --git a/tests/mte_utils.h b/tests/mte_utils.h
new file mode 100644
index 0000000..0f18442
--- /dev/null
+++ b/tests/mte_utils.h
@@ -0,0 +1,31 @@
+/*
+ * Copyright (C) 2024 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#if defined(__BIONIC__) && defined(__aarch64__)
+
+__attribute__((target("mte"))) static bool is_stack_mte_on() {
+  alignas(16) int x = 0;
+  void* p = reinterpret_cast<void*>(reinterpret_cast<uintptr_t>(&x) + (1UL << 57));
+  void* p_cpy = p;
+  __builtin_arm_stg(p);
+  p = __builtin_arm_ldg(p);
+  __builtin_arm_stg(&x);
+  return p == p_cpy;
+}
+
+#endif
diff --git a/tests/unistd_test.cpp b/tests/unistd_test.cpp
index e9a3080..88f5851 100644
--- a/tests/unistd_test.cpp
+++ b/tests/unistd_test.cpp
@@ -968,7 +968,7 @@
   VERIFY_SYSCONF_POSIX_VERSION(_SC_CPUTIME);
   VERIFY_SYSCONF_POSITIVE(_SC_EXPR_NEST_MAX);
   VERIFY_SYSCONF_POSITIVE(_SC_LINE_MAX);
-  VERIFY_SYSCONF_POSITIVE(_SC_NGROUPS_MAX);
+  VerifySysconf(_SC_NGROUPS_MAX, "_SC_NGROUPS_MAX", [](long v){return v >= 0 && v <= NGROUPS_MAX;});
   VERIFY_SYSCONF_POSITIVE(_SC_OPEN_MAX);
   VERIFY_SYSCONF_POSITIVE(_SC_PASS_MAX);
   VERIFY_SYSCONF_POSIX_VERSION(_SC_2_C_BIND);