Merge "libc/Android.bp: merge libc_openbsd_ndk." into main
diff --git a/libc/bionic/heap_tagging.cpp b/libc/bionic/heap_tagging.cpp
index 48ec955..0c1e506 100644
--- a/libc/bionic/heap_tagging.cpp
+++ b/libc/bionic/heap_tagging.cpp
@@ -57,6 +57,7 @@
         break;
       case M_HEAP_TAGGING_LEVEL_SYNC:
       case M_HEAP_TAGGING_LEVEL_ASYNC:
+        atomic_store(&globals->memtag, true);
         atomic_store(&globals->memtag_stack, __libc_shared_globals()->initial_memtag_stack);
         break;
       default:
@@ -113,6 +114,7 @@
           globals->heap_pointer_tag = static_cast<uintptr_t>(0xffull << UNTAG_SHIFT);
         }
         atomic_store(&globals->memtag_stack, false);
+        atomic_store(&globals->memtag, false);
       });
 
       if (heap_tagging_level != M_HEAP_TAGGING_LEVEL_TBI) {
diff --git a/libc/bionic/libc_init_dynamic.cpp b/libc/bionic/libc_init_dynamic.cpp
index c61810e..295484b 100644
--- a/libc/bionic/libc_init_dynamic.cpp
+++ b/libc/bionic/libc_init_dynamic.cpp
@@ -39,11 +39,12 @@
  *   all dynamic linking has been performed.
  */
 
+#include <elf.h>
 #include <stddef.h>
+#include <stdint.h>
 #include <stdio.h>
 #include <stdlib.h>
-#include <stdint.h>
-#include <elf.h>
+#include "bionic/pthread_internal.h"
 #include "libc_init_common.h"
 
 #include "private/bionic_defs.h"
@@ -59,6 +60,10 @@
   extern int __cxa_atexit(void (*)(void *), void *, void *);
 };
 
+void memtag_stack_dlopen_callback() {
+  __pthread_internal_remap_stack_with_mte();
+}
+
 // Use an initializer so __libc_sysinfo will have a fallback implementation
 // while .preinit_array constructors run.
 #if defined(__i386__)
@@ -156,6 +161,10 @@
 
   __libc_init_mte_late();
 
+  // This roundabout way is needed so we don't use the static libc linked into the linker, which
+  // will not affect the process.
+  __libc_shared_globals()->memtag_stack_dlopen_callback = memtag_stack_dlopen_callback;
+
   exit(slingshot(args.argc - __libc_shared_globals()->initial_linker_arg_count,
                  args.argv + __libc_shared_globals()->initial_linker_arg_count,
                  args.envp));
diff --git a/libc/bionic/pthread_attr.cpp b/libc/bionic/pthread_attr.cpp
index de4cc9e..f6c0401 100644
--- a/libc/bionic/pthread_attr.cpp
+++ b/libc/bionic/pthread_attr.cpp
@@ -155,36 +155,6 @@
   return 0;
 }
 
-static uintptr_t __get_main_stack_startstack() {
-  FILE* fp = fopen("/proc/self/stat", "re");
-  if (fp == nullptr) {
-    async_safe_fatal("couldn't open /proc/self/stat: %m");
-  }
-
-  char line[BUFSIZ];
-  if (fgets(line, sizeof(line), fp) == nullptr) {
-    async_safe_fatal("couldn't read /proc/self/stat: %m");
-  }
-
-  fclose(fp);
-
-  // See man 5 proc. There's no reason comm can't contain ' ' or ')',
-  // so we search backwards for the end of it. We're looking for this field:
-  //
-  //  startstack %lu (28) The address of the start (i.e., bottom) of the stack.
-  uintptr_t startstack = 0;
-  const char* end_of_comm = strrchr(line, ')');
-  if (sscanf(end_of_comm + 1, " %*c "
-             "%*d %*d %*d %*d %*d "
-             "%*u %*u %*u %*u %*u %*u %*u "
-             "%*d %*d %*d %*d %*d %*d "
-             "%*u %*u %*d %*u %*u %*u %" SCNuPTR, &startstack) != 1) {
-    async_safe_fatal("couldn't parse /proc/self/stat");
-  }
-
-  return startstack;
-}
-
 static int __pthread_attr_getstack_main_thread(void** stack_base, size_t* stack_size) {
   ErrnoRestorer errno_restorer;
 
@@ -198,28 +168,11 @@
   if (stack_limit.rlim_cur == RLIM_INFINITY) {
     stack_limit.rlim_cur = 8 * 1024 * 1024;
   }
-
-  // Ask the kernel where our main thread's stack started.
-  uintptr_t startstack = __get_main_stack_startstack();
-
-  // Hunt for the region that contains that address.
-  FILE* fp = fopen("/proc/self/maps", "re");
-  if (fp == nullptr) {
-    async_safe_fatal("couldn't open /proc/self/maps: %m");
-  }
-  char line[BUFSIZ];
-  while (fgets(line, sizeof(line), fp) != nullptr) {
-    uintptr_t lo, hi;
-    if (sscanf(line, "%" SCNxPTR "-%" SCNxPTR, &lo, &hi) == 2) {
-      if (lo <= startstack && startstack <= hi) {
-        *stack_size = stack_limit.rlim_cur;
-        *stack_base = reinterpret_cast<void*>(hi - *stack_size);
-        fclose(fp);
-        return 0;
-      }
-    }
-  }
-  async_safe_fatal("stack not found in /proc/self/maps");
+  uintptr_t lo, hi;
+  __find_main_stack_limits(&lo, &hi);
+  *stack_size = stack_limit.rlim_cur;
+  *stack_base = reinterpret_cast<void*>(hi - *stack_size);
+  return 0;
 }
 
 __BIONIC_WEAK_FOR_NATIVE_BRIDGE
diff --git a/libc/bionic/pthread_internal.cpp b/libc/bionic/pthread_internal.cpp
index 6a7ee2f..bfe2f98 100644
--- a/libc/bionic/pthread_internal.cpp
+++ b/libc/bionic/pthread_internal.cpp
@@ -40,6 +40,7 @@
 #include "private/ErrnoRestorer.h"
 #include "private/ScopedRWLock.h"
 #include "private/bionic_futex.h"
+#include "private/bionic_globals.h"
 #include "private/bionic_tls.h"
 
 static pthread_internal_t* g_thread_list = nullptr;
@@ -119,6 +120,89 @@
   return nullptr;
 }
 
+static uintptr_t __get_main_stack_startstack() {
+  FILE* fp = fopen("/proc/self/stat", "re");
+  if (fp == nullptr) {
+    async_safe_fatal("couldn't open /proc/self/stat: %m");
+  }
+
+  char line[BUFSIZ];
+  if (fgets(line, sizeof(line), fp) == nullptr) {
+    async_safe_fatal("couldn't read /proc/self/stat: %m");
+  }
+
+  fclose(fp);
+
+  // See man 5 proc. There's no reason comm can't contain ' ' or ')',
+  // so we search backwards for the end of it. We're looking for this field:
+  //
+  //  startstack %lu (28) The address of the start (i.e., bottom) of the stack.
+  uintptr_t startstack = 0;
+  const char* end_of_comm = strrchr(line, ')');
+  if (sscanf(end_of_comm + 1,
+             " %*c "
+             "%*d %*d %*d %*d %*d "
+             "%*u %*u %*u %*u %*u %*u %*u "
+             "%*d %*d %*d %*d %*d %*d "
+             "%*u %*u %*d %*u %*u %*u %" SCNuPTR,
+             &startstack) != 1) {
+    async_safe_fatal("couldn't parse /proc/self/stat");
+  }
+
+  return startstack;
+}
+
+void __find_main_stack_limits(uintptr_t* low, uintptr_t* high) {
+  // Ask the kernel where our main thread's stack started.
+  uintptr_t startstack = __get_main_stack_startstack();
+
+  // Hunt for the region that contains that address.
+  FILE* fp = fopen("/proc/self/maps", "re");
+  if (fp == nullptr) {
+    async_safe_fatal("couldn't open /proc/self/maps: %m");
+  }
+  char line[BUFSIZ];
+  while (fgets(line, sizeof(line), fp) != nullptr) {
+    uintptr_t lo, hi;
+    if (sscanf(line, "%" SCNxPTR "-%" SCNxPTR, &lo, &hi) == 2) {
+      if (lo <= startstack && startstack <= hi) {
+        *low = lo;
+        *high = hi;
+        fclose(fp);
+        return;
+      }
+    }
+  }
+  async_safe_fatal("stack not found in /proc/self/maps");
+}
+
+void __pthread_internal_remap_stack_with_mte() {
+#if defined(__aarch64__)
+  // If process doesn't have MTE enabled, we don't need to do anything.
+  if (!__libc_globals->memtag) return;
+  bool prev = true;
+  __libc_globals.mutate(
+      [&prev](libc_globals* globals) { prev = atomic_exchange(&globals->memtag_stack, true); });
+  if (prev) return;
+  uintptr_t lo, hi;
+  __find_main_stack_limits(&lo, &hi);
+
+  if (mprotect(reinterpret_cast<void*>(lo), hi - lo,
+               PROT_READ | PROT_WRITE | PROT_MTE | PROT_GROWSDOWN)) {
+    async_safe_fatal("error: failed to set PROT_MTE on main thread");
+  }
+  ScopedWriteLock creation_locker(&g_thread_creation_lock);
+  ScopedReadLock list_locker(&g_thread_list_lock);
+  for (pthread_internal_t* t = g_thread_list; t != nullptr; t = t->next) {
+    if (t->terminating || t->is_main()) continue;
+    if (mprotect(t->mmap_base_unguarded, t->mmap_size_unguarded,
+                 PROT_READ | PROT_WRITE | PROT_MTE)) {
+      async_safe_fatal("error: failed to set PROT_MTE on thread: %d", t->tid);
+    }
+  }
+#endif
+}
+
 bool android_run_on_all_threads(bool (*func)(void*), void* arg) {
   // Take the locks in this order to avoid inversion (pthread_create ->
   // __pthread_internal_add).
diff --git a/libc/bionic/pthread_internal.h b/libc/bionic/pthread_internal.h
index 3b9e6a4..091f711 100644
--- a/libc/bionic/pthread_internal.h
+++ b/libc/bionic/pthread_internal.h
@@ -178,6 +178,7 @@
   bionic_tls* bionic_tls;
 
   int errno_value;
+  bool is_main() { return start_routine == nullptr; }
 };
 
 struct ThreadMapping {
@@ -207,6 +208,7 @@
 __LIBC_HIDDEN__ pid_t __pthread_internal_gettid(pthread_t pthread_id, const char* caller);
 __LIBC_HIDDEN__ void __pthread_internal_remove(pthread_internal_t* thread);
 __LIBC_HIDDEN__ void __pthread_internal_remove_and_free(pthread_internal_t* thread);
+__LIBC_HIDDEN__ void __find_main_stack_limits(uintptr_t* low, uintptr_t* high);
 
 static inline __always_inline bionic_tcb* __get_bionic_tcb() {
   return reinterpret_cast<bionic_tcb*>(&__get_tls()[MIN_TLS_SLOT]);
@@ -266,6 +268,9 @@
 __LIBC_HIDDEN__ extern void __bionic_atfork_run_child();
 __LIBC_HIDDEN__ extern void __bionic_atfork_run_parent();
 
+// Re-map all threads and successively launched threads with PROT_MTE.
+__LIBC_HIDDEN__ void __pthread_internal_remap_stack_with_mte();
+
 extern "C" bool android_run_on_all_threads(bool (*func)(void*), void* arg);
 
 extern pthread_rwlock_t g_thread_creation_lock;
diff --git a/libc/private/bionic_globals.h b/libc/private/bionic_globals.h
index 8ea7d4d..08a61f0 100644
--- a/libc/private/bionic_globals.h
+++ b/libc/private/bionic_globals.h
@@ -50,6 +50,7 @@
   uintptr_t heap_pointer_tag;
   _Atomic(bool) memtag_stack;
   _Atomic(bool) decay_time_enabled;
+  _Atomic(bool) memtag;
 
   // In order to allow a complete switch between dispatch tables without
   // the need for copying each function by function in the structure,
@@ -135,6 +136,8 @@
   HeapTaggingLevel initial_heap_tagging_level = M_HEAP_TAGGING_LEVEL_NONE;
   bool initial_memtag_stack = false;
   int64_t heap_tagging_upgrade_timer_sec = 0;
+
+  void (*memtag_stack_dlopen_callback)() = nullptr;
 };
 
 __LIBC_HIDDEN__ libc_shared_globals* __libc_shared_globals();
diff --git a/linker/linker.cpp b/linker/linker.cpp
index a12388c..60c8e31 100644
--- a/linker/linker.cpp
+++ b/linker/linker.cpp
@@ -2213,6 +2213,14 @@
   loading_trace.End();
 
   if (si != nullptr) {
+    if (si->has_min_version(7) && si->memtag_stack()) {
+      LD_LOG(kLogDlopen, "... dlopen enabling MTE for: realpath=\"%s\", soname=\"%s\"",
+             si->get_realpath(), si->get_soname());
+      if (auto* cb = __libc_shared_globals()->memtag_stack_dlopen_callback) {
+        cb();
+      }
+    }
+
     void* handle = si->to_handle();
     LD_LOG(kLogDlopen,
            "... dlopen calling constructors: realpath=\"%s\", soname=\"%s\", handle=%p",
@@ -3346,7 +3354,7 @@
                               "\"%s\" has text relocations",
                               get_realpath());
     add_dlwarning(get_realpath(), "text relocations");
-    if (phdr_table_unprotect_segments(phdr, phnum, load_bias) < 0) {
+    if (phdr_table_unprotect_segments(phdr, phnum, load_bias, should_pad_segments_) < 0) {
       DL_ERR("can't unprotect loadable segments for \"%s\": %s", get_realpath(), strerror(errno));
       return false;
     }
@@ -3362,7 +3370,7 @@
 #if !defined(__LP64__)
   if (has_text_relocations) {
     // All relocations are done, we can protect our segments back to read-only.
-    if (phdr_table_protect_segments(phdr, phnum, load_bias) < 0) {
+    if (phdr_table_protect_segments(phdr, phnum, load_bias, should_pad_segments_) < 0) {
       DL_ERR("can't protect segments for \"%s\": %s",
              get_realpath(), strerror(errno));
       return false;
@@ -3400,7 +3408,7 @@
 }
 
 bool soinfo::protect_relro() {
-  if (phdr_table_protect_gnu_relro(phdr, phnum, load_bias) < 0) {
+  if (phdr_table_protect_gnu_relro(phdr, phnum, load_bias, should_pad_segments_) < 0) {
     DL_ERR("can't enable GNU RELRO protection for \"%s\": %s",
            get_realpath(), strerror(errno));
     return false;
diff --git a/linker/linker_main.cpp b/linker/linker_main.cpp
index 5f5eba4..018a5eb 100644
--- a/linker/linker_main.cpp
+++ b/linker/linker_main.cpp
@@ -201,6 +201,7 @@
   const ElfW(Phdr)* phdr;
   size_t phdr_count;
   ElfW(Addr) entry_point;
+  bool should_pad_segments;
 };
 
 static ExecutableInfo get_executable_info(const char* arg_path) {
@@ -293,6 +294,7 @@
   result.phdr = elf_reader.loaded_phdr();
   result.phdr_count = elf_reader.phdr_count();
   result.entry_point = elf_reader.entry_point();
+  result.should_pad_segments = elf_reader.should_pad_segments();
   return result;
 }
 
@@ -366,6 +368,7 @@
   somain = si;
   si->phdr = exe_info.phdr;
   si->phnum = exe_info.phdr_count;
+  si->set_should_pad_segments(exe_info.should_pad_segments);
   get_elf_base_from_phdr(si->phdr, si->phnum, &si->base, &si->load_bias);
   si->size = phdr_table_get_load_size(si->phdr, si->phnum);
   si->dynamic = nullptr;
@@ -399,7 +402,7 @@
     auto note_gnu_property = GnuPropertySection(somain);
     if (note_gnu_property.IsBTICompatible() &&
         (phdr_table_protect_segments(somain->phdr, somain->phnum, somain->load_bias,
-                                     &note_gnu_property) < 0)) {
+                                     somain->should_pad_segments(), &note_gnu_property) < 0)) {
       __linker_error("error: can't protect segments for \"%s\": %s", exe_info.path.c_str(),
                      strerror(errno));
     }
diff --git a/linker/linker_phdr.cpp b/linker/linker_phdr.cpp
index 8925e62..af0ef1d 100644
--- a/linker/linker_phdr.cpp
+++ b/linker/linker_phdr.cpp
@@ -196,7 +196,7 @@
     // For Armv8.5-A loaded executable segments may require PROT_BTI.
     if (note_gnu_property_.IsBTICompatible()) {
       did_load_ = (phdr_table_protect_segments(phdr_table_, phdr_num_, load_bias_,
-                                               &note_gnu_property_) == 0);
+                                               should_pad_segments_, &note_gnu_property_) == 0);
     }
 #endif
   }
@@ -748,6 +748,36 @@
   return true;
 }
 
+static inline void _extend_load_segment_vma(const ElfW(Phdr)* phdr_table, size_t phdr_count,
+                                             size_t phdr_idx, ElfW(Addr)* p_memsz,
+                                             ElfW(Addr)* p_filesz) {
+  const ElfW(Phdr)* phdr = &phdr_table[phdr_idx];
+  const ElfW(Phdr)* next = nullptr;
+  size_t next_idx = phdr_idx + 1;
+  if (next_idx < phdr_count && phdr_table[next_idx].p_type == PT_LOAD) {
+    next = &phdr_table[next_idx];
+  }
+
+  // If this is the last LOAD segment, no extension is needed
+  if (!next || *p_memsz != *p_filesz) {
+    return;
+  }
+
+  ElfW(Addr) next_start = page_start(next->p_vaddr);
+  ElfW(Addr) curr_end = page_end(phdr->p_vaddr + *p_memsz);
+
+  // If adjacent segment mappings overlap, no extension is needed.
+  if (curr_end >= next_start) {
+    return;
+  }
+
+  // Extend the LOAD segment mapping to be contiguous with that of
+  // the next LOAD segment.
+  ElfW(Addr) extend = next_start - curr_end;
+  *p_memsz += extend;
+  *p_filesz += extend;
+}
+
 bool ElfReader::LoadSegments() {
   for (size_t i = 0; i < phdr_num_; ++i) {
     const ElfW(Phdr)* phdr = &phdr_table_[i];
@@ -756,18 +786,24 @@
       continue;
     }
 
+    ElfW(Addr) p_memsz = phdr->p_memsz;
+    ElfW(Addr) p_filesz = phdr->p_filesz;
+    if (phdr->p_align > kPageSize && should_pad_segments_) {
+      _extend_load_segment_vma(phdr_table_, phdr_num_, i, &p_memsz, &p_filesz);
+    }
+
     // Segment addresses in memory.
     ElfW(Addr) seg_start = phdr->p_vaddr + load_bias_;
-    ElfW(Addr) seg_end   = seg_start + phdr->p_memsz;
+    ElfW(Addr) seg_end = seg_start + p_memsz;
 
     ElfW(Addr) seg_page_start = page_start(seg_start);
     ElfW(Addr) seg_page_end = page_end(seg_end);
 
-    ElfW(Addr) seg_file_end   = seg_start + phdr->p_filesz;
+    ElfW(Addr) seg_file_end = seg_start + p_filesz;
 
     // File offsets.
     ElfW(Addr) file_start = phdr->p_offset;
-    ElfW(Addr) file_end   = file_start + phdr->p_filesz;
+    ElfW(Addr) file_end = file_start + p_filesz;
 
     ElfW(Addr) file_page_start = page_start(file_start);
     ElfW(Addr) file_length = file_end - file_page_start;
@@ -777,12 +813,12 @@
       return false;
     }
 
-    if (file_end > static_cast<size_t>(file_size_)) {
+    if (file_start + phdr->p_filesz > static_cast<size_t>(file_size_)) {
       DL_ERR("invalid ELF file \"%s\" load segment[%zd]:"
           " p_offset (%p) + p_filesz (%p) ( = %p) past end of file (0x%" PRIx64 ")",
           name_.c_str(), i, reinterpret_cast<void*>(phdr->p_offset),
           reinterpret_cast<void*>(phdr->p_filesz),
-          reinterpret_cast<void*>(file_end), file_size_);
+          reinterpret_cast<void*>(file_start + phdr->p_filesz), file_size_);
       return false;
     }
 
@@ -822,8 +858,18 @@
 
     // if the segment is writable, and does not end on a page boundary,
     // zero-fill it until the page limit.
-    if ((phdr->p_flags & PF_W) != 0 && page_offset(seg_file_end) > 0) {
-      memset(reinterpret_cast<void*>(seg_file_end), 0, page_size() - page_offset(seg_file_end));
+    //
+    // The intention is to zero the partial page at that may exist at the
+    // end of a file backed mapping. With the extended seg_file_end, this
+    // file offset as calculated from the mapping start can overrun the end
+    // of the file. However pages in that range cannot be touched by userspace
+    // because the kernel will not be able to handle a file map fault past the
+    // extent of the file. No need to try zeroing this untouchable region.
+    // Zero the partial page at the end of the original unextended seg_file_end.
+    ElfW(Addr) seg_file_end_orig = seg_start + phdr->p_filesz;
+    if ((phdr->p_flags & PF_W) != 0 && page_offset(seg_file_end_orig) > 0) {
+      memset(reinterpret_cast<void*>(seg_file_end_orig), 0,
+             kPageSize - page_offset(seg_file_end_orig));
     }
 
     seg_file_end = page_end(seg_file_end);
@@ -856,17 +902,23 @@
  * phdr_table_protect_segments and phdr_table_unprotect_segments.
  */
 static int _phdr_table_set_load_prot(const ElfW(Phdr)* phdr_table, size_t phdr_count,
-                                     ElfW(Addr) load_bias, int extra_prot_flags) {
-  const ElfW(Phdr)* phdr = phdr_table;
-  const ElfW(Phdr)* phdr_limit = phdr + phdr_count;
+                                     ElfW(Addr) load_bias, int extra_prot_flags,
+                                     bool should_pad_segments) {
+  for (size_t i = 0; i < phdr_count; ++i) {
+    const ElfW(Phdr)* phdr = &phdr_table[i];
 
-  for (; phdr < phdr_limit; phdr++) {
     if (phdr->p_type != PT_LOAD || (phdr->p_flags & PF_W) != 0) {
       continue;
     }
 
-    ElfW(Addr) seg_page_start = page_start(phdr->p_vaddr) + load_bias;
-    ElfW(Addr) seg_page_end = page_end(phdr->p_vaddr + phdr->p_memsz) + load_bias;
+    ElfW(Addr) p_memsz = phdr->p_memsz;
+    ElfW(Addr) p_filesz = phdr->p_filesz;
+    if (phdr->p_align > kPageSize && should_pad_segments) {
+      _extend_load_segment_vma(phdr_table, phdr_count, i, &p_memsz, &p_filesz);
+    }
+
+    ElfW(Addr) seg_page_start = page_start(phdr->p_vaddr + load_bias);
+    ElfW(Addr) seg_page_end = page_end(phdr->p_vaddr + p_memsz + load_bias);
 
     int prot = PFLAGS_TO_PROT(phdr->p_flags) | extra_prot_flags;
     if ((prot & PROT_WRITE) != 0) {
@@ -901,19 +953,21 @@
  *   phdr_table  -> program header table
  *   phdr_count  -> number of entries in tables
  *   load_bias   -> load bias
+ *   should_pad_segments -> Are segments extended to avoid gaps in the memory map
  *   prop        -> GnuPropertySection or nullptr
  * Return:
  *   0 on success, -1 on failure (error code in errno).
  */
 int phdr_table_protect_segments(const ElfW(Phdr)* phdr_table, size_t phdr_count,
-                                ElfW(Addr) load_bias, const GnuPropertySection* prop __unused) {
+                                ElfW(Addr) load_bias, bool should_pad_segments,
+                                const GnuPropertySection* prop __unused) {
   int prot = 0;
 #if defined(__aarch64__)
   if ((prop != nullptr) && prop->IsBTICompatible()) {
     prot |= PROT_BTI;
   }
 #endif
-  return _phdr_table_set_load_prot(phdr_table, phdr_count, load_bias, prot);
+  return _phdr_table_set_load_prot(phdr_table, phdr_count, load_bias, prot, should_pad_segments);
 }
 
 /* Change the protection of all loaded segments in memory to writable.
@@ -929,19 +983,53 @@
  *   phdr_table  -> program header table
  *   phdr_count  -> number of entries in tables
  *   load_bias   -> load bias
+ *   should_pad_segments -> Are segments extended to avoid gaps in the memory map
  * Return:
  *   0 on success, -1 on failure (error code in errno).
  */
 int phdr_table_unprotect_segments(const ElfW(Phdr)* phdr_table,
-                                  size_t phdr_count, ElfW(Addr) load_bias) {
-  return _phdr_table_set_load_prot(phdr_table, phdr_count, load_bias, PROT_WRITE);
+                                  size_t phdr_count, ElfW(Addr) load_bias,
+                                  bool should_pad_segments) {
+  return _phdr_table_set_load_prot(phdr_table, phdr_count, load_bias, PROT_WRITE,
+                                   should_pad_segments);
+}
+
+static inline void _extend_gnu_relro_prot_end(const ElfW(Phdr)* relro_phdr,
+                                              const ElfW(Phdr)* phdr_table, size_t phdr_count,
+                                              ElfW(Addr) load_bias, ElfW(Addr)* seg_page_end) {
+  // Find the index and phdr of the LOAD containing the GNU_RELRO segment
+  for (size_t index = 0; index < phdr_count; ++index) {
+    const ElfW(Phdr)* phdr = &phdr_table[index];
+
+    if (phdr->p_type == PT_LOAD && phdr->p_vaddr == relro_phdr->p_vaddr) {
+      // If the PT_GNU_RELRO mem size is not at least as large as the corresponding
+      // LOAD segment mem size, we need to protect only a partial region of the
+      // LOAD segment and therefore cannot avoid a VMA split.
+      if (relro_phdr->p_memsz < phdr->p_memsz) {
+        break;
+      }
+
+      ElfW(Addr) p_memsz = phdr->p_memsz;
+      ElfW(Addr) p_filesz = phdr->p_filesz;
+
+      // Attempt extending the VMA (mprotect range). Without extending the range
+      // mprotect will only RO protect a part of the extend RW LOAD segment, which will
+      // leave an extra split RW VMA (the gap).
+      _extend_load_segment_vma(phdr_table, phdr_count, index, &p_memsz, &p_filesz);
+
+      *seg_page_end = page_end(phdr->p_vaddr + p_memsz + load_bias);
+
+      break;
+    }
+  }
 }
 
 /* Used internally by phdr_table_protect_gnu_relro and
  * phdr_table_unprotect_gnu_relro.
  */
 static int _phdr_table_set_gnu_relro_prot(const ElfW(Phdr)* phdr_table, size_t phdr_count,
-                                          ElfW(Addr) load_bias, int prot_flags) {
+                                          ElfW(Addr) load_bias, int prot_flags,
+                                          bool should_pad_segments) {
   const ElfW(Phdr)* phdr = phdr_table;
   const ElfW(Phdr)* phdr_limit = phdr + phdr_count;
 
@@ -966,8 +1054,16 @@
     //       the program is likely to fail at runtime. So in effect the
     //       linker must only emit a PT_GNU_RELRO segment if it ensures
     //       that it starts on a page boundary.
-    ElfW(Addr) seg_page_start = page_start(phdr->p_vaddr) + load_bias;
-    ElfW(Addr) seg_page_end = page_end(phdr->p_vaddr + phdr->p_memsz) + load_bias;
+    ElfW(Addr) seg_page_start = page_start(phdr->p_vaddr + load_bias);
+    ElfW(Addr) seg_page_end = page_end(phdr->p_vaddr + phdr->p_memsz + load_bias);
+
+    // Before extending the RO protection, we need to ensure that the segments were extended
+    // by bionic, because the kernel won't map gaps so it usually contains unrelated
+    // mappings which will be incorrectly protected as RO likely leading to
+    // segmentation fault.
+    if (phdr->p_align > kPageSize && should_pad_segments) {
+      _extend_gnu_relro_prot_end(phdr, phdr_table, phdr_count, load_bias, &seg_page_end);
+    }
 
     int ret = mprotect(reinterpret_cast<void*>(seg_page_start),
                        seg_page_end - seg_page_start,
@@ -992,12 +1088,14 @@
  *   phdr_table  -> program header table
  *   phdr_count  -> number of entries in tables
  *   load_bias   -> load bias
+ *   should_pad_segments -> Were segments extended to avoid gaps in the memory map
  * Return:
  *   0 on success, -1 on failure (error code in errno).
  */
-int phdr_table_protect_gnu_relro(const ElfW(Phdr)* phdr_table,
-                                 size_t phdr_count, ElfW(Addr) load_bias) {
-  return _phdr_table_set_gnu_relro_prot(phdr_table, phdr_count, load_bias, PROT_READ);
+int phdr_table_protect_gnu_relro(const ElfW(Phdr)* phdr_table, size_t phdr_count,
+                                 ElfW(Addr) load_bias, bool should_pad_segments) {
+  return _phdr_table_set_gnu_relro_prot(phdr_table, phdr_count, load_bias, PROT_READ,
+                                        should_pad_segments);
 }
 
 /* Serialize the GNU relro segments to the given file descriptor. This can be
diff --git a/linker/linker_phdr.h b/linker/linker_phdr.h
index e5b87bb..4deed33 100644
--- a/linker/linker_phdr.h
+++ b/linker/linker_phdr.h
@@ -128,13 +128,14 @@
 size_t phdr_table_get_maximum_alignment(const ElfW(Phdr)* phdr_table, size_t phdr_count);
 
 int phdr_table_protect_segments(const ElfW(Phdr)* phdr_table, size_t phdr_count,
-                                ElfW(Addr) load_bias, const GnuPropertySection* prop = nullptr);
+                                ElfW(Addr) load_bias, bool should_pad_segments,
+                                const GnuPropertySection* prop = nullptr);
 
 int phdr_table_unprotect_segments(const ElfW(Phdr)* phdr_table, size_t phdr_count,
-                                  ElfW(Addr) load_bias);
+                                  ElfW(Addr) load_bias, bool should_pad_segments);
 
 int phdr_table_protect_gnu_relro(const ElfW(Phdr)* phdr_table, size_t phdr_count,
-                                 ElfW(Addr) load_bias);
+                                 ElfW(Addr) load_bias, bool should_pad_segments);
 
 int phdr_table_serialize_gnu_relro(const ElfW(Phdr)* phdr_table, size_t phdr_count,
                                    ElfW(Addr) load_bias, int fd, size_t* file_offset);
diff --git a/linker/linker_relocate.cpp b/linker/linker_relocate.cpp
index 952dade..5b58895 100644
--- a/linker/linker_relocate.cpp
+++ b/linker/linker_relocate.cpp
@@ -187,7 +187,8 @@
   auto protect_segments = [&]() {
     // Make .text executable.
     if (phdr_table_protect_segments(relocator.si->phdr, relocator.si->phnum,
-                                    relocator.si->load_bias) < 0) {
+                                    relocator.si->load_bias,
+                                    relocator.si->should_pad_segments()) < 0) {
       DL_ERR("can't protect segments for \"%s\": %s",
              relocator.si->get_realpath(), strerror(errno));
       return false;
@@ -197,7 +198,8 @@
   auto unprotect_segments = [&]() {
     // Make .text writable.
     if (phdr_table_unprotect_segments(relocator.si->phdr, relocator.si->phnum,
-                                      relocator.si->load_bias) < 0) {
+                                      relocator.si->load_bias,
+                                      relocator.si->should_pad_segments()) < 0) {
       DL_ERR("can't unprotect loadable segments for \"%s\": %s",
              relocator.si->get_realpath(), strerror(errno));
       return false;
diff --git a/tests/Android.bp b/tests/Android.bp
index a62abab..0ba91ea 100644
--- a/tests/Android.bp
+++ b/tests/Android.bp
@@ -1108,6 +1108,31 @@
 }
 
 cc_test {
+    name: "memtag_stack_dlopen_test",
+    enabled: false,
+    // This does not use bionic_tests_defaults because it is not supported on
+    // host.
+    arch: {
+        arm64: {
+            enabled: true,
+        },
+    },
+    sanitize: {
+        memtag_heap: true,
+        memtag_stack: false,
+    },
+    srcs: [
+        "memtag_stack_dlopen_test.cpp",
+    ],
+    shared_libs: [
+        "libbase",
+    ],
+    data_libs: ["libtest_simple_memtag_stack"],
+    header_libs: ["bionic_libc_platform_headers"],
+    test_suites: ["device-tests"],
+}
+
+cc_test {
     name: "bionic-stress-tests",
     defaults: [
         "bionic_tests_defaults",
diff --git a/tests/dlext_test.cpp b/tests/dlext_test.cpp
index d078e50..6883da9 100644
--- a/tests/dlext_test.cpp
+++ b/tests/dlext_test.cpp
@@ -31,6 +31,7 @@
 #include <android-base/test_utils.h>
 
 #include <sys/mman.h>
+#include <sys/stat.h>
 #include <sys/types.h>
 #include <sys/vfs.h>
 #include <sys/wait.h>
@@ -2046,6 +2047,11 @@
                                                              -1, 0));
   ASSERT_TRUE(reinterpret_cast<void*>(reserved_addr) != MAP_FAILED);
 
+  struct stat file_stat;
+  int ret = TEMP_FAILURE_RETRY(stat(private_library_absolute_path.c_str(), &file_stat));
+  ASSERT_EQ(ret, 0) << "Failed to stat library";
+  size_t file_size = file_stat.st_size;
+
   for (const auto& rec : maps_to_copy) {
     uintptr_t offset = rec.addr_start - addr_start;
     size_t size = rec.addr_end - rec.addr_start;
@@ -2053,7 +2059,13 @@
     void* map = mmap(addr, size, PROT_READ | PROT_WRITE,
                      MAP_ANON | MAP_PRIVATE | MAP_FIXED, -1, 0);
     ASSERT_TRUE(map != MAP_FAILED);
-    memcpy(map, reinterpret_cast<void*>(rec.addr_start), size);
+    size_t seg_size = size;
+    // See comment on file map fault in ElfReader::LoadSegments()
+    // bionic/linker/linker_phdr.cpp
+    if (rec.offset + size > file_size) {
+      seg_size = file_size - rec.offset;
+    }
+    memcpy(map, reinterpret_cast<void*>(rec.addr_start), seg_size);
     mprotect(map, size, rec.perms);
   }
 
diff --git a/tests/libs/Android.bp b/tests/libs/Android.bp
index 039d1e1..68efbd9 100644
--- a/tests/libs/Android.bp
+++ b/tests/libs/Android.bp
@@ -234,6 +234,17 @@
 }
 
 // -----------------------------------------------------------------------------
+// Library used by memtag_stack_dlopen_test tests
+// -----------------------------------------------------------------------------
+cc_test_library {
+    name: "libtest_simple_memtag_stack",
+    sanitize: {
+        memtag_stack: true,
+    },
+    srcs: ["dlopen_testlib_simple.cpp"],
+}
+
+// -----------------------------------------------------------------------------
 // Libraries used by hwasan_test
 // -----------------------------------------------------------------------------
 cc_test_library {
diff --git a/tests/memtag_stack_dlopen_test.cpp b/tests/memtag_stack_dlopen_test.cpp
new file mode 100644
index 0000000..308af1e
--- /dev/null
+++ b/tests/memtag_stack_dlopen_test.cpp
@@ -0,0 +1,148 @@
+/*
+ * Copyright (C) 2023 The Android Open Source Project
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *  * Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in
+ *    the documentation and/or other materials provided with the
+ *    distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+ * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+ * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS
+ * OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
+ * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
+ * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <thread>
+
+#include <dlfcn.h>
+#include <stdlib.h>
+
+#include <gtest/gtest.h>
+
+#include <android-base/silent_death_test.h>
+#include <android-base/test_utils.h>
+#include "utils.h"
+
+#if defined(__BIONIC__) && defined(__aarch64__)
+__attribute__((target("mte"))) bool is_stack_mte_on() {
+  alignas(16) int x = 0;
+  void* p = reinterpret_cast<void*>(reinterpret_cast<uintptr_t>(&x) + (1UL << 57));
+  void* p_cpy = p;
+  __builtin_arm_stg(p);
+  p = __builtin_arm_ldg(p);
+  __builtin_arm_stg(&x);
+  return p == p_cpy;
+}
+
+// We can't use pthread_getattr_np because that uses the rlimit rather than the actual mapping
+// bounds.
+static void find_main_stack_limits(uintptr_t* low, uintptr_t* high) {
+  uintptr_t startstack = reinterpret_cast<uintptr_t>(__builtin_frame_address(0));
+
+  // Hunt for the region that contains that address.
+  FILE* fp = fopen("/proc/self/maps", "re");
+  if (fp == nullptr) {
+    abort();
+  }
+  char line[BUFSIZ];
+  while (fgets(line, sizeof(line), fp) != nullptr) {
+    uintptr_t lo, hi;
+    if (sscanf(line, "%" SCNxPTR "-%" SCNxPTR, &lo, &hi) == 2) {
+      if (lo <= startstack && startstack <= hi) {
+        *low = lo;
+        *high = hi;
+        fclose(fp);
+        return;
+      }
+    }
+  }
+  abort();
+}
+
+template <typename Fn>
+unsigned int fault_new_stack_page(uintptr_t low, Fn f) {
+  uintptr_t new_low;
+  uintptr_t new_high;
+  volatile char buf[4096];
+  buf[4095] = 1;
+  find_main_stack_limits(&new_low, &new_high);
+  if (new_low < low) {
+    f();
+    return new_high;
+  }
+  // Useless, but should defeat TCO.
+  return new_low + fault_new_stack_page(low, f);
+}
+
+#endif
+
+enum State { kInit, kThreadStarted, kStackRemapped };
+
+TEST(MemtagStackDlopenTest, DlopenRemapsStack) {
+#if defined(__BIONIC__) && defined(__aarch64__)
+  if (!running_with_mte()) GTEST_SKIP() << "Test requires MTE.";
+
+  std::string path = android::base::GetExecutableDirectory() + "/libtest_simple_memtag_stack.so";
+  ASSERT_EQ(0, access(path.c_str(), F_OK));  // Verify test setup.
+  EXPECT_FALSE(is_stack_mte_on());
+  std::mutex m;
+  std::condition_variable cv;
+  State state = kInit;
+
+  bool is_early_thread_mte_on = false;
+  std::thread early_th([&] {
+    {
+      std::lock_guard lk(m);
+      state = kThreadStarted;
+    }
+    cv.notify_one();
+    {
+      std::unique_lock lk(m);
+      cv.wait(lk, [&] { return state == kStackRemapped; });
+    }
+    is_early_thread_mte_on = is_stack_mte_on();
+  });
+  {
+    std::unique_lock lk(m);
+    cv.wait(lk, [&] { return state == kThreadStarted; });
+  }
+  void* handle = dlopen(path.c_str(), RTLD_NOW);
+  {
+    std::lock_guard lk(m);
+    state = kStackRemapped;
+  }
+  cv.notify_one();
+  ASSERT_NE(handle, nullptr);
+  EXPECT_TRUE(is_stack_mte_on());
+
+  bool new_stack_page_mte_on = false;
+  uintptr_t low;
+  uintptr_t high;
+  find_main_stack_limits(&low, &high);
+  fault_new_stack_page(low, [&] { new_stack_page_mte_on = is_stack_mte_on(); });
+  EXPECT_TRUE(new_stack_page_mte_on);
+
+  bool is_late_thread_mte_on = false;
+  std::thread late_th([&] { is_late_thread_mte_on = is_stack_mte_on(); });
+  late_th.join();
+  early_th.join();
+  EXPECT_TRUE(is_early_thread_mte_on);
+  EXPECT_TRUE(is_late_thread_mte_on);
+#else
+  GTEST_SKIP() << "requires bionic arm64";
+#endif
+}