bionic: Optimize TLS memory by isolating per-thread libgen buffers to dedicated pages

Previously, `struct bionic_tls` embedded two 4KB `basename`/`dirname`
buffers (MAXPATHLEN=4096), adding 8KB to every thread's `bionic_tls`
size, even if these libgen functions were unused.

This commit refactors this approach:
- Removes these embedded arrays from `struct bionic_tls`.
- Consolidates them into a new `struct libgen_buffers`.
- Adds `libgen_buffers_ptr` to `struct bionic_tls` to reference an
  instance of `struct libgen_buffers`.
- Allocates this `libgen_buffers` instance for each thread in a
  dedicated, page-aligned memory region separate from the main
  `bionic_tls` data.

This changes the conceptual memory page layout for threads on a 16KB
system as follows:

  BEFORE (Libgen buffers ~8KB embedded in main Static TLS region):

  Higher Address
  ┌───────────────────────────────────────────────────┐
  │ ~ ~ ~ ~ ~ ~ ~ ~ Page Boundary ~ ~ ~ ~ ~ ~ ~ ~ ~   │ (End of mmap)
  ├───────────────────────────────────────────────────┤
  │                                                   │
  │                 PTHREAD_GUARD                     │
  │                                                   │
  ┌───────────────────────────────────────────────────┐
  │ ~ ~ ~ ~ ~ ~ ~ ~ Page Boundary ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ │
  ├───────────────────────────────────────────────────┤
  │                                                   │ ┐
  │           Main Static TLS Region                  │ |
  │           (TCB, bionic_tls members including      │ |
  │            embedded ~8KB Libgen Buffers,          │ |
  │            other static TLS variables)            | | The whole
  |                                                   | | page including
  │                 ┌─────────────────┐               │ │ the libgen
  │                 | Libgen Buffers  |               │ │ buffers is
  │                 |     (~8KB)      |               │ │ faulted
  │                 └─────────────────┘               │ | in.
  |                                                   | |
  |                                                   │ |
  ├───────────────────────────────────────────────────┤ |
  │                                                   │ |
  │               Stack (extends downwards)           │ |
  │                                                   │ ┘
  ├───────────────────────────────────────────────────┤
  │ ~ ~ ~ ~ ~ ~ ~ ~ Page Boundary ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ │
  ├───────────────────────────────────────────────────┤
  Lower Address

  AFTER (Libgen buffers isolated in own page-aligned region for
  pthreads):

  Higher Address
  ┌───────────────────────────────────────────────────┐
  │ ~ ~ ~ ~ ~ ~ ~ ~ Page Boundary ~ ~ ~ ~ ~ ~ ~ ~ ~   │ (End of mmap)
  ├───────────────────────────────────────────────────┤
  │                                                   │
  │                 PTHREAD_GUARD                     │
  │                                                   │
  ├───────────────────────────────────────────────────┤
  │ ~ ~ ~ ~ ~ ~ ~ ~ Page Boundary ~ ~ ~ ~ ~ ~ ~ ~ ~   |
  ├───────────────────────────────────────────────────┤ ┐
  │                                                   │ │ Dedicated
  │            Libgen Buffers Region                  │ │ libgen buffer
  │    (hosts `struct libgen_buffers`, ~8KB data      │ │ pages.
  │     plus padding to fill page(s))                 │ │ Faulted ONLY
  |                                                   | | if used.
  |                                                   | ┘
  ├───────────────────────────────────────────────────┤
  │ ~ ~ ~ ~ ~ ~ ~ ~ Page Boundary ~ ~ ~ ~ ~ ~ ~ ~ ~   |
  ├───────────────────────────────────────────────────┤
  │                                                   │ ┐
  │        Main Static TLS Region (Smaller)           │ │
  │    (TCB, `bionic_tls` members, etc.)              │ │ Faulted
  │    (Does NOT include libgen buffers)              │ | independently.
  │                                                   │ |
  ├───────────────────────────────────────────────────┤ |
  │                                                   │ |
  │                     Stack                         │ |
  │             (extends downwards)                   | |
  |                                                   │ |
  |                                                   │ |
  │                                                   │ ┘
  │                                                   │
  ├───────────────────────────────────────────────────┤
  │ ~ ~ ~ ~ ~ ~ ~ ~ Page Boundary ~ ~ ~ ~ ~ ~ ~ ~ ~   │
  ├───────────────────────────────────────────────────┤
  Lower Address

Benefit: `basename()`/`dirname()` are often unused by many threads.
Isolating their buffers allows physical memory pages for them to be
faulted in only on-demand when these functions are called, thereby
reducing memory pressure. The `bionic_tls` struct itself is also now
smaller (by ~8KB). On large-page systems (e.g., 16KB), this offers
more space for initial stack growth within the page already faulted
for TLS data, potentially reducing immediate further page faults
from stack expansion.

Measurements: Total RSS of `stack_and_tls` for all processes on Pixel 8
(4KB/16KB kernels), post-boot. Averages over 5 reboots, taken after
RSS fluctuation was <100KB/min.

```
adb shell "cat /proc/*/smaps" | grep -A 4 anon:stack_and_tls |
grep Rss | awk '{s+=$2} END {print s}'
```

For 16KB:
Without this change: 69.67MB (Std dev ~ 0.94MB)
With this change: 45.72MB (Std dev ~ 2.27MB)

For 4KB:
Without this change: 35.71MB (Std dev ~ 0.85MB)
With this change: 34.41MB (Std dev ~0.29MB)

The memory savings are more pronounced on 16KB systems. Previously, the
8KB embedded libgen buffers shared a 16KB page with all other active
Static TLS data, causing the entire page to be faulted in even if
libgen functions were unused.

On 4KB systems, the savings are less significant. This is likely
because:
(1) When the 8KB libgen buffers were unused, even if the start of this
    8KB block wasn't page-aligned, at least one 4KB page could exist
    entirely within that libgen buffer space and would not have been
    faulted in. That specific outcome of a potentially untouched
    internal page remains similar.
(2) A 4KB page size means less likelihood of page sharing between the
    libgen buffers and other active TLS members. Consequently, the
    chance of these buffer pages being faulted in due to "collateral"
    access was already lower compared to the 16KB case.

Test: atest -c CtsBionicTestCases
Bug: 382034899
Change-Id: Ia83a3648f720227a306982891a3f2e4f4af8d084
diff --git a/libc/bionic/__libc_init_main_thread.cpp b/libc/bionic/__libc_init_main_thread.cpp
index 0d557f1..b47fed2 100644
--- a/libc/bionic/__libc_init_main_thread.cpp
+++ b/libc/bionic/__libc_init_main_thread.cpp
@@ -156,12 +156,14 @@
   const StaticTlsLayout& layout = __libc_shared_globals()->static_tls_layout;
   auto new_tcb = reinterpret_cast<bionic_tcb*>(mapping.static_tls + layout.offset_bionic_tcb());
   auto new_tls = reinterpret_cast<bionic_tls*>(mapping.static_tls + layout.offset_bionic_tls());
+  auto new_lb = reinterpret_cast<libgen_buffers*>(mapping.libgen_buffers);
 
   __init_static_tls(mapping.static_tls);
   new_tcb->copy_from_bootstrap(temp_tcb);
   new_tls->copy_from_bootstrap(temp_tls);
   __init_tcb(new_tcb, &main_thread);
   __init_bionic_tls_ptrs(new_tcb, new_tls);
+  __init_libgen_buffers_ptr(new_tls, new_lb);
 
   main_thread.mmap_base = mapping.mmap_base;
   main_thread.mmap_size = mapping.mmap_size;
diff --git a/libc/bionic/libgen.cpp b/libc/bionic/libgen.cpp
index b952822..f02e68a 100644
--- a/libc/bionic/libgen.cpp
+++ b/libc/bionic/libgen.cpp
@@ -158,13 +158,13 @@
 }
 
 char* basename(const char* path) {
-  char* buf = __get_bionic_tls().basename_buf;
-  int rc = __basename_r(path, buf, sizeof(__get_bionic_tls().basename_buf));
+  char* buf = (__get_bionic_tls().libgen_buffers_ptr)->basename_buf;
+  int rc = __basename_r(path, buf, sizeof((__get_bionic_tls().libgen_buffers_ptr)->basename_buf));
   return (rc < 0) ? nullptr : buf;
 }
 
 char* dirname(const char* path) {
-  char* buf = __get_bionic_tls().dirname_buf;
-  int rc = __dirname_r(path, buf, sizeof(__get_bionic_tls().dirname_buf));
+  char* buf = (__get_bionic_tls().libgen_buffers_ptr)->dirname_buf;
+  int rc = __dirname_r(path, buf, sizeof((__get_bionic_tls().libgen_buffers_ptr)->dirname_buf));
   return (rc < 0) ? nullptr : buf;
 }
diff --git a/libc/bionic/pthread_create.cpp b/libc/bionic/pthread_create.cpp
index 1bd2da7..46afb18 100644
--- a/libc/bionic/pthread_create.cpp
+++ b/libc/bionic/pthread_create.cpp
@@ -70,22 +70,30 @@
   tcb->tls_slot(TLS_SLOT_BIONIC_TLS) = tls;
 }
 
+void __init_libgen_buffers_ptr(bionic_tls* tls, libgen_buffers* lb) {
+  tls->libgen_buffers_ptr = lb;
+}
+
+static inline size_t get_temp_bionic_tls_size() {
+  return __BIONIC_ALIGN(sizeof(bionic_tls) + sizeof(libgen_buffers), page_size());
+}
+
 // Allocate a temporary bionic_tls that the dynamic linker's main thread can
 // use while it's loading the initial set of ELF modules.
 bionic_tls* __allocate_temp_bionic_tls() {
-  size_t allocation_size = __BIONIC_ALIGN(sizeof(bionic_tls), page_size());
-  void* allocation = mmap(nullptr, allocation_size,
-                          PROT_READ | PROT_WRITE,
-                          MAP_PRIVATE | MAP_ANONYMOUS,
-                          -1, 0);
+  void* allocation = mmap(nullptr, get_temp_bionic_tls_size(), PROT_READ | PROT_WRITE,
+                          MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
   if (allocation == MAP_FAILED) {
     async_safe_fatal("failed to allocate bionic_tls: %m");
   }
-  return static_cast<bionic_tls*>(allocation);
+  bionic_tls* tls = static_cast<bionic_tls*>(allocation);
+  tls->libgen_buffers_ptr =
+      reinterpret_cast<libgen_buffers*>(static_cast<char*>(allocation) + sizeof(bionic_tls));
+  return tls;
 }
 
 void __free_temp_bionic_tls(bionic_tls* tls) {
-  munmap(tls, __BIONIC_ALIGN(sizeof(bionic_tls), page_size()));
+  munmap(tls, get_temp_bionic_tls_size());
 }
 
 static void __init_alternate_signal_stack(pthread_internal_t* thread) {
@@ -216,11 +224,14 @@
 ThreadMapping __allocate_thread_mapping(size_t stack_size, size_t stack_guard_size) {
   const StaticTlsLayout& layout = __libc_shared_globals()->static_tls_layout;
 
-  // Allocate in order: stack guard, stack, static TLS, guard page.
+  // Allocate in order: stack guard, stack, static TLS, libgen buffers, guard page.
   size_t mmap_size;
   if (__builtin_add_overflow(stack_size, stack_guard_size, &mmap_size)) return {};
   if (__builtin_add_overflow(mmap_size, layout.size(), &mmap_size)) return {};
   if (__builtin_add_overflow(mmap_size, PTHREAD_GUARD_SIZE, &mmap_size)) return {};
+  // Add space for the dedicated libgen buffers page(s).
+  size_t libgen_buffers_padded_size = __BIONIC_ALIGN(sizeof(libgen_buffers), page_size());
+  if (__builtin_add_overflow(mmap_size, libgen_buffers_padded_size, &mmap_size)) return {};
 
   // Align the result to a page size.
   const size_t unaligned_size = mmap_size;
@@ -255,12 +266,21 @@
     return {};
   }
 
+  // Layout from the end of the mmap-ed region (before the top PTHREAD_GUARD_SIZE):
+  //
+  // [ PTHREAD_GUARD_SIZE ]
+  // [ libgen_buffers_padded_size (for dedicated page(s) for libgen buffers) ]
+  // [ layout.size() (for static TLS) ]
+  // [ stack_size ]
+  // [ stack_guard_size ]
+
   ThreadMapping result = {};
   result.mmap_base = space;
   result.mmap_size = mmap_size;
   result.mmap_base_unguarded = space + stack_guard_size;
   result.mmap_size_unguarded = mmap_size - stack_guard_size - PTHREAD_GUARD_SIZE;
-  result.static_tls = space + mmap_size - PTHREAD_GUARD_SIZE - layout.size();
+  result.libgen_buffers = space + mmap_size - PTHREAD_GUARD_SIZE - libgen_buffers_padded_size;
+  result.static_tls = result.libgen_buffers - layout.size();
   result.stack_base = space;
   result.stack_top = result.static_tls;
   return result;
@@ -309,6 +329,7 @@
   const StaticTlsLayout& layout = __libc_shared_globals()->static_tls_layout;
   auto tcb = reinterpret_cast<bionic_tcb*>(mapping.static_tls + layout.offset_bionic_tcb());
   auto tls = reinterpret_cast<bionic_tls*>(mapping.static_tls + layout.offset_bionic_tls());
+  auto lb = reinterpret_cast<libgen_buffers*>(mapping.libgen_buffers);
 
   // Initialize TLS memory.
   __init_static_tls(mapping.static_tls);
@@ -316,6 +337,7 @@
   __init_tcb_dtv(tcb);
   __init_tcb_stack_guard(tcb);
   __init_bionic_tls_ptrs(tcb, tls);
+  __init_libgen_buffers_ptr(tls, lb);
 
   attr->stack_size = stack_top - static_cast<char*>(attr->stack_base);
   thread->attr = *attr;
diff --git a/libc/bionic/pthread_internal.h b/libc/bionic/pthread_internal.h
index ae9a791..c6779eb 100644
--- a/libc/bionic/pthread_internal.h
+++ b/libc/bionic/pthread_internal.h
@@ -195,12 +195,14 @@
   char* static_tls;
   char* stack_base;
   char* stack_top;
+  char* libgen_buffers;
 };
 
 __LIBC_HIDDEN__ void __init_tcb(bionic_tcb* tcb, pthread_internal_t* thread);
 __LIBC_HIDDEN__ void __init_tcb_stack_guard(bionic_tcb* tcb);
 __LIBC_HIDDEN__ void __init_tcb_dtv(bionic_tcb* tcb);
 __LIBC_HIDDEN__ void __init_bionic_tls_ptrs(bionic_tcb* tcb, bionic_tls* tls);
+__LIBC_HIDDEN__ void __init_libgen_buffers_ptr(bionic_tls* tls, libgen_buffers* lb);
 __LIBC_HIDDEN__ bionic_tls* __allocate_temp_bionic_tls();
 __LIBC_HIDDEN__ void __free_temp_bionic_tls(bionic_tls* tls);
 __LIBC_HIDDEN__ void __init_additional_stacks(pthread_internal_t*);
diff --git a/libc/private/bionic_tls.h b/libc/private/bionic_tls.h
index 53fe3d5..8de61a6 100644
--- a/libc/private/bionic_tls.h
+++ b/libc/private/bionic_tls.h
@@ -106,15 +106,27 @@
   void* data;
 };
 
-// ~3 pages. This struct is allocated as static TLS memory (i.e. at a fixed
-// offset from the thread pointer).
+// Defines the memory layout for the TLS buffers used by basename() and
+// dirname() in libgen.h.
+//
+// This struct is separated out from bionic TLS to ensure that the libgen
+// buffers, when mapped, occupy their own set of memory pages distinct
+// from the primary bionic_tls structure. This helps improve memory usage
+// if libgen functions are not heavily used, especially on 16KB page size
+// systems.
+struct libgen_buffers {
+  char basename_buf[MAXPATHLEN];
+  char dirname_buf[MAXPATHLEN];
+};
+
+// This struct is allocated as static TLS memory (i.e. at a fixed offset
+// from the thread pointer).
 struct bionic_tls {
   pthread_key_data_t key_data[BIONIC_PTHREAD_KEY_COUNT];
 
   locale_t locale;
 
-  char basename_buf[MAXPATHLEN];
-  char dirname_buf[MAXPATHLEN];
+  libgen_buffers* libgen_buffers_ptr;
 
   mntent mntent_buf;
   char mntent_strings[BUFSIZ];