[riscv][bionic] Prototype TLS Descriptor support

Add basic assembly stubs for TLS Descriptor support in the dynamic
linker, and enable several code paths related to TLSDESC for RISC-V.

Note: This patch requires an updated toolchain that supports TLSDESC
for RISC-V, and the `-mtls-dialect=` compiler option specifically.

Test: adb shell /data/nativetest64/bionic-unit-tests/bionic-unit-tests --gtest_filter=*tls*
Bug: 322984914
Change-Id: I74bd0fa216b44b4ca2c5a5a6aec37b3fc47b00d9
diff --git a/libc/private/bionic_elf_dtv_offset.h b/libc/private/bionic_elf_dtv_offset.h
new file mode 100644
index 0000000..8d9f3b9
--- /dev/null
+++ b/libc/private/bionic_elf_dtv_offset.h
@@ -0,0 +1,44 @@
+/*
+ * Copyright (C) 2024 The Android Open Source Project
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *  * Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in
+ *    the documentation and/or other materials provided with the
+ *    distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+ * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+ * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS
+ * OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
+ * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
+ * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#pragma once
+
+#if defined(__riscv)
+// TLS_DTV_OFFSET is a constant used in relocation fields, defined in RISC-V ELF Specification[1]
+// The front of the TCB contains a pointer to the DTV, and each pointer in DTV
+// points to 0x800 past the start of a TLS block to make full use of the range
+// of load/store instructions, refer to [2].
+//
+// [1]: RISC-V ELF Specification.
+// https://github.com/riscv-non-isa/riscv-elf-psabi-doc/blob/master/riscv-elf.adoc#constants
+// [2]: Documentation of TLS data structures
+// https://github.com/riscv-non-isa/riscv-elf-psabi-doc/issues/53
+#define TLS_DTV_OFFSET 0x800
+#else
+#define TLS_DTV_OFFSET 0
+#endif
diff --git a/libc/private/bionic_elf_tls.h b/libc/private/bionic_elf_tls.h
index 8bd5bc5..04297ad 100644
--- a/libc/private/bionic_elf_tls.h
+++ b/libc/private/bionic_elf_tls.h
@@ -34,6 +34,8 @@
 #include <stdint.h>
 #include <sys/cdefs.h>
 
+#include "bionic_elf_dtv_offset.h"
+
 __LIBC_HIDDEN__ extern _Atomic(size_t) __libc_tls_generation_copy;
 
 struct TlsAlign {
@@ -227,17 +229,3 @@
 void __free_dynamic_tls(bionic_tcb* tcb);
 void __notify_thread_exit_callbacks();
 
-#if defined(__riscv)
-// TLS_DTV_OFFSET is a constant used in relocation fields, defined in RISC-V ELF Specification[1]
-// The front of the TCB contains a pointer to the DTV, and each pointer in DTV
-// points to 0x800 past the start of a TLS block to make full use of the range
-// of load/store instructions, refer to [2].
-//
-// [1]: RISC-V ELF Specification.
-// https://github.com/riscv-non-isa/riscv-elf-psabi-doc/blob/master/riscv-elf.adoc#constants
-// [2]: Documentation of TLS data structures
-// https://github.com/riscv-non-isa/riscv-elf-psabi-doc/issues/53
-#define TLS_DTV_OFFSET 0x800
-#else
-#define TLS_DTV_OFFSET 0
-#endif
diff --git a/linker/Android.bp b/linker/Android.bp
index 1ede380..78109e8 100644
--- a/linker/Android.bp
+++ b/linker/Android.bp
@@ -231,6 +231,7 @@
     name: "linker_sources_riscv64",
     srcs: [
         "arch/riscv64/begin.S",
+        "arch/riscv64/tlsdesc_resolver.S",
     ],
 }
 
diff --git a/linker/arch/riscv64/tlsdesc_resolver.S b/linker/arch/riscv64/tlsdesc_resolver.S
new file mode 100644
index 0000000..fedc926
--- /dev/null
+++ b/linker/arch/riscv64/tlsdesc_resolver.S
@@ -0,0 +1,201 @@
+/*
+ * Copyright (C) 2024 The Android Open Source Project
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *  * Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in
+ *    the documentation and/or other materials provided with the
+ *    distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+ * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+ * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS
+ * OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
+ * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
+ * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <platform/bionic/tls_defines.h>
+#include <private/bionic_asm.h>
+#include <private/bionic_elf_dtv_offset.h>
+
+#ifndef TLS_DTV_OFFSET
+  #error "TLS_DTV_OFFSET not defined"
+#endif
+
+.globl __tls_get_addr
+
+// spill a register onto the stack
+.macro spill reg, idx, f=
+  \f\()sd \reg, \idx*8(sp)
+  .cfi_rel_offset \reg, (\idx)*8
+.endm
+
+// reload a value from the stack
+.macro reload reg, idx, f=
+  \f\()ld \reg, \idx*8(sp)
+  .cfi_same_value \reg
+.endm
+
+.macro spill_vector_regs
+  csrr a3, vlenb
+  slli a3, a3, 3
+  sub sp, sp, a3
+  vs8r.v v0, (sp)
+  sub sp, sp, a3
+  vs8r.v v8, (sp)
+  sub sp, sp, a3
+  vs8r.v v16, (sp)
+  sub sp, sp, a3
+  vs8r.v v24, (sp)
+.endm
+
+.macro reload_vector_regs
+  csrr a3, vlenb
+  slli a3, a3, 3
+  vl8r.v v24, (sp)
+  add sp, sp, a3
+  vl8r.v v16, (sp)
+  add sp, sp, a3
+  vl8r.v v8, (sp)
+  add sp, sp, a3
+  vl8r.v v0, (sp)
+  add sp, sp, a3
+.endm
+
+// We save a total of 35 registers
+.macro for_each_saved_reg op max
+  \op ra, 1
+  \op a1, 2
+  \op a2, 3
+  \op a3, 4
+  \op a4, 5
+  \op a5, 6
+  \op a6, 7
+  \op a7, 8
+  \op t0, 9
+  \op t1, 10
+  \op t2, 11
+  \op t3, 12
+  \op t4, 13
+  \op t5, 14
+  \op t6, 15
+  // save floating point regs
+  \op ft0, 16, f
+  \op ft1, 17, f
+  \op ft2, 18, f
+  \op ft3, 19, f
+  \op ft4, 20, f
+  \op ft5, 21, f
+  \op ft6, 22, f
+  \op ft7, 23, f
+  \op ft8, 24, f
+  \op ft9, 25, f
+  \op ft10, 26, f
+  \op ft11, 27, f
+  \op fa0, 28, f
+  \op fa1, 29, f
+  \op fa2, 30, f
+  \op fa3, 31, f
+  \op fa4, 32, f
+  \op fa5, 33, f
+  \op fa6, 34, f
+  \op fa7, 35, f
+.endm
+
+// These resolver functions must preserve every register except a0. They set a0
+// to the offset of the TLS symbol relative to the thread pointer.
+
+ENTRY_PRIVATE(tlsdesc_resolver_static)
+  ld a0, 8(a0)
+  jr t0
+END(tlsdesc_resolver_static)
+
+ENTRY_PRIVATE(tlsdesc_resolver_dynamic)
+  // We only need 3 stack slots, but still require a 4th slot for alignment
+  addi sp, sp, -4*8
+  .cfi_def_cfa_offset 4*8
+  spill a1, 1
+  spill a2, 2
+  spill a3, 3
+
+  ld a2, (TLS_SLOT_DTV * 8)(tp) // a2 = &DTV
+  ld a1, (a2)                   // a1 = TlsDtv::generation (DTV[0])
+
+  ld a0, 8(a0)                  // a0 = TlsDynamicResolverArg*
+  ld a3, (a0)                   // a3 = TlsDynamicResolverArg::generation
+
+  // Fallback if TlsDtv::generation < TlsDynamicResolverArg::generation
+  // since we need to call __tls_get_addr
+  blt a1, a3, L(fallback)
+
+  // We can't modify a0 yet, since tlsdesc_resolver_dynamic_slow_path requires
+  // a pointer to the TlsIndex, which is the second field of the
+  // TlsDynamicResolverArg. As a result, we can't modify a0 until we will no
+  // longer fallback.
+  ld a1, 8(a0)                  // a1 = TlsIndex::module_id
+  slli a1, a1, 3                // a1 = module_id*8 -- scale the idx
+  add a1, a2, a1                // a1 = &TlsDtv::modules[module_id]
+  ld a1, (a1)                   // a1 = TlsDtv::modules[module_id]
+  beqz a1, L(fallback)
+  ld a3, 16(a0)                 // a3 = TlsIndex::offset
+  add a0, a1, a3                // a0 = TlsDtv::modules[module_id] + offset
+  sub a0, a0, tp                // a0 = TlsDtv::modules[module_id] + offset - tp
+
+  .cfi_remember_state
+  reload a3, 3
+  reload a2, 2
+  reload a1, 1
+  addi sp, sp, 4*8
+  .cfi_adjust_cfa_offset -4*8
+  jr t0
+
+L(fallback):
+  reload a3, 3
+  reload a2, 2
+  reload a1, 1
+  addi sp, sp, 4*8
+  .cfi_adjust_cfa_offset -4*8
+  j tlsdesc_resolver_dynamic_slow_path
+END(tlsdesc_resolver_dynamic)
+
+// On entry, a0 is the address of a TlsDynamicResolverArg object rather than
+// the TlsDescriptor address passed to the original resolver function.
+ENTRY_PRIVATE(tlsdesc_resolver_dynamic_slow_path)
+  // We save a total of 35 registers, but vector spills require an alignment
+  // of 16, so use an extra slot to align it correctly.
+  addi sp, sp, (-8*36)
+  .cfi_def_cfa_offset (8 * 36)
+  for_each_saved_reg spill, 36
+  spill_vector_regs
+
+  add a0, a0, 8
+  call __tls_get_addr
+  addi a0, a0, (-1 * TLS_DTV_OFFSET)  // Correct the address by TLS_DTV_OFFSET
+  sub a0, a0, tp
+
+  reload_vector_regs
+  for_each_saved_reg reload, 36
+  addi sp, sp, 8*36
+  .cfi_def_cfa_offset 0
+  jr t0
+END(tlsdesc_resolver_dynamic_slow_path)
+
+// The address of an unresolved weak TLS symbol evaluates to NULL with TLSDESC.
+// The value returned by this function is added to the thread pointer, so return
+// a negated thread pointer to cancel it out.
+ENTRY_PRIVATE(tlsdesc_resolver_unresolved_weak)
+  sub a0, zero, tp
+  jr t0
+END(tlsdesc_resolver_unresolved_weak)
diff --git a/linker/linker_relocate.cpp b/linker/linker_relocate.cpp
index 85f7b3a..3e36114 100644
--- a/linker/linker_relocate.cpp
+++ b/linker/linker_relocate.cpp
@@ -438,9 +438,9 @@
       }
       break;
 
-#if defined(__aarch64__)
-    // Bionic currently only implements TLSDESC for arm64. This implementation should work with
-    // other architectures, as long as the resolver functions are implemented.
+#if defined(__aarch64__) || defined(__riscv)
+    // Bionic currently implements TLSDESC for arm64 and riscv64. This implementation should work
+    // with other architectures, as long as the resolver functions are implemented.
     case R_GENERIC_TLSDESC:
       count_relocation_if<IsGeneral>(kRelocRelative);
       {
@@ -482,7 +482,7 @@
         }
       }
       break;
-#endif  // defined(__aarch64__)
+#endif  // defined(__aarch64__) || defined(__riscv)
 
 #if defined(__x86_64__)
     case R_X86_64_32:
@@ -672,14 +672,14 @@
 
   // Once the tlsdesc_args_ vector's size is finalized, we can write the addresses of its elements
   // into the TLSDESC relocations.
-#if defined(__aarch64__)
-  // Bionic currently only implements TLSDESC for arm64.
+#if defined(__aarch64__) || defined(__riscv)
+  // Bionic currently only implements TLSDESC for arm64 and riscv64.
   for (const std::pair<TlsDescriptor*, size_t>& pair : relocator.deferred_tlsdesc_relocs) {
     TlsDescriptor* desc = pair.first;
     desc->func = tlsdesc_resolver_dynamic;
     desc->arg = reinterpret_cast<size_t>(&tlsdesc_args_[pair.second]);
   }
-#endif
+#endif // defined(__aarch64__) || defined(__riscv)
 
   return true;
 }
diff --git a/tests/elftls_dl_test.cpp b/tests/elftls_dl_test.cpp
index e759e15..e409b72 100644
--- a/tests/elftls_dl_test.cpp
+++ b/tests/elftls_dl_test.cpp
@@ -107,6 +107,22 @@
   void* lib = dlopen("libtest_elftls_dynamic.so", RTLD_LOCAL | RTLD_NOW);
   ASSERT_NE(nullptr, lib);
 
+  auto get_local_var2 = reinterpret_cast<int(*)()>(dlsym(lib, "get_local_var2"));
+  ASSERT_NE(nullptr, get_local_var2);
+
+  auto get_local_var1 = reinterpret_cast<int(*)()>(dlsym(lib, "get_local_var1"));
+  ASSERT_NE(nullptr, get_local_var1);
+
+  auto get_local_var1_addr = reinterpret_cast<int*(*)()>(dlsym(lib, "get_local_var1_addr"));
+  ASSERT_NE(nullptr, get_local_var1_addr);
+
+  // Make sure subsequent accesses return the same pointer.
+  ASSERT_EQ(get_local_var1_addr(), get_local_var1_addr());
+
+  // Check the initial values are correct.
+  ASSERT_EQ(25, get_local_var2());
+  ASSERT_EQ(15, get_local_var1());
+
   auto bump_local_vars = reinterpret_cast<int(*)()>(dlsym(lib, "bump_local_vars"));
   ASSERT_NE(nullptr, bump_local_vars);
 
@@ -135,7 +151,7 @@
 // TLSDESC, the result is NULL. With __tls_get_addr, the result is the
 // generation count (or maybe undefined behavior)? This test only tests TLSDESC.
 TEST(elftls_dl, tlsdesc_missing_weak) {
-#if defined(__aarch64__)
+#if defined(__aarch64__) || defined(__riscv)
   void* lib = dlopen("libtest_elftls_dynamic.so", RTLD_LOCAL | RTLD_NOW);
   ASSERT_NE(nullptr, lib);
 
diff --git a/tests/libs/elftls_dtv_resize_helper.cpp b/tests/libs/elftls_dtv_resize_helper.cpp
index 340d5df..7fb6fb5 100644
--- a/tests/libs/elftls_dtv_resize_helper.cpp
+++ b/tests/libs/elftls_dtv_resize_helper.cpp
@@ -180,8 +180,8 @@
   // Access a TLS variable from the first filler module.
   ASSERT_EQ(102, func1());
   ASSERT_EQ(5u, highest_modid_in_dtv());
-#if defined(__aarch64__)
-  // The arm64 TLSDESC resolver doesn't update the DTV if it is new enough for
+#if defined(__aarch64__) || defined(__riscv)
+  // The arm64 and riscv64 TLSDESC resolver doesn't update the DTV if it is new enough for
   // the given access.
   ASSERT_EQ(initial_dtv, dtv());
   ASSERT_EQ(5u, dtv()->count);
diff --git a/tests/libs/elftls_dynamic.cpp b/tests/libs/elftls_dynamic.cpp
index 2500484..df3ad75 100644
--- a/tests/libs/elftls_dynamic.cpp
+++ b/tests/libs/elftls_dynamic.cpp
@@ -66,6 +66,18 @@
   return ++local_var_1 + ++local_var_2;
 }
 
+extern "C" int get_local_var1() {
+  return local_var_1;
+}
+
+extern "C" int* get_local_var1_addr() {
+  return &local_var_1;
+}
+
+extern "C" int get_local_var2() {
+  return local_var_2;
+}
+
 __attribute__((weak)) extern "C" __thread int missing_weak_dyn_tls;
 
 extern "C" int* missing_weak_dyn_tls_addr() {