riscv64 SCS support.

Bug: https://github.com/google/android-riscv64/issues/55
Test: treehugger
Change-Id: I05d48a07a302305126942d38529ffa280640c7b7
diff --git a/libc/arch-arm64/bionic/setjmp.S b/libc/arch-arm64/bionic/setjmp.S
index 8e00b56..d787a56 100644
--- a/libc/arch-arm64/bionic/setjmp.S
+++ b/libc/arch-arm64/bionic/setjmp.S
@@ -46,8 +46,6 @@
 // 0      sigflag/cookie  setjmp cookie in top 31 bits, signal mask flag in low bit
 // 1      sigmask         signal mask (not used with _setjmp / _longjmp)
 // 2      core_base       base of core registers (x18-x30, sp)
-//                        (We only store the low bits of x18 to avoid leaking the
-//                        shadow call stack address into memory.)
 // 16     float_base      base of float registers (d8-d15)
 // 24     checksum        checksum of core registers
 // 25     reserved        reserved entries (room to grow)
@@ -68,8 +66,6 @@
 #define _JB_D8_D9       (_JB_D10_D11 + 2)
 #define _JB_CHECKSUM    (_JB_D8_D9 + 2)
 
-#define SCS_MASK (SCS_SIZE - 1)
-
 .macro m_mangle_registers reg, sp_reg
   eor x3, x3, \reg
   eor x19, x19, \reg
@@ -155,6 +151,9 @@
   bic x1, x1, #1
 
   // Mask off the high bits of the shadow call stack pointer.
+  // We only store the low bits of x18 to avoid leaking the
+  // shadow call stack address into memory.
+  // See the SCS commentary in pthread_internal.h for more detail.
   and x3, x18, #SCS_MASK
 
   // Save core registers.
diff --git a/libc/arch-riscv64/bionic/setjmp.S b/libc/arch-riscv64/bionic/setjmp.S
index eea2978..ba3cacf 100644
--- a/libc/arch-riscv64/bionic/setjmp.S
+++ b/libc/arch-riscv64/bionic/setjmp.S
@@ -79,7 +79,7 @@
 .macro m_mangle_registers reg, sp_reg
   xor s0, s0, \reg
   xor s1, s1, \reg
-  xor s2, s2, \reg
+  xor a4, a4, \reg  // a4 is the masked s2 (x18) for SCS.
   xor s3, s3, \reg
   xor s4, s4, \reg
   xor s5, s5, \reg
@@ -151,13 +151,20 @@
   ld a1, _JB_SIGFLAG(a0)
   andi a1, a1, -2
 
+  // Mask off the high bits of the shadow call stack pointer.
+  // We only store the low bits of x18 to avoid leaking the
+  // shadow call stack address into memory.
+  // See the SCS commentary in pthread_internal.h for more detail.
+  li a4, SCS_MASK
+  and a4, a4, x18
+
   // Save core registers.
   mv a2, sp
   m_mangle_registers a1, sp_reg=a2
   sd ra,  _JB_RA(a0)
   sd s0,  _JB_S0(a0)
   sd s1,  _JB_S1(a0)
-  sd s2,  _JB_S2(a0)
+  sd a4,  _JB_S2(a0)  // a4 is the masked s2 (x18) for SCS.
   sd s3,  _JB_S3(a0)
   sd s4,  _JB_S4(a0)
   sd s5,  _JB_S5(a0)
@@ -231,7 +238,7 @@
   ld ra,  _JB_RA(a0)
   ld s0,  _JB_S0(a0)
   ld s1,  _JB_S1(a0)
-  ld s2,  _JB_S2(a0)
+  ld a4,  _JB_S2(a0)  // Don't clobber s2 (x18) used for SCS yet.
   ld s3,  _JB_S3(a0)
   ld s4,  _JB_S4(a0)
   ld s5,  _JB_S5(a0)
@@ -245,6 +252,11 @@
   m_unmangle_registers a2, sp_reg=a3
   mv sp, a3
 
+  // Restore the low bits of the shadow call stack pointer.
+  li a5, ~SCS_MASK
+  and x18, x18, a5
+  or x18, a4, x18
+
   addi sp, sp, -24
   sd   ra, 0(sp)
   sd   a0, 8(sp)
diff --git a/libc/bionic/pthread_create.cpp b/libc/bionic/pthread_create.cpp
index 417ce76..15d6d6d 100644
--- a/libc/bionic/pthread_create.cpp
+++ b/libc/bionic/pthread_create.cpp
@@ -116,14 +116,14 @@
 }
 
 static void __init_shadow_call_stack(pthread_internal_t* thread __unused) {
-#ifdef __aarch64__
+#if defined(__aarch64__) || defined(__riscv)
   // Allocate the stack and the guard region.
   char* scs_guard_region = reinterpret_cast<char*>(
       mmap(nullptr, SCS_GUARD_REGION_SIZE, 0, MAP_PRIVATE | MAP_ANON, -1, 0));
   thread->shadow_call_stack_guard_region = scs_guard_region;
 
   // The address is aligned to SCS_SIZE so that we only need to store the lower log2(SCS_SIZE) bits
-  // in jmp_buf.
+  // in jmp_buf. See the SCS commentary in pthread_internal.h for more detail.
   char* scs_aligned_guard_region =
       reinterpret_cast<char*>(align_up(reinterpret_cast<uintptr_t>(scs_guard_region), SCS_SIZE));
 
@@ -133,11 +133,15 @@
   size_t scs_offset =
       (getpid() == 1) ? 0 : (arc4random_uniform(SCS_GUARD_REGION_SIZE / SCS_SIZE - 1) * SCS_SIZE);
 
-  // Make the stack readable and writable and store its address in register x18. This is
-  // deliberately the only place where the address is stored.
-  char *scs = scs_aligned_guard_region + scs_offset;
+  // Make the stack readable and writable and store its address in x18.
+  // This is deliberately the only place where the address is stored.
+  char* scs = scs_aligned_guard_region + scs_offset;
   mprotect(scs, SCS_SIZE, PROT_READ | PROT_WRITE);
+#if defined(__aarch64__)
   __asm__ __volatile__("mov x18, %0" ::"r"(scs));
+#elif defined(__riscv)
+  __asm__ __volatile__("mv x18, %0" ::"r"(scs));
+#endif
 #endif
 }
 
diff --git a/libc/bionic/pthread_exit.cpp b/libc/bionic/pthread_exit.cpp
index bde95ec..f584b27 100644
--- a/libc/bionic/pthread_exit.cpp
+++ b/libc/bionic/pthread_exit.cpp
@@ -117,7 +117,7 @@
     __rt_sigprocmask(SIG_BLOCK, &set, nullptr, sizeof(sigset64_t));
   }
 
-#ifdef __aarch64__
+#if defined(__aarch64__) || defined(__riscv)
   // Free the shadow call stack and guard pages.
   munmap(thread->shadow_call_stack_guard_region, SCS_GUARD_REGION_SIZE);
 #endif
diff --git a/libc/bionic/pthread_internal.h b/libc/bionic/pthread_internal.h
index 7222b62..083c2ed 100644
--- a/libc/bionic/pthread_internal.h
+++ b/libc/bionic/pthread_internal.h
@@ -105,9 +105,13 @@
 
   void* alternate_signal_stack;
 
-  // The start address of the shadow call stack's guard region (arm64 only).
+  // The start address of the shadow call stack's guard region (arm64/riscv64).
+  // This region is SCS_GUARD_REGION_SIZE bytes large, but only SCS_SIZE bytes
+  // are actually used.
+  //
   // This address is only used to deallocate the shadow call stack on thread
   // exit; the address of the stack itself is stored only in the x18 register.
+  //
   // Because the protection offered by SCS relies on the secrecy of the stack
   // address, storing the address here weakens the protection, but only
   // slightly, because it is relatively easy for an attacker to discover the
@@ -115,13 +119,22 @@
   // to other allocations), but not the stack itself, which is <0.1% of the size
   // of the guard region.
   //
+  // longjmp()/setjmp() don't store all the bits of x18, only the bottom bits
+  // covered by SCS_MASK. Since longjmp()/setjmp() between different threads is
+  // undefined behavior (and unsupported on Android), we can retrieve the high
+  // bits of x18 from the current value in x18 --- all the jmp_buf needs to store
+  // is where exactly the shadow stack pointer is in the thread's shadow stack:
+  // the bottom bits of x18.
+  //
   // There are at least two other options for discovering the start address of
   // the guard region on thread exit, but they are not as simple as storing in
   // TLS.
+  //
   // 1) Derive it from the value of the x18 register. This is only possible in
   //    processes that do not contain legacy code that might clobber x18,
   //    therefore each process must declare early during process startup whether
   //    it might load legacy code.
+  //    TODO: riscv64 has no legacy code, so we can actually go this route there!
   // 2) Mark the guard region as such using prctl(PR_SET_VMA_ANON_NAME) and
   //    discover its address by reading /proc/self/maps. One issue with this is
   //    that reading /proc/self/maps can race with allocations, so we may need
diff --git a/libc/private/bionic_constants.h b/libc/private/bionic_constants.h
index 09294b6..d7f4474 100644
--- a/libc/private/bionic_constants.h
+++ b/libc/private/bionic_constants.h
@@ -14,18 +14,18 @@
  * limitations under the License.
  */
 
-#ifndef _BIONIC_CONSTANTS_H_
-#define _BIONIC_CONSTANTS_H_
+#pragma once
 
 #define NS_PER_S 1000000000
 
-// Size of the shadow call stack. This must be a power of 2.
+// Size of the shadow call stack. This can be small because these stacks only
+// contain return addresses. This must be a power of 2 so the mask trick works.
+// See the SCS commentary in pthread_internal.h for more detail.
 #define SCS_SIZE (8 * 1024)
+#define SCS_MASK (SCS_SIZE - 1)
 
 // The shadow call stack is allocated at an aligned address within a guard region of this size. The
 // guard region must be large enough that we can allocate an SCS_SIZE-aligned SCS while ensuring
 // that there is at least one guard page after the SCS so that a stack overflow results in a SIGSEGV
 // instead of corrupting the allocation that comes after it.
 #define SCS_GUARD_REGION_SIZE (16 * 1024 * 1024)
-
-#endif // _BIONIC_CONSTANTS_H_