riscv64: switch from x18 to gp for shadow call stack.

We want to give back a useful callee-saved general purpose
register (x18) that was only "chosen" because it was what llvm
allowed for historical reasons. gp is a better choice because it's
effectively unused otherwise anyway.

Unfortunately, that means we need extra space in jmp_buf (which I've
reserved in an earlier change, e7b3b8b467bad2cd32470b5edd5cb9938b934316),
so let's rearrange the entries in jmp_buf to match their order in the
register file.

Bug: https://github.com/google/android-riscv64/issues/72
Bug: http://b/277909695
Test: treehugger
Change-Id: Ia629409a894c1a83d2052885702bbdd895c758e1
diff --git a/libc/arch-riscv64/bionic/setjmp.S b/libc/arch-riscv64/bionic/setjmp.S
index ba3cacf..26f7ec9 100644
--- a/libc/arch-riscv64/bionic/setjmp.S
+++ b/libc/arch-riscv64/bionic/setjmp.S
@@ -36,50 +36,52 @@
 // 0      sigflag/cookie  setjmp cookie in top 31 bits, signal mask flag in low bit
 // 1      sigmask         64-bit signal mask
 // 2      ra
-// 3      s0
+// 3      sp
+// 4      gp
+// 5      s0
 // ......
-// 14     s11
-// 15     sp
-// 16     fs0
+// 16     s11
+// 17     fs0
 // ......
-// 27     fs11
-// 28     checksum
+// 28     fs11
+// 29     checksum
 // _JBLEN: defined in bionic/libc/include/setjmp.h
 
 #define _JB_SIGFLAG   0
 #define _JB_SIGMASK   1 * 8
 #define _JB_RA        2 * 8
-#define _JB_S0        3 * 8
-#define _JB_S1        4 * 8
-#define _JB_S2        5 * 8
-#define _JB_S3        6 * 8
-#define _JB_S4        7 * 8
-#define _JB_S5        8 * 8
-#define _JB_S6        9 * 8
-#define _JB_S7       10 * 8
-#define _JB_S8       11 * 8
-#define _JB_S9       12 * 8
-#define _JB_S10      13 * 8
-#define _JB_S11      14 * 8
-#define _JB_SP       15 * 8
-#define _JB_FS0      16 * 8
-#define _JB_FS1      17 * 8
-#define _JB_FS2      18 * 8
-#define _JB_FS3      19 * 8
-#define _JB_FS4      20 * 8
-#define _JB_FS5      21 * 8
-#define _JB_FS6      22 * 8
-#define _JB_FS7      23 * 8
-#define _JB_FS8      24 * 8
-#define _JB_FS9      25 * 8
-#define _JB_FS10     26 * 8
-#define _JB_FS11     27 * 8
-#define _JB_CHECKSUM 28 * 8
+#define _JB_SP        3 * 8
+#define _JB_GP        4 * 8
+#define _JB_S0        5 * 8
+#define _JB_S1        6 * 8
+#define _JB_S2        7 * 8
+#define _JB_S3        8 * 8
+#define _JB_S4        9 * 8
+#define _JB_S5       10 * 8
+#define _JB_S6       11 * 8
+#define _JB_S7       12 * 8
+#define _JB_S8       13 * 8
+#define _JB_S9       14 * 8
+#define _JB_S10      15 * 8
+#define _JB_S11      16 * 8
+#define _JB_FS0      17 * 8
+#define _JB_FS1      18 * 8
+#define _JB_FS2      19 * 8
+#define _JB_FS3      20 * 8
+#define _JB_FS4      21 * 8
+#define _JB_FS5      22 * 8
+#define _JB_FS6      23 * 8
+#define _JB_FS7      24 * 8
+#define _JB_FS8      25 * 8
+#define _JB_FS9      26 * 8
+#define _JB_FS10     27 * 8
+#define _JB_FS11     28 * 8
+#define _JB_CHECKSUM 29 * 8
 
 .macro m_mangle_registers reg, sp_reg
   xor s0, s0, \reg
   xor s1, s1, \reg
-  xor a4, a4, \reg  // a4 is the masked s2 (x18) for SCS.
+  xor s2, s2, \reg
   xor s3, s3, \reg
   xor s4, s4, \reg
   xor s5, s5, \reg
@@ -89,12 +91,13 @@
   xor s9, s9, \reg
   xor s10, s10, \reg
   xor s11, s11, \reg
+  xor a4, a4, \reg  // a4 is the masked gp (x3) for SCS.
   xor \sp_reg, \sp_reg, \reg
 .endm
 
 .macro m_calculate_checksum dst, src, scratch
   li \dst, 0
-  .irp i,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27
+  .irp i,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28
     ld \scratch, (\i * 8)(\src)
     xor \dst, \dst, \scratch
   .endr
@@ -152,19 +155,21 @@
   andi a1, a1, -2
 
   // Mask off the high bits of the shadow call stack pointer.
-  // We only store the low bits of x18 to avoid leaking the
+  // We only store the low bits of gp to avoid leaking the
   // shadow call stack address into memory.
   // See the SCS commentary in pthread_internal.h for more detail.
   li a4, SCS_MASK
-  and a4, a4, x18
+  and a4, a4, gp
 
   // Save core registers.
   mv a2, sp
   m_mangle_registers a1, sp_reg=a2
   sd ra,  _JB_RA(a0)
+  sd a4,  _JB_GP(a0)  // a4 is the masked gp (x3) for SCS.
+  sd a2,  _JB_SP(a0)
   sd s0,  _JB_S0(a0)
   sd s1,  _JB_S1(a0)
-  sd a4,  _JB_S2(a0)  // a4 is the masked s2 (x18) for SCS.
+  sd s2,  _JB_S2(a0)
   sd s3,  _JB_S3(a0)
   sd s4,  _JB_S4(a0)
   sd s5,  _JB_S5(a0)
@@ -174,7 +179,6 @@
   sd s9,  _JB_S9(a0)
   sd s10, _JB_S10(a0)
   sd s11, _JB_S11(a0)
-  sd a2,  _JB_SP(a0)
   m_unmangle_registers a1, sp_reg=a2
 
   // Save floating point registers.
@@ -236,9 +240,10 @@
   // Restore core registers.
   andi a2, a2, -2
   ld ra,  _JB_RA(a0)
+  ld a4,  _JB_GP(a0)  // Don't clobber the upper bits of gp (x3) used for SCS yet.
   ld s0,  _JB_S0(a0)
   ld s1,  _JB_S1(a0)
-  ld a4,  _JB_S2(a0)  // Don't clobber s2 (x18) used for SCS yet.
+  ld s2,  _JB_S2(a0)
   ld s3,  _JB_S3(a0)
   ld s4,  _JB_S4(a0)
   ld s5,  _JB_S5(a0)
@@ -254,8 +259,8 @@
 
   // Restore the low bits of the shadow call stack pointer.
   li a5, ~SCS_MASK
-  and x18, x18, a5
-  or x18, a4, x18
+  and gp, gp, a5
+  or gp, gp, a4
 
   addi sp, sp, -24
   sd   ra, 0(sp)
diff --git a/libc/bionic/pthread_create.cpp b/libc/bionic/pthread_create.cpp
index 15d6d6d..7bf9b40 100644
--- a/libc/bionic/pthread_create.cpp
+++ b/libc/bionic/pthread_create.cpp
@@ -133,14 +133,14 @@
   size_t scs_offset =
       (getpid() == 1) ? 0 : (arc4random_uniform(SCS_GUARD_REGION_SIZE / SCS_SIZE - 1) * SCS_SIZE);
 
-  // Make the stack readable and writable and store its address in x18.
-  // This is deliberately the only place where the address is stored.
+  // Make the stack read-write, and store its address in the register we're using as the shadow
+  // stack pointer. This is deliberately the only place where the address is stored.
   char* scs = scs_aligned_guard_region + scs_offset;
   mprotect(scs, SCS_SIZE, PROT_READ | PROT_WRITE);
 #if defined(__aarch64__)
   __asm__ __volatile__("mov x18, %0" ::"r"(scs));
 #elif defined(__riscv)
-  __asm__ __volatile__("mv x18, %0" ::"r"(scs));
+  __asm__ __volatile__("mv gp, %0" ::"r"(scs));
 #endif
 #endif
 }
diff --git a/libc/bionic/pthread_internal.h b/libc/bionic/pthread_internal.h
index 083c2ed..a3a4ccd 100644
--- a/libc/bionic/pthread_internal.h
+++ b/libc/bionic/pthread_internal.h
@@ -110,7 +110,8 @@
   // are actually used.
   //
   // This address is only used to deallocate the shadow call stack on thread
-  // exit; the address of the stack itself is stored only in the x18 register.
+  // exit; the address of the stack itself is stored only in the register used
+  // as the shadow stack pointer (x18 on arm64, gp on riscv64).
   //
   // Because the protection offered by SCS relies on the secrecy of the stack
   // address, storing the address here weakens the protection, but only
@@ -119,22 +120,24 @@
   // to other allocations), but not the stack itself, which is <0.1% of the size
   // of the guard region.
   //
-  // longjmp()/setjmp() don't store all the bits of x18, only the bottom bits
-  // covered by SCS_MASK. Since longjmp()/setjmp() between different threads is
-  // undefined behavior (and unsupported on Android), we can retrieve the high
-  // bits of x18 from the current value in x18 --- all the jmp_buf needs to store
-  // is where exactly the shadow stack pointer is in the thread's shadow stack:
-  // the bottom bits of x18.
+  // longjmp()/setjmp() don't store all the bits of the shadow stack pointer,
+  // only the bottom bits covered by SCS_MASK. Since longjmp()/setjmp() between
+  // different threads is undefined behavior (and unsupported on Android), we
+  // can retrieve the high bits of the shadow stack pointer from the current
+  // value in the register --- all the jmp_buf needs to store is where exactly
+  // the shadow stack pointer is *within* the thread's shadow stack: the bottom
+  // bits of the register.
   //
   // There are at least two other options for discovering the start address of
   // the guard region on thread exit, but they are not as simple as storing in
   // TLS.
   //
-  // 1) Derive it from the value of the x18 register. This is only possible in
-  //    processes that do not contain legacy code that might clobber x18,
-  //    therefore each process must declare early during process startup whether
-  //    it might load legacy code.
-  //    TODO: riscv64 has no legacy code, so we can actually go this route there!
+  // 1) Derive it from the current value of the shadow stack pointer. This is
+  //    only possible in processes that do not contain legacy code that might
+  //    clobber x18 on arm64, therefore each process must declare early during
+  //    process startup whether it might load legacy code.
+  //    TODO: riscv64 has no legacy code, so we can actually go this route
+  //    there, but hopefully we'll actually get the Zsslpcfi extension instead.
   // 2) Mark the guard region as such using prctl(PR_SET_VMA_ANON_NAME) and
   //    discover its address by reading /proc/self/maps. One issue with this is
   //    that reading /proc/self/maps can race with allocations, so we may need