riscv64: use `tail` for tail calls.

Don't assume that a `j` will always be in range. Let the toolchain
come up with the shortest sequence that will actually work.

Test: llvm-objdump -d
Change-Id: I497a68ecae434aad173c2b1c8024aed3756b8440
diff --git a/libc/arch-riscv64/bionic/__bionic_clone.S b/libc/arch-riscv64/bionic/__bionic_clone.S
index d535095..2827857 100644
--- a/libc/arch-riscv64/bionic/__bionic_clone.S
+++ b/libc/arch-riscv64/bionic/__bionic_clone.S
@@ -51,7 +51,7 @@
 .L_bc_failure:
   # Set errno if something went wrong.
   neg a0, a0
-  j __set_errno_internal
+  tail __set_errno_internal
 
 .L_bc_child:
   # We're in the child now. Set the end of the frame record chain.
@@ -62,5 +62,5 @@
   ld a0, 0(sp)
   ld a1, 8(sp)
   addi sp, sp, 16
-  j __start_thread
+  tail __start_thread
 END(__bionic_clone)