Re-do static function dispatch.

Until now we had ifuncs for the string/memory routines in dynamic
executables but trivial one-line assembler stubs branching to the
default implementation in all cases for static executables. I did
recently experiment with running the ifuncs earlier in static
executables, but that actually broke one of our tests --- although our
internal ifuncs are built very carefully, an ifunc built as a test ends
up with shadow call stack trying to write and read x18 in the prolog and
epilog, but the shadow call stack (and x18) isn't set up yet, leading to
a crash (before any of the usual signal handlers have been installed,
so without any useful diagnostics).

While as far as I can tell, basically no-one outside of libc actually
uses ifuncs (in part because they're not available on non-Linux systems,
and if you're going to have to have a different implementation for other
OSes, you usually end up preferring to just use the same implementation
everywhere), *and* this only affects static executables, which are also
barely used in practice, I still don't fancy the app compat risk of
just saying "meh, anyone with ifuncs will have to make sure they're safe".

Which leads us to this change, which is basically a poor man's ifunc,
with function-scoped static function pointers as a poor man's GOT.
(Which means that, while not ideal performance-wise, this isn't actually
much worse than a dynamic executable.)

This change also factors out the typedefs, since there were a handful
of mistakes in some of the copies.

Test: run the static benchmarks on various architectures
Change-Id: Iec945b710174bebaa27cf71ed0101e7f5d769697
diff --git a/libc/arch-arm/dynamic_function_dispatch.cpp b/libc/arch-arm/dynamic_function_dispatch.cpp
index 1d2f38f..f984421 100644
--- a/libc/arch-arm/dynamic_function_dispatch.cpp
+++ b/libc/arch-arm/dynamic_function_dispatch.cpp
@@ -27,27 +27,26 @@
  */
 
 #include <fcntl.h>
-#include <sys/syscall.h>
-
 #include <private/bionic_ifuncs.h>
+#include <sys/syscall.h>
 
 extern "C" {
 
 enum CpuVariant {
-    kUnknown = 0,
-    kGeneric,
-    kCortexA7,
-    kCortexA9,
-    kCortexA53,
-    kCortexA55,
-    kKrait,
-    kKryo,
+  kUnknown = 0,
+  kGeneric,
+  kCortexA7,
+  kCortexA9,
+  kCortexA53,
+  kCortexA55,
+  kKrait,
+  kKryo,
 };
 
 static constexpr int MAX_CPU_NAME_LEN = 12;
 struct CpuVariantNames {
-    alignas(alignof(int)) char name[MAX_CPU_NAME_LEN];
-    CpuVariant variant;
+  alignas(alignof(int)) char name[MAX_CPU_NAME_LEN];
+  CpuVariant variant;
 };
 
 static constexpr CpuVariantNames cpu_variant_names[] = {
@@ -66,227 +65,237 @@
 };
 
 static long ifunc_open(const char* pathname) {
-    register long r0 __asm__("r0") = AT_FDCWD;
-    register long r1 __asm__("r1") = reinterpret_cast<long>(pathname);
-    register long r2 __asm__("r2") = O_RDONLY;
-    register long r3 __asm__("r3") = 0;
-    register long r7 __asm__("r7") = __NR_openat;
-    __asm__ volatile("swi #0" : "=r"(r0) : "r"(r0), "r"(r1), "r"(r2), "r"(r3), "r"(r7));
-    return r0;
+  register long r0 __asm__("r0") = AT_FDCWD;
+  register long r1 __asm__("r1") = reinterpret_cast<long>(pathname);
+  register long r2 __asm__("r2") = O_RDONLY;
+  register long r3 __asm__("r3") = 0;
+  register long r7 __asm__("r7") = __NR_openat;
+  __asm__ volatile("swi #0"
+                   : "=r"(r0)
+                   : "r"(r0), "r"(r1), "r"(r2), "r"(r3), "r"(r7));
+  return r0;
 }
 
 static ssize_t ifunc_read(int fd, void* buf, size_t count) {
-    register long r0 __asm__("r0") = fd;
-    register long r1 __asm__("r1") = reinterpret_cast<long>(buf);
-    register long r2 __asm__("r2") = count;
-    register long r7 __asm__("r7") = __NR_read;
-    __asm__ volatile("swi #0" : "=r"(r0) : "r"(r0), "r"(r1), "r"(r2), "r"(r7) : "memory");
-    return r0;
+  register long r0 __asm__("r0") = fd;
+  register long r1 __asm__("r1") = reinterpret_cast<long>(buf);
+  register long r2 __asm__("r2") = count;
+  register long r7 __asm__("r7") = __NR_read;
+  __asm__ volatile("swi #0"
+                   : "=r"(r0)
+                   : "r"(r0), "r"(r1), "r"(r2), "r"(r7)
+                   : "memory");
+  return r0;
 }
 
 static int ifunc_close(int fd) {
-    register long r0 __asm__("r0") = fd;
-    register long r7 __asm__("r7") = __NR_close;
-    __asm__ volatile("swi #0" : "=r"(r0) : "r"(r0), "r"(r7));
-    return r0;
+  register long r0 __asm__("r0") = fd;
+  register long r7 __asm__("r7") = __NR_close;
+  __asm__ volatile("swi #0" : "=r"(r0) : "r"(r0), "r"(r7));
+  return r0;
 }
 
 static bool is_same_name(const char* a, const char* b) {
-    static_assert(MAX_CPU_NAME_LEN % sizeof(int) == 0, "");
-    const int* ia = reinterpret_cast<const int*>(a);
-    const int* ib = reinterpret_cast<const int*>(b);
-    for (size_t i = 0; i < MAX_CPU_NAME_LEN / sizeof(int); ++i) {
-        if (ia[i] != ib[i]) {
-            return false;
-        }
+  static_assert(MAX_CPU_NAME_LEN % sizeof(int) == 0, "");
+  const int* ia = reinterpret_cast<const int*>(a);
+  const int* ib = reinterpret_cast<const int*>(b);
+  for (size_t i = 0; i < MAX_CPU_NAME_LEN / sizeof(int); ++i) {
+    if (ia[i] != ib[i]) {
+      return false;
     }
-    return true;
+  }
+  return true;
 }
 
 static CpuVariant init_cpu_variant() {
-    int fd = ifunc_open("/dev/cpu_variant:arm");
-    if (fd < 0) return kGeneric;
+  int fd = ifunc_open("/dev/cpu_variant:arm");
+  if (fd < 0) return kGeneric;
 
-    alignas(alignof(int)) char name[MAX_CPU_NAME_LEN] = {};
+  alignas(alignof(int)) char name[MAX_CPU_NAME_LEN] = {};
 
-    int bytes_read, total_read = 0;
-    while (total_read < MAX_CPU_NAME_LEN - 1 &&
-           (bytes_read = ifunc_read(fd, name + total_read,
-                                    MAX_CPU_NAME_LEN - 1 - total_read)) > 0) {
-        total_read += bytes_read;
-    }
-    ifunc_close(fd);
+  int bytes_read, total_read = 0;
+  while (total_read < MAX_CPU_NAME_LEN - 1 &&
+         (bytes_read = ifunc_read(fd, name + total_read,
+                                  MAX_CPU_NAME_LEN - 1 - total_read)) > 0) {
+    total_read += bytes_read;
+  }
+  ifunc_close(fd);
 
-    if (bytes_read != 0) {
-        // The file is too big. We haven't reach the end. Or maybe there is an
-        // error when reading.
-        return kGeneric;
-    }
-    name[total_read] = 0;
-
-    const CpuVariantNames* cpu_variant = cpu_variant_names;
-    while (cpu_variant->variant != kUnknown) {
-        if (is_same_name(cpu_variant->name, name)) {
-            return cpu_variant->variant;
-        }
-        cpu_variant++;
-    }
+  if (bytes_read != 0) {
+    // The file is too big. We haven't reach the end. Or maybe there is an
+    // error when reading.
     return kGeneric;
+  }
+  name[total_read] = 0;
+
+  const CpuVariantNames* cpu_variant = cpu_variant_names;
+  while (cpu_variant->variant != kUnknown) {
+    if (is_same_name(cpu_variant->name, name)) {
+      return cpu_variant->variant;
+    }
+    cpu_variant++;
+  }
+  return kGeneric;
 }
 
 static CpuVariant get_cpu_variant() {
-    static CpuVariant cpu_variant = kUnknown;
-    if (cpu_variant == kUnknown) {
-        cpu_variant = init_cpu_variant();
-    }
-    return cpu_variant;
+  static CpuVariant cpu_variant = kUnknown;
+  if (cpu_variant == kUnknown) {
+    cpu_variant = init_cpu_variant();
+  }
+  return cpu_variant;
 }
 
-typedef void* memmove_func(void* __dst, const void* __src, size_t __n);
 DEFINE_IFUNC_FOR(memmove) {
-    RETURN_FUNC(memmove_func, memmove_a15);
+  RETURN_FUNC(memmove_func_t, memmove_a15);
 }
+MEMMOVE_SHIM()
 
-typedef void* memcpy_func(void*, const void*, size_t);
 DEFINE_IFUNC_FOR(memcpy) {
-    return memmove_resolver(hwcap);
+  return memmove_resolver(hwcap);
 }
+MEMCPY_SHIM()
 
-typedef void* __memcpy_func(void*, const void*, size_t);
+// On arm32, __memcpy() is not publicly exposed, but gets called by memmove()
+// in cases where the copy is known to be overlap-safe.
+typedef void* __memcpy_func_t(void*, const void*, size_t);
 DEFINE_IFUNC_FOR(__memcpy) {
-    switch(get_cpu_variant()) {
-        case kCortexA7:
-            RETURN_FUNC(__memcpy_func, __memcpy_a7);
-        case kCortexA9:
-            RETURN_FUNC(__memcpy_func, __memcpy_a9);
-        case kKrait:
-            RETURN_FUNC(__memcpy_func, __memcpy_krait);
-        case kCortexA53:
-            RETURN_FUNC(__memcpy_func, __memcpy_a53);
-        case kCortexA55:
-            RETURN_FUNC(__memcpy_func, __memcpy_a55);
-        case kKryo:
-            RETURN_FUNC(__memcpy_func, __memcpy_kryo);
-        default:
-            RETURN_FUNC(__memcpy_func, __memcpy_a15);
-    }
+  switch (get_cpu_variant()) {
+    case kCortexA7:
+      RETURN_FUNC(__memcpy_func_t, __memcpy_a7);
+    case kCortexA9:
+      RETURN_FUNC(__memcpy_func_t, __memcpy_a9);
+    case kKrait:
+      RETURN_FUNC(__memcpy_func_t, __memcpy_krait);
+    case kCortexA53:
+      RETURN_FUNC(__memcpy_func_t, __memcpy_a53);
+    case kCortexA55:
+      RETURN_FUNC(__memcpy_func_t, __memcpy_a55);
+    case kKryo:
+      RETURN_FUNC(__memcpy_func_t, __memcpy_kryo);
+    default:
+      RETURN_FUNC(__memcpy_func_t, __memcpy_a15);
+  }
 }
+DEFINE_STATIC_SHIM(void* __memcpy(void* dst, const void* src, size_t n) {
+  FORWARD(__memcpy)(dst, src, n);
+})
 
-typedef void* __memset_chk_func(void* s, int c, size_t n, size_t n2);
 DEFINE_IFUNC_FOR(__memset_chk) {
-    switch(get_cpu_variant()) {
-        case kCortexA7:
-        case kCortexA53:
-        case kCortexA55:
-        case kKryo:
-            RETURN_FUNC(__memset_chk_func, __memset_chk_a7);
-        case kCortexA9:
-            RETURN_FUNC(__memset_chk_func, __memset_chk_a9);
-        case kKrait:
-            RETURN_FUNC(__memset_chk_func, __memset_chk_krait);
-        default:
-            RETURN_FUNC(__memset_chk_func, __memset_chk_a15);
-    }
+  switch (get_cpu_variant()) {
+    case kCortexA7:
+    case kCortexA53:
+    case kCortexA55:
+    case kKryo:
+      RETURN_FUNC(__memset_chk_func_t, __memset_chk_a7);
+    case kCortexA9:
+      RETURN_FUNC(__memset_chk_func_t, __memset_chk_a9);
+    case kKrait:
+      RETURN_FUNC(__memset_chk_func_t, __memset_chk_krait);
+    default:
+      RETURN_FUNC(__memset_chk_func_t, __memset_chk_a15);
+  }
 }
+__MEMSET_CHK_SHIM()
 
-typedef void* memset_func(void* __dst, int __ch, size_t __n);
 DEFINE_IFUNC_FOR(memset) {
-    switch(get_cpu_variant()) {
-        case kCortexA7:
-        case kCortexA53:
-        case kCortexA55:
-        case kKryo:
-             RETURN_FUNC(memset_func, memset_a7);
-        case kCortexA9:
-             RETURN_FUNC(memset_func, memset_a9);
-        case kKrait:
-             RETURN_FUNC(memset_func, memset_krait);
-        default:
-             RETURN_FUNC(memset_func, memset_a15);
-    }
+  switch (get_cpu_variant()) {
+    case kCortexA7:
+    case kCortexA53:
+    case kCortexA55:
+    case kKryo:
+      RETURN_FUNC(memset_func_t, memset_a7);
+    case kCortexA9:
+      RETURN_FUNC(memset_func_t, memset_a9);
+    case kKrait:
+      RETURN_FUNC(memset_func_t, memset_krait);
+    default:
+      RETURN_FUNC(memset_func_t, memset_a15);
+  }
 }
+MEMSET_SHIM()
 
-typedef char* strcpy_func(char* __dst, const char* __src);
 DEFINE_IFUNC_FOR(strcpy) {
-    switch(get_cpu_variant()) {
-        case kCortexA9:
-            RETURN_FUNC(strcpy_func, strcpy_a9);
-        default:
-            RETURN_FUNC(strcpy_func, strcpy_a15);
-    }
+  switch (get_cpu_variant()) {
+    case kCortexA9:
+      RETURN_FUNC(strcpy_func_t, strcpy_a9);
+    default:
+      RETURN_FUNC(strcpy_func_t, strcpy_a15);
+  }
 }
+STRCPY_SHIM()
 
-typedef char* __strcpy_chk_func(char* dst, const char* src, size_t dst_len);
 DEFINE_IFUNC_FOR(__strcpy_chk) {
-    switch(get_cpu_variant()) {
-        case kCortexA7:
-            RETURN_FUNC(__strcpy_chk_func, __strcpy_chk_a7);
-        case kCortexA9:
-            RETURN_FUNC(__strcpy_chk_func, __strcpy_chk_a9);
-        case kKrait:
-        case kKryo:
-            RETURN_FUNC(__strcpy_chk_func, __strcpy_chk_krait);
-        case kCortexA53:
-            RETURN_FUNC(__strcpy_chk_func, __strcpy_chk_a53);
-        case kCortexA55:
-            RETURN_FUNC(__strcpy_chk_func, __strcpy_chk_a55);
-        default:
-            RETURN_FUNC(__strcpy_chk_func, __strcpy_chk_a15);
-    }
+  switch (get_cpu_variant()) {
+    case kCortexA7:
+      RETURN_FUNC(__strcpy_chk_func_t, __strcpy_chk_a7);
+    case kCortexA9:
+      RETURN_FUNC(__strcpy_chk_func_t, __strcpy_chk_a9);
+    case kKrait:
+    case kKryo:
+      RETURN_FUNC(__strcpy_chk_func_t, __strcpy_chk_krait);
+    case kCortexA53:
+      RETURN_FUNC(__strcpy_chk_func_t, __strcpy_chk_a53);
+    case kCortexA55:
+      RETURN_FUNC(__strcpy_chk_func_t, __strcpy_chk_a55);
+    default:
+      RETURN_FUNC(__strcpy_chk_func_t, __strcpy_chk_a15);
+  }
 }
+__STRCPY_CHK_SHIM()
 
-typedef char* stpcpy_func(char* __dst, const char* __src);
 DEFINE_IFUNC_FOR(stpcpy) {
-    switch(get_cpu_variant()) {
-        case kCortexA9:
-            RETURN_FUNC(stpcpy_func, stpcpy_a9);
-        default:
-            RETURN_FUNC(stpcpy_func, stpcpy_a15);
-    }
+  switch (get_cpu_variant()) {
+    case kCortexA9:
+      RETURN_FUNC(stpcpy_func_t, stpcpy_a9);
+    default:
+      RETURN_FUNC(stpcpy_func_t, stpcpy_a15);
+  }
 }
+STPCPY_SHIM()
 
-typedef char* strcat_func(char* __dst, const char* __src);
 DEFINE_IFUNC_FOR(strcat) {
-    switch(get_cpu_variant()) {
-        case kCortexA9:
-            RETURN_FUNC(strcat_func, strcat_a9);
-        default:
-            RETURN_FUNC(strcat_func, strcat_a15);
-    }
+  switch (get_cpu_variant()) {
+    case kCortexA9:
+      RETURN_FUNC(strcat_func_t, strcat_a9);
+    default:
+      RETURN_FUNC(strcat_func_t, strcat_a15);
+  }
 }
+STRCAT_SHIM()
 
-typedef char* __strcat_chk_func(char* dst, const char* src, size_t dst_buf_size);
 DEFINE_IFUNC_FOR(__strcat_chk) {
-    switch(get_cpu_variant()) {
-        case kCortexA7:
-            RETURN_FUNC(__strcat_chk_func, __strcat_chk_a7);
-        case kCortexA9:
-            RETURN_FUNC(__strcat_chk_func, __strcat_chk_a9);
-        case kKrait:
-        case kKryo:
-            RETURN_FUNC(__strcat_chk_func, __strcat_chk_krait);
-        case kCortexA53:
-            RETURN_FUNC(__strcat_chk_func, __strcat_chk_a53);
-        case kCortexA55:
-            RETURN_FUNC(__strcat_chk_func, __strcat_chk_a55);
-        default:
-            RETURN_FUNC(__strcat_chk_func, __strcat_chk_a15);
-    }
+  switch (get_cpu_variant()) {
+    case kCortexA7:
+      RETURN_FUNC(__strcat_chk_func_t, __strcat_chk_a7);
+    case kCortexA9:
+      RETURN_FUNC(__strcat_chk_func_t, __strcat_chk_a9);
+    case kKrait:
+    case kKryo:
+      RETURN_FUNC(__strcat_chk_func_t, __strcat_chk_krait);
+    case kCortexA53:
+      RETURN_FUNC(__strcat_chk_func_t, __strcat_chk_a53);
+    case kCortexA55:
+      RETURN_FUNC(__strcat_chk_func_t, __strcat_chk_a55);
+    default:
+      RETURN_FUNC(__strcat_chk_func_t, __strcat_chk_a15);
+  }
 }
+__STRCAT_CHK_SHIM()
 
-typedef int strcmp_func(const char* __lhs, const char* __rhs);
 DEFINE_IFUNC_FOR(strcmp) {
-    RETURN_FUNC(strcmp_func, strcmp_a15);
+  RETURN_FUNC(strcmp_func_t, strcmp_a15);
 }
+STRCMP_SHIM()
 
-typedef size_t strlen_func(const char* __s);
 DEFINE_IFUNC_FOR(strlen) {
-    switch(get_cpu_variant()) {
-        case kCortexA9:
-            RETURN_FUNC(strlen_func, strlen_a9);
-        default:
-            RETURN_FUNC(strlen_func, strlen_a15);
-    }
+  switch (get_cpu_variant()) {
+    case kCortexA9:
+      RETURN_FUNC(strlen_func_t, strlen_a9);
+    default:
+      RETURN_FUNC(strlen_func_t, strlen_a15);
+  }
 }
+STRLEN_SHIM()
 
 }  // extern "C"