Microoptimize the strtol() family.

The main change here is to remember that we arm64/x86-64 have flags, and
__builtin_<op>_overflow() lets us look at them. There's a clear saving
for arm64, and x86-64 is better too, though interestingly there the
unsigned case doesn't work out as well as the signed case because both
unsigned __builtin_mul_overflow and unsigned __builtin_add_overflow are
less efficient than the signed multiply and subtract on that
architecture, but the new code still beats the old code even so.

There's a very tiny microoptimization of the hex path that takes
advantage of the fact that conversion to lowercase is a single
instruction on all our architectures when we already know we're dealing
with a letter.

This also merges the signed and unsigned variants of the code. Not
entirely successfully, but the vast majority of the code benefits.

Before (arm64):
```
----------------------------------------------------------------
Benchmark                      Time             CPU   Iterations
----------------------------------------------------------------
BM_inttypes_strtoimax       44.6 ns         44.3 ns     15807654
BM_inttypes_strtoumax       43.1 ns         42.8 ns     16348848
BM_stdlib_strtol            44.6 ns         44.3 ns     15805384
BM_stdlib_strtol_hex        85.5 ns         85.0 ns      8235487
BM_stdlib_strtoll           44.5 ns         44.2 ns     15833137
BM_stdlib_strtoul           43.1 ns         42.8 ns     16353963
BM_stdlib_strtoul_hex       83.1 ns         82.6 ns      8477732
BM_stdlib_strtoull          43.1 ns         42.8 ns     16353015
```

After (arm64):
```
----------------------------------------------------------------
Benchmark                      Time             CPU   Iterations
----------------------------------------------------------------
BM_inttypes_strtoimax       37.9 ns         37.6 ns     17657577
BM_inttypes_strtoumax       35.9 ns         35.7 ns     19597727
BM_stdlib_strtol            36.9 ns         36.7 ns     19093037
BM_stdlib_strtol_hex        70.7 ns         70.3 ns      9961626
BM_stdlib_strtoll           36.9 ns         36.7 ns     19093032
BM_stdlib_strtoul           35.9 ns         35.7 ns     19617784
BM_stdlib_strtoul_hex       67.7 ns         67.3 ns     10113521
BM_stdlib_strtoull          35.9 ns         35.7 ns     19621828
```

Test: treehugger
Change-Id: Ibf53b29e34d63ac31520c6d27ef80ff281899d61
diff --git a/libc/bionic/strtol.cpp b/libc/bionic/strtol.cpp
index 05b4b53..def7921 100644
--- a/libc/bionic/strtol.cpp
+++ b/libc/bionic/strtol.cpp
@@ -35,141 +35,88 @@
 #include <wchar.h>
 
 template <typename T, T Min, T Max, typename CharT>
-T StrToI(const CharT* nptr, CharT** endptr, int base) {
+__attribute__((always_inline)) T StrToI(const CharT* s, CharT** end_ptr, int base) {
   // Ensure that base is between 2 and 36 inclusive, or the special value of 0.
   if (base < 0 || base == 1 || base > 36) {
-    if (endptr != nullptr) *endptr = const_cast<CharT*>(nptr);
+    if (end_ptr != nullptr) *end_ptr = const_cast<CharT*>(s);
     errno = EINVAL;
     return 0;
   }
 
   // Skip white space and pick up leading +/- sign if any.
-  // If base is 0, allow 0x for hex and 0 for octal, else
-  // assume decimal; if base is already 16, allow 0x.
-  const CharT* s = nptr;
+  const CharT* p = s;
   int c;
-  do {
-    c = *s++;
-  } while (isspace(c));
-  int neg;
-  if (c == '-') {
-    neg = 1;
-    c = *s++;
-  } else {
-    neg = 0;
-    if (c == '+') c = *s++;
+  while (isspace(c = *p++)) {
   }
-  if ((base == 0 || base == 16) && c == '0' && (*s == 'x' || *s == 'X') && isxdigit(s[1])) {
-    c = s[1];
-    s += 2;
+  bool neg = false;
+  if (c == '-') {
+    neg = true;
+    c = *p++;
+  } else if (c == '+') {
+    c = *p++;
+  }
+
+  // If base is 0 or 16, allow "0x" prefix for hex.
+  if ((base == 0 || base == 16) && c == '0' && (*p == 'x' || *p == 'X') && isxdigit(p[1])) {
+    c = p[1];
+    p += 2;
     base = 16;
   }
-  if ((base == 0 || base == 2) && c == '0' && (*s == 'b' || *s == 'B') && isdigit(s[1])) {
-    c = s[1];
-    s += 2;
+  // If base is 0 or 2, allow "0b" prefix for binary.
+  if ((base == 0 || base == 2) && c == '0' && (*p == 'b' || *p == 'B') && isdigit(p[1])) {
+    c = p[1];
+    p += 2;
     base = 2;
   }
+  // If base is 0, allow "0" prefix for octal, otherwise base is 10.
   if (base == 0) base = (c == '0') ? 8 : 10;
 
-  // We always work in the negative space because the most negative value has a
-  // larger magnitude than the most positive value.
-  T cutoff = Min / base;
-  int cutlim = -(Min % base);
+  constexpr bool is_signed = (Min != 0);
+  T acc = 0;
   // Non-zero if any digits consumed; negative to indicate overflow/underflow.
   int any = 0;
-  T acc = 0;
-  for (; ; c = *s++) {
+  for (;; c = *p++) {
     if (isdigit(c)) {
       c -= '0';
     } else if (isalpha(c)) {
-      c -= isupper(c) ? 'A' - 10 : 'a' - 10;
+      c = 10 + (_tolower(c) - 'a');
     } else {
       break;
     }
     if (c >= base) break;
     if (any < 0) continue;
-    if (acc < cutoff || (acc == cutoff && c > cutlim)) {
-      any = -1;
-      acc = Min;
-      errno = ERANGE;
+    if (is_signed) {
+      // We work in the negative space because the most negative value has a
+      // larger magnitude than the most positive value.
+      if (__builtin_mul_overflow(acc, base, &acc) || __builtin_sub_overflow(acc, c, &acc)) {
+        any = -1;
+        continue;
+      }
     } else {
-      any = 1;
-      acc *= base;
-      acc -= c;
+      if (__builtin_mul_overflow(acc, base, &acc) || __builtin_add_overflow(acc, c, &acc)) {
+        any = -1;
+        continue;
+      }
     }
-  }
-  if (endptr != nullptr) *endptr = const_cast<CharT*>(any ? s - 1 : nptr);
-  if (!neg) {
-    if (acc == Min) {
-      errno = ERANGE;
-      acc = Max;
-    } else {
-      acc = -acc;
-    }
-  }
-  return acc;
-}
-
-template <typename T, T Max, typename CharT>
-T StrToU(const CharT* nptr, CharT** endptr, int base) {
-  if (base < 0 || base == 1 || base > 36) {
-    if (endptr != nullptr) *endptr = const_cast<CharT*>(nptr);
-    errno = EINVAL;
-    return 0;
+    any = 1;
   }
 
-  const CharT* s = nptr;
-  int c;
-  do {
-    c = *s++;
-  } while (isspace(c));
-  int neg;
-  if (c == '-') {
-    neg = 1;
-    c = *s++;
-  } else {
-    neg = 0;
-    if (c == '+') c = *s++;
-  }
-  if ((base == 0 || base == 16) && c == '0' && (*s == 'x' || *s == 'X') && isxdigit(s[1])) {
-    c = s[1];
-    s += 2;
-    base = 16;
-  }
-  if ((base == 0 || base == 2) && c == '0' && (*s == 'b' || *s == 'B') && isdigit(s[1])) {
-    c = s[1];
-    s += 2;
-    base = 2;
-  }
-  if (base == 0) base = (c == '0') ? 8 : 10;
+  if (end_ptr != nullptr) *end_ptr = const_cast<CharT*>(any ? p - 1 : s);
 
-  T cutoff = Max / static_cast<T>(base);
-  int cutlim = Max % static_cast<T>(base);
-  T acc = 0;
-  int any = 0;
-  for (; ; c = *s++) {
-    if (isdigit(c)) {
-      c -= '0';
-    } else if (isalpha(c)) {
-      c -= isupper(c) ? 'A' - 10 : 'a' - 10;
-    } else {
-      break;
-    }
-    if (c >= base) break;
-    if (any < 0) continue;
-    if (acc > cutoff || (acc == cutoff && c > cutlim)) {
-      any = -1;
-      acc = Max;
-      errno = ERANGE;
-    } else {
-      any = 1;
-      acc *= base;
-      acc += c;
-    }
+  // Detected overflow/underflow in the loop?
+  if (any == -1) {
+    errno = ERANGE;
+    return (is_signed && neg) ? Min : Max;
   }
-  if (neg && any > 0) acc = -acc;
-  if (endptr != nullptr) *endptr = const_cast<CharT*>(any ? s - 1 : nptr);
-  return acc;
+
+  // Will we overflow by trying to negate the most negative value?
+  if (any > 0 && is_signed && !neg && acc == Min) {
+    errno = ERANGE;
+    return Max;
+  }
+
+  if (is_signed) return neg ? acc : -acc;
+  return neg ? -acc : acc;
 }
 
 int atoi(const char* s) {
@@ -209,25 +156,25 @@
 }
 
 unsigned long strtoul(const char* s, char** end, int base) {
-  return StrToU<unsigned long, ULONG_MAX, char>(s, end, base);
+  return StrToI<unsigned long, 0, ULONG_MAX, char>(s, end, base);
 }
 
 unsigned long wcstoul(const wchar_t* s, wchar_t** end, int base) {
-  return StrToU<unsigned long, ULONG_MAX, wchar_t>(s, end, base);
+  return StrToI<unsigned long, 0, ULONG_MAX, wchar_t>(s, end, base);
 }
 
 unsigned long long strtoull(const char* s, char** end, int base) {
-  return StrToU<unsigned long long, ULLONG_MAX, char>(s, end, base);
+  return StrToI<unsigned long long, 0, ULLONG_MAX, char>(s, end, base);
 }
 
 unsigned long long wcstoull(const wchar_t* s, wchar_t** end, int base) {
-  return StrToU<unsigned long long, ULLONG_MAX, wchar_t>(s, end, base);
+  return StrToI<unsigned long long, 0, ULLONG_MAX, wchar_t>(s, end, base);
 }
 
 uintmax_t strtoumax(const char* s, char** end, int base) {
-  return StrToU<uintmax_t, UINTMAX_MAX, char>(s, end, base);
+  return StrToI<uintmax_t, 0, UINTMAX_MAX, char>(s, end, base);
 }
 
 uintmax_t wcstoumax(const wchar_t* s, wchar_t** end, int base) {
-  return StrToU<uintmax_t, UINTMAX_MAX, wchar_t>(s, end, base);
+  return StrToI<uintmax_t, 0, UINTMAX_MAX, wchar_t>(s, end, base);
 }