Merge "Define MAXHOSTNAMELEN explicitly in source files."
diff --git a/benchmarks/semaphore_benchmark.cpp b/benchmarks/semaphore_benchmark.cpp
index a11fcc1..f383da4 100644
--- a/benchmarks/semaphore_benchmark.cpp
+++ b/benchmarks/semaphore_benchmark.cpp
@@ -16,7 +16,10 @@
 
 #include "benchmark.h"
 
+#include <pthread.h>
 #include <semaphore.h>
+#include <stdatomic.h>
+#include <stdio.h>
 
 static void BM_semaphore_sem_getvalue(int iters) {
   StopBenchmarkTiming();
@@ -47,3 +50,94 @@
   StopBenchmarkTiming();
 }
 BENCHMARK(BM_semaphore_sem_wait_sem_post);
+
+/*
+ *    This test reports the overhead of the underlying futex wake syscall on
+ * the producer. It does not report the overhead from issuing the wake to the
+ * point where the posted consumer thread wakes up. It suffers from
+ * clock_gettime syscall overhead. Lock the CPU speed for consistent results
+ * as we may not reach >50% cpu utilization.
+ *
+ *    We will run a background thread that catches the sem_post wakeup and
+ * loops immediately returning back to sleep in sem_wait for the next one. This
+ * thread is run with policy SCHED_OTHER (normal policy), a middle policy.
+ *
+ *    The primary thread will run at SCHED_IDLE (lowest priority policy) when
+ * monitoring the background thread to detect when it hits sem_wait sleep. It
+ * will do so with no clock running. Once we are ready, we will switch to
+ * SCHED_FIFO (highest priority policy) to time the act of running sem_post
+ * with the benchmark clock running. This ensures nothing else in the system
+ * can preempt our timed activity, including the background thread. We are
+ * also protected with the scheduling policy of letting a process hit a
+ * resource limit rather than get hit with a context switch.
+ *
+ *    The background thread will start executing either on another CPU, or
+ * after we back down from SCHED_FIFO, but certainly not in the context of
+ * the timing of the sem_post.
+ */
+static atomic_int BM_semaphore_sem_post_running;
+
+static void *BM_semaphore_sem_post_start_thread(void *obj) {
+    sem_t *semaphore = reinterpret_cast<sem_t *>(obj);
+
+    while ((BM_semaphore_sem_post_running > 0) && !sem_wait(semaphore)) {
+        ;
+    }
+    BM_semaphore_sem_post_running = -1;
+    return NULL;
+}
+
+static void BM_semaphore_sem_post(int iters) {
+  StopBenchmarkTiming();
+
+  sem_t semaphore;
+  sem_init(&semaphore, 0, 0);
+
+  pthread_attr_t attr;
+  pthread_attr_init(&attr);
+  BM_semaphore_sem_post_running = 1;
+  struct sched_param param = { 0, };
+  pthread_attr_setschedparam(&attr, &param);
+  pthread_attr_setschedpolicy(&attr, SCHED_OTHER);
+  pthread_attr_setdetachstate(&attr, PTHREAD_CREATE_DETACHED);
+  pthread_t pthread;
+  pthread_create(&pthread, &attr, BM_semaphore_sem_post_start_thread, &semaphore);
+  pthread_attr_destroy(&attr);
+
+  sched_setscheduler((pid_t)0, SCHED_IDLE, &param);
+  for (int i = 0; i < iters; ++i) {
+    int trys = 3, dummy = 0;
+    do {
+      if (BM_semaphore_sem_post_running < 0) {
+        sched_setscheduler((pid_t)0, SCHED_OTHER, &param);
+        fprintf(stderr, "BM_semaphore_sem_post: start_thread died unexpectedly\n");
+        return;
+      }
+      sched_yield();
+      sem_getvalue(&semaphore, &dummy);
+      if (dummy < 0) {  // POSIX.1-2001 possibility 1
+        break;
+      }
+      if (dummy == 0) { // POSIX.1-2001 possibility 2
+        --trys;
+      }
+    } while (trys);
+    param.sched_priority = 1;
+    sched_setscheduler((pid_t)0, SCHED_FIFO, &param);
+    StartBenchmarkTiming();
+    sem_post(&semaphore);
+    StopBenchmarkTiming(); // Remember to subtract clock syscall overhead
+    param.sched_priority = 0;
+    sched_setscheduler((pid_t)0, SCHED_IDLE, &param);
+  }
+  sched_setscheduler((pid_t)0, SCHED_OTHER, &param);
+
+  if (BM_semaphore_sem_post_running > 0) {
+    BM_semaphore_sem_post_running = 0;
+  }
+  do {
+    sem_post(&semaphore);
+    sched_yield();
+  } while (!BM_semaphore_sem_post_running);
+}
+BENCHMARK(BM_semaphore_sem_post);
diff --git a/libc/bionic/pthread_cond.cpp b/libc/bionic/pthread_cond.cpp
index 32ff81a..5542c59 100644
--- a/libc/bionic/pthread_cond.cpp
+++ b/libc/bionic/pthread_cond.cpp
@@ -30,13 +30,13 @@
 
 #include <errno.h>
 #include <limits.h>
+#include <stdatomic.h>
 #include <sys/mman.h>
 #include <time.h>
 #include <unistd.h>
 
 #include "pthread_internal.h"
 
-#include "private/bionic_atomic_inline.h"
 #include "private/bionic_futex.h"
 #include "private/bionic_time_conversions.h"
 #include "private/bionic_tls.h"
@@ -98,6 +98,14 @@
   return 0;
 }
 
+static inline atomic_uint* COND_TO_ATOMIC_POINTER(pthread_cond_t* cond) {
+  static_assert(sizeof(atomic_uint) == sizeof(cond->value),
+                "cond->value should actually be atomic_uint in implementation.");
+
+  // We prefer casting to atomic_uint instead of declaring cond->value to be atomic_uint directly.
+  // Because using the second method pollutes pthread.h, and causes an error when compiling libcxx.
+  return reinterpret_cast<atomic_uint*>(&cond->value);
+}
 
 // XXX *technically* there is a race condition that could allow
 // XXX a signal to be missed.  If thread A is preempted in _wait()
@@ -107,53 +115,54 @@
 // XXX then the signal will be lost.
 
 int pthread_cond_init(pthread_cond_t* cond, const pthread_condattr_t* attr) {
+  atomic_uint* cond_value_ptr = COND_TO_ATOMIC_POINTER(cond);
+  unsigned int init_value = 0;
+
   if (attr != NULL) {
-    cond->value = (*attr & COND_FLAGS_MASK);
-  } else {
-    cond->value = 0;
+    init_value = (*attr & COND_FLAGS_MASK);
   }
+  atomic_init(cond_value_ptr, init_value);
 
   return 0;
 }
 
 int pthread_cond_destroy(pthread_cond_t* cond) {
-  cond->value = 0xdeadc04d;
+  atomic_uint* cond_value_ptr = COND_TO_ATOMIC_POINTER(cond);
+  atomic_store_explicit(cond_value_ptr, 0xdeadc04d, memory_order_relaxed);
   return 0;
 }
 
 // This function is used by pthread_cond_broadcast and
 // pthread_cond_signal to atomically decrement the counter
-// then wake up 'counter' threads.
-static int __pthread_cond_pulse(pthread_cond_t* cond, int counter) {
-  int flags = (cond->value & COND_FLAGS_MASK);
-  while (true) {
-    int old_value = cond->value;
-    int new_value = ((old_value - COND_COUNTER_STEP) & COND_COUNTER_MASK) | flags;
-    if (__bionic_cmpxchg(old_value, new_value, &cond->value) == 0) {
-      break;
-    }
-  }
+// then wake up thread_count threads.
+static int __pthread_cond_pulse(atomic_uint* cond_value_ptr, int thread_count) {
+  unsigned int old_value = atomic_load_explicit(cond_value_ptr, memory_order_relaxed);
+  bool shared = COND_IS_SHARED(old_value);
 
-  // Ensure that all memory accesses previously made by this thread are
-  // visible to the woken thread(s).  On the other side, the "wait"
-  // code will issue any necessary barriers when locking the mutex.
-  //
-  // This may not strictly be necessary -- if the caller follows
-  // recommended practice and holds the mutex before signaling the cond
-  // var, the mutex ops will provide correct semantics.  If they don't
-  // hold the mutex, they're subject to race conditions anyway.
-  ANDROID_MEMBAR_FULL();
+  // We don't use a release/seq_cst fence here. Because pthread_cond_wait/signal can't be
+  // used as a method for memory synchronization by itself. It should always be used with
+  // pthread mutexes. Note that Spurious wakeups from pthread_cond_wait/timedwait may occur,
+  // so when using condition variables there is always a boolean predicate involving shared
+  // variables associated with each condition wait that is true if the thread should proceed.
+  // If the predicate is seen true before a condition wait, pthread_cond_wait/timedwait will
+  // not be called. That's why pthread_wait/signal pair can't be used as a method for memory
+  // synchronization. And it doesn't help even if we use any fence here.
 
-  __futex_wake_ex(&cond->value, COND_IS_SHARED(cond->value), counter);
+  // The increase of value should leave flags alone, even if the value can overflows.
+  atomic_fetch_add_explicit(cond_value_ptr, COND_COUNTER_STEP, memory_order_relaxed);
+
+  __futex_wake_ex(cond_value_ptr, shared, thread_count);
   return 0;
 }
 
 __LIBC_HIDDEN__
-int __pthread_cond_timedwait_relative(pthread_cond_t* cond, pthread_mutex_t* mutex, const timespec* reltime) {
-  int old_value = cond->value;
+int __pthread_cond_timedwait_relative(atomic_uint* cond_value_ptr, pthread_mutex_t* mutex,
+                                      const timespec* reltime) {
+  unsigned int old_value = atomic_load_explicit(cond_value_ptr, memory_order_relaxed);
+  bool shared = COND_IS_SHARED(old_value);
 
   pthread_mutex_unlock(mutex);
-  int status = __futex_wait_ex(&cond->value, COND_IS_SHARED(cond->value), old_value, reltime);
+  int status = __futex_wait_ex(cond_value_ptr, shared, old_value, reltime);
   pthread_mutex_lock(mutex);
 
   if (status == -ETIMEDOUT) {
@@ -163,7 +172,8 @@
 }
 
 __LIBC_HIDDEN__
-int __pthread_cond_timedwait(pthread_cond_t* cond, pthread_mutex_t* mutex, const timespec* abs_ts, clockid_t clock) {
+int __pthread_cond_timedwait(atomic_uint* cond_value_ptr, pthread_mutex_t* mutex,
+                             const timespec* abs_ts, clockid_t clock) {
   timespec ts;
   timespec* tsp;
 
@@ -176,42 +186,52 @@
     tsp = NULL;
   }
 
-  return __pthread_cond_timedwait_relative(cond, mutex, tsp);
+  return __pthread_cond_timedwait_relative(cond_value_ptr, mutex, tsp);
 }
 
 int pthread_cond_broadcast(pthread_cond_t* cond) {
-  return __pthread_cond_pulse(cond, INT_MAX);
+  atomic_uint* cond_value_ptr = COND_TO_ATOMIC_POINTER(cond);
+  return __pthread_cond_pulse(cond_value_ptr, INT_MAX);
 }
 
 int pthread_cond_signal(pthread_cond_t* cond) {
-  return __pthread_cond_pulse(cond, 1);
+  atomic_uint* cond_value_ptr = COND_TO_ATOMIC_POINTER(cond);
+  return __pthread_cond_pulse(cond_value_ptr, 1);
 }
 
 int pthread_cond_wait(pthread_cond_t* cond, pthread_mutex_t* mutex) {
-  return __pthread_cond_timedwait(cond, mutex, NULL, COND_GET_CLOCK(cond->value));
+  atomic_uint* cond_value_ptr = COND_TO_ATOMIC_POINTER(cond);
+  return __pthread_cond_timedwait(cond_value_ptr, mutex, NULL,
+           COND_GET_CLOCK(atomic_load_explicit(cond_value_ptr, memory_order_relaxed)));
 }
 
 int pthread_cond_timedwait(pthread_cond_t *cond, pthread_mutex_t * mutex, const timespec *abstime) {
-  return __pthread_cond_timedwait(cond, mutex, abstime, COND_GET_CLOCK(cond->value));
+  atomic_uint* cond_value_ptr = COND_TO_ATOMIC_POINTER(cond);
+  return __pthread_cond_timedwait(cond_value_ptr, mutex, abstime,
+           COND_GET_CLOCK(atomic_load_explicit(cond_value_ptr, memory_order_relaxed)));
 }
 
 #if !defined(__LP64__)
 // TODO: this exists only for backward binary compatibility on 32 bit platforms.
 extern "C" int pthread_cond_timedwait_monotonic(pthread_cond_t* cond, pthread_mutex_t* mutex, const timespec* abstime) {
-  return __pthread_cond_timedwait(cond, mutex, abstime, CLOCK_MONOTONIC);
+  atomic_uint* cond_value_ptr = COND_TO_ATOMIC_POINTER(cond);
+  return __pthread_cond_timedwait(cond_value_ptr, mutex, abstime, CLOCK_MONOTONIC);
 }
 
 extern "C" int pthread_cond_timedwait_monotonic_np(pthread_cond_t* cond, pthread_mutex_t* mutex, const timespec* abstime) {
-  return __pthread_cond_timedwait(cond, mutex, abstime, CLOCK_MONOTONIC);
+  atomic_uint* cond_value_ptr = COND_TO_ATOMIC_POINTER(cond);
+  return __pthread_cond_timedwait(cond_value_ptr, mutex, abstime, CLOCK_MONOTONIC);
 }
 
 extern "C" int pthread_cond_timedwait_relative_np(pthread_cond_t* cond, pthread_mutex_t* mutex, const timespec* reltime) {
-  return __pthread_cond_timedwait_relative(cond, mutex, reltime);
+  atomic_uint* cond_value_ptr = COND_TO_ATOMIC_POINTER(cond);
+  return __pthread_cond_timedwait_relative(cond_value_ptr, mutex, reltime);
 }
 
 extern "C" int pthread_cond_timeout_np(pthread_cond_t* cond, pthread_mutex_t* mutex, unsigned ms) {
   timespec ts;
   timespec_from_ms(ts, ms);
-  return __pthread_cond_timedwait_relative(cond, mutex, &ts);
+  atomic_uint* cond_value_ptr = COND_TO_ATOMIC_POINTER(cond);
+  return __pthread_cond_timedwait_relative(cond_value_ptr, mutex, &ts);
 }
 #endif // !defined(__LP64__)
diff --git a/libc/include/pthread.h b/libc/include/pthread.h
index 8d053ae..212551b 100644
--- a/libc/include/pthread.h
+++ b/libc/include/pthread.h
@@ -73,7 +73,7 @@
 };
 
 typedef struct {
-  int volatile value;
+  unsigned int value;
 #ifdef __LP64__
   char __reserved[44];
 #endif
diff --git a/libm/Android.mk b/libm/Android.mk
index cc2b8be..f27c62e 100644
--- a/libm/Android.mk
+++ b/libm/Android.mk
@@ -3,18 +3,19 @@
 
 bionic_coverage := false
 
-# TODO: this comes from from upstream's libc, not libm, but it's an
-# implementation detail that should have hidden visibility, so it needs
-# to be in whatever library the math code is in.
-libm_common_src_files := \
-    digittoint.c  \
+ifneq (,$(filter $(TARGET_ARCH),x86 x86_64))
+# Clang has wrong long double sizes for x86.
+libm_clang := false
+endif
 
-# TODO: this is not in the BSDs.
-libm_common_src_files += \
-    significandl.c \
-    sincos.c \
+# -----------------------------------------------------------------------------
+# libm.a
+# -----------------------------------------------------------------------------
+include $(CLEAR_VARS)
 
-libm_common_src_files += \
+LOCAL_MODULE := libm
+
+LOCAL_SRC_FILES := \
     upstream-freebsd/lib/msun/bsdsrc/b_exp.c \
     upstream-freebsd/lib/msun/bsdsrc/b_log.c \
     upstream-freebsd/lib/msun/bsdsrc/b_tgamma.c \
@@ -186,11 +187,10 @@
     upstream-freebsd/lib/msun/src/w_drem.c \
     upstream-freebsd/lib/msun/src/w_dremf.c \
 
-libm_common_src_files += \
+LOCAL_SRC_FILES_32 += \
     fake_long_double.c \
-    signbit.c \
 
-libm_ld128_src_files = \
+LOCAL_SRC_FILES_64 := \
     upstream-freebsd/lib/msun/src/e_acosl.c \
     upstream-freebsd/lib/msun/src/e_acoshl.c \
     upstream-freebsd/lib/msun/src/e_asinl.c \
@@ -234,7 +234,7 @@
     upstream-freebsd/lib/msun/src/s_tanl.c \
     upstream-freebsd/lib/msun/src/s_truncl.c \
 
-libm_ld128_src_files += \
+LOCAL_SRC_FILES_64 += \
     upstream-freebsd/lib/msun/ld128/invtrig.c \
     upstream-freebsd/lib/msun/ld128/e_lgammal_r.c \
     upstream-freebsd/lib/msun/ld128/k_cosl.c \
@@ -246,11 +246,46 @@
     upstream-freebsd/lib/msun/ld128/s_logl.c \
     upstream-freebsd/lib/msun/ld128/s_nanl.c \
 
-# TODO: re-enable i387/e_sqrtf.S for x86, and maybe others.
+# TODO: this comes from from upstream's libc, not libm, but it's an
+# implementation detail that should have hidden visibility, so it needs
+# to be in whatever library the math code is in.
+LOCAL_SRC_FILES += \
+    digittoint.c  \
 
-libm_common_cflags := \
+# Functionality not in the BSDs.
+LOCAL_SRC_FILES += \
+    significandl.c \
+    sincos.c \
+
+# Modified versions of BSD code.
+LOCAL_SRC_FILES += \
+    signbit.c \
+
+LOCAL_SRC_FILES_arm += \
+    arm/fenv.c \
+
+LOCAL_SRC_FILES_arm64 += \
+    arm64/fenv.c \
+
+LOCAL_SRC_FILES_mips += \
+    mips/fenv.c \
+
+LOCAL_SRC_FILES_x86 += \
+    i387/fenv.c \
+
+LOCAL_SRC_FILES_x86_64 += \
+    amd64/fenv.c \
+
+LOCAL_C_INCLUDES_x86 += $(LOCAL_PATH)/i387
+
+LOCAL_C_INCLUDES += $(LOCAL_PATH)/upstream-freebsd/lib/msun/src/
+LOCAL_C_INCLUDES_64 += $(LOCAL_PATH)/upstream-freebsd/lib/msun/ld128/
+
+LOCAL_CLANG := $(libm_clang)
+LOCAL_ADDITIONAL_DEPENDENCIES := $(LOCAL_PATH)/Android.mk
+LOCAL_ARM_MODE := arm
+LOCAL_CFLAGS := \
     -DFLT_EVAL_METHOD=0 \
-    -std=c99 \
     -include $(LOCAL_PATH)/freebsd-compat.h \
     -Wno-missing-braces \
     -Wno-parentheses \
@@ -262,61 +297,25 @@
 # Workaround the GCC "(long)fn -> lfn" optimization bug which will result in
 # self recursions for lrint, lrintf, and lrintl.
 # BUG: 14225968
-libm_common_cflags += -fno-builtin-rint -fno-builtin-rintf -fno-builtin-rintl
+LOCAL_CFLAGS += \
+    -fno-builtin-rint \
+    -fno-builtin-rintf \
+    -fno-builtin-rintl \
 
-libm_common_includes := $(LOCAL_PATH)/upstream-freebsd/lib/msun/src/
-
-libm_ld_includes := $(LOCAL_PATH)/upstream-freebsd/lib/msun/ld128/
-
-#
-# libm.a for target.
-#
-include $(CLEAR_VARS)
-ifneq (,$(filter $(TARGET_ARCH),x86 x86_64))
-# Clang has wrong long double sizes for x86.
-LOCAL_CLANG := false
-endif
-LOCAL_MODULE:= libm
-LOCAL_ADDITIONAL_DEPENDENCIES := $(LOCAL_PATH)/Android.mk
-LOCAL_ARM_MODE := arm
-LOCAL_CFLAGS := $(libm_common_cflags)
-LOCAL_C_INCLUDES += $(libm_common_includes)
-LOCAL_SRC_FILES := $(libm_common_src_files)
-LOCAL_SYSTEM_SHARED_LIBRARIES := libc
+LOCAL_CONLY_FLAGS := \
+    -std=c99 \
 
 LOCAL_NATIVE_COVERAGE := $(bionic_coverage)
 LOCAL_ADDRESS_SANITIZER := false
-
-# arch-specific settings
-LOCAL_C_INCLUDES_arm := $(LOCAL_PATH)/arm
-LOCAL_SRC_FILES_arm := arm/fenv.c
-
-LOCAL_C_INCLUDES_arm64 := $(libm_ld_includes)
-LOCAL_SRC_FILES_arm64 := arm64/fenv.c $(libm_ld128_src_files)
-
-LOCAL_C_INCLUDES_x86 := $(LOCAL_PATH)/i387
-LOCAL_SRC_FILES_x86 := i387/fenv.c
-
-LOCAL_C_INCLUDES_x86_64 := $(libm_ld_includes)
-LOCAL_SRC_FILES_x86_64 := amd64/fenv.c $(libm_ld128_src_files)
-
-LOCAL_SRC_FILES_mips := mips/fenv.c
-
-LOCAL_C_INCLUDES_mips64 := $(libm_ld_includes)
-LOCAL_SRC_FILES_mips64 := mips/fenv.c $(libm_ld128_src_files)
-
-LOCAL_CXX_STL := none
 include $(BUILD_STATIC_LIBRARY)
 
-#
-# libm.so for target.
-#
+# -----------------------------------------------------------------------------
+# libm.so
+# -----------------------------------------------------------------------------
 include $(CLEAR_VARS)
-ifneq (,$(filter $(TARGET_ARCH),x86 x86_64))
-# Clang has wrong long double sizes for x86.
-LOCAL_CLANG := false
-endif
-LOCAL_MODULE:= libm
+
+LOCAL_MODULE := libm
+LOCAL_CLANG := $(libm_clang)
 LOCAL_ADDITIONAL_DEPENDENCIES := $(LOCAL_PATH)/Android.mk
 LOCAL_SYSTEM_SHARED_LIBRARIES := libc
 LOCAL_WHOLE_STATIC_LIBRARIES := libm
@@ -329,8 +328,7 @@
 # We'd really like to do this for all architectures, but since this wasn't done
 # before, these symbols must continue to be exported on LP32 for binary
 # compatibility.
-LOCAL_LDFLAGS_arm64 := -Wl,--exclude-libs,libgcc.a
-LOCAL_LDFLAGS_mips64 := -Wl,--exclude-libs,libgcc.a
-LOCAL_LDFLAGS_x86_64 := -Wl,--exclude-libs,libgcc.a
+LOCAL_LDFLAGS_64 := -Wl,--exclude-libs,libgcc.a
+
 include $(BUILD_SHARED_LIBRARY)
 endif