Merge "Use LOCAL_C_INCLUDES instead of LOCAL_CFLAGS for include dirs"
diff --git a/libc/Android.mk b/libc/Android.mk
index dd4d4cf..75bb616 100644
--- a/libc/Android.mk
+++ b/libc/Android.mk
@@ -69,6 +69,7 @@
     bionic/__FD_chk.cpp \
     bionic/__fgets_chk.cpp \
     bionic/__memmove_chk.cpp \
+    bionic/__poll_chk.cpp \
     bionic/__read_chk.cpp \
     bionic/__recvfrom_chk.cpp \
     bionic/__stpcpy_chk.cpp \
@@ -115,6 +116,8 @@
     bionic/error.cpp \
     bionic/eventfd_read.cpp \
     bionic/eventfd_write.cpp \
+    bionic/fchmod.cpp \
+    bionic/fchmodat.cpp \
     bionic/ffs.cpp \
     bionic/flockfile.cpp \
     bionic/fork.cpp \
@@ -1223,7 +1226,8 @@
 LOCAL_SYSTEM_SHARED_LIBRARIES :=
 # Only need this for arm since libc++ uses its own unwind code that
 # doesn't mix with the other default unwind code.
-LOCAL_STATIC_LIBRARIES_arm := libunwind_llvm libc++abi
+LOCAL_STATIC_LIBRARIES_arm := libunwind_llvm
+LOCAL_STATIC_LIBRARIES += libc++abi
 LOCAL_ALLOW_UNDEFINED_SYMBOLS := true
 
 # Don't install on release build
diff --git a/libc/SYSCALLS.TXT b/libc/SYSCALLS.TXT
index 0fa2a1e..aae7de7 100644
--- a/libc/SYSCALLS.TXT
+++ b/libc/SYSCALLS.TXT
@@ -113,7 +113,7 @@
 int         __fcntl64:fcntl64(int, int, void*)  arm,mips,x86
 int         fcntl(int, int, void*)  arm64,mips64,x86_64
 int         flock(int, int)   all
-int         fchmod(int, mode_t)  all
+int         ___fchmod:fchmod(int, mode_t)  all
 int         dup(int)  all
 int         pipe2(int*, int) all
 int         dup3(int, int, int)   all
@@ -131,7 +131,7 @@
 
 int __openat:openat(int, const char*, int, mode_t) all
 int faccessat(int, const char*, int, int)  all
-int fchmodat(int, const char*, mode_t, int)  all
+int ___fchmodat:fchmodat(int, const char*, mode_t)  all
 int fchownat(int, const char*, uid_t, gid_t, int)  all
 int fstatat64|fstatat:fstatat64(int, const char*, struct stat*, int)   arm,mips,x86
 int fstatat64|fstatat:newfstatat(int, const char*, struct stat*, int)  arm64,x86_64
diff --git a/libc/arch-arm/syscalls/fchmod.S b/libc/arch-arm/syscalls/___fchmod.S
similarity index 84%
rename from libc/arch-arm/syscalls/fchmod.S
rename to libc/arch-arm/syscalls/___fchmod.S
index 5675f0a..c6da4f8 100644
--- a/libc/arch-arm/syscalls/fchmod.S
+++ b/libc/arch-arm/syscalls/___fchmod.S
@@ -2,7 +2,7 @@
 
 #include <private/bionic_asm.h>
 
-ENTRY(fchmod)
+ENTRY(___fchmod)
     mov     ip, r7
     ldr     r7, =__NR_fchmod
     swi     #0
@@ -11,4 +11,5 @@
     bxls    lr
     neg     r0, r0
     b       __set_errno_internal
-END(fchmod)
+END(___fchmod)
+.hidden ___fchmod
diff --git a/libc/arch-arm/syscalls/fchmodat.S b/libc/arch-arm/syscalls/___fchmodat.S
similarity index 82%
rename from libc/arch-arm/syscalls/fchmodat.S
rename to libc/arch-arm/syscalls/___fchmodat.S
index 3f7e0ee..91bbda5 100644
--- a/libc/arch-arm/syscalls/fchmodat.S
+++ b/libc/arch-arm/syscalls/___fchmodat.S
@@ -2,7 +2,7 @@
 
 #include <private/bionic_asm.h>
 
-ENTRY(fchmodat)
+ENTRY(___fchmodat)
     mov     ip, r7
     ldr     r7, =__NR_fchmodat
     swi     #0
@@ -11,4 +11,5 @@
     bxls    lr
     neg     r0, r0
     b       __set_errno_internal
-END(fchmodat)
+END(___fchmodat)
+.hidden ___fchmodat
diff --git a/libc/arch-arm64/syscalls/fchmod.S b/libc/arch-arm64/syscalls/___fchmod.S
similarity index 81%
rename from libc/arch-arm64/syscalls/fchmod.S
rename to libc/arch-arm64/syscalls/___fchmod.S
index 83a8060..a143c65 100644
--- a/libc/arch-arm64/syscalls/fchmod.S
+++ b/libc/arch-arm64/syscalls/___fchmod.S
@@ -2,7 +2,7 @@
 
 #include <private/bionic_asm.h>
 
-ENTRY(fchmod)
+ENTRY(___fchmod)
     mov     x8, __NR_fchmod
     svc     #0
 
@@ -11,4 +11,5 @@
     b.hi    __set_errno_internal
 
     ret
-END(fchmod)
+END(___fchmod)
+.hidden ___fchmod
diff --git a/libc/arch-arm64/syscalls/fchmodat.S b/libc/arch-arm64/syscalls/___fchmodat.S
similarity index 80%
rename from libc/arch-arm64/syscalls/fchmodat.S
rename to libc/arch-arm64/syscalls/___fchmodat.S
index 8c5bb0e..1ab3736 100644
--- a/libc/arch-arm64/syscalls/fchmodat.S
+++ b/libc/arch-arm64/syscalls/___fchmodat.S
@@ -2,7 +2,7 @@
 
 #include <private/bionic_asm.h>
 
-ENTRY(fchmodat)
+ENTRY(___fchmodat)
     mov     x8, __NR_fchmodat
     svc     #0
 
@@ -11,4 +11,5 @@
     b.hi    __set_errno_internal
 
     ret
-END(fchmodat)
+END(___fchmodat)
+.hidden ___fchmodat
diff --git a/libc/arch-mips/syscalls/fchmod.S b/libc/arch-mips/syscalls/___fchmod.S
similarity index 84%
rename from libc/arch-mips/syscalls/fchmod.S
rename to libc/arch-mips/syscalls/___fchmod.S
index 2a95cc3..ac102ec 100644
--- a/libc/arch-mips/syscalls/fchmod.S
+++ b/libc/arch-mips/syscalls/___fchmod.S
@@ -2,7 +2,7 @@
 
 #include <private/bionic_asm.h>
 
-ENTRY(fchmod)
+ENTRY(___fchmod)
     .set noreorder
     .cpload t9
     li v0, __NR_fchmod
@@ -16,4 +16,5 @@
     j t9
     nop
     .set reorder
-END(fchmod)
+END(___fchmod)
+.hidden ___fchmod
diff --git a/libc/arch-mips/syscalls/fchmodat.S b/libc/arch-mips/syscalls/___fchmodat.S
similarity index 82%
rename from libc/arch-mips/syscalls/fchmodat.S
rename to libc/arch-mips/syscalls/___fchmodat.S
index d9de036..d581efa 100644
--- a/libc/arch-mips/syscalls/fchmodat.S
+++ b/libc/arch-mips/syscalls/___fchmodat.S
@@ -2,7 +2,7 @@
 
 #include <private/bionic_asm.h>
 
-ENTRY(fchmodat)
+ENTRY(___fchmodat)
     .set noreorder
     .cpload t9
     li v0, __NR_fchmodat
@@ -16,4 +16,5 @@
     j t9
     nop
     .set reorder
-END(fchmodat)
+END(___fchmodat)
+.hidden ___fchmodat
diff --git a/libc/arch-mips64/syscalls/fchmod.S b/libc/arch-mips64/syscalls/___fchmod.S
similarity index 87%
rename from libc/arch-mips64/syscalls/fchmod.S
rename to libc/arch-mips64/syscalls/___fchmod.S
index a877b78..7c16c54 100644
--- a/libc/arch-mips64/syscalls/fchmod.S
+++ b/libc/arch-mips64/syscalls/___fchmod.S
@@ -2,7 +2,7 @@
 
 #include <private/bionic_asm.h>
 
-ENTRY(fchmod)
+ENTRY(___fchmod)
     .set push
     .set noreorder
     li v0, __NR_fchmod
@@ -22,4 +22,5 @@
     j t9
     move ra, t0
     .set pop
-END(fchmod)
+END(___fchmod)
+.hidden ___fchmod
diff --git a/libc/arch-mips64/syscalls/fchmodat.S b/libc/arch-mips64/syscalls/___fchmodat.S
similarity index 86%
rename from libc/arch-mips64/syscalls/fchmodat.S
rename to libc/arch-mips64/syscalls/___fchmodat.S
index 151492a..50f108e 100644
--- a/libc/arch-mips64/syscalls/fchmodat.S
+++ b/libc/arch-mips64/syscalls/___fchmodat.S
@@ -2,7 +2,7 @@
 
 #include <private/bionic_asm.h>
 
-ENTRY(fchmodat)
+ENTRY(___fchmodat)
     .set push
     .set noreorder
     li v0, __NR_fchmodat
@@ -22,4 +22,5 @@
     j t9
     move ra, t0
     .set pop
-END(fchmodat)
+END(___fchmodat)
+.hidden ___fchmodat
diff --git a/libc/arch-x86/syscalls/fchmod.S b/libc/arch-x86/syscalls/___fchmod.S
similarity index 90%
rename from libc/arch-x86/syscalls/fchmod.S
rename to libc/arch-x86/syscalls/___fchmod.S
index 37851ff..119a695 100644
--- a/libc/arch-x86/syscalls/fchmod.S
+++ b/libc/arch-x86/syscalls/___fchmod.S
@@ -2,7 +2,7 @@
 
 #include <private/bionic_asm.h>
 
-ENTRY(fchmod)
+ENTRY(___fchmod)
     pushl   %ebx
     .cfi_def_cfa_offset 8
     .cfi_rel_offset ebx, 0
@@ -23,4 +23,5 @@
     popl    %ecx
     popl    %ebx
     ret
-END(fchmod)
+END(___fchmod)
+.hidden ___fchmod
diff --git a/libc/arch-x86/syscalls/fchmodat.S b/libc/arch-x86/syscalls/___fchmodat.S
similarity index 70%
rename from libc/arch-x86/syscalls/fchmodat.S
rename to libc/arch-x86/syscalls/___fchmodat.S
index f515512..b15bb64 100644
--- a/libc/arch-x86/syscalls/fchmodat.S
+++ b/libc/arch-x86/syscalls/___fchmodat.S
@@ -2,7 +2,7 @@
 
 #include <private/bionic_asm.h>
 
-ENTRY(fchmodat)
+ENTRY(___fchmodat)
     pushl   %ebx
     .cfi_def_cfa_offset 8
     .cfi_rel_offset ebx, 0
@@ -12,13 +12,9 @@
     pushl   %edx
     .cfi_adjust_cfa_offset 4
     .cfi_rel_offset edx, 0
-    pushl   %esi
-    .cfi_adjust_cfa_offset 4
-    .cfi_rel_offset esi, 0
-    mov     20(%esp), %ebx
-    mov     24(%esp), %ecx
-    mov     28(%esp), %edx
-    mov     32(%esp), %esi
+    mov     16(%esp), %ebx
+    mov     20(%esp), %ecx
+    mov     24(%esp), %edx
     movl    $__NR_fchmodat, %eax
     int     $0x80
     cmpl    $-MAX_ERRNO, %eax
@@ -28,9 +24,9 @@
     call    __set_errno_internal
     addl    $4, %esp
 1:
-    popl    %esi
     popl    %edx
     popl    %ecx
     popl    %ebx
     ret
-END(fchmodat)
+END(___fchmodat)
+.hidden ___fchmodat
diff --git a/libc/arch-x86_64/syscalls/fchmod.S b/libc/arch-x86_64/syscalls/___fchmod.S
similarity index 83%
rename from libc/arch-x86_64/syscalls/fchmod.S
rename to libc/arch-x86_64/syscalls/___fchmod.S
index b35bd21..7bccbef 100644
--- a/libc/arch-x86_64/syscalls/fchmod.S
+++ b/libc/arch-x86_64/syscalls/___fchmod.S
@@ -2,7 +2,7 @@
 
 #include <private/bionic_asm.h>
 
-ENTRY(fchmod)
+ENTRY(___fchmod)
     movl    $__NR_fchmod, %eax
     syscall
     cmpq    $-MAX_ERRNO, %rax
@@ -12,4 +12,5 @@
     call    __set_errno_internal
 1:
     ret
-END(fchmod)
+END(___fchmod)
+.hidden ___fchmod
diff --git a/libc/arch-x86_64/syscalls/fchmodat.S b/libc/arch-x86_64/syscalls/___fchmodat.S
similarity index 82%
rename from libc/arch-x86_64/syscalls/fchmodat.S
rename to libc/arch-x86_64/syscalls/___fchmodat.S
index 2d78d8e..483ec7d 100644
--- a/libc/arch-x86_64/syscalls/fchmodat.S
+++ b/libc/arch-x86_64/syscalls/___fchmodat.S
@@ -2,8 +2,7 @@
 
 #include <private/bionic_asm.h>
 
-ENTRY(fchmodat)
-    movq    %rcx, %r10
+ENTRY(___fchmodat)
     movl    $__NR_fchmodat, %eax
     syscall
     cmpq    $-MAX_ERRNO, %rax
@@ -13,4 +12,5 @@
     call    __set_errno_internal
 1:
     ret
-END(fchmodat)
+END(___fchmodat)
+.hidden ___fchmodat
diff --git a/libc/bionic/__poll_chk.cpp b/libc/bionic/__poll_chk.cpp
new file mode 100644
index 0000000..3acac4e
--- /dev/null
+++ b/libc/bionic/__poll_chk.cpp
@@ -0,0 +1,49 @@
+/*
+ * Copyright (C) 2015 The Android Open Source Project
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *  * Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in
+ *    the documentation and/or other materials provided with the
+ *    distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+ * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+ * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS
+ * OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
+ * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
+ * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#undef _FORTIFY_SOURCE
+#include <poll.h>
+#include "private/libc_logging.h"
+
+#include <stdio.h>
+
+extern "C" int __poll_chk(struct pollfd* fds, nfds_t fd_count, int timeout, size_t fds_size) {
+fprintf(stderr, "__poll_chk %p %i %i %i\n", fds, (int)fd_count, timeout, (int) fds_size);
+  if (__predict_false(fds_size / sizeof(*fds) < fd_count)) {
+    __fortify_chk_fail("poll: pollfd array smaller than fd count", 0);
+  }
+  return poll(fds, fd_count, timeout);
+}
+
+extern "C" int __ppoll_chk(struct pollfd* fds, nfds_t fd_count, const struct timespec* timeout, const sigset_t* mask, size_t fds_size) {
+fprintf(stderr, "__ppoll_chk %p %i %p %p %i\n", fds, (int)fd_count, timeout, mask, (int) fds_size);
+  if (__predict_false(fds_size / sizeof(*fds) < fd_count)) {
+    __fortify_chk_fail("ppoll: pollfd array smaller than fd count", 0);
+  }
+  return ppoll(fds, fd_count, timeout, mask);
+}
diff --git a/libc/bionic/debug_stacktrace.cpp b/libc/bionic/debug_stacktrace.cpp
index c6ce714..71e876b 100644
--- a/libc/bionic/debug_stacktrace.cpp
+++ b/libc/bionic/debug_stacktrace.cpp
@@ -47,34 +47,20 @@
 
 typedef struct _Unwind_Context __unwind_context;
 
+extern "C" char* __cxa_demangle(const char*, char*, size_t*, int*);
+
 static mapinfo_t* g_map_info = NULL;
-static void* g_demangler;
-typedef char* (*DemanglerFn)(const char*, char*, size_t*, int*);
-static DemanglerFn g_demangler_fn = NULL;
 
 __LIBC_HIDDEN__ void backtrace_startup() {
   ScopedDisableDebugCalls disable;
 
   g_map_info = mapinfo_create(getpid());
-  g_demangler = dlopen("libgccdemangle.so", RTLD_NOW);
-  if (g_demangler != NULL) {
-    void* sym = dlsym(g_demangler, "__cxa_demangle");
-    g_demangler_fn = reinterpret_cast<DemanglerFn>(sym);
-  }
 }
 
 __LIBC_HIDDEN__ void backtrace_shutdown() {
   ScopedDisableDebugCalls disable;
 
   mapinfo_destroy(g_map_info);
-  dlclose(g_demangler);
-}
-
-static char* demangle(const char* symbol) {
-  if (g_demangler_fn == NULL) {
-    return NULL;
-  }
-  return (*g_demangler_fn)(symbol, NULL, NULL, NULL);
 }
 
 struct stack_crawl_state_t {
@@ -158,8 +144,7 @@
       soname = "<unknown>";
     }
     if (symbol != NULL) {
-      // TODO: we might need a flag to say whether it's safe to allocate (demangling allocates).
-      char* demangled_symbol = demangle(symbol);
+      char* demangled_symbol = __cxa_demangle(symbol, NULL, NULL, NULL);
       const char* best_name = (demangled_symbol != NULL) ? demangled_symbol : symbol;
 
       __libc_format_log(ANDROID_LOG_ERROR, "libc",
diff --git a/libc/bionic/fchmod.cpp b/libc/bionic/fchmod.cpp
new file mode 100644
index 0000000..ace8c6b
--- /dev/null
+++ b/libc/bionic/fchmod.cpp
@@ -0,0 +1,72 @@
+/*
+ * Copyright (C) 2015 The Android Open Source Project
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *  * Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in
+ *    the documentation and/or other materials provided with the
+ *    distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+ * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+ * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS
+ * OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
+ * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
+ * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <fcntl.h>
+#include <sys/stat.h>
+#include <sys/types.h>
+#include <errno.h>
+#include <unistd.h>
+#include <stdio.h>
+
+extern "C" int ___fchmod(int, mode_t);
+
+int fchmod(int fd, mode_t mode) {
+  int saved_errno = errno;
+  int result = ___fchmod(fd, mode);
+
+  if ((result == 0) || (errno != EBADF)) {
+    return result;
+  }
+
+  // fd could be an O_PATH file descriptor, and the kernel
+  // may not directly support fchmod() on such a file descriptor.
+  // Use /proc/self/fd instead to emulate this support.
+  // https://sourceware.org/bugzilla/show_bug.cgi?id=14578
+  //
+  // As of February 2015, there are no kernels which support fchmod
+  // on an O_PATH file descriptor, and "man open" documents fchmod
+  // on O_PATH file descriptors as returning EBADF.
+  int fd_flag = fcntl(fd, F_GETFL);
+  if ((fd_flag == -1) || ((fd_flag & O_PATH) == 0)) {
+    errno = EBADF;
+    return -1;
+  }
+
+  char buf[40];
+  snprintf(buf, sizeof(buf), "/proc/self/fd/%d", fd);
+  errno = saved_errno;
+  result = chmod(buf, mode);
+  if ((result == -1) && (errno == ELOOP)) {
+    // Linux does not support changing the mode of a symlink.
+    // For fchmodat(AT_SYMLINK_NOFOLLOW), POSIX requires a return
+    // value of ENOTSUP. Assume that's true here too.
+    errno = ENOTSUP;
+  }
+
+  return result;
+}
diff --git a/libc/bionic/fchmodat.cpp b/libc/bionic/fchmodat.cpp
new file mode 100644
index 0000000..1f83c4b
--- /dev/null
+++ b/libc/bionic/fchmodat.cpp
@@ -0,0 +1,67 @@
+/*
+ * Copyright (C) 2015 The Android Open Source Project
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *  * Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in
+ *    the documentation and/or other materials provided with the
+ *    distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+ * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+ * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS
+ * OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
+ * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
+ * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <fcntl.h>
+#include <sys/stat.h>
+#include <sys/types.h>
+#include <errno.h>
+#include <unistd.h>
+
+#include "private/ErrnoRestorer.h"
+
+extern "C" int ___fchmodat(int, const char*, mode_t);
+
+int fchmodat(int dirfd, const char* pathname, mode_t mode, int flags) {
+  if ((flags & ~AT_SYMLINK_NOFOLLOW) != 0) {
+    errno = EINVAL;
+    return -1;
+  }
+
+  if (flags & AT_SYMLINK_NOFOLLOW) {
+    // Emulate AT_SYMLINK_NOFOLLOW using the mechanism described
+    // at https://sourceware.org/bugzilla/show_bug.cgi?id=14578
+    // comment #10
+
+    int fd = openat(dirfd, pathname, O_PATH | O_NOFOLLOW | O_CLOEXEC);
+    if (fd == -1) {
+      return -1; // returns errno from openat
+    }
+
+    // POSIX requires that ENOTSUP be returned when the system
+    // doesn't support setting the mode of a symbolic link.
+    // This is true for all Linux kernels.
+    // We rely on the O_PATH compatibility layer added in the
+    // fchmod() function to get errno correct.
+    int result = fchmod(fd, mode);
+    ErrnoRestorer errno_restorer; // don't let close() clobber errno
+    close(fd);
+    return result;
+  }
+
+  return ___fchmodat(dirfd, pathname, mode);
+}
diff --git a/libc/bionic/jemalloc.h b/libc/bionic/jemalloc.h
index feb1f43..98ea0ee 100644
--- a/libc/bionic/jemalloc.h
+++ b/libc/bionic/jemalloc.h
@@ -18,6 +18,7 @@
 #define LIBC_BIONIC_JEMALLOC_H_
 
 #include <jemalloc/jemalloc.h>
+#include <malloc.h>  // For struct mallinfo.
 
 // Need to wrap memalign since je_memalign fails on non-power of 2 alignments.
 #define je_memalign je_memalign_round_up_boundary
diff --git a/libc/bionic/libc_logging.cpp b/libc/bionic/libc_logging.cpp
index 49a3762..2eb9d68 100644
--- a/libc/bionic/libc_logging.cpp
+++ b/libc/bionic/libc_logging.cpp
@@ -438,7 +438,7 @@
   vec[1].iov_base = const_cast<char*>(": ");
   vec[1].iov_len = 2;
   vec[2].iov_base = const_cast<char*>(msg);
-  vec[2].iov_len = strlen(msg) + 1;
+  vec[2].iov_len = strlen(msg);
   vec[3].iov_base = const_cast<char*>("\n");
   vec[3].iov_len = 1;
 
@@ -448,8 +448,7 @@
 }
 
 #ifdef TARGET_USES_LOGD
-static int __libc_open_log_socket()
-{
+static int __libc_open_log_socket() {
   // ToDo: Ideally we want this to fail if the gid of the current
   // process is AID_LOGD, but will have to wait until we have
   // registered this in private/android_filesystem_config.h. We have
@@ -491,7 +490,6 @@
 static int __libc_write_log(int priority, const char* tag, const char* msg) {
 #ifdef TARGET_USES_LOGD
   int main_log_fd = __libc_open_log_socket();
-
   if (main_log_fd == -1) {
     // Try stderr instead.
     return __libc_write_stderr(tag, msg);
@@ -614,7 +612,7 @@
   if (tag != 0) {
     __libc_android_log_event_uid(tag);
   }
-  __libc_fatal("FORTIFY_SOURCE: %s. Calling abort().", msg);
+  __libc_fatal("FORTIFY: %s", msg);
 }
 
 static void __libc_fatal(const char* format, va_list args) {
@@ -622,12 +620,12 @@
   BufferOutputStream os(msg, sizeof(msg));
   out_vformat(os, format, args);
 
-  // log to stderr for the benefit of "adb shell" users.
+  // Log to stderr for the benefit of "adb shell" users.
   struct iovec iov[2] = {
-    {msg, strlen(msg)},
-    {const_cast<void*>(static_cast<const void*>("\n")), 1},
+    { msg, os.total },
+    { const_cast<char*>("\n"), 1 },
   };
-  writev(2, iov, 2);
+  TEMP_FAILURE_RETRY(writev(2, iov, 2));
 
   // Log to the log for the benefit of regular app developers (whose stdout and stderr are closed).
   __libc_write_log(ANDROID_LOG_FATAL, "libc", msg);
diff --git a/libc/bionic/poll.cpp b/libc/bionic/poll.cpp
index d267229..23ef90a 100644
--- a/libc/bionic/poll.cpp
+++ b/libc/bionic/poll.cpp
@@ -26,6 +26,7 @@
  * SUCH DAMAGE.
  */
 
+#undef _FORTIFY_SOURCE
 #include <errno.h>
 #include <sys/poll.h>
 #include <sys/select.h>
diff --git a/libc/bionic/pthread_mutex.cpp b/libc/bionic/pthread_mutex.cpp
index 40f1ed2..83d6b54 100644
--- a/libc/bionic/pthread_mutex.cpp
+++ b/libc/bionic/pthread_mutex.cpp
@@ -30,22 +30,19 @@
 
 #include <errno.h>
 #include <limits.h>
+#include <stdatomic.h>
+#include <sys/cdefs.h>
 #include <sys/mman.h>
 #include <unistd.h>
 
 #include "pthread_internal.h"
 
-#include "private/bionic_atomic_inline.h"
 #include "private/bionic_constants.h"
 #include "private/bionic_futex.h"
+#include "private/bionic_systrace.h"
 #include "private/bionic_time_conversions.h"
 #include "private/bionic_tls.h"
 
-#include "private/bionic_systrace.h"
-
-extern void pthread_debug_mutex_lock_check(pthread_mutex_t *mutex);
-extern void pthread_debug_mutex_unlock_check(pthread_mutex_t *mutex);
-
 /* a mutex is implemented as a 32-bit integer holding the following fields
  *
  * bits:     name     description
@@ -87,9 +84,6 @@
 #define  MUTEX_STATE_LOCKED_UNCONTENDED  1   /* must be 1 due to atomic dec in unlock operation */
 #define  MUTEX_STATE_LOCKED_CONTENDED    2   /* must be 1 + LOCKED_UNCONTENDED due to atomic dec */
 
-#define  MUTEX_STATE_FROM_BITS(v)    FIELD_FROM_BITS(v, MUTEX_STATE_SHIFT, MUTEX_STATE_LEN)
-#define  MUTEX_STATE_TO_BITS(v)      FIELD_TO_BITS(v, MUTEX_STATE_SHIFT, MUTEX_STATE_LEN)
-
 #define  MUTEX_STATE_BITS_UNLOCKED            MUTEX_STATE_TO_BITS(MUTEX_STATE_UNLOCKED)
 #define  MUTEX_STATE_BITS_LOCKED_UNCONTENDED  MUTEX_STATE_TO_BITS(MUTEX_STATE_LOCKED_UNCONTENDED)
 #define  MUTEX_STATE_BITS_LOCKED_CONTENDED    MUTEX_STATE_TO_BITS(MUTEX_STATE_LOCKED_CONTENDED)
@@ -116,10 +110,7 @@
 #define  MUTEX_COUNTER_BITS_IS_ZERO(v)          (((v) & MUTEX_COUNTER_MASK) == 0)
 
 /* Used to increment the counter directly after overflow has been checked */
-#define  MUTEX_COUNTER_BITS_ONE      FIELD_TO_BITS(1,MUTEX_COUNTER_SHIFT,MUTEX_COUNTER_LEN)
-
-/* Returns true iff the counter is 0 */
-#define  MUTEX_COUNTER_BITS_ARE_ZERO(v)  (((v) & MUTEX_COUNTER_MASK) == 0)
+#define  MUTEX_COUNTER_BITS_ONE      FIELD_TO_BITS(1, MUTEX_COUNTER_SHIFT,MUTEX_COUNTER_LEN)
 
 /* Mutex shared bit flag
  *
@@ -159,30 +150,9 @@
 /* Mutex owner field:
  *
  * This is only used for recursive and errorcheck mutexes. It holds the
- * tid of the owning thread. Note that this works because the Linux
- * kernel _only_ uses 16-bit values for tids.
- *
- * More specifically, it will wrap to 10000 when it reaches over 32768 for
- * application processes. You can check this by running the following inside
- * an adb shell session:
- *
-    OLDPID=$$;
-    while true; do
-    NEWPID=$(sh -c 'echo $$')
-    if [ "$NEWPID" -gt 32768 ]; then
-        echo "AARGH: new PID $NEWPID is too high!"
-        exit 1
-    fi
-    if [ "$NEWPID" -lt "$OLDPID" ]; then
-        echo "****** Wrapping from PID $OLDPID to $NEWPID. *******"
-    else
-        echo -n "$NEWPID!"
-    fi
-    OLDPID=$NEWPID
-    done
-
- * Note that you can run the same example on a desktop Linux system,
- * the wrapping will also happen at 32768, but will go back to 300 instead.
+ * tid of the owning thread. We use 16 bits to represent tid here,
+ * so the highest tid is 65535. There is a test to check /proc/sys/kernel/pid_max
+ * to make sure it will not exceed our limit.
  */
 #define  MUTEX_OWNER_SHIFT     16
 #define  MUTEX_OWNER_LEN       16
@@ -267,9 +237,20 @@
     return 0;
 }
 
+static inline atomic_int* MUTEX_TO_ATOMIC_POINTER(pthread_mutex_t* mutex) {
+    static_assert(sizeof(atomic_int) == sizeof(mutex->value),
+                  "mutex->value should actually be atomic_int in implementation.");
+
+    // We prefer casting to atomic_int instead of declaring mutex->value to be atomic_int directly.
+    // Because using the second method pollutes pthread.h, and causes an error when compiling libcxx.
+    return reinterpret_cast<atomic_int*>(&mutex->value);
+}
+
 int pthread_mutex_init(pthread_mutex_t* mutex, const pthread_mutexattr_t* attr) {
+    atomic_int* mutex_value_ptr = MUTEX_TO_ATOMIC_POINTER(mutex);
+
     if (__predict_true(attr == NULL)) {
-        mutex->value = MUTEX_TYPE_BITS_NORMAL;
+        atomic_init(mutex_value_ptr, MUTEX_TYPE_BITS_NORMAL);
         return 0;
     }
 
@@ -292,13 +273,13 @@
         return EINVAL;
     }
 
-    mutex->value = value;
+    atomic_init(mutex_value_ptr, value);
     return 0;
 }
 
 
 /*
- * Lock a non-recursive mutex.
+ * Lock a mutex of type NORMAL.
  *
  * As noted above, there are three states:
  *   0 (unlocked, no contention)
@@ -309,96 +290,75 @@
  * "type" value is zero, so the only bits that will be set are the ones in
  * the lock state field.
  */
-static inline void _normal_lock(pthread_mutex_t* mutex, int shared) {
+static inline void _normal_mutex_lock(atomic_int* mutex_value_ptr, int shared) {
     /* convenience shortcuts */
     const int unlocked           = shared | MUTEX_STATE_BITS_UNLOCKED;
     const int locked_uncontended = shared | MUTEX_STATE_BITS_LOCKED_UNCONTENDED;
-    /*
-     * The common case is an unlocked mutex, so we begin by trying to
-     * change the lock's state from 0 (UNLOCKED) to 1 (LOCKED).
-     * __bionic_cmpxchg() returns 0 if it made the swap successfully.
-     * If the result is nonzero, this lock is already held by another thread.
-     */
-    if (__bionic_cmpxchg(unlocked, locked_uncontended, &mutex->value) != 0) {
-        const int locked_contended = shared | MUTEX_STATE_BITS_LOCKED_CONTENDED;
-        /*
-         * We want to go to sleep until the mutex is available, which
-         * requires promoting it to state 2 (CONTENDED). We need to
-         * swap in the new state value and then wait until somebody wakes us up.
-         *
-         * __bionic_swap() returns the previous value.  We swap 2 in and
-         * see if we got zero back; if so, we have acquired the lock.  If
-         * not, another thread still holds the lock and we wait again.
-         *
-         * The second argument to the __futex_wait() call is compared
-         * against the current value.  If it doesn't match, __futex_wait()
-         * returns immediately (otherwise, it sleeps for a time specified
-         * by the third argument; 0 means sleep forever).  This ensures
-         * that the mutex is in state 2 when we go to sleep on it, which
-         * guarantees a wake-up call.
-         */
 
-         ScopedTrace trace("Contending for pthread mutex");
-
-
-        while (__bionic_swap(locked_contended, &mutex->value) != unlocked) {
-            __futex_wait_ex(&mutex->value, shared, locked_contended, NULL);
-        }
+    // The common case is an unlocked mutex, so we begin by trying to
+    // change the lock's state from unlocked to locked_uncontended.
+    // If exchanged successfully, An acquire fence is required to make
+    // all memory accesses made by other threads visible in current CPU.
+    int mvalue = unlocked;
+    if (__predict_true(atomic_compare_exchange_strong_explicit(mutex_value_ptr, &mvalue,
+                                                locked_uncontended,
+                                                memory_order_acquire,
+                                                memory_order_relaxed))) {
+        return;
     }
-    ANDROID_MEMBAR_FULL();
+
+    ScopedTrace trace("Contending for pthread mutex");
+
+    // We want to go to sleep until the mutex is available, which requires
+    // promoting it to locked_contended. We need to swap in the new state
+    // value and then wait until somebody wakes us up.
+    // An atomic_exchange is used to compete with other threads for the lock.
+    // If it returns unlocked, we have acquired the lock, otherwise another
+    // thread still holds the lock and we should wait again.
+    // If lock is acquired, an acquire fence is needed to make all memory accesses
+    // made by other threads visible in current CPU.
+    const int locked_contended = shared | MUTEX_STATE_BITS_LOCKED_CONTENDED;
+    while (atomic_exchange_explicit(mutex_value_ptr, locked_contended,
+                                    memory_order_acquire) != unlocked) {
+
+        __futex_wait_ex(mutex_value_ptr, shared, locked_contended, NULL);
+    }
 }
 
 /*
- * Release a non-recursive mutex.  The caller is responsible for determining
+ * Release a mutex of type NORMAL.  The caller is responsible for determining
  * that we are in fact the owner of this lock.
  */
-static inline void _normal_unlock(pthread_mutex_t* mutex, int shared) {
-    ANDROID_MEMBAR_FULL();
+static inline void _normal_mutex_unlock(atomic_int* mutex_value_ptr, int shared) {
+    const int unlocked         = shared | MUTEX_STATE_BITS_UNLOCKED;
+    const int locked_contended = shared | MUTEX_STATE_BITS_LOCKED_CONTENDED;
 
-    /*
-     * The mutex state will be 1 or (rarely) 2.  We use an atomic decrement
-     * to release the lock.  __bionic_atomic_dec() returns the previous value;
-     * if it wasn't 1 we have to do some additional work.
-     */
-    if (__bionic_atomic_dec(&mutex->value) != (shared|MUTEX_STATE_BITS_LOCKED_UNCONTENDED)) {
-        /*
-         * Start by releasing the lock.  The decrement changed it from
-         * "contended lock" to "uncontended lock", which means we still
-         * hold it, and anybody who tries to sneak in will push it back
-         * to state 2.
-         *
-         * Once we set it to zero the lock is up for grabs.  We follow
-         * this with a __futex_wake() to ensure that one of the waiting
-         * threads has a chance to grab it.
-         *
-         * This doesn't cause a race with the swap/wait pair in
-         * _normal_lock(), because the __futex_wait() call there will
-         * return immediately if the mutex value isn't 2.
-         */
-        mutex->value = shared;
-
-        /*
-         * Wake up one waiting thread.  We don't know which thread will be
-         * woken or when it'll start executing -- futexes make no guarantees
-         * here.  There may not even be a thread waiting.
-         *
-         * The newly-woken thread will replace the 0 we just set above
-         * with 2, which means that when it eventually releases the mutex
-         * it will also call FUTEX_WAKE.  This results in one extra wake
-         * call whenever a lock is contended, but lets us avoid forgetting
-         * anyone without requiring us to track the number of sleepers.
-         *
-         * It's possible for another thread to sneak in and grab the lock
-         * between the zero assignment above and the wake call below.  If
-         * the new thread is "slow" and holds the lock for a while, we'll
-         * wake up a sleeper, which will swap in a 2 and then go back to
-         * sleep since the lock is still held.  If the new thread is "fast",
-         * running to completion before we call wake, the thread we
-         * eventually wake will find an unlocked mutex and will execute.
-         * Either way we have correct behavior and nobody is orphaned on
-         * the wait queue.
-         */
-        __futex_wake_ex(&mutex->value, shared, 1);
+    // We use an atomic_exchange to release the lock. If locked_contended state
+    // is returned, some threads is waiting for the lock and we need to wake up
+    // one of them.
+    // A release fence is required to make previous stores visible to next
+    // lock owner threads.
+    if (atomic_exchange_explicit(mutex_value_ptr, unlocked,
+                                 memory_order_release) == locked_contended) {
+        // Wake up one waiting thread. We don't know which thread will be
+        // woken or when it'll start executing -- futexes make no guarantees
+        // here. There may not even be a thread waiting.
+        //
+        // The newly-woken thread will replace the unlocked state we just set above
+        // with locked_contended state, which means that when it eventually releases
+        // the mutex it will also call FUTEX_WAKE. This results in one extra wake
+        // call whenever a lock is contended, but let us avoid forgetting anyone
+        // without requiring us to track the number of sleepers.
+        //
+        // It's possible for another thread to sneak in and grab the lock between
+        // the exchange above and the wake call below. If the new thread is "slow"
+        // and holds the lock for a while, we'll wake up a sleeper, which will swap
+        // in locked_uncontended state and then go back to sleep since the lock is
+        // still held. If the new thread is "fast", running to completion before
+        // we call wake, the thread we eventually wake will find an unlocked mutex
+        // and will execute. Either way we have correct behavior and nobody is
+        // orphaned on the wait queue.
+        __futex_wake_ex(mutex_value_ptr, shared, 1);
     }
 }
 
@@ -414,183 +374,175 @@
  * mvalue is the current mutex value (already loaded)
  * mutex pointers to the mutex.
  */
-static inline __always_inline int _recursive_increment(pthread_mutex_t* mutex, int mvalue, int mtype) {
+static inline __always_inline
+int _recursive_increment(atomic_int* mutex_value_ptr, int mvalue, int mtype) {
     if (mtype == MUTEX_TYPE_BITS_ERRORCHECK) {
-        /* trying to re-lock a mutex we already acquired */
+        // Trying to re-lock a mutex we already acquired.
         return EDEADLK;
     }
 
-    /* Detect recursive lock overflow and return EAGAIN.
-     * This is safe because only the owner thread can modify the
-     * counter bits in the mutex value.
-     */
+    // Detect recursive lock overflow and return EAGAIN.
+    // This is safe because only the owner thread can modify the
+    // counter bits in the mutex value.
     if (MUTEX_COUNTER_BITS_WILL_OVERFLOW(mvalue)) {
         return EAGAIN;
     }
 
-    /* We own the mutex, but other threads are able to change
-     * the lower bits (e.g. promoting it to "contended"), so we
-     * need to use an atomic cmpxchg loop to update the counter.
-     */
-    for (;;) {
-        /* increment counter, overflow was already checked */
-        int newval = mvalue + MUTEX_COUNTER_BITS_ONE;
-        if (__predict_true(__bionic_cmpxchg(mvalue, newval, &mutex->value) == 0)) {
-            /* mutex is still locked, not need for a memory barrier */
-            return 0;
-        }
-        /* the value was changed, this happens when another thread changes
-         * the lower state bits from 1 to 2 to indicate contention. This
-         * cannot change the counter, so simply reload and try again.
-         */
-        mvalue = mutex->value;
-    }
+    // We own the mutex, but other threads are able to change the lower bits
+    // (e.g. promoting it to "contended"), so we need to use an atomic exchange
+    // loop to update the counter. The counter will not overflow in the loop,
+    // as only the owner thread can change it.
+    // The mutex is still locked, so we don't need a release fence.
+    while (!atomic_compare_exchange_weak_explicit(mutex_value_ptr, &mvalue,
+                                                  mvalue + MUTEX_COUNTER_BITS_ONE,
+                                                  memory_order_relaxed,
+                                                  memory_order_relaxed)) { }
+    return 0;
 }
 
 int pthread_mutex_lock(pthread_mutex_t* mutex) {
+    atomic_int* mutex_value_ptr = MUTEX_TO_ATOMIC_POINTER(mutex);
+
     int mvalue, mtype, tid, shared;
 
-    mvalue = mutex->value;
+    mvalue = atomic_load_explicit(mutex_value_ptr, memory_order_relaxed);
     mtype = (mvalue & MUTEX_TYPE_MASK);
     shared = (mvalue & MUTEX_SHARED_MASK);
 
-    /* Handle non-recursive case first */
+    // Handle common case first.
     if ( __predict_true(mtype == MUTEX_TYPE_BITS_NORMAL) ) {
-        _normal_lock(mutex, shared);
+        _normal_mutex_lock(mutex_value_ptr, shared);
         return 0;
     }
 
-    /* Do we already own this recursive or error-check mutex ? */
+    // Do we already own this recursive or error-check mutex?
     tid = __get_thread()->tid;
     if ( tid == MUTEX_OWNER_FROM_BITS(mvalue) )
-        return _recursive_increment(mutex, mvalue, mtype);
+        return _recursive_increment(mutex_value_ptr, mvalue, mtype);
 
-    /* Add in shared state to avoid extra 'or' operations below */
+    // Add in shared state to avoid extra 'or' operations below.
     mtype |= shared;
 
-    /* First, if the mutex is unlocked, try to quickly acquire it.
-     * In the optimistic case where this works, set the state to 1 to
-     * indicate locked with no contention */
+    // First, if the mutex is unlocked, try to quickly acquire it.
+    // In the optimistic case where this works, set the state to locked_uncontended.
     if (mvalue == mtype) {
         int newval = MUTEX_OWNER_TO_BITS(tid) | mtype | MUTEX_STATE_BITS_LOCKED_UNCONTENDED;
-        if (__bionic_cmpxchg(mvalue, newval, &mutex->value) == 0) {
-            ANDROID_MEMBAR_FULL();
+        // If exchanged successfully, An acquire fence is required to make
+        // all memory accesses made by other threads visible in current CPU.
+        if (__predict_true(atomic_compare_exchange_strong_explicit(mutex_value_ptr, &mvalue,
+                           newval, memory_order_acquire, memory_order_relaxed))) {
             return 0;
         }
-        /* argh, the value changed, reload before entering the loop */
-        mvalue = mutex->value;
     }
 
     ScopedTrace trace("Contending for pthread mutex");
 
-    for (;;) {
-        int newval;
-
-        /* if the mutex is unlocked, its value should be 'mtype' and
-         * we try to acquire it by setting its owner and state atomically.
-         * NOTE: We put the state to 2 since we _know_ there is contention
-         * when we are in this loop. This ensures all waiters will be
-         * unlocked.
-         */
+    while (true) {
         if (mvalue == mtype) {
-            newval = MUTEX_OWNER_TO_BITS(tid) | mtype | MUTEX_STATE_BITS_LOCKED_CONTENDED;
-            /* TODO: Change this to __bionic_cmpxchg_acquire when we
-             *        implement it to get rid of the explicit memory
-             *        barrier below.
-             */
-            if (__predict_false(__bionic_cmpxchg(mvalue, newval, &mutex->value) != 0)) {
-                mvalue = mutex->value;
-                continue;
-            }
-            ANDROID_MEMBAR_FULL();
-            return 0;
-        }
+            // If the mutex is unlocked, its value should be 'mtype' and
+            // we try to acquire it by setting its owner and state atomically.
+            // NOTE: We put the state to locked_contended since we _know_ there
+            // is contention when we are in this loop. This ensures all waiters
+            // will be unlocked.
 
-        /* the mutex is already locked by another thread, if its state is 1
-         * we will change it to 2 to indicate contention. */
-        if (MUTEX_STATE_BITS_IS_LOCKED_UNCONTENDED(mvalue)) {
-            newval = MUTEX_STATE_BITS_FLIP_CONTENTION(mvalue); /* locked state 1 => state 2 */
-            if (__predict_false(__bionic_cmpxchg(mvalue, newval, &mutex->value) != 0)) {
-                mvalue = mutex->value;
+            int newval = MUTEX_OWNER_TO_BITS(tid) | mtype | MUTEX_STATE_BITS_LOCKED_CONTENDED;
+            // If exchanged successfully, An acquire fence is required to make
+            // all memory accesses made by other threads visible in current CPU.
+            if (__predict_true(atomic_compare_exchange_weak_explicit(mutex_value_ptr,
+                                                                     &mvalue, newval,
+                                                                     memory_order_acquire,
+                                                                     memory_order_relaxed))) {
+                return 0;
+            }
+            continue;
+        } else if (MUTEX_STATE_BITS_IS_LOCKED_UNCONTENDED(mvalue)) {
+            // The mutex is already locked by another thread, if the state is locked_uncontended,
+            // we should set it to locked_contended beforing going to sleep. This can make
+            // sure waiters will be woken up eventually.
+
+            int newval = MUTEX_STATE_BITS_FLIP_CONTENTION(mvalue);
+            if (__predict_false(!atomic_compare_exchange_weak_explicit(mutex_value_ptr,
+                                                                       &mvalue, newval,
+                                                                       memory_order_relaxed,
+                                                                       memory_order_relaxed))) {
                 continue;
             }
             mvalue = newval;
         }
 
-        /* wait until the mutex is unlocked */
-        __futex_wait_ex(&mutex->value, shared, mvalue, NULL);
-
-        mvalue = mutex->value;
+        // We are in locked_contended state, sleep until someone wake us up.
+        __futex_wait_ex(mutex_value_ptr, shared, mvalue, NULL);
+        mvalue = atomic_load_explicit(mutex_value_ptr, memory_order_relaxed);
     }
-    /* NOTREACHED */
 }
 
 int pthread_mutex_unlock(pthread_mutex_t* mutex) {
+    atomic_int* mutex_value_ptr = MUTEX_TO_ATOMIC_POINTER(mutex);
+
     int mvalue, mtype, tid, shared;
 
-    mvalue = mutex->value;
+    mvalue = atomic_load_explicit(mutex_value_ptr, memory_order_relaxed);
     mtype  = (mvalue & MUTEX_TYPE_MASK);
     shared = (mvalue & MUTEX_SHARED_MASK);
 
-    /* Handle common case first */
+    // Handle common case first.
     if (__predict_true(mtype == MUTEX_TYPE_BITS_NORMAL)) {
-        _normal_unlock(mutex, shared);
+        _normal_mutex_unlock(mutex_value_ptr, shared);
         return 0;
     }
 
-    /* Do we already own this recursive or error-check mutex ? */
+    // Do we already own this recursive or error-check mutex?
     tid = __get_thread()->tid;
     if ( tid != MUTEX_OWNER_FROM_BITS(mvalue) )
         return EPERM;
 
-    /* If the counter is > 0, we can simply decrement it atomically.
-     * Since other threads can mutate the lower state bits (and only the
-     * lower state bits), use a cmpxchg to do it.
-     */
+    // If the counter is > 0, we can simply decrement it atomically.
+    // Since other threads can mutate the lower state bits (and only the
+    // lower state bits), use a compare_exchange loop to do it.
     if (!MUTEX_COUNTER_BITS_IS_ZERO(mvalue)) {
-        for (;;) {
-            int newval = mvalue - MUTEX_COUNTER_BITS_ONE;
-            if (__predict_true(__bionic_cmpxchg(mvalue, newval, &mutex->value) == 0)) {
-                /* success: we still own the mutex, so no memory barrier */
-                return 0;
-            }
-            /* the value changed, so reload and loop */
-            mvalue = mutex->value;
-        }
+        // We still own the mutex, so a release fence is not needed.
+        while (!atomic_compare_exchange_weak_explicit(mutex_value_ptr, &mvalue,
+                                                      mvalue - MUTEX_COUNTER_BITS_ONE,
+                                                      memory_order_relaxed,
+                                                      memory_order_relaxed)) { }
+        return 0;
     }
 
-    /* the counter is 0, so we're going to unlock the mutex by resetting
-     * its value to 'unlocked'. We need to perform a swap in order
-     * to read the current state, which will be 2 if there are waiters
-     * to awake.
-     *
-     * TODO: Change this to __bionic_swap_release when we implement it
-     *        to get rid of the explicit memory barrier below.
-     */
-    ANDROID_MEMBAR_FULL();  /* RELEASE BARRIER */
-    mvalue = __bionic_swap(mtype | shared | MUTEX_STATE_BITS_UNLOCKED, &mutex->value);
-
-    /* Wake one waiting thread, if any */
+    // The counter is 0, so we'are going to unlock the mutex by resetting its
+    // state to unlocked, we need to perform a atomic_exchange inorder to read
+    // the current state, which will be locked_contended if there may have waiters
+    // to awake.
+    // A release fence is required to make previous stores visible to next
+    // lock owner threads.
+    mvalue = atomic_exchange_explicit(mutex_value_ptr,
+                                      mtype | shared | MUTEX_STATE_BITS_UNLOCKED,
+                                      memory_order_release);
     if (MUTEX_STATE_BITS_IS_LOCKED_CONTENDED(mvalue)) {
-        __futex_wake_ex(&mutex->value, shared, 1);
+        __futex_wake_ex(mutex_value_ptr, shared, 1);
     }
+
     return 0;
 }
 
 int pthread_mutex_trylock(pthread_mutex_t* mutex) {
-    int mvalue = mutex->value;
+    atomic_int* mutex_value_ptr = MUTEX_TO_ATOMIC_POINTER(mutex);
+
+    int mvalue = atomic_load_explicit(mutex_value_ptr, memory_order_relaxed);
     int mtype  = (mvalue & MUTEX_TYPE_MASK);
     int shared = (mvalue & MUTEX_SHARED_MASK);
 
     // Handle common case first.
     if (__predict_true(mtype == MUTEX_TYPE_BITS_NORMAL)) {
-        if (__bionic_cmpxchg(shared|MUTEX_STATE_BITS_UNLOCKED,
-                             shared|MUTEX_STATE_BITS_LOCKED_UNCONTENDED,
-                             &mutex->value) == 0) {
-            ANDROID_MEMBAR_FULL();
+        mvalue = shared | MUTEX_STATE_BITS_UNLOCKED;
+        // If exchanged successfully, An acquire fence is required to make
+        // all memory accesses made by other threads visible in current CPU.
+        if (atomic_compare_exchange_strong_explicit(mutex_value_ptr,
+                                                    &mvalue,
+                                                    shared | MUTEX_STATE_BITS_LOCKED_UNCONTENDED,
+                                                    memory_order_acquire,
+                                                    memory_order_relaxed)) {
             return 0;
         }
-
         return EBUSY;
     }
 
@@ -600,158 +552,163 @@
         if (mtype == MUTEX_TYPE_BITS_ERRORCHECK) {
             return EBUSY;
         }
-        return _recursive_increment(mutex, mvalue, mtype);
+        return _recursive_increment(mutex_value_ptr, mvalue, mtype);
     }
 
-    /* Same as pthread_mutex_lock, except that we don't want to wait, and
-     * the only operation that can succeed is a single cmpxchg to acquire the
-     * lock if it is released / not owned by anyone. No need for a complex loop.
-     */
+    // Same as pthread_mutex_lock, except that we don't want to wait, and
+    // the only operation that can succeed is a single compare_exchange to acquire the
+    // lock if it is released / not owned by anyone. No need for a complex loop.
+    // If exchanged successfully, An acquire fence is required to make
+    // all memory accesses made by other threads visible in current CPU.
     mtype |= shared | MUTEX_STATE_BITS_UNLOCKED;
     mvalue = MUTEX_OWNER_TO_BITS(tid) | mtype | MUTEX_STATE_BITS_LOCKED_UNCONTENDED;
 
-    if (__predict_true(__bionic_cmpxchg(mtype, mvalue, &mutex->value) == 0)) {
-        ANDROID_MEMBAR_FULL();
+    if (__predict_true(atomic_compare_exchange_strong_explicit(mutex_value_ptr,
+                                                               &mtype, mvalue,
+                                                               memory_order_acquire,
+                                                               memory_order_relaxed))) {
         return 0;
     }
-
     return EBUSY;
 }
 
 static int __pthread_mutex_timedlock(pthread_mutex_t* mutex, const timespec* abs_ts, clockid_t clock) {
-  timespec ts;
+    atomic_int* mutex_value_ptr = MUTEX_TO_ATOMIC_POINTER(mutex);
 
-  int mvalue = mutex->value;
-  int mtype  = (mvalue & MUTEX_TYPE_MASK);
-  int shared = (mvalue & MUTEX_SHARED_MASK);
+    timespec ts;
 
-  // Handle common case first.
-  if (__predict_true(mtype == MUTEX_TYPE_BITS_NORMAL)) {
-    const int unlocked           = shared | MUTEX_STATE_BITS_UNLOCKED;
-    const int locked_uncontended = shared | MUTEX_STATE_BITS_LOCKED_UNCONTENDED;
-    const int locked_contended   = shared | MUTEX_STATE_BITS_LOCKED_CONTENDED;
+    int mvalue = atomic_load_explicit(mutex_value_ptr, memory_order_relaxed);
+    int mtype  = (mvalue & MUTEX_TYPE_MASK);
+    int shared = (mvalue & MUTEX_SHARED_MASK);
 
-    // Fast path for uncontended lock. Note: MUTEX_TYPE_BITS_NORMAL is 0.
-    if (__bionic_cmpxchg(unlocked, locked_uncontended, &mutex->value) == 0) {
-      ANDROID_MEMBAR_FULL();
-      return 0;
+    // Handle common case first.
+    if (__predict_true(mtype == MUTEX_TYPE_BITS_NORMAL)) {
+        const int unlocked           = shared | MUTEX_STATE_BITS_UNLOCKED;
+        const int locked_uncontended = shared | MUTEX_STATE_BITS_LOCKED_UNCONTENDED;
+        const int locked_contended   = shared | MUTEX_STATE_BITS_LOCKED_CONTENDED;
+
+        // If exchanged successfully, An acquire fence is required to make
+        // all memory accesses made by other threads visible in current CPU.
+        mvalue = unlocked;
+        if (atomic_compare_exchange_strong_explicit(mutex_value_ptr, &mvalue, locked_uncontended,
+                                                    memory_order_acquire, memory_order_relaxed)) {
+            return 0;
+        }
+
+        ScopedTrace trace("Contending for timed pthread mutex");
+
+        // Same as pthread_mutex_lock, except that we can only wait for a specified
+        // time interval. If lock is acquired, an acquire fence is needed to make
+        // all memory accesses made by other threads visible in current CPU.
+        while (atomic_exchange_explicit(mutex_value_ptr, locked_contended,
+                                        memory_order_acquire) != unlocked) {
+            if (!timespec_from_absolute_timespec(ts, *abs_ts, clock)) {
+                return ETIMEDOUT;
+            }
+            __futex_wait_ex(mutex_value_ptr, shared, locked_contended, &ts);
+        }
+
+        return 0;
+    }
+
+    // Do we already own this recursive or error-check mutex?
+    pid_t tid = __get_thread()->tid;
+    if (tid == MUTEX_OWNER_FROM_BITS(mvalue)) {
+        return _recursive_increment(mutex_value_ptr, mvalue, mtype);
+    }
+
+    mtype |= shared;
+
+    // First try a quick lock.
+    if (mvalue == mtype) {
+        int newval = MUTEX_OWNER_TO_BITS(tid) | mtype | MUTEX_STATE_BITS_LOCKED_UNCONTENDED;
+        // If exchanged successfully, An acquire fence is required to make
+        // all memory accesses made by other threads visible in current CPU.
+        if (__predict_true(atomic_compare_exchange_strong_explicit(mutex_value_ptr,
+                                                                   &mvalue, newval,
+                                                                   memory_order_acquire,
+                                                                   memory_order_relaxed))) {
+            return 0;
+        }
     }
 
     ScopedTrace trace("Contending for timed pthread mutex");
 
-    // Loop while needed.
-    while (__bionic_swap(locked_contended, &mutex->value) != unlocked) {
-      if (!timespec_from_absolute_timespec(ts, *abs_ts, clock)) {
-        return ETIMEDOUT;
-      }
-      __futex_wait_ex(&mutex->value, shared, locked_contended, &ts);
+    // The following implements the same loop as pthread_mutex_lock,
+    // but adds checks to ensure that the operation never exceeds the
+    // absolute expiration time.
+    while (true) {
+        if (mvalue == mtype) { // Unlocked.
+            int newval = MUTEX_OWNER_TO_BITS(tid) | mtype | MUTEX_STATE_BITS_LOCKED_CONTENDED;
+            // An acquire fence is needed for successful exchange.
+            if (!atomic_compare_exchange_strong_explicit(mutex_value_ptr, &mvalue, newval,
+                                                         memory_order_acquire,
+                                                         memory_order_relaxed)) {
+                goto check_time;
+            }
+
+            return 0;
+        } else if (MUTEX_STATE_BITS_IS_LOCKED_UNCONTENDED(mvalue)) {
+            // The value is locked. If the state is locked_uncontended, we need to switch
+            // it to locked_contended before sleep, so we can get woken up later.
+            int newval = MUTEX_STATE_BITS_FLIP_CONTENTION(mvalue);
+            if (!atomic_compare_exchange_strong_explicit(mutex_value_ptr, &mvalue, newval,
+                                                         memory_order_relaxed,
+                                                         memory_order_relaxed)) {
+                goto check_time;
+            }
+            mvalue = newval;
+        }
+
+        if (!timespec_from_absolute_timespec(ts, *abs_ts, clock)) {
+            return ETIMEDOUT;
+        }
+
+        if (__futex_wait_ex(mutex_value_ptr, shared, mvalue, &ts) == -ETIMEDOUT) {
+            return ETIMEDOUT;
+        }
+
+check_time:
+        if (!timespec_from_absolute_timespec(ts, *abs_ts, clock)) {
+            return ETIMEDOUT;
+        }
+        // After futex_wait or time costly timespec_from_absolte_timespec,
+        // we'd better read mvalue again in case it is changed.
+        mvalue = atomic_load_explicit(mutex_value_ptr, memory_order_relaxed);
     }
-    ANDROID_MEMBAR_FULL();
-    return 0;
-  }
-
-  // Do we already own this recursive or error-check mutex?
-  pid_t tid = __get_thread()->tid;
-  if (tid == MUTEX_OWNER_FROM_BITS(mvalue)) {
-    return _recursive_increment(mutex, mvalue, mtype);
-  }
-
-  // The following implements the same loop as pthread_mutex_lock_impl
-  // but adds checks to ensure that the operation never exceeds the
-  // absolute expiration time.
-  mtype |= shared;
-
-  // First try a quick lock.
-  if (mvalue == mtype) {
-    mvalue = MUTEX_OWNER_TO_BITS(tid) | mtype | MUTEX_STATE_BITS_LOCKED_UNCONTENDED;
-    if (__predict_true(__bionic_cmpxchg(mtype, mvalue, &mutex->value) == 0)) {
-      ANDROID_MEMBAR_FULL();
-      return 0;
-    }
-    mvalue = mutex->value;
-  }
-
-  ScopedTrace trace("Contending for timed pthread mutex");
-
-  while (true) {
-    // If the value is 'unlocked', try to acquire it directly.
-    // NOTE: put state to 2 since we know there is contention.
-    if (mvalue == mtype) { // Unlocked.
-      mvalue = MUTEX_OWNER_TO_BITS(tid) | mtype | MUTEX_STATE_BITS_LOCKED_CONTENDED;
-      if (__bionic_cmpxchg(mtype, mvalue, &mutex->value) == 0) {
-        ANDROID_MEMBAR_FULL();
-        return 0;
-      }
-      // The value changed before we could lock it. We need to check
-      // the time to avoid livelocks, reload the value, then loop again.
-      if (!timespec_from_absolute_timespec(ts, *abs_ts, clock)) {
-        return ETIMEDOUT;
-      }
-
-      mvalue = mutex->value;
-      continue;
-    }
-
-    // The value is locked. If 'uncontended', try to switch its state
-    // to 'contented' to ensure we get woken up later.
-    if (MUTEX_STATE_BITS_IS_LOCKED_UNCONTENDED(mvalue)) {
-      int newval = MUTEX_STATE_BITS_FLIP_CONTENTION(mvalue);
-      if (__bionic_cmpxchg(mvalue, newval, &mutex->value) != 0) {
-        // This failed because the value changed, reload it.
-        mvalue = mutex->value;
-      } else {
-        // This succeeded, update mvalue.
-        mvalue = newval;
-      }
-    }
-
-    // Check time and update 'ts'.
-    if (timespec_from_absolute_timespec(ts, *abs_ts, clock)) {
-      return ETIMEDOUT;
-    }
-
-    // Only wait to be woken up if the state is '2', otherwise we'll
-    // simply loop right now. This can happen when the second cmpxchg
-    // in our loop failed because the mutex was unlocked by another thread.
-    if (MUTEX_STATE_BITS_IS_LOCKED_CONTENDED(mvalue)) {
-      if (__futex_wait_ex(&mutex->value, shared, mvalue, &ts) == -ETIMEDOUT) {
-        return ETIMEDOUT;
-      }
-      mvalue = mutex->value;
-    }
-  }
-  /* NOTREACHED */
 }
 
 #if !defined(__LP64__)
 extern "C" int pthread_mutex_lock_timeout_np(pthread_mutex_t* mutex, unsigned ms) {
-  timespec abs_timeout;
-  clock_gettime(CLOCK_MONOTONIC, &abs_timeout);
-  abs_timeout.tv_sec  += ms / 1000;
-  abs_timeout.tv_nsec += (ms % 1000) * 1000000;
-  if (abs_timeout.tv_nsec >= NS_PER_S) {
-    abs_timeout.tv_sec++;
-    abs_timeout.tv_nsec -= NS_PER_S;
-  }
+    timespec abs_timeout;
+    clock_gettime(CLOCK_MONOTONIC, &abs_timeout);
+    abs_timeout.tv_sec  += ms / 1000;
+    abs_timeout.tv_nsec += (ms % 1000) * 1000000;
+    if (abs_timeout.tv_nsec >= NS_PER_S) {
+        abs_timeout.tv_sec++;
+        abs_timeout.tv_nsec -= NS_PER_S;
+    }
 
-  int error = __pthread_mutex_timedlock(mutex, &abs_timeout, CLOCK_MONOTONIC);
-  if (error == ETIMEDOUT) {
-    error = EBUSY;
-  }
-  return error;
+    int error = __pthread_mutex_timedlock(mutex, &abs_timeout, CLOCK_MONOTONIC);
+    if (error == ETIMEDOUT) {
+        error = EBUSY;
+    }
+    return error;
 }
 #endif
 
 int pthread_mutex_timedlock(pthread_mutex_t* mutex, const timespec* abs_timeout) {
-  return __pthread_mutex_timedlock(mutex, abs_timeout, CLOCK_REALTIME);
+    return __pthread_mutex_timedlock(mutex, abs_timeout, CLOCK_REALTIME);
 }
 
 int pthread_mutex_destroy(pthread_mutex_t* mutex) {
-  // Use trylock to ensure that the mutex is valid and not already locked.
-  int error = pthread_mutex_trylock(mutex);
-  if (error != 0) {
-    return error;
-  }
-  mutex->value = 0xdead10cc;
-  return 0;
+    // Use trylock to ensure that the mutex is valid and not already locked.
+    int error = pthread_mutex_trylock(mutex);
+    if (error != 0) {
+        return error;
+    }
+
+    atomic_int* mutex_value_ptr = MUTEX_TO_ATOMIC_POINTER(mutex);
+    atomic_store_explicit(mutex_value_ptr, 0xdead10cc, memory_order_relaxed);
+    return 0;
 }
diff --git a/libc/bionic/semaphore.cpp b/libc/bionic/semaphore.cpp
index dabfea0..0b04650 100644
--- a/libc/bionic/semaphore.cpp
+++ b/libc/bionic/semaphore.cpp
@@ -26,13 +26,19 @@
  * SUCH DAMAGE.
  */
 
+// Memory order requirements for POSIX semaphores appear unclear and are
+// currently interpreted inconsistently.
+// We conservatively prefer sequentially consistent operations for now.
+// CAUTION: This is more conservative than some other major implementations,
+// and may change if and when the issue is resolved.
+
 #include <semaphore.h>
 #include <errno.h>
 #include <limits.h>
+#include <stdatomic.h>
 #include <sys/time.h>
 #include <time.h>
 
-#include "private/bionic_atomic_inline.h"
 #include "private/bionic_constants.h"
 #include "private/bionic_futex.h"
 #include "private/bionic_time_conversions.h"
@@ -66,7 +72,7 @@
 #define SEMCOUNT_FROM_VALUE(val)    (((val) << SEMCOUNT_VALUE_SHIFT) & SEMCOUNT_VALUE_MASK)
 
 // Convert a sem->count bit pattern into the corresponding signed value.
-static inline int SEMCOUNT_TO_VALUE(uint32_t sval) {
+static inline int SEMCOUNT_TO_VALUE(unsigned int sval) {
   return (static_cast<int>(sval) >> SEMCOUNT_VALUE_SHIFT);
 }
 
@@ -79,11 +85,20 @@
 #define SEMCOUNT_DECREMENT(sval)    (((sval) - (1U << SEMCOUNT_VALUE_SHIFT)) & SEMCOUNT_VALUE_MASK)
 #define SEMCOUNT_INCREMENT(sval)    (((sval) + (1U << SEMCOUNT_VALUE_SHIFT)) & SEMCOUNT_VALUE_MASK)
 
-// Return the shared bitflag from a semaphore.
-static inline uint32_t SEM_GET_SHARED(sem_t* sem) {
-  return (sem->count & SEMCOUNT_SHARED_MASK);
+static inline atomic_uint* SEM_TO_ATOMIC_POINTER(sem_t* sem) {
+  static_assert(sizeof(atomic_uint) == sizeof(sem->count),
+                "sem->count should actually be atomic_uint in implementation.");
+
+  // We prefer casting to atomic_uint instead of declaring sem->count to be atomic_uint directly.
+  // Because using the second method pollutes semaphore.h.
+  return reinterpret_cast<atomic_uint*>(&sem->count);
 }
 
+// Return the shared bitflag from a semaphore counter.
+static inline unsigned int SEM_GET_SHARED(atomic_uint* sem_count_ptr) {
+  // memory_order_relaxed is used as SHARED flag will not be changed after init.
+  return (atomic_load_explicit(sem_count_ptr, memory_order_relaxed) & SEMCOUNT_SHARED_MASK);
+}
 
 int sem_init(sem_t* sem, int pshared, unsigned int value) {
   // Ensure that 'value' can be stored in the semaphore.
@@ -92,10 +107,13 @@
     return -1;
   }
 
-  sem->count = SEMCOUNT_FROM_VALUE(value);
+  unsigned int count = SEMCOUNT_FROM_VALUE(value);
   if (pshared != 0) {
-    sem->count |= SEMCOUNT_SHARED_MASK;
+    count |= SEMCOUNT_SHARED_MASK;
   }
+
+  atomic_uint* sem_count_ptr = SEM_TO_ATOMIC_POINTER(sem);
+  atomic_init(sem_count_ptr, count);
   return 0;
 }
 
@@ -122,98 +140,97 @@
 // and return the old one. As a special case,
 // this returns immediately if the value is
 // negative (i.e. -1)
-static int __sem_dec(volatile uint32_t* sem) {
-  volatile int32_t* ptr = reinterpret_cast<volatile int32_t*>(sem);
-  uint32_t shared = (*sem & SEMCOUNT_SHARED_MASK);
-  uint32_t old_value, new_value;
-  int ret;
+static int __sem_dec(atomic_uint* sem_count_ptr) {
+  unsigned int old_value = atomic_load_explicit(sem_count_ptr, memory_order_relaxed);
+  unsigned int shared = old_value & SEMCOUNT_SHARED_MASK;
 
+  // Use memory_order_seq_cst in atomic_compare_exchange operation to ensure all
+  // memory access made by other threads can be seen in current thread.
+  // An acquire fence may be sufficient, but it is still in discussion whether
+  // POSIX semaphores should provide sequential consistency.
   do {
-    old_value = (*sem & SEMCOUNT_VALUE_MASK);
-    ret = SEMCOUNT_TO_VALUE(old_value);
-    if (ret < 0) {
+    if (SEMCOUNT_TO_VALUE(old_value) < 0) {
       break;
     }
+  } while (!atomic_compare_exchange_weak(sem_count_ptr, &old_value,
+           SEMCOUNT_DECREMENT(old_value) | shared));
 
-    new_value = SEMCOUNT_DECREMENT(old_value);
-  } while (__bionic_cmpxchg((old_value|shared), (new_value|shared), ptr) != 0);
-
-  return ret;
+  return SEMCOUNT_TO_VALUE(old_value);
 }
 
 // Same as __sem_dec, but will not touch anything if the
 // value is already negative *or* 0. Returns the old value.
-static int __sem_trydec(volatile uint32_t* sem) {
-  volatile int32_t* ptr = reinterpret_cast<volatile int32_t*>(sem);
-  uint32_t shared = (*sem & SEMCOUNT_SHARED_MASK);
-  uint32_t old_value, new_value;
-  int          ret;
+static int __sem_trydec(atomic_uint* sem_count_ptr) {
+  unsigned int old_value = atomic_load_explicit(sem_count_ptr, memory_order_relaxed);
+  unsigned int shared = old_value & SEMCOUNT_SHARED_MASK;
 
+  // Use memory_order_seq_cst in atomic_compare_exchange operation to ensure all
+  // memory access made by other threads can be seen in current thread.
+  // An acquire fence may be sufficient, but it is still in discussion whether
+  // POSIX semaphores should provide sequential consistency.
   do {
-    old_value = (*sem & SEMCOUNT_VALUE_MASK);
-    ret = SEMCOUNT_TO_VALUE(old_value);
-    if (ret <= 0) {
+    if (SEMCOUNT_TO_VALUE(old_value) <= 0) {
       break;
     }
+  } while (!atomic_compare_exchange_weak(sem_count_ptr, &old_value,
+           SEMCOUNT_DECREMENT(old_value) | shared));
 
-    new_value = SEMCOUNT_DECREMENT(old_value);
-  } while (__bionic_cmpxchg((old_value|shared), (new_value|shared), ptr) != 0);
-
-  return ret;
+  return SEMCOUNT_TO_VALUE(old_value);
 }
 
-
 // "Increment" the value of a semaphore atomically and
 // return its old value. Note that this implements
 // the special case of "incrementing" any negative
 // value to +1 directly.
 //
 // NOTE: The value will _not_ wrap above SEM_VALUE_MAX
-static int __sem_inc(volatile uint32_t* sem) {
-  volatile int32_t* ptr = reinterpret_cast<volatile int32_t*>(sem);
-  uint32_t shared = (*sem & SEMCOUNT_SHARED_MASK);
-  uint32_t old_value, new_value;
-  int ret;
+static int __sem_inc(atomic_uint* sem_count_ptr) {
+  unsigned int old_value = atomic_load_explicit(sem_count_ptr, memory_order_relaxed);
+  unsigned int shared = old_value  & SEMCOUNT_SHARED_MASK;
+  unsigned int new_value;
 
+  // Use memory_order_seq_cst in atomic_compare_exchange operation to ensure all
+  // memory access made before can be seen in other threads.
+  // A release fence may be sufficient, but it is still in discussion whether
+  // POSIX semaphores should provide sequential consistency.
   do {
-    old_value = (*sem & SEMCOUNT_VALUE_MASK);
-    ret = SEMCOUNT_TO_VALUE(old_value);
-
     // Can't go higher than SEM_VALUE_MAX.
-    if (ret == SEM_VALUE_MAX) {
+    if (SEMCOUNT_TO_VALUE(old_value) == SEM_VALUE_MAX) {
       break;
     }
 
-    // If the counter is negative, go directly to +1, otherwise just increment.
-    if (ret < 0) {
-        new_value = SEMCOUNT_ONE;
+    // If the counter is negative, go directly to one, otherwise just increment.
+    if (SEMCOUNT_TO_VALUE(old_value) < 0) {
+      new_value = SEMCOUNT_ONE | shared;
     } else {
-      new_value = SEMCOUNT_INCREMENT(old_value);
+      new_value = SEMCOUNT_INCREMENT(old_value) | shared;
     }
-  } while (__bionic_cmpxchg((old_value|shared), (new_value|shared), ptr) != 0);
+  } while (!atomic_compare_exchange_weak(sem_count_ptr, &old_value,
+           new_value));
 
-  return ret;
+  return SEMCOUNT_TO_VALUE(old_value);
 }
 
 int sem_wait(sem_t* sem) {
-  uint32_t shared = SEM_GET_SHARED(sem);
+  atomic_uint* sem_count_ptr = SEM_TO_ATOMIC_POINTER(sem);
+  unsigned int shared = SEM_GET_SHARED(sem_count_ptr);
 
   while (true) {
-    if (__sem_dec(&sem->count) > 0) {
-      ANDROID_MEMBAR_FULL();
+    if (__sem_dec(sem_count_ptr) > 0) {
       return 0;
     }
 
-    __futex_wait_ex(&sem->count, shared, shared|SEMCOUNT_MINUS_ONE, NULL);
+    __futex_wait_ex(sem_count_ptr, shared, shared | SEMCOUNT_MINUS_ONE, NULL);
   }
 }
 
 int sem_timedwait(sem_t* sem, const timespec* abs_timeout) {
+  atomic_uint* sem_count_ptr = SEM_TO_ATOMIC_POINTER(sem);
+
   // POSIX says we need to try to decrement the semaphore
   // before checking the timeout value. Note that if the
   // value is currently 0, __sem_trydec() does nothing.
-  if (__sem_trydec(&sem->count) > 0) {
-    ANDROID_MEMBAR_FULL();
+  if (__sem_trydec(sem_count_ptr) > 0) {
     return 0;
   }
 
@@ -223,7 +240,7 @@
     return -1;
   }
 
-  uint32_t shared = SEM_GET_SHARED(sem);
+  unsigned int shared = SEM_GET_SHARED(sem_count_ptr);
 
   while (true) {
     // POSIX mandates CLOCK_REALTIME here.
@@ -234,13 +251,12 @@
     }
 
     // Try to grab the semaphore. If the value was 0, this will also change it to -1.
-    if (__sem_dec(&sem->count) > 0) {
-      ANDROID_MEMBAR_FULL();
+    if (__sem_dec(sem_count_ptr) > 0) {
       break;
     }
 
     // Contention detected. Wait for a wakeup event.
-    int ret = __futex_wait_ex(&sem->count, shared, shared|SEMCOUNT_MINUS_ONE, &ts);
+    int ret = __futex_wait_ex(sem_count_ptr, shared, shared | SEMCOUNT_MINUS_ONE, &ts);
 
     // Return in case of timeout or interrupt.
     if (ret == -ETIMEDOUT || ret == -EINTR) {
@@ -252,13 +268,13 @@
 }
 
 int sem_post(sem_t* sem) {
-  uint32_t shared = SEM_GET_SHARED(sem);
+  atomic_uint* sem_count_ptr = SEM_TO_ATOMIC_POINTER(sem);
+  unsigned int shared = SEM_GET_SHARED(sem_count_ptr);
 
-  ANDROID_MEMBAR_FULL();
-  int old_value = __sem_inc(&sem->count);
+  int old_value = __sem_inc(sem_count_ptr);
   if (old_value < 0) {
     // Contention on the semaphore. Wake up all waiters.
-    __futex_wake_ex(&sem->count, shared, INT_MAX);
+    __futex_wake_ex(sem_count_ptr, shared, INT_MAX);
   } else if (old_value == SEM_VALUE_MAX) {
     // Overflow detected.
     errno = EOVERFLOW;
@@ -269,8 +285,8 @@
 }
 
 int sem_trywait(sem_t* sem) {
-  if (__sem_trydec(&sem->count) > 0) {
-    ANDROID_MEMBAR_FULL();
+  atomic_uint* sem_count_ptr = SEM_TO_ATOMIC_POINTER(sem);
+  if (__sem_trydec(sem_count_ptr) > 0) {
     return 0;
   } else {
     errno = EAGAIN;
@@ -279,7 +295,12 @@
 }
 
 int sem_getvalue(sem_t* sem, int* sval) {
-  int val = SEMCOUNT_TO_VALUE(sem->count);
+  atomic_uint* sem_count_ptr = SEM_TO_ATOMIC_POINTER(sem);
+
+  // Use memory_order_seq_cst in atomic_load operation.
+  // memory_order_relaxed may be fine here, but it is still in discussion
+  // whether POSIX semaphores should provide sequential consistency.
+  int val = SEMCOUNT_TO_VALUE(atomic_load(sem_count_ptr));
   if (val < 0) {
     val = 0;
   }
diff --git a/libc/include/poll.h b/libc/include/poll.h
index 0199cab..7c16d81 100644
--- a/libc/include/poll.h
+++ b/libc/include/poll.h
@@ -38,8 +38,52 @@
 
 typedef unsigned int nfds_t;
 
-extern int poll(struct pollfd*, nfds_t, int);
-extern int ppoll(struct pollfd*, nfds_t, const struct timespec*, const sigset_t*);
+int poll(struct pollfd*, nfds_t, int);
+int ppoll(struct pollfd*, nfds_t, const struct timespec*, const sigset_t*);
+
+int __poll_chk(struct pollfd*, nfds_t, int, size_t);
+int __poll_real(struct pollfd*, nfds_t, int) __RENAME(poll);
+__errordecl(__poll_too_small_error, "poll: pollfd array smaller than fd count");
+
+int __ppoll_chk(struct pollfd*, nfds_t, const struct timespec*, const sigset_t*, size_t);
+int __ppoll_real(struct pollfd*, nfds_t, const struct timespec*, const sigset_t*) __RENAME(ppoll);
+__errordecl(__ppoll_too_small_error, "ppoll: pollfd array smaller than fd count");
+
+#if defined(__BIONIC_FORTIFY)
+
+__BIONIC_FORTIFY_INLINE
+int poll(struct pollfd* fds, nfds_t fd_count, int timeout) {
+#if defined(__clang__)
+  return __poll_chk(fds, fd_count, timeout, __bos(fds));
+#else
+  if (__bos(fds) != __BIONIC_FORTIFY_UNKNOWN_SIZE) {
+    if (!__builtin_constant_p(fd_count)) {
+      return __poll_chk(fds, fd_count, timeout, __bos(fds));
+    } else if (__bos(fds) / sizeof(*fds) < fd_count) {
+      __poll_too_small_error();
+    }
+  }
+  return __poll_real(fds, fd_count, timeout);
+#endif
+}
+
+__BIONIC_FORTIFY_INLINE
+int ppoll(struct pollfd* fds, nfds_t fd_count, const struct timespec* timeout, const sigset_t* mask) {
+#if defined(__clang__)
+  return __ppoll_chk(fds, fd_count, timeout, mask, __bos(fds));
+#else
+  if (__bos(fds) != __BIONIC_FORTIFY_UNKNOWN_SIZE) {
+    if (!__builtin_constant_p(fd_count)) {
+      return __ppoll_chk(fds, fd_count, timeout, mask, __bos(fds));
+    } else if (__bos(fds) / sizeof(*fds) < fd_count) {
+      __ppoll_too_small_error();
+    }
+  }
+  return __ppoll_real(fds, fd_count, timeout, mask);
+#endif
+}
+
+#endif
 
 __END_DECLS
 
diff --git a/libc/include/pthread.h b/libc/include/pthread.h
index 4281132..8d053ae 100644
--- a/libc/include/pthread.h
+++ b/libc/include/pthread.h
@@ -43,7 +43,7 @@
 #endif
 
 typedef struct {
-  int volatile value;
+  int value;
 #ifdef __LP64__
   char __reserved[36];
 #endif
diff --git a/libc/include/semaphore.h b/libc/include/semaphore.h
index 5827870..4ef13af 100644
--- a/libc/include/semaphore.h
+++ b/libc/include/semaphore.h
@@ -36,7 +36,7 @@
 struct timespec;
 
 typedef struct {
-  volatile unsigned int count;
+  unsigned int count;
 #ifdef __LP64__
   int __reserved[3];
 #endif
diff --git a/libc/include/stdio.h b/libc/include/stdio.h
index c0dac1a..b04aa24 100644
--- a/libc/include/stdio.h
+++ b/libc/include/stdio.h
@@ -207,16 +207,9 @@
 #define	L_tmpnam	1024	/* XXX must be == PATH_MAX */
 #define	TMP_MAX		308915776
 
-/* Always ensure that these are consistent with <fcntl.h> and <unistd.h>! */
-#ifndef SEEK_SET
-#define	SEEK_SET	0	/* set file offset to offset */
-#endif
-#ifndef SEEK_CUR
-#define	SEEK_CUR	1	/* set file offset to current plus offset */
-#endif
-#ifndef SEEK_END
-#define	SEEK_END	2	/* set file offset to EOF plus offset */
-#endif
+#define SEEK_SET 0
+#define SEEK_CUR 1
+#define SEEK_END 2
 
 /*
  * Functions defined in ANSI C standard.
diff --git a/libc/include/string.h b/libc/include/string.h
index 4ca77ae..d67928c 100644
--- a/libc/include/string.h
+++ b/libc/include/string.h
@@ -31,7 +31,6 @@
 
 #include <sys/cdefs.h>
 #include <stddef.h>
-#include <malloc.h>
 #include <xlocale.h>
 
 __BEGIN_DECLS
diff --git a/libc/include/sys/select.h b/libc/include/sys/select.h
index 553050b..0c4a823 100644
--- a/libc/include/sys/select.h
+++ b/libc/include/sys/select.h
@@ -31,7 +31,6 @@
 
 #include <linux/time.h>
 #include <signal.h>
-#include <string.h>
 #include <sys/cdefs.h>
 #include <sys/types.h>
 
@@ -49,7 +48,14 @@
 #define __FDMASK(fd) (1UL << ((fd) % NFDBITS))
 #define __FDS_BITS(set) (((fd_set*)(set))->fds_bits)
 
-#define FD_ZERO(set) (memset(set, 0, sizeof(*(fd_set*)(set))))
+/* Inline loop so we don't have to declare memset. */
+#define FD_ZERO(set) \
+  do { \
+    size_t __i; \
+    for (__i = 0; __i < __FDSET_LONGS; ++__i) { \
+      (set)->fds_bits[__i] = 0; \
+    } \
+  } while (0)
 
 extern void __FD_CLR_chk(int, fd_set*, size_t);
 extern void __FD_SET_chk(int, fd_set*, size_t);
diff --git a/libc/include/unistd.h b/libc/include/unistd.h
index c755715..6403d4a 100644
--- a/libc/include/unistd.h
+++ b/libc/include/unistd.h
@@ -35,14 +35,19 @@
 #include <sys/select.h>
 #include <sys/sysconf.h>
 
+#include <machine/posix_limits.h>
+
 __BEGIN_DECLS
 
-/* Standard file descriptor numbers. */
 #define STDIN_FILENO	0
 #define STDOUT_FILENO	1
 #define STDERR_FILENO	2
 
-/* Values for whence in fseek and lseek */
+#define F_OK 0
+#define X_OK 1
+#define W_OK 2
+#define R_OK 4
+
 #define SEEK_SET 0
 #define SEEK_CUR 1
 #define SEEK_END 2
@@ -68,8 +73,6 @@
 #define _PC_PRIO_IO 18
 #define _PC_SYNC_IO 19
 
-#include <machine/posix_limits.h>
-
 extern char** environ;
 
 extern __noreturn void _exit(int);
@@ -121,13 +124,6 @@
 extern long fpathconf(int, int);
 extern long pathconf(const char*, int);
 
-
-/* Macros for access() */
-#define R_OK  4  /* Read */
-#define W_OK  2  /* Write */
-#define X_OK  1  /* Execute */
-#define F_OK  0  /* Existence */
-
 extern int access(const char*, int);
 extern int faccessat(int, const char*, int, int);
 extern int link(const char*, const char*);
diff --git a/libc/tools/gensyscalls.py b/libc/tools/gensyscalls.py
index 4e24077..7e11418 100755
--- a/libc/tools/gensyscalls.py
+++ b/libc/tools/gensyscalls.py
@@ -286,8 +286,9 @@
     for alias in aliases:
         stub += function_alias % { "func" : syscall["func"], "alias" : alias }
 
-    # Use hidden visibility for any functions beginning with underscores.
-    if pointer_length == 64 and syscall["func"].startswith("__"):
+    # Use hidden visibility on LP64 for any functions beginning with underscores.
+    # Force hidden visibility for any functions which begin with 3 underscores
+    if (pointer_length == 64 and syscall["func"].startswith("__")) or syscall["func"].startswith("___"):
         stub += '.hidden ' + syscall["func"] + '\n'
 
     return stub
diff --git a/libc/tools/zoneinfo/update-tzdata.py b/libc/tools/zoneinfo/update-tzdata.py
index 330f166..d5788af 100755
--- a/libc/tools/zoneinfo/update-tzdata.py
+++ b/libc/tools/zoneinfo/update-tzdata.py
@@ -117,13 +117,20 @@
   # Build the ICU tools.
   print 'Configuring ICU tools...'
   subprocess.check_call(['%s/runConfigureICU' % icu_dir, 'Linux'])
-  print 'Making ICU tools...'
-  subprocess.check_call(['make', '-j32'])
 
   # Run the ICU tools.
   os.chdir('tools/tzcode')
+
+  # The tz2icu tool only picks up icuregions and icuzones in they are in the CWD
+  for icu_data_file in [ 'icuregions', 'icuzones']:
+    icu_data_file_source = '%s/tools/tzcode/%s' % (icu_dir, icu_data_file)
+    icu_data_file_symlink = './%s' % icu_data_file
+    os.symlink(icu_data_file_source, icu_data_file_symlink)
+
   shutil.copyfile('%s/%s' % (original_working_dir, data_filename), data_filename)
   print 'Making ICU data...'
+  # The Makefile assumes the existence of the bin directory.
+  os.mkdir('%s/bin' % icu_working_dir)
   subprocess.check_call(['make'])
 
   # Copy the source file to its ultimate destination.
@@ -133,7 +140,7 @@
 
   # Regenerate the .dat file.
   os.chdir(icu_working_dir)
-  subprocess.check_call(['make', '-j32'])
+  subprocess.check_call(['make', 'INCLUDE_UNI_CORE_DATA=1', '-j32'])
 
   # Copy the .dat file to its ultimate destination.
   icu_dat_data_dir = '%s/stubdata' % icu_dir
diff --git a/linker/linker.cpp b/linker/linker.cpp
index f7bcd27..3934484 100644
--- a/linker/linker.cpp
+++ b/linker/linker.cpp
@@ -798,7 +798,7 @@
 }
 
 ElfW(Sym)* soinfo::gnu_addr_lookup(const void* addr) {
-  ElfW(Addr) soaddr = reinterpret_cast<ElfW(Addr)>(addr) - base;
+  ElfW(Addr) soaddr = reinterpret_cast<ElfW(Addr)>(addr) - load_bias;
 
   for (size_t i = 0; i < nbucket_; ++i) {
     uint32_t n = bucket_[i];
@@ -819,7 +819,7 @@
 }
 
 ElfW(Sym)* soinfo::elf_addr_lookup(const void* addr) {
-  ElfW(Addr) soaddr = reinterpret_cast<ElfW(Addr)>(addr) - base;
+  ElfW(Addr) soaddr = reinterpret_cast<ElfW(Addr)>(addr) - load_bias;
 
   // Search the library's symbol table for any defined symbol which
   // contains this address.
diff --git a/tests/Android.mk b/tests/Android.mk
index 82a92f0..bd4695f 100644
--- a/tests/Android.mk
+++ b/tests/Android.mk
@@ -329,6 +329,7 @@
 bionic-unit-tests-glibc_whole_static_libraries := \
     libBionicStandardTests \
     libBionicGtestMain \
+    $(fortify_libs) \
 
 bionic-unit-tests-glibc_ldlibs := \
     -lrt -ldl -lutil \
diff --git a/tests/fortify_test.cpp b/tests/fortify_test.cpp
index 48764aa..5cc728f 100644
--- a/tests/fortify_test.cpp
+++ b/tests/fortify_test.cpp
@@ -19,12 +19,20 @@
 
 #include <fcntl.h>
 #include <malloc.h>
+#include <poll.h>
 #include <signal.h>
 #include <stdarg.h>
 #include <string.h>
 #include <sys/socket.h>
 #include <sys/stat.h>
 #include <sys/types.h>
+#include <time.h>
+
+#if __BIONIC__
+#define ASSERT_FORTIFY(expr) ASSERT_EXIT(expr, testing::KilledBySignal(SIGABRT), "FORTIFY")
+#else
+#define ASSERT_FORTIFY(expr) ASSERT_EXIT(expr, testing::KilledBySignal(SIGABRT), "")
+#endif
 
 // Fortify test code needs to run multiple times, so TEST_NAME macro is used to
 // distinguish different tests. TEST_NAME is defined in compilation command.
@@ -48,8 +56,7 @@
 TEST_F(DEATHTEST, stpncpy_fortified2) {
   foo myfoo;
   int copy_amt = atoi("11");
-  ASSERT_EXIT(stpncpy(myfoo.a, "01234567890", copy_amt),
-              testing::KilledBySignal(SIGABRT), "");
+  ASSERT_FORTIFY(stpncpy(myfoo.a, "01234567890", copy_amt));
 }
 #endif
 
@@ -60,8 +67,7 @@
   foo myfoo;
   memset(&myfoo, 0, sizeof(myfoo));
   myfoo.one[0] = 'A'; // not null terminated string
-  ASSERT_EXIT(stpncpy(myfoo.b, myfoo.one, sizeof(myfoo.b)),
-              testing::KilledBySignal(SIGABRT), "");
+  ASSERT_FORTIFY(stpncpy(myfoo.b, myfoo.one, sizeof(myfoo.b)));
 }
 #endif
 
@@ -71,8 +77,7 @@
 TEST_F(DEATHTEST, strncpy_fortified2) {
   foo myfoo;
   int copy_amt = atoi("11");
-  ASSERT_EXIT(strncpy(myfoo.a, "01234567890", copy_amt),
-              testing::KilledBySignal(SIGABRT), "");
+  ASSERT_FORTIFY(strncpy(myfoo.a, "01234567890", copy_amt));
 }
 #endif
 
@@ -83,8 +88,7 @@
   foo myfoo;
   memset(&myfoo, 0, sizeof(myfoo));
   myfoo.one[0] = 'A'; // not null terminated string
-  ASSERT_EXIT(strncpy(myfoo.b, myfoo.one, sizeof(myfoo.b)),
-              testing::KilledBySignal(SIGABRT), "");
+  ASSERT_FORTIFY(strncpy(myfoo.b, myfoo.one, sizeof(myfoo.b)));
 }
 #endif
 
@@ -95,8 +99,7 @@
   foo myfoo;
   char source_buf[15];
   memcpy(source_buf, "12345678901234", 15);
-  ASSERT_EXIT(sprintf(myfoo.a, "%s", source_buf),
-              testing::KilledBySignal(SIGABRT), "");
+  ASSERT_FORTIFY(sprintf(myfoo.a, "%s", source_buf));
 }
 #endif
 
@@ -105,8 +108,7 @@
 // this buffer overflow. TODO: Fix clang.
 TEST_F(DEATHTEST, sprintf2_fortified2) {
   foo myfoo;
-  ASSERT_EXIT(sprintf(myfoo.a, "0123456789"),
-              testing::KilledBySignal(SIGABRT), "");
+  ASSERT_FORTIFY(sprintf(myfoo.a, "0123456789"));
 }
 #endif
 
@@ -125,11 +127,11 @@
 }
 
 TEST_F(DEATHTEST, vsprintf_fortified2) {
-  ASSERT_EXIT(vsprintf_helper2("%s", "0123456789"), testing::KilledBySignal(SIGABRT), "");
+  ASSERT_FORTIFY(vsprintf_helper2("%s", "0123456789"));
 }
 
 TEST_F(DEATHTEST, vsprintf2_fortified2) {
-  ASSERT_EXIT(vsprintf_helper2("0123456789"), testing::KilledBySignal(SIGABRT), "");
+  ASSERT_FORTIFY(vsprintf_helper2("0123456789"));
 }
 #endif
 
@@ -149,11 +151,11 @@
 }
 
 TEST_F(DEATHTEST, vsnprintf_fortified2) {
-  ASSERT_EXIT(vsnprintf_helper2("%s", "0123456789"), testing::KilledBySignal(SIGABRT), "");
+  ASSERT_FORTIFY(vsnprintf_helper2("%s", "0123456789"));
 }
 
 TEST_F(DEATHTEST, vsnprintf2_fortified2) {
-  ASSERT_EXIT(vsnprintf_helper2("0123456789"), testing::KilledBySignal(SIGABRT), "");
+  ASSERT_FORTIFY(vsnprintf_helper2("0123456789"));
 }
 #endif
 
@@ -165,8 +167,7 @@
 #if defined(__BIONIC__)
   foo myfoo;
   char* src = strdup("");
-  ASSERT_EXIT(stpcpy(myfoo.empty, src),
-              testing::KilledBySignal(SIGABRT), "");
+  ASSERT_FORTIFY(stpcpy(myfoo.empty, src));
   free(src);
 #else // __BIONIC__
   GTEST_LOG_(INFO) << "This test does nothing.\n";
@@ -182,8 +183,7 @@
 #if defined(__BIONIC__)
   foo myfoo;
   char* src = strdup("");
-  ASSERT_EXIT(strcpy(myfoo.empty, src),
-              testing::KilledBySignal(SIGABRT), "");
+  ASSERT_FORTIFY(strcpy(myfoo.empty, src));
   free(src);
 #else // __BIONIC__
   GTEST_LOG_(INFO) << "This test does nothing.\n";
@@ -199,8 +199,7 @@
 #if defined(__BIONIC__)
   foo myfoo;
   char* src = strdup("1");
-  ASSERT_EXIT(strcpy(myfoo.empty, src),
-              testing::KilledBySignal(SIGABRT), "");
+  ASSERT_FORTIFY(strcpy(myfoo.empty, src));
   free(src);
 #else // __BIONIC__
   GTEST_LOG_(INFO) << "This test does nothing.\n";
@@ -216,8 +215,7 @@
 #if defined(__BIONIC__)
   foo myfoo;
   char* src = strdup("12");
-  ASSERT_EXIT(strcpy(myfoo.one, src),
-              testing::KilledBySignal(SIGABRT), "");
+  ASSERT_FORTIFY(strcpy(myfoo.one, src));
   free(src);
 #else // __BIONIC__
   GTEST_LOG_(INFO) << "This test does nothing.\n";
@@ -233,8 +231,7 @@
   foo myfoo;
   memcpy(myfoo.a, "0123456789", sizeof(myfoo.a));
   myfoo.b[0] = '\0';
-  ASSERT_EXIT(printf("%s", strchr(myfoo.a, 'a')),
-              testing::KilledBySignal(SIGABRT), "");
+  ASSERT_FORTIFY(printf("%s", strchr(myfoo.a, 'a')));
 #else // __BIONIC__
   GTEST_LOG_(INFO) << "This test does nothing.\n";
 #endif // __BIONIC__
@@ -249,8 +246,7 @@
   foo myfoo;
   memcpy(myfoo.a, "0123456789", 10);
   memcpy(myfoo.b, "01234", 6);
-  ASSERT_EXIT(printf("%s", strrchr(myfoo.a, 'a')),
-              testing::KilledBySignal(SIGABRT), "");
+  ASSERT_FORTIFY(printf("%s", strrchr(myfoo.a, 'a')));
 #else // __BIONIC__
   GTEST_LOG_(INFO) << "This test does nothing.\n";
 #endif // __BIONIC__
@@ -265,8 +261,7 @@
   foo myfoo;
   strcpy(myfoo.a, "01");
   size_t n = strlen(myfoo.a);
-  ASSERT_EXIT(strlcpy(myfoo.one, myfoo.a, n),
-              testing::KilledBySignal(SIGABRT), "");
+  ASSERT_FORTIFY(strlcpy(myfoo.one, myfoo.a, n));
 #else // __BIONIC__
   GTEST_LOG_(INFO) << "This test does nothing.\n";
 #endif // __BIONIC__
@@ -282,8 +277,7 @@
   strcpy(myfoo.a, "01");
   myfoo.one[0] = '\0';
   size_t n = strlen(myfoo.a);
-  ASSERT_EXIT(strlcat(myfoo.one, myfoo.a, n),
-              testing::KilledBySignal(SIGABRT), "");
+  ASSERT_FORTIFY(strlcat(myfoo.one, myfoo.a, n));
 #else // __BIONIC__
   GTEST_LOG_(INFO) << "This test does nothing.\n";
 #endif // __BIONIC__
@@ -297,7 +291,7 @@
   foo myfoo;
   size_t n = atoi("10"); // avoid compiler optimizations
   strncpy(myfoo.a, "012345678", n);
-  ASSERT_EXIT(strncat(myfoo.a, "9", n), testing::KilledBySignal(SIGABRT), "");
+  ASSERT_FORTIFY(strncat(myfoo.a, "9", n));
 }
 #endif
 
@@ -308,7 +302,7 @@
   foo myfoo;
   myfoo.a[0] = '\0';
   size_t n = atoi("10"); // avoid compiler optimizations
-  ASSERT_EXIT(strncat(myfoo.a, "0123456789", n), testing::KilledBySignal(SIGABRT), "");
+  ASSERT_FORTIFY(strncat(myfoo.a, "0123456789", n));
 }
 #endif
 
@@ -317,7 +311,7 @@
   memcpy(myfoo.a, "0123456789", sizeof(myfoo.a)); // unterminated string
   myfoo.b[0] = '\0';
   size_t n = atoi("10"); // avoid compiler optimizations
-  ASSERT_EXIT(strncat(myfoo.b, myfoo.a, n), testing::KilledBySignal(SIGABRT), "");
+  ASSERT_FORTIFY(strncat(myfoo.b, myfoo.a, n));
 }
 
 #ifndef __clang__
@@ -328,7 +322,7 @@
   strcpy(src, "0123456789");
   foo myfoo;
   myfoo.a[0] = '\0';
-  ASSERT_EXIT(strcat(myfoo.a, src), testing::KilledBySignal(SIGABRT), "");
+  ASSERT_FORTIFY(strcat(myfoo.a, src));
 }
 #endif
 
@@ -336,21 +330,21 @@
   foo myfoo;
   memcpy(myfoo.a, "0123456789", sizeof(myfoo.a)); // unterminated string
   myfoo.b[0] = '\0';
-  ASSERT_EXIT(strcat(myfoo.b, myfoo.a), testing::KilledBySignal(SIGABRT), "");
+  ASSERT_FORTIFY(strcat(myfoo.b, myfoo.a));
 }
 
 TEST_F(DEATHTEST, snprintf_fortified2) {
   foo myfoo;
   strcpy(myfoo.a, "012345678");
   size_t n = strlen(myfoo.a) + 2;
-  ASSERT_EXIT(snprintf(myfoo.b, n, "a%s", myfoo.a), testing::KilledBySignal(SIGABRT), "");
+  ASSERT_FORTIFY(snprintf(myfoo.b, n, "a%s", myfoo.a));
 }
 
 TEST_F(DEATHTEST, bzero_fortified2) {
   foo myfoo;
   memcpy(myfoo.b, "0123456789", sizeof(myfoo.b));
   size_t n = atoi("11");
-  ASSERT_EXIT(bzero(myfoo.b, n), testing::KilledBySignal(SIGABRT), "");
+  ASSERT_FORTIFY(bzero(myfoo.b, n));
 }
 
 #endif /* defined(_FORTIFY_SOURCE) && _FORTIFY_SOURCE=2 */
@@ -360,7 +354,7 @@
 #if defined(__BIONIC__)
   char buf[10];
   char *orig = strdup("0123456789");
-  ASSERT_EXIT(strcpy(buf, orig), testing::KilledBySignal(SIGABRT), "");
+  ASSERT_FORTIFY(strcpy(buf, orig));
   free(orig);
 #else // __BIONIC__
   GTEST_LOG_(INFO) << "This test does nothing.\n";
@@ -372,7 +366,7 @@
 #if defined(__BIONIC__)
   char buf[0];
   char *orig = strdup("");
-  ASSERT_EXIT(strcpy(buf, orig), testing::KilledBySignal(SIGABRT), "");
+  ASSERT_FORTIFY(strcpy(buf, orig));
   free(orig);
 #else // __BIONIC__
   GTEST_LOG_(INFO) << "This test does nothing.\n";
@@ -384,7 +378,7 @@
 #if defined(__BIONIC__)
   char buf[0];
   char *orig = strdup("1");
-  ASSERT_EXIT(strcpy(buf, orig), testing::KilledBySignal(SIGABRT), "");
+  ASSERT_FORTIFY(strcpy(buf, orig));
   free(orig);
 #else // __BIONIC__
   GTEST_LOG_(INFO) << "This test does nothing.\n";
@@ -396,7 +390,7 @@
 #if defined(__BIONIC__)
   char buf[1];
   char *orig = strdup("12");
-  ASSERT_EXIT(strcpy(buf, orig), testing::KilledBySignal(SIGABRT), "");
+  ASSERT_FORTIFY(strcpy(buf, orig));
   free(orig);
 #else // __BIONIC__
   GTEST_LOG_(INFO) << "This test does nothing.\n";
@@ -407,7 +401,7 @@
 #if defined(__BIONIC__)
   char buf[10];
   memcpy(buf, "0123456789", sizeof(buf));
-  ASSERT_EXIT(printf("%zd", strlen(buf)), testing::KilledBySignal(SIGABRT), "");
+  ASSERT_FORTIFY(printf("%zd", strlen(buf)));
 #else // __BIONIC__
   GTEST_LOG_(INFO) << "This test does nothing.\n";
 #endif // __BIONIC__
@@ -417,7 +411,7 @@
 #if defined(__BIONIC__)
   char buf[10];
   memcpy(buf, "0123456789", sizeof(buf));
-  ASSERT_EXIT(printf("%s", strchr(buf, 'a')), testing::KilledBySignal(SIGABRT), "");
+  ASSERT_FORTIFY(printf("%s", strchr(buf, 'a')));
 #else // __BIONIC__
   GTEST_LOG_(INFO) << "This test does nothing.\n";
 #endif // __BIONIC__
@@ -427,7 +421,7 @@
 #if defined(__BIONIC__)
   char buf[10];
   memcpy(buf, "0123456789", sizeof(buf));
-  ASSERT_EXIT(printf("%s", strrchr(buf, 'a')), testing::KilledBySignal(SIGABRT), "");
+  ASSERT_FORTIFY(printf("%s", strrchr(buf, 'a')));
 #else // __BIONIC__
   GTEST_LOG_(INFO) << "This test does nothing.\n";
 #endif // __BIONIC__
@@ -439,7 +433,7 @@
   char bufb[10];
   strcpy(bufa, "01234567890123");
   size_t n = strlen(bufa);
-  ASSERT_EXIT(strlcpy(bufb, bufa, n), testing::KilledBySignal(SIGABRT), "");
+  ASSERT_FORTIFY(strlcpy(bufb, bufa, n));
 #else // __BIONIC__
   GTEST_LOG_(INFO) << "This test does nothing.\n";
 #endif // __BIONIC__
@@ -452,7 +446,7 @@
   bufb[0] = '\0';
   strcpy(bufa, "01234567890123");
   size_t n = strlen(bufa);
-  ASSERT_EXIT(strlcat(bufb, bufa, n), testing::KilledBySignal(SIGABRT), "");
+  ASSERT_FORTIFY(strlcat(bufb, bufa, n));
 #else // __BIONIC__
   GTEST_LOG_(INFO) << "This test does nothing.\n";
 #endif // __BIONIC__
@@ -462,7 +456,7 @@
   char buf[10];
   char source_buf[15];
   memcpy(source_buf, "12345678901234", 15);
-  ASSERT_EXIT(sprintf(buf, "%s", source_buf), testing::KilledBySignal(SIGABRT), "");
+  ASSERT_FORTIFY(sprintf(buf, "%s", source_buf));
 }
 
 #ifndef __clang__
@@ -472,14 +466,14 @@
   char* buf = (char *) malloc(10);
   char source_buf[11];
   memcpy(source_buf, "1234567890", 11);
-  ASSERT_EXIT(sprintf(buf, "%s", source_buf), testing::KilledBySignal(SIGABRT), "");
+  ASSERT_FORTIFY(sprintf(buf, "%s", source_buf));
   free(buf);
 }
 #endif
 
 TEST_F(DEATHTEST, sprintf2_fortified) {
   char buf[5];
-  ASSERT_EXIT(sprintf(buf, "aaaaa"), testing::KilledBySignal(SIGABRT), "");
+  ASSERT_FORTIFY(sprintf(buf, "aaaaa"));
 }
 
 static int vsprintf_helper(const char *fmt, ...) {
@@ -494,11 +488,11 @@
 }
 
 TEST_F(DEATHTEST, vsprintf_fortified) {
-  ASSERT_EXIT(vsprintf_helper("%s", "0123456789"), testing::KilledBySignal(SIGABRT), "");
+  ASSERT_FORTIFY(vsprintf_helper("%s", "0123456789"));
 }
 
 TEST_F(DEATHTEST, vsprintf2_fortified) {
-  ASSERT_EXIT(vsprintf_helper("0123456789"), testing::KilledBySignal(SIGABRT), "");
+  ASSERT_FORTIFY(vsprintf_helper("0123456789"));
 }
 
 static int vsnprintf_helper(const char *fmt, ...) {
@@ -514,25 +508,25 @@
 }
 
 TEST_F(DEATHTEST, vsnprintf_fortified) {
-  ASSERT_EXIT(vsnprintf_helper("%s", "0123456789"), testing::KilledBySignal(SIGABRT), "");
+  ASSERT_FORTIFY(vsnprintf_helper("%s", "0123456789"));
 }
 
 TEST_F(DEATHTEST, vsnprintf2_fortified) {
-  ASSERT_EXIT(vsnprintf_helper("0123456789"), testing::KilledBySignal(SIGABRT), "");
+  ASSERT_FORTIFY(vsnprintf_helper("0123456789"));
 }
 
 TEST_F(DEATHTEST, strncat_fortified) {
   char buf[10];
   size_t n = atoi("10"); // avoid compiler optimizations
   strncpy(buf, "012345678", n);
-  ASSERT_EXIT(strncat(buf, "9", n), testing::KilledBySignal(SIGABRT), "");
+  ASSERT_FORTIFY(strncat(buf, "9", n));
 }
 
 TEST_F(DEATHTEST, strncat2_fortified) {
   char buf[10];
   buf[0] = '\0';
   size_t n = atoi("10"); // avoid compiler optimizations
-  ASSERT_EXIT(strncat(buf, "0123456789", n), testing::KilledBySignal(SIGABRT), "");
+  ASSERT_FORTIFY(strncat(buf, "0123456789", n));
 }
 
 TEST_F(DEATHTEST, strcat_fortified) {
@@ -540,14 +534,14 @@
   strcpy(src, "0123456789");
   char buf[10];
   buf[0] = '\0';
-  ASSERT_EXIT(strcat(buf, src), testing::KilledBySignal(SIGABRT), "");
+  ASSERT_FORTIFY(strcat(buf, src));
 }
 
 TEST_F(DEATHTEST, memmove_fortified) {
   char buf[20];
   strcpy(buf, "0123456789");
   size_t n = atoi("10");
-  ASSERT_EXIT(memmove(buf + 11, buf, n), testing::KilledBySignal(SIGABRT), "");
+  ASSERT_FORTIFY(memmove(buf + 11, buf, n));
 }
 
 TEST_F(DEATHTEST, memcpy_fortified) {
@@ -555,7 +549,7 @@
   char bufb[10];
   strcpy(bufa, "012345678");
   size_t n = atoi("11");
-  ASSERT_EXIT(memcpy(bufb, bufa, n), testing::KilledBySignal(SIGABRT), "");
+  ASSERT_FORTIFY(memcpy(bufb, bufa, n));
 }
 
 TEST_F(DEATHTEST, stpncpy_fortified) {
@@ -563,14 +557,14 @@
   char bufb[10];
   strcpy(bufa, "01234567890123");
   size_t n = strlen(bufa);
-  ASSERT_EXIT(stpncpy(bufb, bufa, n), testing::KilledBySignal(SIGABRT), "");
+  ASSERT_FORTIFY(stpncpy(bufb, bufa, n));
 }
 
 TEST_F(DEATHTEST, stpncpy2_fortified) {
   char dest[11];
   char src[10];
   memcpy(src, "0123456789", sizeof(src)); // src is not null terminated
-  ASSERT_EXIT(stpncpy(dest, src, sizeof(dest)), testing::KilledBySignal(SIGABRT), "");
+  ASSERT_FORTIFY(stpncpy(dest, src, sizeof(dest)));
 }
 
 TEST_F(DEATHTEST, strncpy_fortified) {
@@ -578,7 +572,7 @@
   char bufb[10];
   strcpy(bufa, "01234567890123");
   size_t n = strlen(bufa);
-  ASSERT_EXIT(strncpy(bufb, bufa, n), testing::KilledBySignal(SIGABRT), "");
+  ASSERT_FORTIFY(strncpy(bufb, bufa, n));
 }
 
 
@@ -586,7 +580,7 @@
   char dest[11];
   char src[10];
   memcpy(src, "0123456789", sizeof(src)); // src is not null terminated
-  ASSERT_EXIT(strncpy(dest, src, sizeof(dest)), testing::KilledBySignal(SIGABRT), "");
+  ASSERT_FORTIFY(strncpy(dest, src, sizeof(dest)));
 }
 
 TEST_F(DEATHTEST, snprintf_fortified) {
@@ -594,55 +588,46 @@
   char bufb[10];
   strcpy(bufa, "0123456789");
   size_t n = strlen(bufa) + 1;
-  ASSERT_EXIT(snprintf(bufb, n, "%s", bufa), testing::KilledBySignal(SIGABRT), "");
+  ASSERT_FORTIFY(snprintf(bufb, n, "%s", bufa));
 }
 
 TEST_F(DEATHTEST, bzero_fortified) {
   char buf[10];
   memcpy(buf, "0123456789", sizeof(buf));
   size_t n = atoi("11");
-  ASSERT_EXIT(bzero(buf, n), testing::KilledBySignal(SIGABRT), "");
+  ASSERT_FORTIFY(bzero(buf, n));
 }
 
 TEST_F(DEATHTEST, umask_fortified) {
   mode_t mask = atoi("1023");  // 01777 in octal
-  ASSERT_EXIT(umask(mask), testing::KilledBySignal(SIGABRT), "");
+  ASSERT_FORTIFY(umask(mask));
 }
 
 TEST_F(DEATHTEST, recv_fortified) {
   size_t data_len = atoi("11"); // suppress compiler optimizations
   char buf[10];
-  ASSERT_EXIT(recv(0, buf, data_len, 0), testing::KilledBySignal(SIGABRT), "");
+  ASSERT_FORTIFY(recv(0, buf, data_len, 0));
 }
 
 TEST_F(DEATHTEST, FD_ISSET_fortified) {
 #if defined(__BIONIC__) // glibc catches this at compile-time.
   fd_set set;
   memset(&set, 0, sizeof(set));
-  ASSERT_EXIT(FD_ISSET(-1, &set), testing::KilledBySignal(SIGABRT), "");
+  ASSERT_FORTIFY(FD_ISSET(-1, &set));
 #endif
 }
 
 TEST_F(DEATHTEST, FD_ISSET_2_fortified) {
   char buf[1];
   fd_set* set = (fd_set*) buf;
-  ASSERT_EXIT(FD_ISSET(0, set), testing::KilledBySignal(SIGABRT), "");
-}
-
-// gtest's ASSERT_EXIT needs a valid expression, but glibc has a do-while macro.
-static void FD_ZERO_function(fd_set* s) { FD_ZERO(s); }
-
-TEST_F(DEATHTEST, FD_ZERO_fortified) {
-  char buf[1];
-  fd_set* set = (fd_set*) buf;
-  ASSERT_EXIT(FD_ZERO_function(set), testing::KilledBySignal(SIGABRT), "");
+  ASSERT_FORTIFY(FD_ISSET(0, set));
 }
 
 TEST_F(DEATHTEST, read_fortified) {
   char buf[1];
   size_t ct = atoi("2"); // prevent optimizations
   int fd = open("/dev/null", O_RDONLY);
-  ASSERT_EXIT(read(fd, buf, ct), testing::KilledBySignal(SIGABRT), "");
+  ASSERT_FORTIFY(read(fd, buf, ct));
   close(fd);
 }
 
@@ -950,3 +935,19 @@
   sprintf(BUF_AND_CONTENTS(buf));
   EXPECT_STREQ(CONTENTS, buf);
 }
+
+TEST_F(DEATHTEST, poll_fortified) {
+  nfds_t fd_count = atoi("2"); // suppress compiler optimizations
+  pollfd buf[1] = {{0, POLLIN, 0}};
+  // Set timeout to zero to prevent waiting in poll when fortify test fails.
+  ASSERT_FORTIFY(poll(buf, fd_count, 0));
+}
+
+TEST_F(DEATHTEST, ppoll_fortified) {
+  nfds_t fd_count = atoi("2"); // suppress compiler optimizations
+  pollfd buf[1] = {{0, POLLIN, 0}};
+  // Set timeout to zero to prevent waiting in ppoll when fortify test fails.
+  timespec timeout;
+  timeout.tv_sec = timeout.tv_nsec = 0;
+  ASSERT_FORTIFY(ppoll(buf, fd_count, &timeout, NULL));
+}
diff --git a/tests/gtest_main.cpp b/tests/gtest_main.cpp
index e199449..86d6466 100644
--- a/tests/gtest_main.cpp
+++ b/tests/gtest_main.cpp
@@ -16,9 +16,12 @@
 
 #include <gtest/gtest.h>
 
+#include <ctype.h>
 #include <errno.h>
 #include <fcntl.h>
 #include <inttypes.h>
+#include <limits.h>
+#include <signal.h>
 #include <stdarg.h>
 #include <stdio.h>
 #include <string.h>
@@ -93,7 +96,10 @@
          "      It takes effect only in isolation mode. Default warnline is 2000 ms.\n"
          "  --gtest-filter=POSITIVE_PATTERNS[-NEGATIVE_PATTERNS]\n"
          "      Used as a synonym for --gtest_filter option in gtest.\n"
-         "\nDefault bionic unit test option is -j.\n"
+         "Default bionic unit test option is -j.\n"
+         "In isolation mode, you can send SIGQUIT to the parent process to show current\n"
+         "running tests, or send SIGINT to the parent process to stop testing and\n"
+         "clean up current running tests.\n"
          "\n");
 }
 
@@ -223,13 +229,13 @@
   int towrite = strlen(buf);
   char* p = buf;
   while (towrite > 0) {
-    ssize_t write_count = TEMP_FAILURE_RETRY(write(child_output_fd, p, towrite));
-    if (write_count == -1) {
+    ssize_t bytes_written = TEMP_FAILURE_RETRY(write(child_output_fd, p, towrite));
+    if (bytes_written == -1) {
       fprintf(stderr, "failed to write child_output_fd: %s\n", strerror(errno));
       exit(1);
     } else {
-      towrite -= write_count;
-      p += write_count;
+      towrite -= bytes_written;
+      p += bytes_written;
     }
   }
 }
@@ -488,6 +494,18 @@
   fclose(fp);
 }
 
+struct ChildProcInfo {
+  pid_t pid;
+  int64_t start_time_ns;
+  int64_t end_time_ns;
+  int64_t deadline_end_time_ns; // The time when the test is thought of as timeout.
+  size_t testcase_id, test_id;
+  bool finished;
+  bool timed_out;
+  int exit_status;
+  int child_read_fd; // File descriptor to read child test failure info.
+};
+
 // Forked Child process, run the single test.
 static void ChildProcessFn(int argc, char** argv, const std::string& test_name) {
   char** new_argv = new char*[argc + 2];
@@ -505,57 +523,142 @@
   exit(result);
 }
 
-struct ChildProcInfo {
-  pid_t pid;
-  int64_t start_time_ns;
-  int64_t deadline_time_ns;
-  size_t testcase_id, test_id;
-  bool done_flag;
-  bool timeout_flag;
-  int exit_status;
-  int child_read_fd;
-  ChildProcInfo() : pid(0) {}
-};
+static ChildProcInfo RunChildProcess(const std::string& test_name, int testcase_id, int test_id,
+                                     sigset_t sigmask, int argc, char** argv) {
+  int pipefd[2];
+  int ret = pipe2(pipefd, O_NONBLOCK);
+  if (ret == -1) {
+    perror("pipe2 in RunTestInSeparateProc");
+    exit(1);
+  }
+  pid_t pid = fork();
+  if (pid == -1) {
+    perror("fork in RunTestInSeparateProc");
+    exit(1);
+  } else if (pid == 0) {
+    // In child process, run a single test.
+    close(pipefd[0]);
+    child_output_fd = pipefd[1];
 
-static void WaitChildProcs(std::vector<ChildProcInfo>& child_proc_list) {
-  pid_t result;
-  int status;
-  bool loop_flag = true;
+    if (sigprocmask(SIG_SETMASK, &sigmask, NULL) == -1) {
+      perror("sigprocmask SIG_SETMASK");
+      exit(1);
+    }
+    ChildProcessFn(argc, argv, test_name);
+    // Unreachable.
+  }
+  // In parent process, initialize child process info.
+  close(pipefd[1]);
+  ChildProcInfo child_proc;
+  child_proc.child_read_fd = pipefd[0];
+  child_proc.pid = pid;
+  child_proc.start_time_ns = NanoTime();
+  child_proc.deadline_end_time_ns = child_proc.start_time_ns + GetDeadlineInfo(test_name) * 1000000LL;
+  child_proc.testcase_id = testcase_id;
+  child_proc.test_id = test_id;
+  child_proc.finished = false;
+  return child_proc;
+}
 
+static void HandleSignals(std::vector<TestCase>& testcase_list,
+                            std::vector<ChildProcInfo>& child_proc_list) {
+  sigset_t waiting_mask;
+  sigemptyset(&waiting_mask);
+  sigaddset(&waiting_mask, SIGINT);
+  sigaddset(&waiting_mask, SIGQUIT);
+  timespec timeout;
+  timeout.tv_sec = timeout.tv_nsec = 0;
   while (true) {
-    while ((result = waitpid(-1, &status, WNOHANG)) == -1) {
-      if (errno != EINTR) {
-        break;
+    int signo = TEMP_FAILURE_RETRY(sigtimedwait(&waiting_mask, NULL, &timeout));
+    if (signo == -1) {
+      if (errno == EAGAIN) {
+        return; // Timeout, no pending signals.
+      }
+      perror("sigtimedwait");
+      exit(1);
+    } else if (signo == SIGQUIT) {
+      // Print current running tests.
+      printf("List of current running tests:\n");
+      for (auto& child_proc : child_proc_list) {
+        if (child_proc.pid != 0) {
+          std::string test_name = testcase_list[child_proc.testcase_id].GetTestName(child_proc.test_id);
+          int64_t current_time_ns = NanoTime();
+          int64_t run_time_ms = (current_time_ns - child_proc.start_time_ns) / 1000000;
+          printf("  %s (%" PRId64 " ms)\n", test_name.c_str(), run_time_ms);
+        }
+      }
+    } else if (signo == SIGINT) {
+      // Kill current running tests.
+      for (auto& child_proc : child_proc_list) {
+        if (child_proc.pid != 0) {
+          // Send SIGKILL to ensure the child process can be killed unconditionally.
+          kill(child_proc.pid, SIGKILL);
+        }
+      }
+      // SIGINT kills the parent process as well.
+      exit(1);
+    }
+  }
+}
+
+static bool CheckChildProcExit(pid_t exit_pid, int exit_status,
+                               std::vector<ChildProcInfo>& child_proc_list) {
+  for (size_t i = 0; i < child_proc_list.size(); ++i) {
+    if (child_proc_list[i].pid == exit_pid) {
+      child_proc_list[i].finished = true;
+      child_proc_list[i].timed_out = false;
+      child_proc_list[i].exit_status = exit_status;
+      child_proc_list[i].end_time_ns = NanoTime();
+      return true;
+    }
+  }
+  return false;
+}
+
+static size_t CheckChildProcTimeout(std::vector<ChildProcInfo>& child_proc_list) {
+  int64_t current_time_ns = NanoTime();
+  size_t timeout_child_count = 0;
+  for (size_t i = 0; i < child_proc_list.size(); ++i) {
+    if (child_proc_list[i].deadline_end_time_ns <= current_time_ns) {
+      child_proc_list[i].finished = true;
+      child_proc_list[i].timed_out = true;
+      child_proc_list[i].end_time_ns = current_time_ns;
+      ++timeout_child_count;
+    }
+  }
+  return timeout_child_count;
+}
+
+static void WaitChildProcs(std::vector<TestCase>& testcase_list,
+                           std::vector<ChildProcInfo>& child_proc_list) {
+  size_t finished_child_count = 0;
+  while (true) {
+    int status;
+    pid_t result;
+    while ((result = TEMP_FAILURE_RETRY(waitpid(-1, &status, WNOHANG))) > 0) {
+      if (CheckChildProcExit(result, status, child_proc_list)) {
+        ++finished_child_count;
       }
     }
 
     if (result == -1) {
-      perror("waitpid");
-      exit(1);
+      if (errno == ECHILD) {
+        // This happens when we have no running child processes.
+        return;
+      } else {
+        perror("waitpid");
+        exit(1);
+      }
     } else if (result == 0) {
-      // Check child timeout.
-      int64_t current_time_ns = NanoTime();
-      for (size_t i = 0; i < child_proc_list.size(); ++i) {
-        if (child_proc_list[i].deadline_time_ns <= current_time_ns) {
-          child_proc_list[i].done_flag = true;
-          child_proc_list[i].timeout_flag = true;
-          loop_flag = false;
-        }
-      }
-    } else {
-      // Check child finish.
-      for (size_t i = 0; i < child_proc_list.size(); ++i) {
-        if (child_proc_list[i].pid == result) {
-          child_proc_list[i].done_flag = true;
-          child_proc_list[i].timeout_flag = false;
-          child_proc_list[i].exit_status = status;
-          loop_flag = false;
-          break;
-        }
-      }
+      finished_child_count += CheckChildProcTimeout(child_proc_list);
     }
 
-    if (!loop_flag) break;
+    if (finished_child_count > 0) {
+      return;
+    }
+
+    HandleSignals(testcase_list, child_proc_list);
+
     // sleep 1 ms to avoid busy looping.
     timespec sleep_time;
     sleep_time.tv_sec = 0;
@@ -564,15 +667,9 @@
   }
 }
 
-static TestResult WaitChildProc(pid_t pid) {
-  pid_t result;
+static TestResult WaitForOneChild(pid_t pid) {
   int exit_status;
-
-  while ((result = waitpid(pid, &exit_status, 0)) == -1) {
-    if (errno != EINTR) {
-      break;
-    }
-  }
+  pid_t result = TEMP_FAILURE_RETRY(waitpid(pid, &exit_status, 0));
 
   TestResult test_result = TEST_SUCCESS;
   if (result != pid || WEXITSTATUS(exit_status) != 0) {
@@ -581,6 +678,57 @@
   return test_result;
 }
 
+static void CollectChildTestResult(const ChildProcInfo& child_proc, TestCase& testcase) {
+  int test_id = child_proc.test_id;
+  testcase.SetTestTime(test_id, child_proc.end_time_ns - child_proc.start_time_ns);
+  if (child_proc.timed_out) {
+    // The child process marked as timed_out has not exited, and we should kill it manually.
+    kill(child_proc.pid, SIGKILL);
+    WaitForOneChild(child_proc.pid);
+  }
+
+  while (true) {
+    char buf[1024];
+    ssize_t bytes_read = TEMP_FAILURE_RETRY(read(child_proc.child_read_fd, buf, sizeof(buf) - 1));
+    if (bytes_read > 0) {
+      buf[bytes_read] = '\0';
+      testcase.GetTest(test_id).AppendFailureMessage(buf);
+    } else if (bytes_read == 0) {
+      break; // Read end.
+    } else {
+      if (errno == EAGAIN) {
+        // No data is available. This rarely happens, only when the child process created other
+        // processes which have not exited so far. But the child process has already exited or
+        // been killed, so the test has finished, and we shouldn't wait further.
+        break;
+      }
+      perror("read child_read_fd in RunTestInSeparateProc");
+      exit(1);
+    }
+  }
+  close(child_proc.child_read_fd);
+
+  if (child_proc.timed_out) {
+    testcase.SetTestResult(test_id, TEST_TIMEOUT);
+    char buf[1024];
+    snprintf(buf, sizeof(buf), "%s killed because of timeout at %" PRId64 " ms.\n",
+             testcase.GetTestName(test_id).c_str(), testcase.GetTestTime(test_id) / 1000000);
+    testcase.GetTest(test_id).AppendFailureMessage(buf);
+
+  } else if (WIFSIGNALED(child_proc.exit_status)) {
+    // Record signal terminated test as failed.
+    testcase.SetTestResult(test_id, TEST_FAILED);
+    char buf[1024];
+    snprintf(buf, sizeof(buf), "%s terminated by signal: %s.\n",
+             testcase.GetTestName(test_id).c_str(), strsignal(WTERMSIG(child_proc.exit_status)));
+    testcase.GetTest(test_id).AppendFailureMessage(buf);
+
+  } else {
+    testcase.SetTestResult(test_id, WEXITSTATUS(child_proc.exit_status) == 0 ?
+                           TEST_SUCCESS : TEST_FAILED);
+  }
+}
+
 // We choose to use multi-fork and multi-wait here instead of multi-thread, because it always
 // makes deadlock to use fork in multi-thread.
 static void RunTestInSeparateProc(int argc, char** argv, std::vector<TestCase>& testcase_list,
@@ -591,13 +739,23 @@
                         testing::UnitTest::GetInstance()->listeners().default_result_printer());
   testing::UnitTest::GetInstance()->listeners().Append(new TestResultPrinter);
 
+  // Signals are blocked here as we want to handle them in HandleSignals() later.
+  sigset_t block_mask, orig_mask;
+  sigemptyset(&block_mask);
+  sigaddset(&block_mask, SIGINT);
+  sigaddset(&block_mask, SIGQUIT);
+  if (sigprocmask(SIG_BLOCK, &block_mask, &orig_mask) == -1) {
+    perror("sigprocmask SIG_BLOCK");
+    exit(1);
+  }
+
   for (size_t iteration = 1; iteration <= iteration_count; ++iteration) {
     OnTestIterationStartPrint(testcase_list, iteration, iteration_count);
     int64_t iteration_start_time_ns = NanoTime();
     time_t epoch_iteration_start_time = time(NULL);
 
     // Run up to job_count tests in parallel, each test in a child process.
-    std::vector<ChildProcInfo> child_proc_list(job_count);
+    std::vector<ChildProcInfo> child_proc_list;
 
     // Next test to run is [next_testcase_id:next_test_id].
     size_t next_testcase_id = 0;
@@ -608,103 +766,40 @@
     size_t finished_testcase_count = 0;
 
     while (finished_testcase_count < testcase_list.size()) {
-      // Fork up to job_count child processes.
-      for (auto& child_proc : child_proc_list) {
-        if (child_proc.pid == 0 && next_testcase_id < testcase_list.size()) {
-          std::string test_name = testcase_list[next_testcase_id].GetTestName(next_test_id);
-          int pipefd[2];
-          int ret = pipe(pipefd);
-          if (ret == -1) {
-            perror("pipe2 in RunTestInSeparateProc");
-            exit(1);
-          }
-          pid_t pid = fork();
-          if (pid == -1) {
-            perror("fork in RunTestInSeparateProc");
-            exit(1);
-          } else if (pid == 0) {
-            close(pipefd[0]);
-            child_output_fd = pipefd[1];
-            // Run child process test, never return.
-            ChildProcessFn(argc, argv, test_name);
-          }
-          // Parent process
-          close(pipefd[1]);
-          child_proc.child_read_fd = pipefd[0];
-          child_proc.pid = pid;
-          child_proc.start_time_ns = NanoTime();
-          child_proc.deadline_time_ns = child_proc.start_time_ns +
-                                        GetDeadlineInfo(test_name) * 1000000LL;
-          child_proc.testcase_id = next_testcase_id;
-          child_proc.test_id = next_test_id;
-          child_proc.done_flag = false;
-          if (++next_test_id == testcase_list[next_testcase_id].TestCount()) {
-            next_test_id = 0;
-            ++next_testcase_id;
-          }
+      // run up to job_count child processes.
+      while (child_proc_list.size() < job_count && next_testcase_id < testcase_list.size()) {
+        std::string test_name = testcase_list[next_testcase_id].GetTestName(next_test_id);
+        ChildProcInfo child_proc = RunChildProcess(test_name, next_testcase_id, next_test_id,
+                                                   orig_mask, argc, argv);
+        child_proc_list.push_back(child_proc);
+        if (++next_test_id == testcase_list[next_testcase_id].TestCount()) {
+          next_test_id = 0;
+          ++next_testcase_id;
         }
       }
 
       // Wait for any child proc finish or timeout.
-      WaitChildProcs(child_proc_list);
+      WaitChildProcs(testcase_list, child_proc_list);
 
       // Collect result.
-      for (auto& child_proc : child_proc_list) {
-        if (child_proc.pid != 0 && child_proc.done_flag == true) {
+      auto it = child_proc_list.begin();
+      while (it != child_proc_list.end()) {
+        auto& child_proc = *it;
+        if (child_proc.finished == true) {
           size_t testcase_id = child_proc.testcase_id;
           size_t test_id = child_proc.test_id;
           TestCase& testcase = testcase_list[testcase_id];
-          testcase.SetTestTime(test_id, NanoTime() - child_proc.start_time_ns);
 
-          // Kill and wait the timeout child process before we read failure message.
-          if (child_proc.timeout_flag) {
-            kill(child_proc.pid, SIGKILL);
-            WaitChildProc(child_proc.pid);
-          }
-
-          while (true) {
-            char buf[1024];
-            int ret = TEMP_FAILURE_RETRY(read(child_proc.child_read_fd, buf, sizeof(buf) - 1));
-            if (ret > 0) {
-              buf[ret] = '\0';
-              testcase.GetTest(test_id).AppendFailureMessage(buf);
-            } else if (ret == 0) {
-              break; // Read end.
-            } else {
-              perror("read child_read_fd in RunTestInSeparateProc");
-              exit(1);
-            }
-          }
-          close(child_proc.child_read_fd);
-
-          if (child_proc.timeout_flag) {
-            testcase.SetTestResult(test_id, TEST_TIMEOUT);
-            char buf[1024];
-            snprintf(buf, sizeof(buf), "%s killed because of timeout at %" PRId64 " ms.\n",
-                     testcase.GetTestName(test_id).c_str(),
-                     testcase.GetTestTime(test_id) / 1000000);
-            testcase.GetTest(test_id).AppendFailureMessage(buf);
-
-          } else if (WIFSIGNALED(child_proc.exit_status)) {
-            // Record signal terminated test as failed.
-            testcase.SetTestResult(test_id, TEST_FAILED);
-            char buf[1024];
-            snprintf(buf, sizeof(buf), "%s terminated by signal: %s.\n",
-                     testcase.GetTestName(test_id).c_str(),
-                     strsignal(WTERMSIG(child_proc.exit_status)));
-            testcase.GetTest(test_id).AppendFailureMessage(buf);
-
-          } else {
-            testcase.SetTestResult(test_id, WEXITSTATUS(child_proc.exit_status) == 0 ?
-                                   TEST_SUCCESS : TEST_FAILED);
-          }
+          CollectChildTestResult(child_proc, testcase);
           OnTestEndPrint(testcase, test_id);
 
           if (++finished_test_count_list[testcase_id] == testcase.TestCount()) {
             ++finished_testcase_count;
           }
-          child_proc.pid = 0;
-          child_proc.done_flag = false;
+
+          it = child_proc_list.erase(it);
+        } else {
+          ++it;
         }
       }
     }
@@ -716,12 +811,36 @@
                                  elapsed_time_ns);
     }
   }
+
+  // Restore signal mask.
+  if (sigprocmask(SIG_SETMASK, &orig_mask, NULL) == -1) {
+    perror("sigprocmask SIG_SETMASK");
+    exit(1);
+  }
 }
 
 static size_t GetProcessorCount() {
   return static_cast<size_t>(sysconf(_SC_NPROCESSORS_ONLN));
 }
 
+static void AddPathSeparatorInTestProgramPath(std::vector<char*>& args) {
+  // To run DeathTest in threadsafe mode, gtest requires that the user must invoke the
+  // test program via a valid path that contains at least one path separator.
+  // The reason is that gtest uses clone() + execve() to run DeathTest in threadsafe mode,
+  // and execve() doesn't read environment variable PATH, so execve() will not success
+  // until we specify the absolute path or relative path of the test program directly.
+  if (strchr(args[0], '/') == NULL) {
+    char path[PATH_MAX];
+    ssize_t path_len = readlink("/proc/self/exe", path, sizeof(path));
+    if (path_len <= 0 || path_len >= static_cast<ssize_t>(sizeof(path))) {
+      perror("readlink");
+      exit(1);
+    }
+    path[path_len] = '\0';
+    args[0] = strdup(path);
+  }
+}
+
 static void AddGtestFilterSynonym(std::vector<char*>& args) {
   // Support --gtest-filter as a synonym for --gtest_filter.
   for (size_t i = 1; i < args.size(); ++i) {
@@ -759,6 +878,7 @@
     }
   }
 
+  AddPathSeparatorInTestProgramPath(args);
   AddGtestFilterSynonym(args);
 
   // if --bionic-selftest argument is used, only enable self tests, otherwise remove self tests.
diff --git a/tests/pthread_test.cpp b/tests/pthread_test.cpp
index cb32079..5dc60ee 100644
--- a/tests/pthread_test.cpp
+++ b/tests/pthread_test.cpp
@@ -27,6 +27,7 @@
 #include <malloc.h>
 #include <pthread.h>
 #include <signal.h>
+#include <stdio.h>
 #include <sys/mman.h>
 #include <sys/syscall.h>
 #include <time.h>
@@ -1092,3 +1093,14 @@
   ASSERT_EQ(EPERM, pthread_mutex_unlock(&lock));
   ASSERT_EQ(0, pthread_mutex_destroy(&lock));
 }
+
+TEST(pthread, pthread_mutex_owner_tid_limit) {
+  FILE* fp = fopen("/proc/sys/kernel/pid_max", "r");
+  ASSERT_TRUE(fp != NULL);
+  long pid_max;
+  ASSERT_EQ(1, fscanf(fp, "%ld", &pid_max));
+  fclose(fp);
+  // Current pthread_mutex uses 16 bits to represent owner tid.
+  // Change the implementation if we need to support higher value than 65535.
+  ASSERT_LE(pid_max, 65536);
+}
diff --git a/tests/sys_stat_test.cpp b/tests/sys_stat_test.cpp
index e465774..7bbb7c6 100644
--- a/tests/sys_stat_test.cpp
+++ b/tests/sys_stat_test.cpp
@@ -95,3 +95,127 @@
   ASSERT_EQ(0, fstat64(fd, &sb));
   close(fd);
 }
+
+TEST(sys_stat, fchmodat_EFAULT_file) {
+  ASSERT_EQ(-1, fchmodat(AT_FDCWD, (char *) 0x1, 0751, 0));
+  ASSERT_EQ(EFAULT, errno);
+}
+
+TEST(sys_stat, fchmodat_AT_SYMLINK_NOFOLLOW_EFAULT_file) {
+  ASSERT_EQ(-1, fchmodat(AT_FDCWD, (char *) 0x1, 0751, AT_SYMLINK_NOFOLLOW));
+#if defined(__BIONIC__)
+  ASSERT_EQ(EFAULT, errno);
+#else
+  // glibc 2.19 does not implement AT_SYMLINK_NOFOLLOW and always
+  // returns ENOTSUP
+  ASSERT_EQ(ENOTSUP, errno);
+#endif
+}
+
+TEST(sys_stat, fchmodat_bad_flags) {
+  ASSERT_EQ(-1, fchmodat(AT_FDCWD, "/blah", 0751, ~AT_SYMLINK_NOFOLLOW));
+  ASSERT_EQ(EINVAL, errno);
+}
+
+TEST(sys_stat, fchmodat_bad_flags_ALL) {
+  ASSERT_EQ(-1, fchmodat(AT_FDCWD, "/blah", 0751, ~0));
+  ASSERT_EQ(EINVAL, errno);
+}
+
+TEST(sys_stat, fchmodat_nonexistant_file) {
+  ASSERT_EQ(-1, fchmodat(AT_FDCWD, "/blah", 0751, 0));
+  ASSERT_EQ(ENOENT, errno);
+}
+
+TEST(sys_stat, fchmodat_AT_SYMLINK_NOFOLLOW_nonexistant_file) {
+  ASSERT_EQ(-1, fchmodat(AT_FDCWD, "/blah", 0751, AT_SYMLINK_NOFOLLOW));
+#if defined(__BIONIC__)
+  ASSERT_EQ(ENOENT, errno);
+#else
+  // glibc 2.19 does not implement AT_SYMLINK_NOFOLLOW and always
+  // returns ENOTSUP
+  ASSERT_EQ(ENOTSUP, errno);
+#endif
+}
+
+TEST(sys_stat, fchmodat_file) {
+  TemporaryFile tf;
+  struct stat sb;
+
+  ASSERT_EQ(0, fchmodat(AT_FDCWD, tf.filename, 0751, 0));
+  ASSERT_EQ(0, fstat(tf.fd, &sb));
+  ASSERT_TRUE(0751 == (sb.st_mode & (S_IRWXU | S_IRWXG | S_IRWXO)));
+}
+
+TEST(sys_stat, fchmodat_AT_SYMLINK_NOFOLLOW_file) {
+  TemporaryFile tf;
+  errno = 0;
+  int result = fchmodat(AT_FDCWD, tf.filename, 0751, AT_SYMLINK_NOFOLLOW);
+
+#if defined(__BIONIC__)
+  struct stat sb;
+  ASSERT_EQ(0, result);
+  ASSERT_EQ(0, errno);
+  ASSERT_EQ(0, fstat(tf.fd, &sb));
+  ASSERT_TRUE(0751 == (sb.st_mode & (S_IRWXU | S_IRWXG | S_IRWXO)));
+#else
+  // glibc 2.19 does not implement AT_SYMLINK_NOFOLLOW and always
+  // returns ENOTSUP
+  ASSERT_EQ(-1, result);
+  ASSERT_EQ(ENOTSUP, errno);
+#endif
+}
+
+TEST(sys_stat, fchmodat_symlink) {
+  TemporaryFile tf;
+  char linkname[255];
+  struct stat sb;
+
+  snprintf(linkname, sizeof(linkname), "%s.link", tf.filename);
+
+  ASSERT_EQ(0, symlink(tf.filename, linkname));
+  ASSERT_EQ(0, fchmodat(AT_FDCWD, linkname, 0751, 0));
+  ASSERT_EQ(0, fstat(tf.fd, &sb));
+  ASSERT_TRUE(0751 == (sb.st_mode & (S_IRWXU | S_IRWXG | S_IRWXO)));
+  unlink(linkname);
+}
+
+TEST(sys_stat, fchmodat_dangling_symlink) {
+  TemporaryFile tf;
+  char linkname[255];
+  char target[255];
+
+  snprintf(linkname, sizeof(linkname), "%s.link", tf.filename);
+  snprintf(target, sizeof(target), "%s.doesnotexist", tf.filename);
+
+  ASSERT_EQ(0, symlink(target, linkname));
+  ASSERT_EQ(-1, fchmodat(AT_FDCWD, linkname, 0751, 0));
+  ASSERT_EQ(ENOENT, errno);
+  unlink(linkname);
+}
+
+TEST(sys_stat, fchmodat_AT_SYMLINK_NOFOLLOW_with_symlink) {
+  TemporaryFile tf;
+  char linkname[255];
+
+  snprintf(linkname, sizeof(linkname), "%s.link", tf.filename);
+
+  ASSERT_EQ(0, symlink(tf.filename, linkname));
+  ASSERT_EQ(-1, fchmodat(AT_FDCWD, linkname, 0751, AT_SYMLINK_NOFOLLOW));
+  ASSERT_EQ(ENOTSUP, errno);
+  unlink(linkname);
+}
+
+TEST(sys_stat, fchmodat_AT_SYMLINK_NOFOLLOW_with_dangling_symlink) {
+  TemporaryFile tf;
+  char linkname[255];
+  char target[255];
+
+  snprintf(linkname, sizeof(linkname), "%s.link", tf.filename);
+  snprintf(target, sizeof(target), "%s.doesnotexist", tf.filename);
+
+  ASSERT_EQ(0, symlink(target, linkname));
+  ASSERT_EQ(-1, fchmodat(AT_FDCWD, linkname, 0751, AT_SYMLINK_NOFOLLOW));
+  ASSERT_EQ(ENOTSUP, errno);
+  unlink(linkname);
+}