diff --git a/benchmarks/string_benchmark.cpp b/benchmarks/string_benchmark.cpp
index 38122f2..d176675 100644
--- a/benchmarks/string_benchmark.cpp
+++ b/benchmarks/string_benchmark.cpp
@@ -248,6 +248,25 @@
 }
 BIONIC_BENCHMARK_WITH_ARG(BM_string_strcmp, "AT_ALIGNED_TWOBUF");
 
+static void BM_string_strncmp(benchmark::State& state) {
+  const size_t nbytes = state.range(0);
+  const size_t s1_alignment = state.range(1);
+  const size_t s2_alignment = state.range(2);
+
+  std::vector<char> s1;
+  std::vector<char> s2;
+  char* s1_aligned = GetAlignedPtrFilled(&s1, s1_alignment, nbytes, 'x');
+  char* s2_aligned = GetAlignedPtrFilled(&s2, s2_alignment, nbytes, 'x');
+
+  volatile int c __attribute__((unused));
+  for (auto _ : state) {
+    c = strncmp(s1_aligned, s2_aligned, nbytes);
+  }
+
+  state.SetBytesProcessed(uint64_t(state.iterations()) * uint64_t(nbytes));
+}
+BIONIC_BENCHMARK_WITH_ARG(BM_string_strncmp, "AT_ALIGNED_TWOBUF");
+
 static void BM_string_strstr(benchmark::State& state) {
   const size_t nbytes = state.range(0);
   const size_t haystack_alignment = state.range(1);
diff --git a/libc/Android.bp b/libc/Android.bp
index 7356c64..61d00cd 100644
--- a/libc/Android.bp
+++ b/libc/Android.bp
@@ -71,7 +71,8 @@
     stl: "none",
     system_shared_libs: [],
     sanitize: {
-        never: true,
+        address: false,
+        integer_overflow: false,
     },
     native_coverage: false,
     recovery_available: true,
diff --git a/libc/arch-arm64/generic/bionic/memcmp.S b/libc/arch-arm64/generic/bionic/memcmp.S
index 3a138bf..bff54ae 100644
--- a/libc/arch-arm64/generic/bionic/memcmp.S
+++ b/libc/arch-arm64/generic/bionic/memcmp.S
@@ -33,6 +33,8 @@
 
 #include <private/bionic_asm.h>
 
+#define L(l) .L ## l
+
 /* Parameters and result.  */
 #define src1		x0
 #define src2		x1
@@ -42,88 +44,124 @@
 /* Internal variables.  */
 #define data1		x3
 #define data1w		w3
-#define data2		x4
-#define data2w		w4
-#define tmp1		x5
+#define data1h		x4
+#define data2		x5
+#define data2w		w5
+#define data2h		x6
+#define tmp1		x7
+#define tmp2		x8
 
 /* Small inputs of less than 8 bytes are handled separately.  This allows the
-   main code to be sped up using unaligned loads since there are now at least
+   main code to be speed up using unaligned loads since there are now at least
    8 bytes to be compared.  If the first 8 bytes are equal, align src1.
    This ensures each iteration does at most one unaligned access even if both
    src1 and src2 are unaligned, and mutually aligned inputs behave as if
-   aligned.  After the main loop, process the last 8 bytes using unaligned
+   aligned.  After the main loop, process the last 16 bytes using unaligned
    accesses.  */
 
-.p2align 6
 ENTRY(memcmp)
+.p2align 6
 	subs	limit, limit, 8
-	b.lo	.Lless8
+	b.lo	L(less8)
 
 	/* Limit >= 8, so check first 8 bytes using unaligned loads.  */
 	ldr	data1, [src1], 8
 	ldr	data2, [src2], 8
-	and	tmp1, src1, 7
-	add	limit, limit, tmp1
 	cmp	data1, data2
-	bne	.Lreturn
+	b.ne	L(return)
+
+	subs	limit, limit, 8
+	b.gt	L(more16)
+
+	ldr	data1, [src1, limit]
+	ldr	data2, [src2, limit]
+	b	L(return)
+
+L(more16):
+	ldr	data1, [src1], 8
+	ldr	data2, [src2], 8
+	cmp	data1, data2
+	bne	L(return)
+
+	/* Jump directly to comparing the last 16 bytes for 32 byte (or less)
+	   strings.  */
+	subs	limit, limit, 16
+	b.ls	L(last_bytes)
+
+	/* We overlap loads between 0-32 bytes at either side of SRC1 when we
+	   try to align, so limit it only to strings larger than 128 bytes.  */
+	cmp	limit, 96
+	b.ls	L(loop16)
 
 	/* Align src1 and adjust src2 with bytes not yet done.  */
+	and	tmp1, src1, 15
+	add	limit, limit, tmp1
 	sub	src1, src1, tmp1
 	sub	src2, src2, tmp1
 
-	subs	limit, limit, 8
-	b.ls	.Llast_bytes
-
-	/* Loop performing 8 bytes per iteration using aligned src1.
-	   Limit is pre-decremented by 8 and must be larger than zero.
-	   Exit if <= 8 bytes left to do or if the data is not equal.  */
+	/* Loop performing 16 bytes per iteration using aligned src1.
+	   Limit is pre-decremented by 16 and must be larger than zero.
+	   Exit if <= 16 bytes left to do or if the data is not equal.  */
 	.p2align 4
-.Lloop8:
-	ldr	data1, [src1], 8
-	ldr	data2, [src2], 8
-	subs	limit, limit, 8
-	ccmp	data1, data2, 0, hi  /* NZCV = 0b0000.  */
-	b.eq	.Lloop8
+L(loop16):
+	ldp	data1, data1h, [src1], 16
+	ldp	data2, data2h, [src2], 16
+	subs	limit, limit, 16
+	ccmp	data1, data2, 0, hi
+	ccmp	data1h, data2h, 0, eq
+	b.eq	L(loop16)
 
 	cmp	data1, data2
-	bne	.Lreturn
+	bne	L(return)
+	mov	data1, data1h
+	mov	data2, data2h
+	cmp	data1, data2
+	bne	L(return)
 
-	/* Compare last 1-8 bytes using unaligned access.  */
-.Llast_bytes:
-	ldr	data1, [src1, limit]
-	ldr	data2, [src2, limit]
+	/* Compare last 1-16 bytes using unaligned access.  */
+L(last_bytes):
+	add	src1, src1, limit
+	add	src2, src2, limit
+	ldp	data1, data1h, [src1]
+	ldp	data2, data2h, [src2]
+	cmp     data1, data2
+	bne	L(return)
+	mov	data1, data1h
+	mov	data2, data2h
+	cmp	data1, data2
 
 	/* Compare data bytes and set return value to 0, -1 or 1.  */
-.Lreturn:
+L(return):
 #ifndef __AARCH64EB__
 	rev	data1, data1
 	rev	data2, data2
 #endif
 	cmp     data1, data2
-.Lret_eq:
+L(ret_eq):
 	cset	result, ne
 	cneg	result, result, lo
-        ret
+	ret
 
 	.p2align 4
 	/* Compare up to 8 bytes.  Limit is [-8..-1].  */
-.Lless8:
+L(less8):
 	adds	limit, limit, 4
-	b.lo	.Lless4
+	b.lo	L(less4)
 	ldr	data1w, [src1], 4
 	ldr	data2w, [src2], 4
 	cmp	data1w, data2w
-	b.ne	.Lreturn
+	b.ne	L(return)
 	sub	limit, limit, 4
-.Lless4:
+L(less4):
 	adds	limit, limit, 4
-	beq	.Lret_eq
-.Lbyte_loop:
+	beq	L(ret_eq)
+L(byte_loop):
 	ldrb	data1w, [src1], 1
 	ldrb	data2w, [src2], 1
 	subs	limit, limit, 1
 	ccmp	data1w, data2w, 0, ne	/* NZCV = 0b0000.  */
-	b.eq	.Lbyte_loop
+	b.eq	L(byte_loop)
 	sub	result, data1w, data2w
 	ret
+
 END(memcmp)
diff --git a/libc/arch-arm64/generic/bionic/strcmp.S b/libc/arch-arm64/generic/bionic/strcmp.S
index 271452d..fbc215e 100644
--- a/libc/arch-arm64/generic/bionic/strcmp.S
+++ b/libc/arch-arm64/generic/bionic/strcmp.S
@@ -32,6 +32,8 @@
 
 #include <private/bionic_asm.h>
 
+#define L(label) .L ## label
+
 #define REP8_01 0x0101010101010101
 #define REP8_7f 0x7f7f7f7f7f7f7f7f
 #define REP8_80 0x8080808080808080
@@ -61,24 +63,25 @@
 	eor	tmp1, src1, src2
 	mov	zeroones, #REP8_01
 	tst	tmp1, #7
-	b.ne	.Lmisaligned8
+	b.ne	L(misaligned8)
 	ands	tmp1, src1, #7
-	b.ne	.Lmutual_align
+	b.ne	L(mutual_align)
 	/* NUL detection works on the principle that (X - 1) & (~X) & 0x80
 	   (=> (X - 1) & ~(X | 0x7f)) is non-zero iff a byte is zero, and
 	   can be done in parallel across the entire word.  */
-.Lloop_aligned:
+L(loop_aligned):
 	ldr	data1, [src1], #8
 	ldr	data2, [src2], #8
-.Lstart_realigned:
+L(start_realigned):
 	sub	tmp1, data1, zeroones
 	orr	tmp2, data1, #REP8_7f
 	eor	diff, data1, data2	/* Non-zero if differences found.  */
 	bic	has_nul, tmp1, tmp2	/* Non-zero if NUL terminator.  */
 	orr	syndrome, diff, has_nul
-	cbz	syndrome, .Lloop_aligned
+	cbz	syndrome, L(loop_aligned)
 	/* End of performance-critical section  -- one 64B cache line.  */
 
+L(end):
 #ifndef	__AARCH64EB__
 	rev	syndrome, syndrome
 	rev	data1, data1
@@ -129,7 +132,7 @@
 	ret
 #endif
 
-.Lmutual_align:
+L(mutual_align):
 	/* Sources are mutually aligned, but are not currently at an
 	   alignment boundary.  Round down the addresses and then mask off
 	   the bytes that preceed the start point.  */
@@ -149,15 +152,41 @@
 #endif
 	orr	data1, data1, tmp2
 	orr	data2, data2, tmp2
-	b	.Lstart_realigned
+	b	L(start_realigned)
 
-.Lmisaligned8:
-	/* We can do better than this.  */
+L(misaligned8):
+	/* Align SRC1 to 8 bytes and then compare 8 bytes at a time, always
+	   checking to make sure that we don't access beyond page boundary in
+	   SRC2.  */
+	tst	src1, #7
+	b.eq	L(loop_misaligned)
+L(do_misaligned):
 	ldrb	data1w, [src1], #1
 	ldrb	data2w, [src2], #1
 	cmp	data1w, #1
 	ccmp	data1w, data2w, #0, cs	/* NZCV = 0b0000.  */
-	b.eq	.Lmisaligned8
+	b.ne	L(done)
+	tst	src1, #7
+	b.ne	L(do_misaligned)
+
+L(loop_misaligned):
+	/* Test if we are within the last dword of the end of a 4K page.  If
+	   yes then jump back to the misaligned loop to copy a byte at a time.  */
+	and	tmp1, src2, #0xff8
+	eor	tmp1, tmp1, #0xff8
+	cbz	tmp1, L(do_misaligned)
+	ldr	data1, [src1], #8
+	ldr	data2, [src2], #8
+
+	sub	tmp1, data1, zeroones
+	orr	tmp2, data1, #REP8_7f
+	eor	diff, data1, data2	/* Non-zero if differences found.  */
+	bic	has_nul, tmp1, tmp2	/* Non-zero if NUL terminator.  */
+	orr	syndrome, diff, has_nul
+	cbz	syndrome, L(loop_misaligned)
+	b	L(end)
+
+L(done):
 	sub	result, data1, data2
 	ret
 END(strcmp)
diff --git a/libc/arch-arm64/generic/bionic/strncmp.S b/libc/arch-arm64/generic/bionic/strncmp.S
index 267f663..b81f43a 100644
--- a/libc/arch-arm64/generic/bionic/strncmp.S
+++ b/libc/arch-arm64/generic/bionic/strncmp.S
@@ -58,6 +58,7 @@
 #define limit_wd	x13
 #define mask		x14
 #define endloop		x15
+#define count		mask
 
 	.text
 	.p2align 6
@@ -69,9 +70,9 @@
 	eor	tmp1, src1, src2
 	mov	zeroones, #REP8_01
 	tst	tmp1, #7
+	and	count, src1, #7
 	b.ne	.Lmisaligned8
-	ands	tmp1, src1, #7
-	b.ne	.Lmutual_align
+	cbnz	count, .Lmutual_align
 	/* Calculate the number of full and partial words -1.  */
 	sub	limit_wd, limit, #1	/* limit != 0, so no underflow.  */
 	lsr	limit_wd, limit_wd, #3	/* Convert to Dwords.  */
@@ -176,42 +177,104 @@
 	bic	src1, src1, #7
 	bic	src2, src2, #7
 	ldr	data1, [src1], #8
-	neg	tmp3, tmp1, lsl #3	/* 64 - bits(bytes beyond align). */
+	neg	tmp3, count, lsl #3	/* 64 - bits(bytes beyond align). */
 	ldr	data2, [src2], #8
 	mov	tmp2, #~0
 	sub	limit_wd, limit, #1	/* limit != 0, so no underflow.  */
 #ifdef __AARCH64EB__
 	/* Big-endian.  Early bytes are at MSB.  */
-	lsl	tmp2, tmp2, tmp3	/* Shift (tmp1 & 63).  */
+	lsl	tmp2, tmp2, tmp3	/* Shift (count & 63).  */
 #else
 	/* Little-endian.  Early bytes are at LSB.  */
-	lsr	tmp2, tmp2, tmp3	/* Shift (tmp1 & 63).  */
+	lsr	tmp2, tmp2, tmp3	/* Shift (count & 63).  */
 #endif
 	and	tmp3, limit_wd, #7
 	lsr	limit_wd, limit_wd, #3
 	/* Adjust the limit. Only low 3 bits used, so overflow irrelevant.  */
-	add	limit, limit, tmp1
-	add	tmp3, tmp3, tmp1
+	add	limit, limit, count
+	add	tmp3, tmp3, count
 	orr	data1, data1, tmp2
 	orr	data2, data2, tmp2
 	add	limit_wd, limit_wd, tmp3, lsr #3
 	b	.Lstart_realigned
 
-.Lret0:
-	mov	result, #0
-	ret
-
 	.p2align 6
+	/* Don't bother with dwords for up to 16 bytes.  */
 .Lmisaligned8:
-	sub	limit, limit, #1
-1:
+	cmp	limit, #16
+	b.hs	.Ltry_misaligned_words
+
+.Lbyte_loop:
 	/* Perhaps we can do better than this.  */
 	ldrb	data1w, [src1], #1
 	ldrb	data2w, [src2], #1
 	subs	limit, limit, #1
-	ccmp	data1w, #1, #0, cs	/* NZCV = 0b0000.  */
+	ccmp	data1w, #1, #0, hi	/* NZCV = 0b0000.  */
 	ccmp	data1w, data2w, #0, cs	/* NZCV = 0b0000.  */
-	b.eq	1b
+	b.eq	.Lbyte_loop
+.Ldone:
 	sub	result, data1, data2
 	ret
+	/* Align the SRC1 to a dword by doing a bytewise compare and then do
+	   the dword loop.  */
+.Ltry_misaligned_words:
+	lsr	limit_wd, limit, #3
+	cbz	count, .Ldo_misaligned
+
+	neg	count, count
+	and	count, count, #7
+	sub	limit, limit, count
+	lsr	limit_wd, limit, #3
+
+.Lpage_end_loop:
+	ldrb	data1w, [src1], #1
+	ldrb	data2w, [src2], #1
+	cmp	data1w, #1
+	ccmp	data1w, data2w, #0, cs	/* NZCV = 0b0000.  */
+	b.ne	.Ldone
+	subs	count, count, #1
+	b.hi	.Lpage_end_loop
+
+.Ldo_misaligned:
+	/* Prepare ourselves for the next page crossing.  Unlike the aligned
+	   loop, we fetch 1 less dword because we risk crossing bounds on
+	   SRC2.  */
+	mov	count, #8
+	subs	limit_wd, limit_wd, #1
+	b.lo	.Ldone_loop
+.Lloop_misaligned:
+	and	tmp2, src2, #0xff8
+	eor	tmp2, tmp2, #0xff8
+	cbz	tmp2, .Lpage_end_loop
+
+	ldr	data1, [src1], #8
+	ldr	data2, [src2], #8
+	sub	tmp1, data1, zeroones
+	orr	tmp2, data1, #REP8_7f
+	eor	diff, data1, data2	/* Non-zero if differences found.  */
+	bics	has_nul, tmp1, tmp2	/* Non-zero if NUL terminator.  */
+	ccmp	diff, #0, #0, eq
+	b.ne	.Lnot_limit
+	subs	limit_wd, limit_wd, #1
+	b.pl	.Lloop_misaligned
+
+.Ldone_loop:
+	/* We found a difference or a NULL before the limit was reached.  */
+	and	limit, limit, #7
+	cbz	limit, .Lnot_limit
+	/* Read the last word.  */
+	sub	src1, src1, 8
+	sub	src2, src2, 8
+	ldr	data1, [src1, limit]
+	ldr	data2, [src2, limit]
+	sub	tmp1, data1, zeroones
+	orr	tmp2, data1, #REP8_7f
+	eor	diff, data1, data2	/* Non-zero if differences found.  */
+	bics	has_nul, tmp1, tmp2	/* Non-zero if NUL terminator.  */
+	ccmp	diff, #0, #0, eq
+	b.ne	.Lnot_limit
+
+.Lret0:
+	mov	result, #0
+	ret
 END(strncmp)
diff --git a/libc/bionic/libc_init_static.cpp b/libc/bionic/libc_init_static.cpp
index 9eb574a..55506a3 100644
--- a/libc/bionic/libc_init_static.cpp
+++ b/libc/bionic/libc_init_static.cpp
@@ -45,6 +45,10 @@
 #include "private/bionic_tls.h"
 #include "private/KernelArgumentBlock.h"
 
+#if __has_feature(hwaddress_sanitizer)
+#include <sanitizer/hwasan_interface.h>
+#endif
+
 // Leave the variable uninitialized for the sake of the dynamic loader, which
 // links in this file. The loader will initialize this variable before
 // relocating itself.
@@ -85,11 +89,10 @@
 //
 // The 'structors' parameter contains pointers to various initializer
 // arrays that must be run before the program's 'main' routine is launched.
-
-__noreturn void __libc_init(void* raw_args,
-                            void (*onexit)(void) __unused,
-                            int (*slingshot)(int, char**, char**),
-                            structors_array_t const * const structors) {
+__noreturn static void __real_libc_init(void *raw_args,
+                                        void (*onexit)(void) __unused,
+                                        int (*slingshot)(int, char**, char**),
+                                        structors_array_t const * const structors) {
   BIONIC_STOP_UNWIND;
 
   KernelArgumentBlock args(raw_args);
@@ -124,6 +127,20 @@
   exit(slingshot(args.argc, args.argv, args.envp));
 }
 
+#if __has_feature(hwaddress_sanitizer)
+__attribute__((no_sanitize("hwaddress")))
+#endif
+__noreturn void __libc_init(void* raw_args,
+                            void (*onexit)(void) __unused,
+                            int (*slingshot)(int, char**, char**),
+                            structors_array_t const * const structors) {
+#if __has_feature(hwaddress_sanitizer)
+  __hwasan_shadow_init();
+#endif
+  __real_libc_init(raw_args, onexit, slingshot, structors);
+}
+
+
 static uint32_t g_target_sdk_version{__ANDROID_API__};
 
 extern "C" uint32_t android_get_application_target_sdk_version() {
diff --git a/libc/bionic/malloc_common.cpp b/libc/bionic/malloc_common.cpp
index 40a0023..5a5ec76 100644
--- a/libc/bionic/malloc_common.cpp
+++ b/libc/bionic/malloc_common.cpp
@@ -47,8 +47,26 @@
 #include <private/bionic_globals.h>
 #include <private/bionic_malloc_dispatch.h>
 
+#if __has_feature(hwaddress_sanitizer)
+// FIXME: implement these in HWASan allocator.
+extern "C" int __sanitizer_iterate(uintptr_t base __unused, size_t size __unused,
+                                   void (*callback)(uintptr_t base, size_t size, void* arg) __unused,
+                                   void* arg __unused) {
+  return 0;
+}
+
+extern "C" void __sanitizer_malloc_disable() {
+}
+
+extern "C" void __sanitizer_malloc_enable() {
+}
+#include <sanitizer/hwasan_interface.h>
+#define Malloc(function)  __sanitizer_ ## function
+
+#else // __has_feature(hwaddress_sanitizer)
 #include "jemalloc.h"
 #define Malloc(function)  je_ ## function
+#endif
 
 static constexpr MallocDispatch __libc_malloc_default_dispatch
   __attribute__((unused)) = {
diff --git a/libc/bionic/pthread_create.cpp b/libc/bionic/pthread_create.cpp
index c95d400..98d1726 100644
--- a/libc/bionic/pthread_create.cpp
+++ b/libc/bionic/pthread_create.cpp
@@ -249,6 +249,8 @@
   // accesses previously made by the creating thread are visible to us.
   thread->startup_handshake_lock.lock();
 
+  __hwasan_thread_enter();
+
   __init_alternate_signal_stack(thread);
 
   void* result = thread->start_routine(thread->start_routine_arg);
diff --git a/libc/bionic/pthread_exit.cpp b/libc/bionic/pthread_exit.cpp
index ac5d429..220f7a0 100644
--- a/libc/bionic/pthread_exit.cpp
+++ b/libc/bionic/pthread_exit.cpp
@@ -126,6 +126,7 @@
       // That's one last thing we can do before dropping to assembler.
       ScopedSignalBlocker ssb;
       __pthread_unmap_tls(thread);
+      __hwasan_thread_exit();
       _exit_with_stack_teardown(thread->attr.stack_base, thread->mmap_size);
     }
   }
@@ -133,5 +134,6 @@
   // No need to free mapped space. Either there was no space mapped, or it is left for
   // the pthread_join caller to clean up.
   __pthread_unmap_tls(thread);
+  __hwasan_thread_exit();
   __exit(0);
 }
diff --git a/libc/bionic/pthread_internal.h b/libc/bionic/pthread_internal.h
index 18f5aee..1ec201b 100644
--- a/libc/bionic/pthread_internal.h
+++ b/libc/bionic/pthread_internal.h
@@ -31,6 +31,13 @@
 #include <pthread.h>
 #include <stdatomic.h>
 
+#if __has_feature(hwaddress_sanitizer)
+#include <sanitizer/hwasan_interface.h>
+#else
+#define __hwasan_thread_enter()
+#define __hwasan_thread_exit()
+#endif
+
 #include "private/bionic_lock.h"
 #include "private/bionic_tls.h"
 
diff --git a/libm/Android.bp b/libm/Android.bp
index da13ab1..3b88fa3 100644
--- a/libm/Android.bp
+++ b/libm/Android.bp
@@ -493,6 +493,7 @@
         "-D_BSD_SOURCE",
         "-DFLT_EVAL_METHOD=0",
         "-include freebsd-compat.h",
+        "-fno-math-errno",
         "-Wall",
         "-Werror",
         "-Wno-missing-braces",
@@ -503,6 +504,10 @@
         "-Wno-unused-variable",
     ],
 
+    ldflags: [
+        "-Wl,--Bsymbolic-functions",
+    ],
+
     include_dirs: ["bionic/libc"],
     system_shared_libs: ["libc"],
 
diff --git a/linker/Android.bp b/linker/Android.bp
index fb6aa7d..b809f76 100644
--- a/linker/Android.bp
+++ b/linker/Android.bp
@@ -228,6 +228,10 @@
     // Insert an extra objcopy step to add prefix to symbols. This is needed to prevent gdb
     // looking up symbols in the linker by mistake.
     prefix_symbols: "__dl_",
+
+    sanitize: {
+        hwaddress: false,
+    },
 }
 
 cc_library {
