Mandate optimized __memset_chk for arm and arm64.

This involves actually implementing assembler __memset_chk for arm64,
but that's easily done.

Obviously I'd like this for all architectures (and all the string functions),
but this is low-hanging fruit...

Change-Id: I70ec48c91aafd1f0feb974a2555c51611de9ef82
diff --git a/libc/arch-arm64/generic/bionic/memset.S b/libc/arch-arm64/generic/bionic/memset.S
index 7c204b4..9626c0b 100644
--- a/libc/arch-arm64/generic/bionic/memset.S
+++ b/libc/arch-arm64/generic/bionic/memset.S
@@ -45,12 +45,9 @@
    values rather than re-reading them each call.  */
 
 #define dstin		x0
-#ifdef BZERO
-#define count		x1
-#else
-#define count		x2
-#endif
 #define val		w1
+#define count		x2
+#define dst_count x3 /* for __memset_chk */
 #define tmp1		x3
 #define tmp1w		w3
 #define tmp2		x4
@@ -64,16 +61,22 @@
 #define dst		x8
 #define tmp3w		w9
 
-#ifdef BZERO
-ENTRY(bzero)
-#else
+ENTRY(__memset_chk)
+  cmp count, dst_count
+  bls memset
+
+  // Preserve for accurate backtrace.
+  stp x29, x30, [sp, -16]!
+  .cfi_def_cfa_offset 16
+  .cfi_rel_offset x29, 0
+  .cfi_rel_offset x30, 8
+
+  bl __memset_chk_fail
+END(__memset_chk)
+
 ENTRY(memset)
-#endif
 
 	mov	dst, dstin		/* Preserve return value.  */
-#ifdef BZERO
-	b	.Lzero_mem
-#endif
 	ands	A_lw, val, #255
 	b.eq	.Lzero_mem
 	orr	A_lw, A_lw, A_lw, lsl #8
@@ -233,11 +236,7 @@
 	ands	count, count, zva_bits_x
 	b.ne	.Ltail_maybe_long
 	ret
-#ifdef BZERO
-END(bzero)
-#else
 END(memset)
-#endif
 
 #ifdef MAYBE_VIRT
 	.bss