Switch to the arm-optimized-routines memcpy() and memmove().

Outsource this to them, and choose the best of the two options available
based on the hardware we're running on.

Test: treehugger
Change-Id: I2fa7555c971b64a6decca132210e901ffa248efa
diff --git a/libc/arch-arm64/dynamic_function_dispatch.cpp b/libc/arch-arm64/dynamic_function_dispatch.cpp
index 83e5ca4..bbd4218 100644
--- a/libc/arch-arm64/dynamic_function_dispatch.cpp
+++ b/libc/arch-arm64/dynamic_function_dispatch.cpp
@@ -41,6 +41,24 @@
     }
 }
 
+typedef void* memcpy_func(void*, const void*, size_t);
+DEFINE_IFUNC_FOR(memcpy) {
+    if (arg->_hwcap & HWCAP_ASIMD) {
+        RETURN_FUNC(memcpy_func, __memcpy_aarch64_simd);
+    } else {
+        RETURN_FUNC(memcpy_func, __memcpy_aarch64);
+    }
+}
+
+typedef void* memmove_func(void*, const void*, size_t);
+DEFINE_IFUNC_FOR(memmove) {
+    if (arg->_hwcap & HWCAP_ASIMD) {
+        RETURN_FUNC(memmove_func, __memmove_aarch64_simd);
+    } else {
+        RETURN_FUNC(memmove_func, __memmove_aarch64);
+    }
+}
+
 typedef int stpcpy_func(char*, const char*);
 DEFINE_IFUNC_FOR(stpcpy) {
     if (arg->_hwcap2 & HWCAP2_MTE) {