Add MOPS memcpy/memset/memmove from arm-optimized-routines.

These make use of instructions designed specifically for
memcpy/memset/memmove, and should have optimal performance without the
need to tune them to specific microarchitectures.

For memcpy, use memmove, as the MOPS version of memcpy does not handle
overlapping regions, while every other version does.

Test: bionic-unit-tests
Test: boot on Qemu with MOPS enabled

Change-Id: I5b312c8a7f81cf1353f06a047b78ff608e9c6712
diff --git a/libc/arch-arm64/dynamic_function_dispatch.cpp b/libc/arch-arm64/dynamic_function_dispatch.cpp
index a42c361..f9e4263 100644
--- a/libc/arch-arm64/dynamic_function_dispatch.cpp
+++ b/libc/arch-arm64/dynamic_function_dispatch.cpp
@@ -66,7 +66,9 @@
 
 typedef void* memcpy_func(void*, const void*, size_t);
 DEFINE_IFUNC_FOR(memcpy) {
-    if (__bionic_is_oryon(arg->_hwcap)) {
+    if (arg->_hwcap2 & HWCAP2_MOPS) {
+        RETURN_FUNC(memcpy_func, __memmove_aarch64_mops);
+    } else if (__bionic_is_oryon(arg->_hwcap)) {
         RETURN_FUNC(memcpy_func, __memcpy_aarch64_nt);
     } else if (arg->_hwcap & HWCAP_ASIMD) {
         RETURN_FUNC(memcpy_func, __memcpy_aarch64_simd);
@@ -77,7 +79,9 @@
 
 typedef void* memmove_func(void*, const void*, size_t);
 DEFINE_IFUNC_FOR(memmove) {
-  if (__bionic_is_oryon(arg->_hwcap)) {
+  if (arg->_hwcap2 & HWCAP2_MOPS) {
+    RETURN_FUNC(memmove_func, __memmove_aarch64_mops);
+  } else if (__bionic_is_oryon(arg->_hwcap)) {
     RETURN_FUNC(memcpy_func, __memmove_aarch64_nt);
   } else if (arg->_hwcap & HWCAP_ASIMD) {
     RETURN_FUNC(memmove_func, __memmove_aarch64_simd);
@@ -93,7 +97,9 @@
 
 typedef int memset_func(void*, int, size_t);
 DEFINE_IFUNC_FOR(memset) {
-    if (__bionic_is_oryon(arg->_hwcap)) {
+    if (arg->_hwcap2 & HWCAP2_MOPS) {
+        RETURN_FUNC(memset_func, __memset_aarch64_mops);
+    } else if (__bionic_is_oryon(arg->_hwcap)) {
         RETURN_FUNC(memset_func, __memset_aarch64_nt);
     } else {
         RETURN_FUNC(memset_func, __memset_aarch64);