Replace cxa_guard fences with acquire loads This seemed to be the only place in bionic where a fence on a performance-critical path could be easily replaced by a stronger load/store order constraint. Do so. On x86 this should generate the same code either way. Based on microbenchmarks of the relevant ARM instructions, this is currently performance-neutral in this kind of context. But in the future, the newly generated acquire loads should give us a performance benefit. Test: Booted AOSP Change-Id: I7823e11d6ae4fd58e0425244c293262e2320fd81

commit: 963daed78044724547ff1c7c35b34a4d50a0b867 [log] [tgz]
author: Hans Boehm <hboehm@google.com> Wed Feb 22 15:34:29 2017 -0800
committer: Hans Boehm <hboehm@google.com> Thu Feb 23 11:50:58 2017 -0800
tree: e8293055d696d79e994e7094dfb58cce2957963f
parent: c9a840ac76c7cdbe6028ac91cdb8eb6698f0a854 [diff]
diff --git a/libc/bionic/__cxa_guard.cpp b/libc/bionic/__cxa_guard.cpp
index 97284d5..06926df 100644
--- a/libc/bionic/__cxa_guard.cpp
+++ b/libc/bionic/__cxa_guard.cpp

@@ -79,38 +79,33 @@
 #define CONSTRUCTION_UNDERWAY_WITH_WAITER       0x200
 
 extern "C" int __cxa_guard_acquire(_guard_t* gv) {
-  int old_value = atomic_load_explicit(&gv->state, memory_order_relaxed);
+  int old_value = atomic_load_explicit(&gv->state, memory_order_acquire);
+  // In the common CONSTRUCTION_COMPLETE case we have to ensure that all the stores performed by
+  // the construction function are observable on this CPU after we exit. A similar constraint may
+  // apply in the CONSTRUCTION_NOT_YET_STARTED case with a prior abort.
 
   while (true) {
     if (old_value == CONSTRUCTION_COMPLETE) {
-      // A load_acquire operation is need before exiting with COMPLETE state, as we have to ensure
-      // that all the stores performed by the construction function are observable on this CPU
-      // after we exit.
-      atomic_thread_fence(memory_order_acquire);
       return 0;
     } else if (old_value == CONSTRUCTION_NOT_YET_STARTED) {
       if (!atomic_compare_exchange_weak_explicit(&gv->state, &old_value,
                                                   CONSTRUCTION_UNDERWAY_WITHOUT_WAITER,
-                                                  memory_order_relaxed,
-                                                  memory_order_relaxed)) {
+                                                  memory_order_acquire /* or relaxed in C++17 */,
+                                                  memory_order_acquire)) {
         continue;
       }
-      // The acquire fence may not be needed. But as described in section 3.3.2 of
-      // the Itanium C++ ABI specification, it probably has to behave like the
-      // acquisition of a mutex, which needs an acquire fence.
-      atomic_thread_fence(memory_order_acquire);
       return 1;
     } else if (old_value == CONSTRUCTION_UNDERWAY_WITHOUT_WAITER) {
       if (!atomic_compare_exchange_weak_explicit(&gv->state, &old_value,
                                                  CONSTRUCTION_UNDERWAY_WITH_WAITER,
-                                                 memory_order_relaxed,
-                                                 memory_order_relaxed)) {
+                                                 memory_order_acquire /* or relaxed in C++17 */,
+                                                 memory_order_acquire)) {
         continue;
       }
     }
 
     __futex_wait_ex(&gv->state, false, CONSTRUCTION_UNDERWAY_WITH_WAITER, false, nullptr);
-    old_value = atomic_load_explicit(&gv->state, memory_order_relaxed);
+    old_value = atomic_load_explicit(&gv->state, memory_order_acquire);
   }
 }
commit	963daed78044724547ff1c7c35b34a4d50a0b867	[log] [tgz]
author	Hans Boehm <hboehm@google.com>	Wed Feb 22 15:34:29 2017 -0800
committer	Hans Boehm <hboehm@google.com>	Thu Feb 23 11:50:58 2017 -0800
tree	e8293055d696d79e994e7094dfb58cce2957963f
parent	c9a840ac76c7cdbe6028ac91cdb8eb6698f0a854 [diff]