BpfSyscallWrappers: grab shared lock on writable map open

and add an accessor to grab an exclusive lock on a R/W map open
(such a map could be accessed with a write-through cache)

Note: we can't grab a flock as that would occupy the full inode,
and all bpfmaps are actually (currently) the *same* anonymous inode.

As such we actually grab a lock on a range (a single byte),
the offset being determined by the unique bpf map id.

We include some very simple, but sufficient, correctness tests
in the critical boot path: this is to prevent any surprises
caused by kernel implementation changes.

$ adb root && sleep 1 && adb wait-for-device shell grep OFDLCK /proc/locks
id: OFDLCK ADVISORY [WRITE|READ] pid blkmaj:min:inode min_offset max_offset
11: OFDLCK ADVISORY  READ -1 00:0e:1048 36 36
14: OFDLCK ADVISORY  READ -1 00:0e:1048 35 35
15: OFDLCK ADVISORY  READ -1 00:0e:1048 41 41
16: OFDLCK ADVISORY  READ -1 00:0e:1048 40 40
22: OFDLCK ADVISORY  READ -1 00:0e:1048 24 24
23: OFDLCK ADVISORY  READ -1 00:0e:1048 17 17
24: OFDLCK ADVISORY  READ -1 00:0e:1048 16 16
25: OFDLCK ADVISORY  READ -1 00:0e:1048 13 13

OFDLCK probably means 'Open File Descriptor LoCK' since an OFDLCK
is associated with (held by) the file descriptor and not a process/pid,
on the given (anonymous in this case) block device + inode.

Where READ=shared and WRITE=exclusive.
There are (as yet) no exclusive locks held post boot.

The pid field is unfortunately always -1 (and cannot be manually set).

The 00:0e:1048 (or at least the inode portion) is random
(likely depends on boot ordering)

The final two fields (min and max offset) are the bpf map id.

Test: TreeHugger
Signed-off-by: Maciej Żenczykowski <maze@google.com>
Change-Id: I208e3450da3fe4689ad5fd578539f401f25a4fef
diff --git a/bpf_progs/netd.c b/bpf_progs/netd.c
index c3acaad..76ab329 100644
--- a/bpf_progs/netd.c
+++ b/bpf_progs/netd.c
@@ -97,6 +97,9 @@
 DEFINE_BPF_MAP_NO_NETD(ingress_discard_map, HASH, IngressDiscardKey, IngressDiscardValue,
                        INGRESS_DISCARD_MAP_SIZE)
 
+DEFINE_BPF_MAP_RW_NETD(lock_array_test_map, ARRAY, uint32_t, bool, 1)
+DEFINE_BPF_MAP_RW_NETD(lock_hash_test_map, HASH, uint32_t, bool, 1)
+
 /* never actually used from ebpf */
 DEFINE_BPF_MAP_NO_NETD(iface_index_name_map, HASH, uint32_t, IfaceValue, IFACE_INDEX_NAME_MAP_SIZE)
 
diff --git a/netd/BpfHandler.cpp b/netd/BpfHandler.cpp
index 925ee50..aa84089 100644
--- a/netd/BpfHandler.cpp
+++ b/netd/BpfHandler.cpp
@@ -181,7 +181,27 @@
     return netdutils::status::ok;
 }
 
+static void mapLockTest(void) {
+    // The maps must be R/W, and as yet unopened (or more specifically not yet lock'ed).
+    const char * const m1 = BPF_NETD_PATH "map_netd_lock_array_test_map";
+    const char * const m2 = BPF_NETD_PATH "map_netd_lock_hash_test_map";
+
+    unique_fd fd0(bpf::mapRetrieveExclusiveRW(m1)); if (!fd0.ok()) abort();
+
+    unique_fd fd1(bpf::mapRetrieveExclusiveRW(m2)); if (!fd1.ok()) abort();  // no conflict with fd0
+    unique_fd fd2(bpf::mapRetrieveExclusiveRW(m2)); if ( fd2.ok()) abort();  // busy due to fd1
+    unique_fd fd3(bpf::mapRetrieveRO(m2));          if (!fd3.ok()) abort();  // no lock taken
+    unique_fd fd4(bpf::mapRetrieveRW(m2));          if ( fd4.ok()) abort();  // busy due to fd1
+    fd1.reset();  // releases exclusive lock
+    unique_fd fd5(bpf::mapRetrieveRO(m2));          if (!fd5.ok()) abort();  // no lock taken
+    unique_fd fd6(bpf::mapRetrieveRW(m2));          if (!fd6.ok()) abort();  // now ok
+    unique_fd fd7(bpf::mapRetrieveRO(m2));          if (!fd7.ok()) abort();  // no lock taken
+    unique_fd fd8(bpf::mapRetrieveExclusiveRW(m2)); if ( fd8.ok()) abort();  // busy due to fd6
+}
+
 Status BpfHandler::initMaps() {
+    mapLockTest();
+
     RETURN_IF_NOT_OK(mStatsMapA.init(STATS_MAP_A_PATH));
     RETURN_IF_NOT_OK(mStatsMapB.init(STATS_MAP_B_PATH));
     RETURN_IF_NOT_OK(mConfigurationMap.init(CONFIGURATION_MAP_PATH));
diff --git a/staticlibs/native/bpf_syscall_wrappers/include/BpfSyscallWrappers.h b/staticlibs/native/bpf_syscall_wrappers/include/BpfSyscallWrappers.h
index 2a0e8e0..7da4222 100644
--- a/staticlibs/native/bpf_syscall_wrappers/include/BpfSyscallWrappers.h
+++ b/staticlibs/native/bpf_syscall_wrappers/include/BpfSyscallWrappers.h
@@ -16,8 +16,13 @@
 
 #pragma once
 
+#include <stdlib.h>
+#include <unistd.h>
 #include <linux/bpf.h>
 #include <linux/unistd.h>
+#include <sys/file.h>
+
+#include "../../bpf_headers/include/bpf/KernelUtils.h"
 
 #ifdef BPF_FD_JUST_USE_INT
   #define BPF_FD_TYPE int
@@ -128,14 +133,49 @@
                             });
 }
 
+int bpfGetFdMapId(const BPF_FD_TYPE map_fd);
+
+inline int bpfLock(int fd, short type) {
+    if (!isAtLeastKernelVersion(4, 14, 0)) return fd;  // 4.14+ required to fetch map id
+    if (fd < 0) return fd;  // pass any errors straight through
+#ifdef BPF_FD_JUST_USE_INT
+    int mapId = bpfGetFdMapId(fd);
+#else
+    base::unique_fd ufd(fd);
+    int mapId = bpfGetFdMapId(ufd);
+    (void)ufd.release();
+#endif
+    if (mapId <= 0) abort();  // should not be possible
+
+    // on __LP64__ (aka. 64-bit userspace) 'struct flock64' is the same as 'struct flock'
+    struct flock64 fl = {
+        .l_type = type,        // short: F_{RD,WR,UN}LCK
+        .l_whence = SEEK_SET,  // short: SEEK_{SET,CUR,END}
+        .l_start = mapId,      // off_t: start offset
+        .l_len = 1,            // off_t: number of bytes
+    };
+
+    // see: bionic/libc/bionic/fcntl.cpp: iff !__LP64__ this uses fcntl64
+    int ret = fcntl(fd, F_OFD_SETLK, &fl);
+    if (!ret) return fd;  // success
+    close(fd);
+    return ret;  // most likely -1 with errno == EAGAIN, due to already held lock
+}
+
+inline int mapRetrieveExclusiveRW(const char* pathname) {
+    return bpfLock(bpfFdGet(pathname, 0), F_WRLCK);
+}
 inline int mapRetrieveRW(const char* pathname) {
-    return bpfFdGet(pathname, 0);
+    return bpfLock(bpfFdGet(pathname, 0), F_RDLCK);
 }
 
 inline int mapRetrieveRO(const char* pathname) {
     return bpfFdGet(pathname, BPF_F_RDONLY);
 }
 
+// WARNING: it's impossible to grab a shared (ie. read) lock on a write-only fd,
+// so only use this if you don't care about locking, ie. never use
+// mapRetrieveExclusiveRW() on the same map (ie. pathname).
 inline int mapRetrieveWO(const char* pathname) {
     return bpfFdGet(pathname, BPF_F_WRONLY);
 }