clatd: implement seccomp system call protection
but only enable it in enforcing mode on aarch64,
since that gets good test coverage via Pixel
on GoogleGuest ipv6-only wifi network
and (for example) T-Mobile US cellular.
For other architectures this will only result
in (automatically ratelimitted to at most 5/s)
audit logs for any 'unusual' system calls.
Test: TreeHugger, manually on Pixel on GoogleGuest wifi
Signed-off-by: Maciej Żenczykowski <maze@google.com>
Change-Id: Id14c6d9db8d7b4652c7358cac101a82ef09843e0
diff --git a/clatd/clatd.h b/clatd/clatd.h
index daa5ebc..11e9687 100644
--- a/clatd/clatd.h
+++ b/clatd/clatd.h
@@ -48,7 +48,7 @@
// plus some extra just-in-case headroom, because it doesn't hurt.
#define MAXDUMPLEN (64 + MAXMTU)
-#define CLATD_VERSION "1.7"
+#define CLATD_VERSION "1.8"
#define ARRAY_SIZE(x) (sizeof(x) / sizeof((x)[0]))
diff --git a/clatd/main.c b/clatd/main.c
index bc29041..9e5710e 100644
--- a/clatd/main.c
+++ b/clatd/main.c
@@ -18,12 +18,17 @@
#include <arpa/inet.h>
#include <errno.h>
+#include <linux/audit.h>
+#include <linux/filter.h>
+#include <linux/seccomp.h>
+#include <linux/unistd.h>
#include <netinet/in.h>
#include <stdbool.h>
#include <stdint.h>
#include <stdlib.h>
#include <string.h>
#include <sys/personality.h>
+#include <sys/prctl.h>
#include <sys/utsname.h>
#include <unistd.h>
@@ -53,6 +58,131 @@
printf("-w [write socket descriptor number]\n");
}
+// Load the architecture identifier (AUDIT_ARCH_* constant)
+#define BPF_SECCOMP_LOAD_AUDIT_ARCH \
+ BPF_STMT(BPF_LD | BPF_W | BPF_ABS, offsetof(struct seccomp_data, arch))
+
+// Load the system call number
+#define BPF_SECCOMP_LOAD_SYSCALL_NR \
+ BPF_STMT(BPF_LD | BPF_W | BPF_ABS, offsetof(struct seccomp_data, nr))
+
+#if __BYTE_ORDER == __LITTLE_ENDIAN
+// Load the system call argument n, where n is [0..5]
+#define BPF_SECCOMP_LOAD_SYSCALL_ARG_LO32(n) \
+ BPF_STMT(BPF_LD | BPF_W | BPF_ABS, offsetof(struct seccomp_data, args[n]))
+#define BPF_SECCOMP_LOAD_SYSCALL_ARG_HI32(n) \
+ BPF_STMT(BPF_LD | BPF_W | BPF_ABS, offsetof(struct seccomp_data, args[n]) + 4)
+#else
+#error "Not a little endian architecture?"
+#endif
+
+// Allow the system call
+#define BPF_SECCOMP_ALLOW BPF_STMT(BPF_RET | BPF_K, SECCOMP_RET_ALLOW)
+
+// Allow (but 'audit' log) the system call
+#define BPF_SECCOMP_LOG BPF_STMT(BPF_RET | BPF_K, SECCOMP_RET_LOG)
+
+// Reject the system call (kill thread)
+#define BPF_SECCOMP_KILL BPF_STMT(BPF_RET | BPF_K, SECCOMP_RET_KILL)
+
+// Note arguments to BPF_JUMP(opcode, operand, true_offset, false_offset)
+
+// If not equal, jump over count instructions
+#define BPF_JUMP_IF_NOT_EQUAL(v, count) \
+ BPF_JUMP(BPF_JMP | BPF_JEQ | BPF_K, (v), 0, (count))
+
+// If equal, jump over count instructions
+#define BPF_JUMP_IF_EQUAL(v, count) \
+ BPF_JUMP(BPF_JMP | BPF_JEQ | BPF_K, (v), (count), 0)
+
+// *TWO* instructions: compare and if not equal jump over the allow statement
+#define BPF2_SECCOMP_ALLOW_IF_EQUAL(v) \
+ BPF_JUMP_IF_NOT_EQUAL((v), 1), \
+ BPF_SECCOMP_ALLOW
+
+// *TWO* instructions: compare and if not equal jump over the log statement
+#define BPF2_SECCOMP_LOG_IF_EQUAL(v) \
+ BPF_JUMP_IF_NOT_EQUAL((v), 1), \
+ BPF_SECCOMP_LOG
+
+// *TWO* instructions: compare and if equal jump over the kill statement
+#define BPF2_SECCOMP_KILL_IF_NOT_EQUAL(v) \
+ BPF_JUMP_IF_EQUAL((v), 1), \
+ BPF_SECCOMP_KILL
+
+// Android only supports the following 5 little endian architectures
+#if defined(__aarch64__) && defined(__LP64__)
+ #define MY_AUDIT_ARCH AUDIT_ARCH_AARCH64
+#elif defined(__arm__) && defined(__ILP32__)
+ #define MY_AUDIT_ARCH AUDIT_ARCH_ARM
+#elif defined(__i386__) && defined(__ILP32__)
+ #define MY_AUDIT_ARCH AUDIT_ARCH_I386
+#elif defined(__x86_64__) && defined(__LP64__)
+ #define MY_AUDIT_ARCH AUDIT_ARCH_X86_64
+#elif defined(__riscv) && defined(__LP64__)
+ #define MY_AUDIT_ARCH AUDIT_ARCH_RISCV64
+#else
+ #error "Unknown AUDIT_ARCH_* architecture."
+#endif
+
+void enable_seccomp(void) {
+ static const struct sock_filter filter[] = {
+ BPF_SECCOMP_LOAD_AUDIT_ARCH,
+ BPF2_SECCOMP_KILL_IF_NOT_EQUAL(MY_AUDIT_ARCH),
+
+ BPF_SECCOMP_LOAD_SYSCALL_NR, // aarch64
+
+ // main event loop:
+ // ppoll ( read sendmsg | recvmsg writev )
+ BPF2_SECCOMP_ALLOW_IF_EQUAL(__NR_ppoll), // 73
+ BPF2_SECCOMP_ALLOW_IF_EQUAL(__NR_read), // 63
+ BPF2_SECCOMP_ALLOW_IF_EQUAL(__NR_sendmsg), // 211
+ BPF2_SECCOMP_ALLOW_IF_EQUAL(__NR_recvmsg), // 212
+ BPF2_SECCOMP_ALLOW_IF_EQUAL(__NR_writev), // 66
+
+ // logging: getuid writev
+ BPF2_SECCOMP_ALLOW_IF_EQUAL(__NR_getuid), // 174
+
+ // inbound signal (SIGTERM) processing
+ BPF2_SECCOMP_ALLOW_IF_EQUAL(__NR_rt_sigreturn), // 139
+
+ // sleep(n)
+ BPF2_SECCOMP_ALLOW_IF_EQUAL(__NR_nanosleep), // 101
+
+ // _exit(0)
+ BPF2_SECCOMP_ALLOW_IF_EQUAL(__NR_exit_group), // 94
+
+#if defined(__aarch64__)
+ // Pixels are aarch64 - if we break clatd functionality on them,
+ // we *will* notice on GoogleGuest WiFi network (which is ipv6 only)
+ BPF_SECCOMP_KILL,
+#else
+ // All other architectures: generate audit lines visible in dmesg and logcat
+ BPF_SECCOMP_LOG,
+#endif
+ };
+ static const struct sock_fprog prog = {
+ .len = (unsigned short)ARRAY_SIZE(filter),
+ .filter = (struct sock_filter *)filter,
+ };
+
+ // https://man7.org/linux/man-pages/man2/PR_SET_NO_NEW_PRIVS.2const.html
+ // required to allow non-privileged seccomp filter installation
+ int rv = prctl(PR_SET_NO_NEW_PRIVS, 1L, 0L, 0L, 0L);
+ if (rv) {
+ logmsg(ANDROID_LOG_FATAL, "prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0) = %d [%d]", rv, errno);
+ exit(1);
+ }
+
+ // https://man7.org/linux/man-pages/man2/PR_SET_SECCOMP.2const.html
+ // but see also https://man7.org/linux/man-pages/man2/seccomp.2.html
+ rv = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &prog, 0L, 0L);
+ if (rv) {
+ logmsg(ANDROID_LOG_FATAL, "prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &prog, 0, 0) = %d [%d]", rv, errno);
+ exit(1);
+ }
+}
+
/* function: main
* allocate and setup the tun device, then run the event loop
*/
@@ -241,6 +371,8 @@
// TODO: actually perform true DAD
send_dad(tunnel.write_fd6, &Global_Clatd_Config.ipv6_local_subnet);
+ enable_seccomp(); // WARNING: from this point forward very limited system calls available.
+
event_loop(&tunnel);
if (sigterm) {
@@ -255,5 +387,7 @@
logmsg(ANDROID_LOG_INFO, "Clatd on %s %s SIGTERM", uplink_interface,
sigterm ? "received" : "timed out waiting for");
}
- return 0;
+
+ // Using _exit() here avoids 4 mprotect() syscalls triggered via 'exit(0)' or 'return 0'
+ _exit(0);
}