clatd - ipv6 fragment to ipv4 translation support (try 2)
This is a repeat of:
https://android-review.git.corp.google.com/c/platform/packages/modules/Connectivity/+/2266447
which was reverted in:
https://android-review.git.corp.google.com/c/platform/packages/modules/Connectivity/+/2372509
This time with kver >= 4.14 protections of the bpf_skb_adjust_room()
bpf helper which isn't present on 4.9 T devices.
Original change comments:
Tested manually on a flame device connected to an ipv6-only wifi
network (GoogleGuest).
On server:
nc -4 -l -u -p 443
On client (phone):
adb shell nc -4 -u my.server 443
On client (phone):
adb shell tcpdump -l -ee -vv -s 1600 -i v4-wlan0
On client send something to server "Hi."
On server send something to client "Hey!"
You should see normal unfragmented IP packets.
Then on server send something really long (I used 57 copies of the 26 letter English alphabet). This should be long enough that fragmentation is required.
You should see tcpdump show 2 ipv4 fragments, and netcat
show the packet being delivered correctly.
(and previous versions of the code were buggy and were
resulting in corrupt packets and things not working)
Test: TreeHugger
Signed-off-by: Maciej Żenczykowski <maze@google.com>
Change-Id: I6758e63d8133215edd26b4cd2d73a5b5f261ffd1
diff --git a/bpf_progs/clatd.c b/bpf_progs/clatd.c
index 563ca5e..7350209 100644
--- a/bpf_progs/clatd.c
+++ b/bpf_progs/clatd.c
@@ -52,9 +52,17 @@
__be32 identification;
};
+// constants for passing in to 'bool is_ethernet'
+static const bool RAWIP = false;
+static const bool ETHER = true;
+
+#define KVER_4_14 KVER(4, 14, 0)
+
DEFINE_BPF_MAP_GRW(clat_ingress6_map, HASH, ClatIngress6Key, ClatIngress6Value, 16, AID_SYSTEM)
-static inline __always_inline int nat64(struct __sk_buff* skb, bool is_ethernet) {
+static inline __always_inline int nat64(struct __sk_buff* skb,
+ const bool is_ethernet,
+ const unsigned kver) {
// Require ethernet dst mac address to be our unicast address.
if (is_ethernet && (skb->pkt_type != PACKET_HOST)) return TC_ACT_PIPE;
@@ -103,6 +111,31 @@
__u8 proto = ip6->nexthdr;
__be16 ip_id = 0;
__be16 frag_off = htons(IP_DF);
+ __u16 tot_len = ntohs(ip6->payload_len) + sizeof(struct iphdr); // cannot overflow, see above
+
+ if (proto == IPPROTO_FRAGMENT) {
+ // Fragment handling requires bpf_skb_adjust_room which is 4.14+
+ if (kver < KVER_4_14) return TC_ACT_PIPE;
+
+ // Must have (ethernet and) ipv6 header and ipv6 fragment extension header
+ if (data + l2_header_size + sizeof(*ip6) + sizeof(struct frag_hdr) > data_end)
+ return TC_ACT_PIPE;
+ const struct frag_hdr *frag = (const struct frag_hdr *)(ip6 + 1);
+ proto = frag->nexthdr;
+ // RFC6145: use bottom 16-bits of network endian 32-bit IPv6 ID field for 16-bit IPv4 field.
+ // this is equivalent to: ip_id = htons(ntohl(frag->identification));
+ ip_id = frag->identification >> 16;
+ // Conversion of 16-bit IPv6 frag offset to 16-bit IPv4 frag offset field.
+ // IPv6 is '13 bits of offset in multiples of 8' + 2 zero bits + more fragment bit
+ // IPv4 is zero bit + don't frag bit + more frag bit + '13 bits of offset in multiples of 8'
+ frag_off = ntohs(frag->frag_off);
+ frag_off = ((frag_off & 1) << 13) | (frag_off >> 3);
+ frag_off = htons(frag_off);
+ // Note that by construction tot_len is guaranteed to not underflow here
+ tot_len -= sizeof(struct frag_hdr);
+ // This is a badly formed IPv6 packet with less payload than the size of an IPv6 Frag EH
+ if (tot_len < sizeof(struct iphdr)) return TC_ACT_PIPE;
+ }
switch (proto) {
case IPPROTO_TCP: // For TCP & UDP the checksum neutrality of the chosen IPv6
@@ -129,7 +162,7 @@
.version = 4, // u4
.ihl = sizeof(struct iphdr) / sizeof(__u32), // u4
.tos = (ip6->priority << 4) + (ip6->flow_lbl[0] >> 4), // u8
- .tot_len = htons(ntohs(ip6->payload_len) + sizeof(struct iphdr)), // be16
+ .tot_len = htons(tot_len), // be16
.id = ip_id, // be16
.frag_off = frag_off, // be16
.ttl = ip6->hop_limit, // u8
@@ -186,6 +219,26 @@
// return -ENOTSUPP;
bpf_csum_update(skb, sum6);
+ // Technically 'kver < KVER_4_14' already implies 'frag_off == htons(IP_DF)' due to logic above,
+ // thus the initial 'kver >= KVER_4_14' check here is entirely superfluous.
+ //
+ // However, we *need* the compiler (when compiling the program for 4.9) to entirely
+ // optimize out the call to bpf_skb_adjust_room() bpf helper: it's not enough for it to emit
+ // an unreachable call to it, it must *not* emit it at all (otherwise the 4.9 kernel's
+ // bpf verifier will refuse to load a program with an unknown bpf helper call)
+ //
+ // This is easiest to achieve by being very explicit in the if clause,
+ // better safe than sorry...
+ //
+ // Note: we currently have no TreeHugger coverage for 4.9-T devices (there are no such
+ // Pixel or cuttlefish devices), so likely you won't notice for months if this breaks...
+ if (kver >= KVER_4_14 && frag_off != htons(IP_DF)) {
+ // If we're converting an IPv6 Fragment, we need to trim off 8 more bytes
+ // We're beyond recovery on error here... but hard to imagine how this could fail.
+ if (bpf_skb_adjust_room(skb, -(__s32)sizeof(struct frag_hdr), BPF_ADJ_ROOM_NET, /*flags*/0))
+ return TC_ACT_SHOT;
+ }
+
// bpf_skb_change_proto() invalidates all pointers - reload them.
data = (void*)(long)skb->data;
data_end = (void*)(long)skb->data_end;
@@ -214,14 +267,24 @@
return TC_ACT_PIPE;
}
-DEFINE_BPF_PROG("schedcls/ingress6/clat_ether", AID_ROOT, AID_SYSTEM, sched_cls_ingress6_clat_ether)
+DEFINE_BPF_PROG_KVER("schedcls/ingress6/clat_ether$4_14", AID_ROOT, AID_SYSTEM, sched_cls_ingress6_clat_ether_4_14, KVER_4_14)
(struct __sk_buff* skb) {
- return nat64(skb, true);
+ return nat64(skb, ETHER, KVER_4_14);
}
-DEFINE_BPF_PROG("schedcls/ingress6/clat_rawip", AID_ROOT, AID_SYSTEM, sched_cls_ingress6_clat_rawip)
+DEFINE_BPF_PROG_KVER_RANGE("schedcls/ingress6/clat_ether$4_9", AID_ROOT, AID_SYSTEM, sched_cls_ingress6_clat_ether_4_9, KVER_NONE, KVER_4_14)
(struct __sk_buff* skb) {
- return nat64(skb, false);
+ return nat64(skb, ETHER, KVER_NONE);
+}
+
+DEFINE_BPF_PROG_KVER("schedcls/ingress6/clat_rawip$4_14", AID_ROOT, AID_SYSTEM, sched_cls_ingress6_clat_rawip_4_14, KVER_4_14)
+(struct __sk_buff* skb) {
+ return nat64(skb, RAWIP, KVER_4_14);
+}
+
+DEFINE_BPF_PROG_KVER_RANGE("schedcls/ingress6/clat_rawip$4_9", AID_ROOT, AID_SYSTEM, sched_cls_ingress6_clat_rawip_4_9, KVER_NONE, KVER_4_14)
+(struct __sk_buff* skb) {
+ return nat64(skb, RAWIP, KVER_NONE);
}
DEFINE_BPF_MAP_GRW(clat_egress4_map, HASH, ClatEgress4Key, ClatEgress4Value, 16, AID_SYSTEM)