Merge "clatd - ipv6 fragment to ipv4 translation support (try 2)"
diff --git a/bpf_progs/clatd.c b/bpf_progs/clatd.c
index 563ca5e..7350209 100644
--- a/bpf_progs/clatd.c
+++ b/bpf_progs/clatd.c
@@ -52,9 +52,17 @@
__be32 identification;
};
+// constants for passing in to 'bool is_ethernet'
+static const bool RAWIP = false;
+static const bool ETHER = true;
+
+#define KVER_4_14 KVER(4, 14, 0)
+
DEFINE_BPF_MAP_GRW(clat_ingress6_map, HASH, ClatIngress6Key, ClatIngress6Value, 16, AID_SYSTEM)
-static inline __always_inline int nat64(struct __sk_buff* skb, bool is_ethernet) {
+static inline __always_inline int nat64(struct __sk_buff* skb,
+ const bool is_ethernet,
+ const unsigned kver) {
// Require ethernet dst mac address to be our unicast address.
if (is_ethernet && (skb->pkt_type != PACKET_HOST)) return TC_ACT_PIPE;
@@ -103,6 +111,31 @@
__u8 proto = ip6->nexthdr;
__be16 ip_id = 0;
__be16 frag_off = htons(IP_DF);
+ __u16 tot_len = ntohs(ip6->payload_len) + sizeof(struct iphdr); // cannot overflow, see above
+
+ if (proto == IPPROTO_FRAGMENT) {
+ // Fragment handling requires bpf_skb_adjust_room which is 4.14+
+ if (kver < KVER_4_14) return TC_ACT_PIPE;
+
+ // Must have (ethernet and) ipv6 header and ipv6 fragment extension header
+ if (data + l2_header_size + sizeof(*ip6) + sizeof(struct frag_hdr) > data_end)
+ return TC_ACT_PIPE;
+ const struct frag_hdr *frag = (const struct frag_hdr *)(ip6 + 1);
+ proto = frag->nexthdr;
+ // RFC6145: use bottom 16-bits of network endian 32-bit IPv6 ID field for 16-bit IPv4 field.
+ // this is equivalent to: ip_id = htons(ntohl(frag->identification));
+ ip_id = frag->identification >> 16;
+ // Conversion of 16-bit IPv6 frag offset to 16-bit IPv4 frag offset field.
+ // IPv6 is '13 bits of offset in multiples of 8' + 2 zero bits + more fragment bit
+ // IPv4 is zero bit + don't frag bit + more frag bit + '13 bits of offset in multiples of 8'
+ frag_off = ntohs(frag->frag_off);
+ frag_off = ((frag_off & 1) << 13) | (frag_off >> 3);
+ frag_off = htons(frag_off);
+ // Note that by construction tot_len is guaranteed to not underflow here
+ tot_len -= sizeof(struct frag_hdr);
+ // This is a badly formed IPv6 packet with less payload than the size of an IPv6 Frag EH
+ if (tot_len < sizeof(struct iphdr)) return TC_ACT_PIPE;
+ }
switch (proto) {
case IPPROTO_TCP: // For TCP & UDP the checksum neutrality of the chosen IPv6
@@ -129,7 +162,7 @@
.version = 4, // u4
.ihl = sizeof(struct iphdr) / sizeof(__u32), // u4
.tos = (ip6->priority << 4) + (ip6->flow_lbl[0] >> 4), // u8
- .tot_len = htons(ntohs(ip6->payload_len) + sizeof(struct iphdr)), // be16
+ .tot_len = htons(tot_len), // be16
.id = ip_id, // be16
.frag_off = frag_off, // be16
.ttl = ip6->hop_limit, // u8
@@ -186,6 +219,26 @@
// return -ENOTSUPP;
bpf_csum_update(skb, sum6);
+ // Technically 'kver < KVER_4_14' already implies 'frag_off == htons(IP_DF)' due to logic above,
+ // thus the initial 'kver >= KVER_4_14' check here is entirely superfluous.
+ //
+ // However, we *need* the compiler (when compiling the program for 4.9) to entirely
+ // optimize out the call to bpf_skb_adjust_room() bpf helper: it's not enough for it to emit
+ // an unreachable call to it, it must *not* emit it at all (otherwise the 4.9 kernel's
+ // bpf verifier will refuse to load a program with an unknown bpf helper call)
+ //
+ // This is easiest to achieve by being very explicit in the if clause,
+ // better safe than sorry...
+ //
+ // Note: we currently have no TreeHugger coverage for 4.9-T devices (there are no such
+ // Pixel or cuttlefish devices), so likely you won't notice for months if this breaks...
+ if (kver >= KVER_4_14 && frag_off != htons(IP_DF)) {
+ // If we're converting an IPv6 Fragment, we need to trim off 8 more bytes
+ // We're beyond recovery on error here... but hard to imagine how this could fail.
+ if (bpf_skb_adjust_room(skb, -(__s32)sizeof(struct frag_hdr), BPF_ADJ_ROOM_NET, /*flags*/0))
+ return TC_ACT_SHOT;
+ }
+
// bpf_skb_change_proto() invalidates all pointers - reload them.
data = (void*)(long)skb->data;
data_end = (void*)(long)skb->data_end;
@@ -214,14 +267,24 @@
return TC_ACT_PIPE;
}
-DEFINE_BPF_PROG("schedcls/ingress6/clat_ether", AID_ROOT, AID_SYSTEM, sched_cls_ingress6_clat_ether)
+DEFINE_BPF_PROG_KVER("schedcls/ingress6/clat_ether$4_14", AID_ROOT, AID_SYSTEM, sched_cls_ingress6_clat_ether_4_14, KVER_4_14)
(struct __sk_buff* skb) {
- return nat64(skb, true);
+ return nat64(skb, ETHER, KVER_4_14);
}
-DEFINE_BPF_PROG("schedcls/ingress6/clat_rawip", AID_ROOT, AID_SYSTEM, sched_cls_ingress6_clat_rawip)
+DEFINE_BPF_PROG_KVER_RANGE("schedcls/ingress6/clat_ether$4_9", AID_ROOT, AID_SYSTEM, sched_cls_ingress6_clat_ether_4_9, KVER_NONE, KVER_4_14)
(struct __sk_buff* skb) {
- return nat64(skb, false);
+ return nat64(skb, ETHER, KVER_NONE);
+}
+
+DEFINE_BPF_PROG_KVER("schedcls/ingress6/clat_rawip$4_14", AID_ROOT, AID_SYSTEM, sched_cls_ingress6_clat_rawip_4_14, KVER_4_14)
+(struct __sk_buff* skb) {
+ return nat64(skb, RAWIP, KVER_4_14);
+}
+
+DEFINE_BPF_PROG_KVER_RANGE("schedcls/ingress6/clat_rawip$4_9", AID_ROOT, AID_SYSTEM, sched_cls_ingress6_clat_rawip_4_9, KVER_NONE, KVER_4_14)
+(struct __sk_buff* skb) {
+ return nat64(skb, RAWIP, KVER_NONE);
}
DEFINE_BPF_MAP_GRW(clat_egress4_map, HASH, ClatEgress4Key, ClatEgress4Value, 16, AID_SYSTEM)