Blame - bpf_progs/clatd.c - android_packages_modules_Connectivity

2022-01-20 20:58:34 -0800

[diff] [blame]

/*

*

* Licensed under the Apache License, Version 2.0 (the "License");

5

* you may not use this file except in compliance with the License.

6

* You may obtain a copy of the License at

7

*

8

* http://www.apache.org/licenses/LICENSE-2.0

9

*

10

* Unless required by applicable law or agreed to in writing, software

11

* distributed under the License is distributed on an "AS IS" BASIS,

12

* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.

13

* See the License for the specific language governing permissions and

14

* limitations under the License.

15

*/

16

17

#include <linux/bpf.h>

18

#include <linux/if.h>

19

#include <linux/if_ether.h>

20

#include <linux/in.h>

21

#include <linux/in6.h>

22

#include <linux/ip.h>

23

#include <linux/ipv6.h>

24

#include <linux/pkt_cls.h>

25

#include <linux/swab.h>

#include <stdbool.h>

#include <stdint.h>

// bionic kernel uapi linux/udp.h header is munged...

30

#define __kernel_udphdr udphdr

31

#include <linux/udp.h>

32

Maciej Żenczykowski

f769952

2022-05-24 15:56:03 -0700

[diff] [blame]

33

// The resulting .o needs to load on the Android T beta 3 bpfloader

34

#define BPFLOADER_MIN_VER BPFLOADER_T_BETA3_VERSION

Maciej Żenczykowski

acebffb

2022-05-16 16:05:15 -0700

[diff] [blame]

35

Maciej Żenczykowski

2022-01-20 20:58:34 -0800

[diff] [blame]

36

#include "bpf_helpers.h"

37

#include "bpf_net_helpers.h"

Maciej Żenczykowski

7b452a1

2022-12-08 13:10:29 +0000

[diff] [blame]

38

#include "clatd.h"

Maciej Żenczykowski

ce9108f

2022-06-15 02:02:21 -0700

[diff] [blame]

39

#include "clat_mark.h"

Maciej Żenczykowski

2022-01-20 20:58:34 -0800

[diff] [blame]

40

Maciej Żenczykowski

e0ddc47

2022-10-24 02:43:21 -0700

[diff] [blame]

41

// IP flags. (from kernel's include/net/ip.h)

42

#define IP_CE 0x8000 // Flag: "Congestion" (really reserved 'evil bit')

43

#define IP_DF 0x4000 // Flag: "Don't Fragment"

44

#define IP_MF 0x2000 // Flag: "More Fragments"

45

#define IP_OFFSET 0x1FFF // "Fragment Offset" part

46

47

// from kernel's include/net/ipv6.h

48

struct frag_hdr {

49

__u8 nexthdr;

50

__u8 reserved; // always zero

51

__be16 frag_off; // 13 bit offset, 2 bits zero, 1 bit "More Fragments"

52

__be32 identification;

53

};

Maciej Żenczykowski

2022-01-20 20:58:34 -0800

[diff] [blame]

54

Maciej Żenczykowski

2023-01-03 14:59:54 +0000

[diff] [blame]

55

// constants for passing in to 'bool is_ethernet'

56

static const bool RAWIP = false;

57

static const bool ETHER = true;

58

59

#define KVER_4_14 KVER(4, 14, 0)

60

Maciej Żenczykowski

2022-01-20 20:58:34 -0800

[diff] [blame]

61

DEFINE_BPF_MAP_GRW(clat_ingress6_map, HASH, ClatIngress6Key, ClatIngress6Value, 16, AID_SYSTEM)

62

Maciej Żenczykowski

2023-01-03 14:59:54 +0000

[diff] [blame]

63

static inline __always_inline int nat64(struct __sk_buff* skb,

64

const bool is_ethernet,

65

const unsigned kver) {

Maciej Żenczykowski

2022-01-20 20:58:34 -0800

[diff] [blame]

66

// Require ethernet dst mac address to be our unicast address.

67

if (is_ethernet && (skb->pkt_type != PACKET_HOST)) return TC_ACT_PIPE;

68

69

// Must be meta-ethernet IPv6 frame

70

if (skb->protocol != htons(ETH_P_IPV6)) return TC_ACT_PIPE;

71

Maciej Żenczykowski

2022-04-07 16:53:02 -0700

[diff] [blame]

72

const int l2_header_size = is_ethernet ? sizeof(struct ethhdr) : 0;

73

74

// Not clear if this is actually necessary considering we use DPA (Direct Packet Access),

75

// but we need to make sure we can read the IPv6 header reliably so that we can set

76

// skb->mark = 0xDeadC1a7 for packets we fail to offload.

Maciej Żenczykowski

824fb29

2022-04-11 23:29:46 -0700

[diff] [blame]

77

try_make_writable(skb, l2_header_size + sizeof(struct ipv6hdr));

Maciej Żenczykowski

2022-04-07 16:53:02 -0700

[diff] [blame]

78

79

void* data = (void*)(long)skb->data;

80

const void* data_end = (void*)(long)skb->data_end;

81

const struct ethhdr* const eth = is_ethernet ? data : NULL; // used iff is_ethernet

82

const struct ipv6hdr* const ip6 = is_ethernet ? (void*)(eth + 1) : data;

83

Maciej Żenczykowski

2022-01-20 20:58:34 -0800

[diff] [blame]

84

// Must have (ethernet and) ipv6 header

85

if (data + l2_header_size + sizeof(*ip6) > data_end) return TC_ACT_PIPE;

86

87

// Ethertype - if present - must be IPv6

88

if (is_ethernet && (eth->h_proto != htons(ETH_P_IPV6))) return TC_ACT_PIPE;

89

90

// IP version must be 6

91

if (ip6->version != 6) return TC_ACT_PIPE;

92

93

// Maximum IPv6 payload length that can be translated to IPv4

Maciej Żenczykowski

bdcb896

2023-03-17 18:15:34 +0000

[diff] [blame^]

94

// Note: technically this check is too strict for an IPv6 fragment,

95

// which by virtue of stripping the extra 8 byte fragment extension header,

96

// could thus be 8 bytes larger and still fit in an ipv4 packet post

97

// translation. However... who ever heard of receiving ~64KB frags...

98

// fragments are kind of by definition smaller than ingress device mtu,

99

// and thus, on the internet, very very unlikely to exceed 1500 bytes.

Maciej Żenczykowski

2022-01-20 20:58:34 -0800

[diff] [blame]

100

if (ntohs(ip6->payload_len) > 0xFFFF - sizeof(struct iphdr)) return TC_ACT_PIPE;

101

Maciej Żenczykowski

2022-01-20 20:58:34 -0800

[diff] [blame]

102

ClatIngress6Key k = {

103

.iif = skb->ifindex,

104

.pfx96.in6_u.u6_addr32 =

105

{

106

ip6->saddr.in6_u.u6_addr32[0],

107

ip6->saddr.in6_u.u6_addr32[1],

108

ip6->saddr.in6_u.u6_addr32[2],

109

},

110

.local6 = ip6->daddr,

111

};

112

113

ClatIngress6Value* v = bpf_clat_ingress6_map_lookup_elem(&k);

114

115

if (!v) return TC_ACT_PIPE;

116

Maciej Żenczykowski

2022-10-24 03:08:06 -0700

[diff] [blame]

117

__u8 proto = ip6->nexthdr;

118

__be16 ip_id = 0;

119

__be16 frag_off = htons(IP_DF);

Maciej Żenczykowski

2023-01-03 14:59:54 +0000

[diff] [blame]

120

__u16 tot_len = ntohs(ip6->payload_len) + sizeof(struct iphdr); // cannot overflow, see above

121

122

if (proto == IPPROTO_FRAGMENT) {

123

// Fragment handling requires bpf_skb_adjust_room which is 4.14+

124

if (kver < KVER_4_14) return TC_ACT_PIPE;

125

126

// Must have (ethernet and) ipv6 header and ipv6 fragment extension header

127

if (data + l2_header_size + sizeof(*ip6) + sizeof(struct frag_hdr) > data_end)

128

return TC_ACT_PIPE;

129

const struct frag_hdr *frag = (const struct frag_hdr *)(ip6 + 1);

130

proto = frag->nexthdr;

131

// RFC6145: use bottom 16-bits of network endian 32-bit IPv6 ID field for 16-bit IPv4 field.

132

// this is equivalent to: ip_id = htons(ntohl(frag->identification));

133

ip_id = frag->identification >> 16;

134

// Conversion of 16-bit IPv6 frag offset to 16-bit IPv4 frag offset field.

135

// IPv6 is '13 bits of offset in multiples of 8' + 2 zero bits + more fragment bit

136

// IPv4 is zero bit + don't frag bit + more frag bit + '13 bits of offset in multiples of 8'

137

frag_off = ntohs(frag->frag_off);

138

frag_off = ((frag_off & 1) << 13) | (frag_off >> 3);

139

frag_off = htons(frag_off);

140

// Note that by construction tot_len is guaranteed to not underflow here

141

tot_len -= sizeof(struct frag_hdr);

142

// This is a badly formed IPv6 packet with less payload than the size of an IPv6 Frag EH

143

if (tot_len < sizeof(struct iphdr)) return TC_ACT_PIPE;

144

}

Maciej Żenczykowski

2022-10-24 03:08:06 -0700

[diff] [blame]

145

146

switch (proto) {

Hungming Chen

6c0b1e8

2022-04-01 19:51:56 +0800

[diff] [blame]

147

case IPPROTO_TCP: // For TCP & UDP the checksum neutrality of the chosen IPv6

148

case IPPROTO_UDP: // address means there is no need to update their checksums.

149

case IPPROTO_GRE: // We do not need to bother looking at GRE/ESP headers,

150

case IPPROTO_ESP: // since there is never a checksum to update.

151

break;

152

153

default: // do not know how to handle anything else

154

// Mark ingress non-offloaded clat packet for dropping in ip6tables bw_raw_PREROUTING.

155

// Non-offloaded clat packet is going to be handled by clat daemon and ip6tables. The

156

// duplicate one in ip6tables is not necessary.

157

skb->mark = CLAT_MARK;

return TC_ACT_PIPE;

}

Maciej Żenczykowski

2022-01-20 20:58:34 -0800

[diff] [blame]

161

struct ethhdr eth2; // used iff is_ethernet

162

if (is_ethernet) {

163

eth2 = *eth; // Copy over the ethernet header (src/dst mac)

164

eth2.h_proto = htons(ETH_P_IP); // But replace the ethertype

}

struct iphdr ip = {

.version = 4, // u4

.ihl = sizeof(struct iphdr) / sizeof(__u32), // u4

170

.tos = (ip6->priority << 4) + (ip6->flow_lbl[0] >> 4), // u8

Maciej Żenczykowski

2023-01-03 14:59:54 +0000

[diff] [blame]

171

.tot_len = htons(tot_len), // be16

Maciej Żenczykowski

2022-10-24 03:08:06 -0700

[diff] [blame]

172

.id = ip_id, // be16

173

.frag_off = frag_off, // be16

Maciej Żenczykowski

2022-01-20 20:58:34 -0800

[diff] [blame]

174

.ttl = ip6->hop_limit, // u8

Maciej Żenczykowski

2022-10-24 03:08:06 -0700

[diff] [blame]

175

.protocol = proto, // u8

Maciej Żenczykowski

2022-01-20 20:58:34 -0800

[diff] [blame]

176

.check = 0, // u16

Maciej Żenczykowski

2022-10-24 03:08:06 -0700

[diff] [blame]

177

.saddr = ip6->saddr.in6_u.u6_addr32[3], // be32

178

.daddr = v->local4.s_addr, // be32

Maciej Żenczykowski

2022-01-20 20:58:34 -0800

[diff] [blame]

179

};

180

181

// Calculate the IPv4 one's complement checksum of the IPv4 header.

182

__wsum sum4 = 0;

183

for (int i = 0; i < sizeof(ip) / sizeof(__u16); ++i) {

184

sum4 += ((__u16*)&ip)[i];

185

}

186

// Note that sum4 is guaranteed to be non-zero by virtue of ip.version == 4

187

sum4 = (sum4 & 0xFFFF) + (sum4 >> 16); // collapse u32 into range 1 .. 0x1FFFE

188

sum4 = (sum4 & 0xFFFF) + (sum4 >> 16); // collapse any potential carry into u16

189

ip.check = (__u16)~sum4; // sum4 cannot be zero, so this is never 0xFFFF

190

191

// Calculate the *negative* IPv6 16-bit one's complement checksum of the IPv6 header.

192

__wsum sum6 = 0;

193

// We'll end up with a non-zero sum due to ip6->version == 6 (which has '0' bits)

194

for (int i = 0; i < sizeof(*ip6) / sizeof(__u16); ++i) {

195

sum6 += ~((__u16*)ip6)[i]; // note the bitwise negation

196

}

197

198

// Note that there is no L4 checksum update: we are relying on the checksum neutrality

199

// of the ipv6 address chosen by netd's ClatdController.

200

201

// Packet mutations begin - point of no return, but if this first modification fails

202

// the packet is probably still pristine, so let clatd handle it.

Hungming Chen

6c0b1e8

2022-04-01 19:51:56 +0800

[diff] [blame]

203

if (bpf_skb_change_proto(skb, htons(ETH_P_IP), 0)) {

204

// Mark ingress non-offloaded clat packet for dropping in ip6tables bw_raw_PREROUTING.

205

// Non-offloaded clat packet is going to be handled by clat daemon and ip6tables. The

206

// duplicate one in ip6tables is not necessary.

207

skb->mark = CLAT_MARK;

208

return TC_ACT_PIPE;

209

}

Maciej Żenczykowski

2022-01-20 20:58:34 -0800

[diff] [blame]

210

211

// This takes care of updating the skb->csum field for a CHECKSUM_COMPLETE packet.

212

//

213

// In such a case, skb->csum is a 16-bit one's complement sum of the entire payload,

214

// thus we need to subtract out the ipv6 header's sum, and add in the ipv4 header's sum.

215

// However, by construction of ip.check above the checksum of an ipv4 header is zero.

216

// Thus we only need to subtract the ipv6 header's sum, which is the same as adding

217

// in the sum of the bitwise negation of the ipv6 header.

218

//

219

// bpf_csum_update() always succeeds if the skb is CHECKSUM_COMPLETE and returns an error

220

// (-ENOTSUPP) if it isn't. So we just ignore the return code.

221

//

222

// if (skb->ip_summed == CHECKSUM_COMPLETE)

223

// return (skb->csum = csum_add(skb->csum, csum));

224

// else

225

// return -ENOTSUPP;

226

bpf_csum_update(skb, sum6);

227

Maciej Żenczykowski

2023-01-03 14:59:54 +0000

[diff] [blame]

228

// Technically 'kver < KVER_4_14' already implies 'frag_off == htons(IP_DF)' due to logic above,

229

// thus the initial 'kver >= KVER_4_14' check here is entirely superfluous.

230

//

231

// However, we *need* the compiler (when compiling the program for 4.9) to entirely

232

// optimize out the call to bpf_skb_adjust_room() bpf helper: it's not enough for it to emit

233

// an unreachable call to it, it must *not* emit it at all (otherwise the 4.9 kernel's

234

// bpf verifier will refuse to load a program with an unknown bpf helper call)

235

//

236

// This is easiest to achieve by being very explicit in the if clause,

237

// better safe than sorry...

238

//

239

// Note: we currently have no TreeHugger coverage for 4.9-T devices (there are no such

240

// Pixel or cuttlefish devices), so likely you won't notice for months if this breaks...

241

if (kver >= KVER_4_14 && frag_off != htons(IP_DF)) {

242

// If we're converting an IPv6 Fragment, we need to trim off 8 more bytes

243

// We're beyond recovery on error here... but hard to imagine how this could fail.

244

if (bpf_skb_adjust_room(skb, -(__s32)sizeof(struct frag_hdr), BPF_ADJ_ROOM_NET, /*flags*/0))

return TC_ACT_SHOT;

}

Maciej Żenczykowski

2022-01-20 20:58:34 -0800

[diff] [blame]

248

// bpf_skb_change_proto() invalidates all pointers - reload them.

249

data = (void*)(long)skb->data;

250

data_end = (void*)(long)skb->data_end;

251

252

// I cannot think of any valid way for this error condition to trigger, however I do

253

// believe the explicit check is required to keep the in kernel ebpf verifier happy.

254

if (data + l2_header_size + sizeof(struct iphdr) > data_end) return TC_ACT_SHOT;

255

256

if (is_ethernet) {

257

struct ethhdr* new_eth = data;

258

259

// Copy over the updated ethernet header

260

*new_eth = eth2;

261

262

// Copy over the new ipv4 header.

263

*(struct iphdr*)(new_eth + 1) = ip;

264

} else {

265

// Copy over the new ipv4 header without an ethernet header.

266

*(struct iphdr*)data = ip;

267

}

268

269

// Redirect, possibly back to same interface, so tcpdump sees packet twice.

270

if (v->oif) return bpf_redirect(v->oif, BPF_F_INGRESS);

271

272

// Just let it through, tcpdump will not see IPv4 packet.

return TC_ACT_PIPE;

}

Maciej Żenczykowski

2023-01-03 14:59:54 +0000

[diff] [blame]

276

DEFINE_BPF_PROG_KVER("schedcls/ingress6/clat_ether$4_14", AID_ROOT, AID_SYSTEM, sched_cls_ingress6_clat_ether_4_14, KVER_4_14)

Maciej Żenczykowski

2022-01-20 20:58:34 -0800

[diff] [blame]

277

(struct __sk_buff* skb) {

Maciej Żenczykowski

2023-01-03 14:59:54 +0000

[diff] [blame]

278

return nat64(skb, ETHER, KVER_4_14);

Maciej Żenczykowski

2022-01-20 20:58:34 -0800

[diff] [blame]

279

}

280

Maciej Żenczykowski

2023-01-03 14:59:54 +0000

[diff] [blame]

281

DEFINE_BPF_PROG_KVER_RANGE("schedcls/ingress6/clat_ether$4_9", AID_ROOT, AID_SYSTEM, sched_cls_ingress6_clat_ether_4_9, KVER_NONE, KVER_4_14)

Maciej Żenczykowski

2022-01-20 20:58:34 -0800

[diff] [blame]

282

(struct __sk_buff* skb) {

Maciej Żenczykowski

2023-01-03 14:59:54 +0000

[diff] [blame]

283

return nat64(skb, ETHER, KVER_NONE);

284

}

285

286

DEFINE_BPF_PROG_KVER("schedcls/ingress6/clat_rawip$4_14", AID_ROOT, AID_SYSTEM, sched_cls_ingress6_clat_rawip_4_14, KVER_4_14)

287

(struct __sk_buff* skb) {

288

return nat64(skb, RAWIP, KVER_4_14);

289

}

290

291

DEFINE_BPF_PROG_KVER_RANGE("schedcls/ingress6/clat_rawip$4_9", AID_ROOT, AID_SYSTEM, sched_cls_ingress6_clat_rawip_4_9, KVER_NONE, KVER_4_14)

292

(struct __sk_buff* skb) {

293

return nat64(skb, RAWIP, KVER_NONE);

Maciej Żenczykowski

2022-01-20 20:58:34 -0800

[diff] [blame]

294

}

295

296

DEFINE_BPF_MAP_GRW(clat_egress4_map, HASH, ClatEgress4Key, ClatEgress4Value, 16, AID_SYSTEM)

297

Maciej Żenczykowski

2022-01-20 20:58:34 -0800

[diff] [blame]

298

DEFINE_BPF_PROG("schedcls/egress4/clat_rawip", AID_ROOT, AID_SYSTEM, sched_cls_egress4_clat_rawip)

299

(struct __sk_buff* skb) {

Maciej Żenczykowski

2022-04-07 16:53:02 -0700

[diff] [blame]

300

// Must be meta-ethernet IPv4 frame

301

if (skb->protocol != htons(ETH_P_IP)) return TC_ACT_PIPE;

302

303

// Possibly not needed, but for consistency with nat64 up above

Maciej Żenczykowski

824fb29

2022-04-11 23:29:46 -0700

[diff] [blame]

304

try_make_writable(skb, sizeof(struct iphdr));

Maciej Żenczykowski

2022-04-07 16:53:02 -0700

[diff] [blame]

305

Maciej Żenczykowski

2022-01-20 20:58:34 -0800

[diff] [blame]

306

void* data = (void*)(long)skb->data;

307

const void* data_end = (void*)(long)skb->data_end;

308

const struct iphdr* const ip4 = data;

309

Maciej Żenczykowski