blob: 144a4c940a52143b675b4ee7c79c0c17d75f18c2 [file] [log] [blame]
Patrick Rohr776c40c2022-01-12 21:05:26 +01001/*
2 * Copyright (C) 2022 The Android Open Source Project
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17#define LOG_TAG "TcUtils"
18
19#include "tcutils/tcutils.h"
20
Patrick Rohr28c717a2022-01-19 14:51:35 +010021#include "logging.h"
Patrick Rohre7f26e22022-01-12 22:13:12 +010022#include "kernelversion.h"
Patrick Rohr776c40c2022-01-12 21:05:26 +010023#include "scopeguard.h"
24
Patrick Rohr776c40c2022-01-12 21:05:26 +010025#include <arpa/inet.h>
26#include <cerrno>
Patrick Rohr776c40c2022-01-12 21:05:26 +010027#include <cstring>
28#include <libgen.h>
29#include <linux/if_arp.h>
30#include <linux/if_ether.h>
31#include <linux/netlink.h>
32#include <linux/pkt_cls.h>
33#include <linux/pkt_sched.h>
34#include <linux/rtnetlink.h>
Patrick Rohre815a742022-01-17 10:37:40 +010035#include <linux/tc_act/tc_bpf.h>
Patrick Rohr776c40c2022-01-12 21:05:26 +010036#include <net/if.h>
Patrick Rohr0c34e9a02022-01-17 13:59:09 +010037#include <stdio.h>
Patrick Rohr776c40c2022-01-12 21:05:26 +010038#include <sys/socket.h>
Patrick Rohr776c40c2022-01-12 21:05:26 +010039#include <unistd.h>
40#include <utility>
41
42#define BPF_FD_JUST_USE_INT
43#include <BpfSyscallWrappers.h>
44#undef BPF_FD_JUST_USE_INT
45
46// The maximum length of TCA_BPF_NAME. Sync from net/sched/cls_bpf.c.
47#define CLS_BPF_NAME_LEN 256
48
49// Classifier name. See cls_bpf_ops in net/sched/cls_bpf.c.
50#define CLS_BPF_KIND_NAME "bpf"
51
52namespace android {
53namespace {
54
Patrick Rohre815a742022-01-17 10:37:40 +010055/**
56 * IngressPoliceFilterBuilder builds a nlmsg request equivalent to the following
57 * tc command:
58 *
59 * tc filter add dev .. ingress prio .. protocol .. matchall \
60 * action police rate .. burst .. conform-exceed pipe/continue \
61 * action bpf object-pinned .. \
62 * drop
63 */
64class IngressPoliceFilterBuilder final {
65 // default mtu is 2047, so the cell logarithm factor (cell_log) is 3.
66 // 0x7FF >> 0x3FF x 2^1 >> 0x1FF x 2^2 >> 0xFF x 2^3
67 static constexpr int RTAB_CELL_LOGARITHM = 3;
68 static constexpr size_t RTAB_SIZE = 256;
69 static constexpr unsigned TIME_UNITS_PER_SEC = 1000000;
70
71 struct Request {
72 nlmsghdr n;
73 tcmsg t;
74 struct {
75 nlattr attr;
76 char str[NLMSG_ALIGN(sizeof("matchall"))];
77 } kind;
78 struct {
79 nlattr attr;
80 struct {
81 nlattr attr;
82 struct {
83 nlattr attr;
84 struct {
85 nlattr attr;
86 char str[NLMSG_ALIGN(sizeof("police"))];
87 } kind;
88 struct {
89 nlattr attr;
90 struct {
91 nlattr attr;
92 struct tc_police obj;
93 } police;
94 struct {
95 nlattr attr;
96 uint32_t u32[RTAB_SIZE];
97 } rtab;
98 struct {
99 nlattr attr;
100 int32_t s32;
101 } notexceedact;
102 } opt;
103 } act1;
104 struct {
105 nlattr attr;
106 struct {
107 nlattr attr;
108 char str[NLMSG_ALIGN(sizeof("bpf"))];
109 } kind;
110 struct {
111 nlattr attr;
112 struct {
113 nlattr attr;
114 uint32_t u32;
115 } fd;
116 struct {
117 nlattr attr;
118 char str[NLMSG_ALIGN(CLS_BPF_NAME_LEN)];
119 } name;
120 struct {
121 nlattr attr;
122 struct tc_act_bpf obj;
123 } parms;
124 } opt;
125 } act2;
126 } acts;
127 } opt;
128 };
129
130 // class members
131 const unsigned mBurstInBytes;
132 const char *mBpfProgPath;
133 int mBpfFd;
134 Request mRequest;
135
136 static double getTickInUsec() {
137 FILE *fp = fopen("/proc/net/psched", "re");
138 if (!fp) {
Patrick Rohr28c717a2022-01-19 14:51:35 +0100139 ALOGE("fopen(\"/proc/net/psched\"): %s", strerror(errno));
Patrick Rohre815a742022-01-17 10:37:40 +0100140 return 0.0;
141 }
142 auto scopeGuard = base::make_scope_guard([fp] { fclose(fp); });
143
144 uint32_t t2us;
145 uint32_t us2t;
146 uint32_t clockRes;
147 const bool isError =
148 fscanf(fp, "%08x%08x%08x", &t2us, &us2t, &clockRes) != 3;
149
150 if (isError) {
Patrick Rohr28c717a2022-01-19 14:51:35 +0100151 ALOGE("fscanf(/proc/net/psched, \"%%08x%%08x%%08x\"): %s",
Patrick Rohre815a742022-01-17 10:37:40 +0100152 strerror(errno));
153 return 0.0;
154 }
155
156 const double clockFactor =
157 static_cast<double>(clockRes) / TIME_UNITS_PER_SEC;
158 return static_cast<double>(t2us) / static_cast<double>(us2t) * clockFactor;
159 }
160
161 static inline const double kTickInUsec = getTickInUsec();
162
163public:
164 // clang-format off
165 IngressPoliceFilterBuilder(int ifIndex, uint16_t prio, uint16_t proto, unsigned rateInBytesPerSec,
166 unsigned burstInBytes, const char* bpfProgPath)
167 : mBurstInBytes(burstInBytes),
168 mBpfProgPath(bpfProgPath),
169 mBpfFd(-1),
170 mRequest{
171 .n = {
172 .nlmsg_len = sizeof(mRequest),
173 .nlmsg_type = RTM_NEWTFILTER,
174 .nlmsg_flags = NLM_F_REQUEST | NLM_F_ACK | NLM_F_EXCL | NLM_F_CREATE,
175 },
176 .t = {
177 .tcm_family = AF_UNSPEC,
178 .tcm_ifindex = ifIndex,
179 .tcm_handle = TC_H_UNSPEC,
180 .tcm_parent = TC_H_MAKE(TC_H_CLSACT, TC_H_MIN_INGRESS),
181 .tcm_info = (static_cast<uint32_t>(prio) << 16)
182 | static_cast<uint32_t>(htons(proto)),
183 },
184 .kind = {
185 .attr = {
186 .nla_len = sizeof(mRequest.kind),
187 .nla_type = TCA_KIND,
188 },
189 .str = "matchall",
190 },
191 .opt = {
192 .attr = {
193 .nla_len = sizeof(mRequest.opt),
194 .nla_type = TCA_OPTIONS,
195 },
196 .acts = {
197 .attr = {
198 .nla_len = sizeof(mRequest.opt.acts),
Patrick Rohr20dcaf52022-02-11 13:14:45 +0100199 .nla_type = TCA_MATCHALL_ACT,
Patrick Rohre815a742022-01-17 10:37:40 +0100200 },
201 .act1 = {
202 .attr = {
203 .nla_len = sizeof(mRequest.opt.acts.act1),
204 .nla_type = 1, // action priority
205 },
206 .kind = {
207 .attr = {
208 .nla_len = sizeof(mRequest.opt.acts.act1.kind),
209 .nla_type = TCA_ACT_KIND,
210 },
211 .str = "police",
212 },
213 .opt = {
214 .attr = {
215 .nla_len = sizeof(mRequest.opt.acts.act1.opt),
216 .nla_type = TCA_ACT_OPTIONS | NLA_F_NESTED,
217 },
218 .police = {
219 .attr = {
220 .nla_len = sizeof(mRequest.opt.acts.act1.opt.police),
221 .nla_type = TCA_POLICE_TBF,
222 },
223 .obj = {
224 .action = TC_ACT_PIPE,
225 .burst = 0,
226 .rate = {
227 .cell_log = RTAB_CELL_LOGARITHM,
228 .linklayer = TC_LINKLAYER_ETHERNET,
229 .cell_align = -1,
230 .rate = rateInBytesPerSec,
231 },
232 },
233 },
234 .rtab = {
235 .attr = {
236 .nla_len = sizeof(mRequest.opt.acts.act1.opt.rtab),
237 .nla_type = TCA_POLICE_RATE,
238 },
239 .u32 = {},
240 },
241 .notexceedact = {
242 .attr = {
243 .nla_len = sizeof(mRequest.opt.acts.act1.opt.notexceedact),
244 .nla_type = TCA_POLICE_RESULT,
245 },
246 .s32 = TC_ACT_UNSPEC,
247 },
248 },
249 },
250 .act2 = {
251 .attr = {
252 .nla_len = sizeof(mRequest.opt.acts.act2),
253 .nla_type = 2, // action priority
254 },
255 .kind = {
256 .attr = {
257 .nla_len = sizeof(mRequest.opt.acts.act2.kind),
258 .nla_type = TCA_ACT_KIND,
259 },
260 .str = "bpf",
261 },
262 .opt = {
263 .attr = {
264 .nla_len = sizeof(mRequest.opt.acts.act2.opt),
265 .nla_type = TCA_ACT_OPTIONS | NLA_F_NESTED,
266 },
267 .fd = {
268 .attr = {
269 .nla_len = sizeof(mRequest.opt.acts.act2.opt.fd),
270 .nla_type = TCA_ACT_BPF_FD,
271 },
272 .u32 = 0, // set during build()
273 },
274 .name = {
275 .attr = {
276 .nla_len = sizeof(mRequest.opt.acts.act2.opt.name),
277 .nla_type = TCA_ACT_BPF_NAME,
278 },
279 .str = "placeholder",
280 },
281 .parms = {
282 .attr = {
283 .nla_len = sizeof(mRequest.opt.acts.act2.opt.parms),
284 .nla_type = TCA_ACT_BPF_PARMS,
285 },
286 .obj = {
287 // default action to be executed when bpf prog
288 // returns TC_ACT_UNSPEC.
289 .action = TC_ACT_SHOT,
290 },
291 },
292 },
293 },
294 },
295 },
296 } {
297 // constructor body
298 }
299 // clang-format on
300
301 ~IngressPoliceFilterBuilder() {
302 // TODO: use unique_fd
303 if (mBpfFd != -1) {
304 close(mBpfFd);
305 }
306 }
307
308 constexpr unsigned getRequestSize() const { return sizeof(Request); }
309
310private:
311 unsigned calculateXmitTime(unsigned size) {
312 const uint32_t rate = mRequest.opt.acts.act1.opt.police.obj.rate.rate;
313 return (static_cast<double>(size) / static_cast<double>(rate)) *
314 TIME_UNITS_PER_SEC * kTickInUsec;
315 }
316
317 void initBurstRate() {
318 mRequest.opt.acts.act1.opt.police.obj.burst =
319 calculateXmitTime(mBurstInBytes);
320 }
321
322 // Calculates a table with 256 transmission times for different packet sizes
323 // (all the way up to MTU). RTAB_CELL_LOGARITHM is used as a scaling factor.
324 // In this case, MTU size is always 2048, so RTAB_CELL_LOGARITHM is always
325 // 3. Therefore, this function generates the transmission times for packets
326 // of size 1..256 x 2^3.
327 void initRateTable() {
328 for (unsigned i = 0; i < RTAB_SIZE; ++i) {
329 unsigned adjustedSize = (i + 1) << RTAB_CELL_LOGARITHM;
330 mRequest.opt.acts.act1.opt.rtab.u32[i] = calculateXmitTime(adjustedSize);
331 }
332 }
333
334 int initBpfFd() {
335 mBpfFd = bpf::retrieveProgram(mBpfProgPath);
336 if (mBpfFd == -1) {
337 int error = errno;
Patrick Rohr28c717a2022-01-19 14:51:35 +0100338 ALOGE("retrieveProgram failed: %d", error);
Patrick Rohre815a742022-01-17 10:37:40 +0100339 return -error;
340 }
341
342 mRequest.opt.acts.act2.opt.fd.u32 = static_cast<uint32_t>(mBpfFd);
343 snprintf(mRequest.opt.acts.act2.opt.name.str,
344 sizeof(mRequest.opt.acts.act2.opt.name.str), "%s:[*fsobj]",
345 basename(mBpfProgPath));
346
347 return 0;
348 }
349
350public:
351 int build() {
352 if (kTickInUsec == 0.0) {
353 return -EINVAL;
354 }
355
356 initBurstRate();
357 initRateTable();
358 return initBpfFd();
359 }
360
361 const Request *getRequest() const {
362 // Make sure to call build() before calling this function. Otherwise, the
363 // request will be invalid.
364 return &mRequest;
365 }
366};
367
Patrick Rohr776c40c2022-01-12 21:05:26 +0100368const sockaddr_nl KERNEL_NLADDR = {AF_NETLINK, 0, 0, 0};
369const uint16_t NETLINK_REQUEST_FLAGS = NLM_F_REQUEST | NLM_F_ACK;
370
371int sendAndProcessNetlinkResponse(const void *req, int len) {
372 // TODO: use unique_fd instead of ScopeGuard
373 int fd = socket(AF_NETLINK, SOCK_RAW | SOCK_CLOEXEC, NETLINK_ROUTE);
374 if (fd == -1) {
375 int error = errno;
Patrick Rohr28c717a2022-01-19 14:51:35 +0100376 ALOGE("socket(AF_NETLINK, SOCK_RAW | SOCK_CLOEXEC, NETLINK_ROUTE): %d",
Patrick Rohr776c40c2022-01-12 21:05:26 +0100377 error);
378 return -error;
379 }
380 auto scopeGuard = base::make_scope_guard([fd] { close(fd); });
381
382 static constexpr int on = 1;
383 if (setsockopt(fd, SOL_NETLINK, NETLINK_CAP_ACK, &on, sizeof(on))) {
384 int error = errno;
Patrick Rohr28c717a2022-01-19 14:51:35 +0100385 ALOGE("setsockopt(fd, SOL_NETLINK, NETLINK_CAP_ACK, 1): %d", error);
Patrick Rohr776c40c2022-01-12 21:05:26 +0100386 return -error;
387 }
388
389 // this is needed to get valid strace netlink parsing, it allocates the pid
390 if (bind(fd, (const struct sockaddr *)&KERNEL_NLADDR,
391 sizeof(KERNEL_NLADDR))) {
392 int error = errno;
Patrick Rohr28c717a2022-01-19 14:51:35 +0100393 ALOGE("bind(fd, {AF_NETLINK, 0, 0}: %d)", error);
Patrick Rohr776c40c2022-01-12 21:05:26 +0100394 return -error;
395 }
396
397 // we do not want to receive messages from anyone besides the kernel
398 if (connect(fd, (const struct sockaddr *)&KERNEL_NLADDR,
399 sizeof(KERNEL_NLADDR))) {
400 int error = errno;
Patrick Rohr28c717a2022-01-19 14:51:35 +0100401 ALOGE("connect(fd, {AF_NETLINK, 0, 0}): %d", error);
Patrick Rohr776c40c2022-01-12 21:05:26 +0100402 return -error;
403 }
404
405 int rv = send(fd, req, len, 0);
406
407 if (rv == -1) {
408 int error = errno;
Patrick Rohr28c717a2022-01-19 14:51:35 +0100409 ALOGE("send(fd, req, len, 0) failed: %d", error);
Patrick Rohr776c40c2022-01-12 21:05:26 +0100410 return -error;
411 }
412
413 if (rv != len) {
Patrick Rohr28c717a2022-01-19 14:51:35 +0100414 ALOGE("send(fd, req, len = %d, 0) returned invalid message size %d", len,
Patrick Rohr776c40c2022-01-12 21:05:26 +0100415 rv);
416 return -EMSGSIZE;
417 }
418
419 struct {
420 nlmsghdr h;
421 nlmsgerr e;
422 char buf[256];
423 } resp = {};
424
425 rv = recv(fd, &resp, sizeof(resp), MSG_TRUNC);
426
427 if (rv == -1) {
428 int error = errno;
Patrick Rohr28c717a2022-01-19 14:51:35 +0100429 ALOGE("recv() failed: %d", error);
Patrick Rohr776c40c2022-01-12 21:05:26 +0100430 return -error;
431 }
432
433 if (rv < (int)NLMSG_SPACE(sizeof(struct nlmsgerr))) {
Patrick Rohr28c717a2022-01-19 14:51:35 +0100434 ALOGE("recv() returned short packet: %d", rv);
Patrick Rohr776c40c2022-01-12 21:05:26 +0100435 return -EBADMSG;
436 }
437
438 if (resp.h.nlmsg_len != (unsigned)rv) {
Patrick Rohr28c717a2022-01-19 14:51:35 +0100439 ALOGE("recv() returned invalid header length: %d != %d",
Patrick Rohr776c40c2022-01-12 21:05:26 +0100440 resp.h.nlmsg_len, rv);
441 return -EBADMSG;
442 }
443
444 if (resp.h.nlmsg_type != NLMSG_ERROR) {
Patrick Rohr28c717a2022-01-19 14:51:35 +0100445 ALOGE("recv() did not return NLMSG_ERROR message: %d",
Patrick Rohr776c40c2022-01-12 21:05:26 +0100446 resp.h.nlmsg_type);
447 return -ENOMSG;
448 }
449
450 if (resp.e.error) {
Patrick Rohr28c717a2022-01-19 14:51:35 +0100451 ALOGE("NLMSG_ERROR message return error: %d", resp.e.error);
Patrick Rohr776c40c2022-01-12 21:05:26 +0100452 }
453 return resp.e.error; // returns 0 on success
454}
455
456int hardwareAddressType(const char *interface) {
457 int fd = socket(AF_INET6, SOCK_DGRAM | SOCK_CLOEXEC, 0);
458 if (fd < 0)
459 return -errno;
460 auto scopeGuard = base::make_scope_guard([fd] { close(fd); });
461
462 struct ifreq ifr = {};
463 // We use strncpy() instead of strlcpy() since kernel has to be able
464 // to handle non-zero terminated junk passed in by userspace anyway,
465 // and this way too long interface names (more than IFNAMSIZ-1 = 15
466 // characters plus terminating NULL) will not get truncated to 15
467 // characters and zero-terminated and thus potentially erroneously
468 // match a truncated interface if one were to exist.
469 strncpy(ifr.ifr_name, interface, sizeof(ifr.ifr_name));
470
471 if (ioctl(fd, SIOCGIFHWADDR, &ifr, sizeof(ifr))) {
472 return -errno;
473 }
474 return ifr.ifr_hwaddr.sa_family;
475}
476
Patrick Rohr776c40c2022-01-12 21:05:26 +0100477} // namespace
478
479int isEthernet(const char *iface, bool &isEthernet) {
480 int rv = hardwareAddressType(iface);
481 if (rv < 0) {
Patrick Rohr28c717a2022-01-19 14:51:35 +0100482 ALOGE("Get hardware address type of interface %s failed: %s", iface,
Patrick Rohr776c40c2022-01-12 21:05:26 +0100483 strerror(-rv));
Patrick Rohr27846ff2022-01-17 12:22:51 +0100484 return rv;
Patrick Rohr776c40c2022-01-12 21:05:26 +0100485 }
486
487 // Backwards compatibility with pre-GKI kernels that use various custom
488 // ARPHRD_* for their cellular interface
489 switch (rv) {
490 // ARPHRD_PUREIP on at least some Mediatek Android kernels
491 // example: wembley with 4.19 kernel
492 case 520:
493 // in Linux 4.14+ rmnet support was upstreamed and ARHRD_RAWIP became 519,
494 // but it is 530 on at least some Qualcomm Android 4.9 kernels with rmnet
495 // example: Pixel 3 family
496 case 530:
497 // >5.4 kernels are GKI2.0 and thus upstream compatible, however 5.10
498 // shipped with Android S, so (for safety) let's limit ourselves to
499 // >5.10, ie. 5.11+ as a guarantee we're on Android T+ and thus no
500 // longer need this non-upstream compatibility logic
501 static bool is_pre_5_11_kernel = !isAtLeastKernelVersion(5, 11, 0);
502 if (is_pre_5_11_kernel)
503 return false;
504 }
505
506 switch (rv) {
507 case ARPHRD_ETHER:
508 isEthernet = true;
509 return 0;
510 case ARPHRD_NONE:
511 case ARPHRD_PPP:
512 case ARPHRD_RAWIP:
513 isEthernet = false;
514 return 0;
515 default:
Patrick Rohr28c717a2022-01-19 14:51:35 +0100516 ALOGE("Unknown hardware address type %d on interface %s", rv, iface);
Patrick Rohr27846ff2022-01-17 12:22:51 +0100517 return -EAFNOSUPPORT;
Patrick Rohr776c40c2022-01-12 21:05:26 +0100518 }
519}
520
Patrick Rohr42b58ae2022-01-17 13:09:12 +0100521// ADD: nlMsgType=RTM_NEWQDISC nlMsgFlags=NLM_F_EXCL|NLM_F_CREATE
522// REPLACE: nlMsgType=RTM_NEWQDISC nlMsgFlags=NLM_F_CREATE|NLM_F_REPLACE
523// DEL: nlMsgType=RTM_DELQDISC nlMsgFlags=0
524int doTcQdiscClsact(int ifIndex, uint16_t nlMsgType, uint16_t nlMsgFlags) {
525 // This is the name of the qdisc we are attaching.
526 // Some hoop jumping to make this compile time constant with known size,
527 // so that the structure declaration is well defined at compile time.
528#define CLSACT "clsact"
529 // sizeof() includes the terminating NULL
530 static constexpr size_t ASCIIZ_LEN_CLSACT = sizeof(CLSACT);
531
532 const struct {
533 nlmsghdr n;
534 tcmsg t;
535 struct {
536 nlattr attr;
537 char str[NLMSG_ALIGN(ASCIIZ_LEN_CLSACT)];
538 } kind;
539 } req = {
540 .n =
541 {
542 .nlmsg_len = sizeof(req),
543 .nlmsg_type = nlMsgType,
544 .nlmsg_flags =
545 static_cast<__u16>(NETLINK_REQUEST_FLAGS | nlMsgFlags),
546 },
547 .t =
548 {
549 .tcm_family = AF_UNSPEC,
550 .tcm_ifindex = ifIndex,
551 .tcm_handle = TC_H_MAKE(TC_H_CLSACT, 0),
552 .tcm_parent = TC_H_CLSACT,
553 },
554 .kind =
555 {
556 .attr =
557 {
558 .nla_len = NLA_HDRLEN + ASCIIZ_LEN_CLSACT,
559 .nla_type = TCA_KIND,
560 },
561 .str = CLSACT,
562 },
563 };
564#undef CLSACT
565
566 return sendAndProcessNetlinkResponse(&req, sizeof(req));
567}
568
Patrick Rohr776c40c2022-01-12 21:05:26 +0100569// tc filter add dev .. in/egress prio 1 protocol ipv6/ip bpf object-pinned
570// /sys/fs/bpf/... direct-action
571int tcAddBpfFilter(int ifIndex, bool ingress, uint16_t prio, uint16_t proto,
572 const char *bpfProgPath) {
573 const int bpfFd = bpf::retrieveProgram(bpfProgPath);
574 if (bpfFd == -1) {
Patrick Rohr28c717a2022-01-19 14:51:35 +0100575 ALOGE("retrieveProgram failed: %d", errno);
Patrick Rohr776c40c2022-01-12 21:05:26 +0100576 return -errno;
577 }
578 auto scopeGuard = base::make_scope_guard([bpfFd] { close(bpfFd); });
579
580 struct {
581 nlmsghdr n;
582 tcmsg t;
583 struct {
584 nlattr attr;
585 // The maximum classifier name length is defined in
586 // tcf_proto_ops in include/net/sch_generic.h.
587 char str[NLMSG_ALIGN(sizeof(CLS_BPF_KIND_NAME))];
588 } kind;
589 struct {
590 nlattr attr;
591 struct {
592 nlattr attr;
593 __u32 u32;
594 } fd;
595 struct {
596 nlattr attr;
597 char str[NLMSG_ALIGN(CLS_BPF_NAME_LEN)];
598 } name;
599 struct {
600 nlattr attr;
601 __u32 u32;
602 } flags;
603 } options;
604 } req = {
605 .n =
606 {
607 .nlmsg_len = sizeof(req),
608 .nlmsg_type = RTM_NEWTFILTER,
609 .nlmsg_flags = NETLINK_REQUEST_FLAGS | NLM_F_EXCL | NLM_F_CREATE,
610 },
611 .t =
612 {
613 .tcm_family = AF_UNSPEC,
614 .tcm_ifindex = ifIndex,
615 .tcm_handle = TC_H_UNSPEC,
616 .tcm_parent = TC_H_MAKE(TC_H_CLSACT, ingress ? TC_H_MIN_INGRESS
617 : TC_H_MIN_EGRESS),
618 .tcm_info =
619 static_cast<__u32>((static_cast<uint16_t>(prio) << 16) |
620 htons(static_cast<uint16_t>(proto))),
621 },
622 .kind =
623 {
624 .attr =
625 {
626 .nla_len = sizeof(req.kind),
627 .nla_type = TCA_KIND,
628 },
629 .str = CLS_BPF_KIND_NAME,
630 },
631 .options =
632 {
633 .attr =
634 {
635 .nla_len = sizeof(req.options),
636 .nla_type = NLA_F_NESTED | TCA_OPTIONS,
637 },
638 .fd =
639 {
640 .attr =
641 {
642 .nla_len = sizeof(req.options.fd),
643 .nla_type = TCA_BPF_FD,
644 },
645 .u32 = static_cast<__u32>(bpfFd),
646 },
647 .name =
648 {
649 .attr =
650 {
651 .nla_len = sizeof(req.options.name),
652 .nla_type = TCA_BPF_NAME,
653 },
654 // Visible via 'tc filter show', but
655 // is overwritten by strncpy below
656 .str = "placeholder",
657 },
658 .flags =
659 {
660 .attr =
661 {
662 .nla_len = sizeof(req.options.flags),
663 .nla_type = TCA_BPF_FLAGS,
664 },
665 .u32 = TCA_BPF_FLAG_ACT_DIRECT,
666 },
667 },
668 };
669
670 snprintf(req.options.name.str, sizeof(req.options.name.str), "%s:[*fsobj]",
671 basename(bpfProgPath));
672
673 int error = sendAndProcessNetlinkResponse(&req, sizeof(req));
674 return error;
675}
676
Patrick Rohre815a742022-01-17 10:37:40 +0100677// tc filter add dev .. ingress prio .. protocol .. matchall \
678// action police rate .. burst .. conform-exceed pipe/continue \
679// action bpf object-pinned .. \
680// drop
681//
682// TODO: tc-police does not do ECN marking, so in the future, we should consider
683// adding a second tc-police filter at a lower priority that rate limits traffic
684// at something like 0.8 times the global rate limit and ecn marks exceeding
685// packets inside a bpf program (but does not drop them).
686int tcAddIngressPoliceFilter(int ifIndex, uint16_t prio, uint16_t proto,
687 unsigned rateInBytesPerSec,
688 const char *bpfProgPath) {
689 // TODO: this value needs to be validated.
690 // TCP IW10 (initial congestion window) means servers will send 10 mtus worth
691 // of data on initial connect.
692 // If nic is LRO capable it could aggregate up to 64KiB, so again probably a
693 // bad idea to set burst below that, because ingress packets could get
694 // aggregated to 64KiB at the nic.
695 // I don't know, but I wonder whether we shouldn't just do 128KiB and not do
696 // any math.
697 static constexpr unsigned BURST_SIZE_IN_BYTES = 128 * 1024; // 128KiB
698 IngressPoliceFilterBuilder filter(ifIndex, prio, proto, rateInBytesPerSec,
699 BURST_SIZE_IN_BYTES, bpfProgPath);
700 const int error = filter.build();
701 if (error) {
702 return error;
703 }
704 return sendAndProcessNetlinkResponse(filter.getRequest(),
705 filter.getRequestSize());
706}
707
Patrick Rohr776c40c2022-01-12 21:05:26 +0100708// tc filter del dev .. in/egress prio .. protocol ..
709int tcDeleteFilter(int ifIndex, bool ingress, uint16_t prio, uint16_t proto) {
710 const struct {
711 nlmsghdr n;
712 tcmsg t;
713 } req = {
714 .n =
715 {
716 .nlmsg_len = sizeof(req),
717 .nlmsg_type = RTM_DELTFILTER,
718 .nlmsg_flags = NETLINK_REQUEST_FLAGS,
719 },
720 .t =
721 {
722 .tcm_family = AF_UNSPEC,
723 .tcm_ifindex = ifIndex,
724 .tcm_handle = TC_H_UNSPEC,
725 .tcm_parent = TC_H_MAKE(TC_H_CLSACT, ingress ? TC_H_MIN_INGRESS
726 : TC_H_MIN_EGRESS),
727 .tcm_info =
728 static_cast<__u32>((static_cast<uint16_t>(prio) << 16) |
729 htons(static_cast<uint16_t>(proto))),
730 },
731 };
732
733 return sendAndProcessNetlinkResponse(&req, sizeof(req));
734}
735
736} // namespace android