Add functionality to attach tc-police action to ingress qdisc
This is used for go/bandwidth-limiting.
Bug: 157552970
Test: atest LibTcUtilsTest
Change-Id: Ic8e4d8dc016b14a6d4fc8ddbb3949941e9ef95af
diff --git a/staticlibs/native/tcutils/tcutils.cpp b/staticlibs/native/tcutils/tcutils.cpp
index 9b8c843..ad9d9e4 100644
--- a/staticlibs/native/tcutils/tcutils.cpp
+++ b/staticlibs/native/tcutils/tcutils.cpp
@@ -32,6 +32,7 @@
#include <linux/pkt_cls.h>
#include <linux/pkt_sched.h>
#include <linux/rtnetlink.h>
+#include <linux/tc_act/tc_bpf.h>
#include <net/if.h>
#include <stdarg.h>
#include <stdio.h>
@@ -59,6 +60,319 @@
va_end(args);
}
+/**
+ * IngressPoliceFilterBuilder builds a nlmsg request equivalent to the following
+ * tc command:
+ *
+ * tc filter add dev .. ingress prio .. protocol .. matchall \
+ * action police rate .. burst .. conform-exceed pipe/continue \
+ * action bpf object-pinned .. \
+ * drop
+ */
+class IngressPoliceFilterBuilder final {
+ // default mtu is 2047, so the cell logarithm factor (cell_log) is 3.
+ // 0x7FF >> 0x3FF x 2^1 >> 0x1FF x 2^2 >> 0xFF x 2^3
+ static constexpr int RTAB_CELL_LOGARITHM = 3;
+ static constexpr size_t RTAB_SIZE = 256;
+ static constexpr unsigned TIME_UNITS_PER_SEC = 1000000;
+
+ struct Request {
+ nlmsghdr n;
+ tcmsg t;
+ struct {
+ nlattr attr;
+ char str[NLMSG_ALIGN(sizeof("matchall"))];
+ } kind;
+ struct {
+ nlattr attr;
+ struct {
+ nlattr attr;
+ struct {
+ nlattr attr;
+ struct {
+ nlattr attr;
+ char str[NLMSG_ALIGN(sizeof("police"))];
+ } kind;
+ struct {
+ nlattr attr;
+ struct {
+ nlattr attr;
+ struct tc_police obj;
+ } police;
+ struct {
+ nlattr attr;
+ uint32_t u32[RTAB_SIZE];
+ } rtab;
+ struct {
+ nlattr attr;
+ int32_t s32;
+ } notexceedact;
+ } opt;
+ } act1;
+ struct {
+ nlattr attr;
+ struct {
+ nlattr attr;
+ char str[NLMSG_ALIGN(sizeof("bpf"))];
+ } kind;
+ struct {
+ nlattr attr;
+ struct {
+ nlattr attr;
+ uint32_t u32;
+ } fd;
+ struct {
+ nlattr attr;
+ char str[NLMSG_ALIGN(CLS_BPF_NAME_LEN)];
+ } name;
+ struct {
+ nlattr attr;
+ struct tc_act_bpf obj;
+ } parms;
+ } opt;
+ } act2;
+ } acts;
+ } opt;
+ };
+
+ // class members
+ const unsigned mBurstInBytes;
+ const char *mBpfProgPath;
+ int mBpfFd;
+ Request mRequest;
+
+ static double getTickInUsec() {
+ FILE *fp = fopen("/proc/net/psched", "re");
+ if (!fp) {
+ logError("fopen(\"/proc/net/psched\"): %s", strerror(errno));
+ return 0.0;
+ }
+ auto scopeGuard = base::make_scope_guard([fp] { fclose(fp); });
+
+ uint32_t t2us;
+ uint32_t us2t;
+ uint32_t clockRes;
+ const bool isError =
+ fscanf(fp, "%08x%08x%08x", &t2us, &us2t, &clockRes) != 3;
+
+ if (isError) {
+ logError("fscanf(/proc/net/psched, \"%%08x%%08x%%08x\"): %s",
+ strerror(errno));
+ return 0.0;
+ }
+
+ const double clockFactor =
+ static_cast<double>(clockRes) / TIME_UNITS_PER_SEC;
+ return static_cast<double>(t2us) / static_cast<double>(us2t) * clockFactor;
+ }
+
+ static inline const double kTickInUsec = getTickInUsec();
+
+public:
+ // clang-format off
+ IngressPoliceFilterBuilder(int ifIndex, uint16_t prio, uint16_t proto, unsigned rateInBytesPerSec,
+ unsigned burstInBytes, const char* bpfProgPath)
+ : mBurstInBytes(burstInBytes),
+ mBpfProgPath(bpfProgPath),
+ mBpfFd(-1),
+ mRequest{
+ .n = {
+ .nlmsg_len = sizeof(mRequest),
+ .nlmsg_type = RTM_NEWTFILTER,
+ .nlmsg_flags = NLM_F_REQUEST | NLM_F_ACK | NLM_F_EXCL | NLM_F_CREATE,
+ },
+ .t = {
+ .tcm_family = AF_UNSPEC,
+ .tcm_ifindex = ifIndex,
+ .tcm_handle = TC_H_UNSPEC,
+ .tcm_parent = TC_H_MAKE(TC_H_CLSACT, TC_H_MIN_INGRESS),
+ .tcm_info = (static_cast<uint32_t>(prio) << 16)
+ | static_cast<uint32_t>(htons(proto)),
+ },
+ .kind = {
+ .attr = {
+ .nla_len = sizeof(mRequest.kind),
+ .nla_type = TCA_KIND,
+ },
+ .str = "matchall",
+ },
+ .opt = {
+ .attr = {
+ .nla_len = sizeof(mRequest.opt),
+ .nla_type = TCA_OPTIONS,
+ },
+ .acts = {
+ .attr = {
+ .nla_len = sizeof(mRequest.opt.acts),
+ .nla_type = TCA_U32_ACT,
+ },
+ .act1 = {
+ .attr = {
+ .nla_len = sizeof(mRequest.opt.acts.act1),
+ .nla_type = 1, // action priority
+ },
+ .kind = {
+ .attr = {
+ .nla_len = sizeof(mRequest.opt.acts.act1.kind),
+ .nla_type = TCA_ACT_KIND,
+ },
+ .str = "police",
+ },
+ .opt = {
+ .attr = {
+ .nla_len = sizeof(mRequest.opt.acts.act1.opt),
+ .nla_type = TCA_ACT_OPTIONS | NLA_F_NESTED,
+ },
+ .police = {
+ .attr = {
+ .nla_len = sizeof(mRequest.opt.acts.act1.opt.police),
+ .nla_type = TCA_POLICE_TBF,
+ },
+ .obj = {
+ .action = TC_ACT_PIPE,
+ .burst = 0,
+ .rate = {
+ .cell_log = RTAB_CELL_LOGARITHM,
+ .linklayer = TC_LINKLAYER_ETHERNET,
+ .cell_align = -1,
+ .rate = rateInBytesPerSec,
+ },
+ },
+ },
+ .rtab = {
+ .attr = {
+ .nla_len = sizeof(mRequest.opt.acts.act1.opt.rtab),
+ .nla_type = TCA_POLICE_RATE,
+ },
+ .u32 = {},
+ },
+ .notexceedact = {
+ .attr = {
+ .nla_len = sizeof(mRequest.opt.acts.act1.opt.notexceedact),
+ .nla_type = TCA_POLICE_RESULT,
+ },
+ .s32 = TC_ACT_UNSPEC,
+ },
+ },
+ },
+ .act2 = {
+ .attr = {
+ .nla_len = sizeof(mRequest.opt.acts.act2),
+ .nla_type = 2, // action priority
+ },
+ .kind = {
+ .attr = {
+ .nla_len = sizeof(mRequest.opt.acts.act2.kind),
+ .nla_type = TCA_ACT_KIND,
+ },
+ .str = "bpf",
+ },
+ .opt = {
+ .attr = {
+ .nla_len = sizeof(mRequest.opt.acts.act2.opt),
+ .nla_type = TCA_ACT_OPTIONS | NLA_F_NESTED,
+ },
+ .fd = {
+ .attr = {
+ .nla_len = sizeof(mRequest.opt.acts.act2.opt.fd),
+ .nla_type = TCA_ACT_BPF_FD,
+ },
+ .u32 = 0, // set during build()
+ },
+ .name = {
+ .attr = {
+ .nla_len = sizeof(mRequest.opt.acts.act2.opt.name),
+ .nla_type = TCA_ACT_BPF_NAME,
+ },
+ .str = "placeholder",
+ },
+ .parms = {
+ .attr = {
+ .nla_len = sizeof(mRequest.opt.acts.act2.opt.parms),
+ .nla_type = TCA_ACT_BPF_PARMS,
+ },
+ .obj = {
+ // default action to be executed when bpf prog
+ // returns TC_ACT_UNSPEC.
+ .action = TC_ACT_SHOT,
+ },
+ },
+ },
+ },
+ },
+ },
+ } {
+ // constructor body
+ }
+ // clang-format on
+
+ ~IngressPoliceFilterBuilder() {
+ // TODO: use unique_fd
+ if (mBpfFd != -1) {
+ close(mBpfFd);
+ }
+ }
+
+ constexpr unsigned getRequestSize() const { return sizeof(Request); }
+
+private:
+ unsigned calculateXmitTime(unsigned size) {
+ const uint32_t rate = mRequest.opt.acts.act1.opt.police.obj.rate.rate;
+ return (static_cast<double>(size) / static_cast<double>(rate)) *
+ TIME_UNITS_PER_SEC * kTickInUsec;
+ }
+
+ void initBurstRate() {
+ mRequest.opt.acts.act1.opt.police.obj.burst =
+ calculateXmitTime(mBurstInBytes);
+ }
+
+ // Calculates a table with 256 transmission times for different packet sizes
+ // (all the way up to MTU). RTAB_CELL_LOGARITHM is used as a scaling factor.
+ // In this case, MTU size is always 2048, so RTAB_CELL_LOGARITHM is always
+ // 3. Therefore, this function generates the transmission times for packets
+ // of size 1..256 x 2^3.
+ void initRateTable() {
+ for (unsigned i = 0; i < RTAB_SIZE; ++i) {
+ unsigned adjustedSize = (i + 1) << RTAB_CELL_LOGARITHM;
+ mRequest.opt.acts.act1.opt.rtab.u32[i] = calculateXmitTime(adjustedSize);
+ }
+ }
+
+ int initBpfFd() {
+ mBpfFd = bpf::retrieveProgram(mBpfProgPath);
+ if (mBpfFd == -1) {
+ int error = errno;
+ logError("retrieveProgram failed: %d", error);
+ return -error;
+ }
+
+ mRequest.opt.acts.act2.opt.fd.u32 = static_cast<uint32_t>(mBpfFd);
+ snprintf(mRequest.opt.acts.act2.opt.name.str,
+ sizeof(mRequest.opt.acts.act2.opt.name.str), "%s:[*fsobj]",
+ basename(mBpfProgPath));
+
+ return 0;
+ }
+
+public:
+ int build() {
+ if (kTickInUsec == 0.0) {
+ return -EINVAL;
+ }
+
+ initBurstRate();
+ initRateTable();
+ return initBpfFd();
+ }
+
+ const Request *getRequest() const {
+ // Make sure to call build() before calling this function. Otherwise, the
+ // request will be invalid.
+ return &mRequest;
+ }
+};
+
const sockaddr_nl KERNEL_NLADDR = {AF_NETLINK, 0, 0, 0};
const uint16_t NETLINK_REQUEST_FLAGS = NLM_F_REQUEST | NLM_F_ACK;
@@ -368,6 +682,37 @@
return error;
}
+// tc filter add dev .. ingress prio .. protocol .. matchall \
+// action police rate .. burst .. conform-exceed pipe/continue \
+// action bpf object-pinned .. \
+// drop
+//
+// TODO: tc-police does not do ECN marking, so in the future, we should consider
+// adding a second tc-police filter at a lower priority that rate limits traffic
+// at something like 0.8 times the global rate limit and ecn marks exceeding
+// packets inside a bpf program (but does not drop them).
+int tcAddIngressPoliceFilter(int ifIndex, uint16_t prio, uint16_t proto,
+ unsigned rateInBytesPerSec,
+ const char *bpfProgPath) {
+ // TODO: this value needs to be validated.
+ // TCP IW10 (initial congestion window) means servers will send 10 mtus worth
+ // of data on initial connect.
+ // If nic is LRO capable it could aggregate up to 64KiB, so again probably a
+ // bad idea to set burst below that, because ingress packets could get
+ // aggregated to 64KiB at the nic.
+ // I don't know, but I wonder whether we shouldn't just do 128KiB and not do
+ // any math.
+ static constexpr unsigned BURST_SIZE_IN_BYTES = 128 * 1024; // 128KiB
+ IngressPoliceFilterBuilder filter(ifIndex, prio, proto, rateInBytesPerSec,
+ BURST_SIZE_IN_BYTES, bpfProgPath);
+ const int error = filter.build();
+ if (error) {
+ return error;
+ }
+ return sendAndProcessNetlinkResponse(filter.getRequest(),
+ filter.getRequestSize());
+}
+
// tc filter del dev .. in/egress prio .. protocol ..
int tcDeleteFilter(int ifIndex, bool ingress, uint16_t prio, uint16_t proto) {
const struct {