Add functionality to attach tc-police action to ingress qdisc

This is used for go/bandwidth-limiting.

Bug: 157552970
Test: atest LibTcUtilsTest
Change-Id: Ic8e4d8dc016b14a6d4fc8ddbb3949941e9ef95af
diff --git a/staticlibs/native/tcutils/tcutils.cpp b/staticlibs/native/tcutils/tcutils.cpp
index 9b8c843..ad9d9e4 100644
--- a/staticlibs/native/tcutils/tcutils.cpp
+++ b/staticlibs/native/tcutils/tcutils.cpp
@@ -32,6 +32,7 @@
 #include <linux/pkt_cls.h>
 #include <linux/pkt_sched.h>
 #include <linux/rtnetlink.h>
+#include <linux/tc_act/tc_bpf.h>
 #include <net/if.h>
 #include <stdarg.h>
 #include <stdio.h>
@@ -59,6 +60,319 @@
   va_end(args);
 }
 
+/**
+ * IngressPoliceFilterBuilder builds a nlmsg request equivalent to the following
+ * tc command:
+ *
+ * tc filter add dev .. ingress prio .. protocol .. matchall \
+ *     action police rate .. burst .. conform-exceed pipe/continue \
+ *     action bpf object-pinned .. \
+ *     drop
+ */
+class IngressPoliceFilterBuilder final {
+  // default mtu is 2047, so the cell logarithm factor (cell_log) is 3.
+  // 0x7FF >> 0x3FF x 2^1 >> 0x1FF x 2^2 >> 0xFF x 2^3
+  static constexpr int RTAB_CELL_LOGARITHM = 3;
+  static constexpr size_t RTAB_SIZE = 256;
+  static constexpr unsigned TIME_UNITS_PER_SEC = 1000000;
+
+  struct Request {
+    nlmsghdr n;
+    tcmsg t;
+    struct {
+      nlattr attr;
+      char str[NLMSG_ALIGN(sizeof("matchall"))];
+    } kind;
+    struct {
+      nlattr attr;
+      struct {
+        nlattr attr;
+        struct {
+          nlattr attr;
+          struct {
+            nlattr attr;
+            char str[NLMSG_ALIGN(sizeof("police"))];
+          } kind;
+          struct {
+            nlattr attr;
+            struct {
+              nlattr attr;
+              struct tc_police obj;
+            } police;
+            struct {
+              nlattr attr;
+              uint32_t u32[RTAB_SIZE];
+            } rtab;
+            struct {
+              nlattr attr;
+              int32_t s32;
+            } notexceedact;
+          } opt;
+        } act1;
+        struct {
+          nlattr attr;
+          struct {
+            nlattr attr;
+            char str[NLMSG_ALIGN(sizeof("bpf"))];
+          } kind;
+          struct {
+            nlattr attr;
+            struct {
+              nlattr attr;
+              uint32_t u32;
+            } fd;
+            struct {
+              nlattr attr;
+              char str[NLMSG_ALIGN(CLS_BPF_NAME_LEN)];
+            } name;
+            struct {
+              nlattr attr;
+              struct tc_act_bpf obj;
+            } parms;
+          } opt;
+        } act2;
+      } acts;
+    } opt;
+  };
+
+  // class members
+  const unsigned mBurstInBytes;
+  const char *mBpfProgPath;
+  int mBpfFd;
+  Request mRequest;
+
+  static double getTickInUsec() {
+    FILE *fp = fopen("/proc/net/psched", "re");
+    if (!fp) {
+      logError("fopen(\"/proc/net/psched\"): %s", strerror(errno));
+      return 0.0;
+    }
+    auto scopeGuard = base::make_scope_guard([fp] { fclose(fp); });
+
+    uint32_t t2us;
+    uint32_t us2t;
+    uint32_t clockRes;
+    const bool isError =
+        fscanf(fp, "%08x%08x%08x", &t2us, &us2t, &clockRes) != 3;
+
+    if (isError) {
+      logError("fscanf(/proc/net/psched, \"%%08x%%08x%%08x\"): %s",
+               strerror(errno));
+      return 0.0;
+    }
+
+    const double clockFactor =
+        static_cast<double>(clockRes) / TIME_UNITS_PER_SEC;
+    return static_cast<double>(t2us) / static_cast<double>(us2t) * clockFactor;
+  }
+
+  static inline const double kTickInUsec = getTickInUsec();
+
+public:
+  // clang-format off
+  IngressPoliceFilterBuilder(int ifIndex, uint16_t prio, uint16_t proto, unsigned rateInBytesPerSec,
+                      unsigned burstInBytes, const char* bpfProgPath)
+      : mBurstInBytes(burstInBytes),
+        mBpfProgPath(bpfProgPath),
+        mBpfFd(-1),
+        mRequest{
+            .n = {
+                .nlmsg_len = sizeof(mRequest),
+                .nlmsg_type = RTM_NEWTFILTER,
+                .nlmsg_flags = NLM_F_REQUEST | NLM_F_ACK | NLM_F_EXCL | NLM_F_CREATE,
+            },
+            .t = {
+                .tcm_family = AF_UNSPEC,
+                .tcm_ifindex = ifIndex,
+                .tcm_handle = TC_H_UNSPEC,
+                .tcm_parent = TC_H_MAKE(TC_H_CLSACT, TC_H_MIN_INGRESS),
+                .tcm_info = (static_cast<uint32_t>(prio) << 16)
+                            | static_cast<uint32_t>(htons(proto)),
+            },
+            .kind = {
+                .attr = {
+                    .nla_len = sizeof(mRequest.kind),
+                    .nla_type = TCA_KIND,
+                },
+                .str = "matchall",
+            },
+            .opt = {
+                .attr = {
+                    .nla_len = sizeof(mRequest.opt),
+                    .nla_type = TCA_OPTIONS,
+                },
+                .acts = {
+                    .attr = {
+                        .nla_len = sizeof(mRequest.opt.acts),
+                        .nla_type = TCA_U32_ACT,
+                    },
+                    .act1 = {
+                        .attr = {
+                            .nla_len = sizeof(mRequest.opt.acts.act1),
+                            .nla_type = 1, // action priority
+                        },
+                        .kind = {
+                            .attr = {
+                                .nla_len = sizeof(mRequest.opt.acts.act1.kind),
+                                .nla_type = TCA_ACT_KIND,
+                            },
+                            .str = "police",
+                        },
+                        .opt = {
+                            .attr = {
+                                .nla_len = sizeof(mRequest.opt.acts.act1.opt),
+                                .nla_type = TCA_ACT_OPTIONS | NLA_F_NESTED,
+                            },
+                            .police = {
+                                .attr = {
+                                    .nla_len = sizeof(mRequest.opt.acts.act1.opt.police),
+                                    .nla_type = TCA_POLICE_TBF,
+                                },
+                                .obj = {
+                                    .action = TC_ACT_PIPE,
+                                    .burst = 0,
+                                    .rate = {
+                                        .cell_log = RTAB_CELL_LOGARITHM,
+                                        .linklayer = TC_LINKLAYER_ETHERNET,
+                                        .cell_align = -1,
+                                        .rate = rateInBytesPerSec,
+                                    },
+                                },
+                            },
+                            .rtab = {
+                                .attr = {
+                                    .nla_len = sizeof(mRequest.opt.acts.act1.opt.rtab),
+                                    .nla_type = TCA_POLICE_RATE,
+                                },
+                                .u32 = {},
+                            },
+                            .notexceedact = {
+                                .attr = {
+                                    .nla_len = sizeof(mRequest.opt.acts.act1.opt.notexceedact),
+                                    .nla_type = TCA_POLICE_RESULT,
+                                },
+                                .s32 = TC_ACT_UNSPEC,
+                            },
+                        },
+                    },
+                    .act2 = {
+                        .attr = {
+                            .nla_len = sizeof(mRequest.opt.acts.act2),
+                            .nla_type = 2, // action priority
+                        },
+                        .kind = {
+                            .attr = {
+                                .nla_len = sizeof(mRequest.opt.acts.act2.kind),
+                                .nla_type = TCA_ACT_KIND,
+                            },
+                            .str = "bpf",
+                        },
+                        .opt = {
+                            .attr = {
+                                .nla_len = sizeof(mRequest.opt.acts.act2.opt),
+                                .nla_type = TCA_ACT_OPTIONS | NLA_F_NESTED,
+                            },
+                            .fd = {
+                                .attr = {
+                                    .nla_len = sizeof(mRequest.opt.acts.act2.opt.fd),
+                                    .nla_type = TCA_ACT_BPF_FD,
+                                },
+                                .u32 = 0, // set during build()
+                            },
+                            .name = {
+                                .attr = {
+                                    .nla_len = sizeof(mRequest.opt.acts.act2.opt.name),
+                                    .nla_type = TCA_ACT_BPF_NAME,
+                                },
+                                .str = "placeholder",
+                            },
+                            .parms = {
+                                .attr = {
+                                    .nla_len = sizeof(mRequest.opt.acts.act2.opt.parms),
+                                    .nla_type = TCA_ACT_BPF_PARMS,
+                                },
+                                .obj = {
+                                    // default action to be executed when bpf prog
+                                    // returns TC_ACT_UNSPEC.
+                                    .action = TC_ACT_SHOT,
+                                },
+                            },
+                        },
+                    },
+                },
+            },
+        } {
+      // constructor body
+  }
+  // clang-format on
+
+  ~IngressPoliceFilterBuilder() {
+    // TODO: use unique_fd
+    if (mBpfFd != -1) {
+      close(mBpfFd);
+    }
+  }
+
+  constexpr unsigned getRequestSize() const { return sizeof(Request); }
+
+private:
+  unsigned calculateXmitTime(unsigned size) {
+    const uint32_t rate = mRequest.opt.acts.act1.opt.police.obj.rate.rate;
+    return (static_cast<double>(size) / static_cast<double>(rate)) *
+           TIME_UNITS_PER_SEC * kTickInUsec;
+  }
+
+  void initBurstRate() {
+    mRequest.opt.acts.act1.opt.police.obj.burst =
+        calculateXmitTime(mBurstInBytes);
+  }
+
+  // Calculates a table with 256 transmission times for different packet sizes
+  // (all the way up to MTU). RTAB_CELL_LOGARITHM is used as a scaling factor.
+  // In this case, MTU size is always 2048, so RTAB_CELL_LOGARITHM is always
+  // 3. Therefore, this function generates the transmission times for packets
+  // of size 1..256 x 2^3.
+  void initRateTable() {
+    for (unsigned i = 0; i < RTAB_SIZE; ++i) {
+      unsigned adjustedSize = (i + 1) << RTAB_CELL_LOGARITHM;
+      mRequest.opt.acts.act1.opt.rtab.u32[i] = calculateXmitTime(adjustedSize);
+    }
+  }
+
+  int initBpfFd() {
+    mBpfFd = bpf::retrieveProgram(mBpfProgPath);
+    if (mBpfFd == -1) {
+      int error = errno;
+      logError("retrieveProgram failed: %d", error);
+      return -error;
+    }
+
+    mRequest.opt.acts.act2.opt.fd.u32 = static_cast<uint32_t>(mBpfFd);
+    snprintf(mRequest.opt.acts.act2.opt.name.str,
+             sizeof(mRequest.opt.acts.act2.opt.name.str), "%s:[*fsobj]",
+             basename(mBpfProgPath));
+
+    return 0;
+  }
+
+public:
+  int build() {
+    if (kTickInUsec == 0.0) {
+      return -EINVAL;
+    }
+
+    initBurstRate();
+    initRateTable();
+    return initBpfFd();
+  }
+
+  const Request *getRequest() const {
+    // Make sure to call build() before calling this function. Otherwise, the
+    // request will be invalid.
+    return &mRequest;
+  }
+};
+
 const sockaddr_nl KERNEL_NLADDR = {AF_NETLINK, 0, 0, 0};
 const uint16_t NETLINK_REQUEST_FLAGS = NLM_F_REQUEST | NLM_F_ACK;
 
@@ -368,6 +682,37 @@
   return error;
 }
 
+// tc filter add dev .. ingress prio .. protocol .. matchall \
+//     action police rate .. burst .. conform-exceed pipe/continue \
+//     action bpf object-pinned .. \
+//     drop
+//
+// TODO: tc-police does not do ECN marking, so in the future, we should consider
+// adding a second tc-police filter at a lower priority that rate limits traffic
+// at something like 0.8 times the global rate limit and ecn marks exceeding
+// packets inside a bpf program (but does not drop them).
+int tcAddIngressPoliceFilter(int ifIndex, uint16_t prio, uint16_t proto,
+                             unsigned rateInBytesPerSec,
+                             const char *bpfProgPath) {
+  // TODO: this value needs to be validated.
+  // TCP IW10 (initial congestion window) means servers will send 10 mtus worth
+  // of data on initial connect.
+  // If nic is LRO capable it could aggregate up to 64KiB, so again probably a
+  // bad idea to set burst below that, because ingress packets could get
+  // aggregated to 64KiB at the nic.
+  // I don't know, but I wonder whether we shouldn't just do 128KiB and not do
+  // any math.
+  static constexpr unsigned BURST_SIZE_IN_BYTES = 128 * 1024; // 128KiB
+  IngressPoliceFilterBuilder filter(ifIndex, prio, proto, rateInBytesPerSec,
+                                    BURST_SIZE_IN_BYTES, bpfProgPath);
+  const int error = filter.build();
+  if (error) {
+    return error;
+  }
+  return sendAndProcessNetlinkResponse(filter.getRequest(),
+                                       filter.getRequestSize());
+}
+
 // tc filter del dev .. in/egress prio .. protocol ..
 int tcDeleteFilter(int ifIndex, bool ingress, uint16_t prio, uint16_t proto) {
   const struct {