blob: 94d717be5da60e084a6a9a5a449312ab2a572e5a [file] [log] [blame]
Tyler Wear72388212021-09-09 14:49:02 -07001/*
2 * Copyright (C) 2021 The Android Open Source Project
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
Maciej Żenczykowski11141da2024-03-15 18:21:33 -070017// The resulting .o needs to load on Android T+
Maciej Żenczykowski4e4f8722024-06-15 06:38:08 -070018#define BPFLOADER_MIN_VER BPFLOADER_MAINLINE_T_VERSION
Maciej Żenczykowskiacebffb2022-05-16 16:05:15 -070019
Maciej Żenczykowski85c9c992024-08-16 17:57:36 -070020#include "bpf_net_helpers.h"
Ken Chen74ff3ee2022-07-14 16:46:39 +080021#include "dscpPolicy.h"
Tyler Wear72388212021-09-09 14:49:02 -070022
Tyler Wear11f494f2022-06-14 16:04:49 -070023#define ECN_MASK 3
Maciej Żenczykowskid7b92c02022-07-27 19:57:15 +000024#define UPDATE_TOS(dscp, tos) ((dscp) << 2) | ((tos) & ECN_MASK)
Tyler Wear11f494f2022-06-14 16:04:49 -070025
Maciej Żenczykowski52ff2b62024-08-27 18:17:33 -070026// The cache is never read nor written by userspace and is indexed by socket cookie % CACHE_MAP_SIZE
27#define CACHE_MAP_SIZE 32 // should be a power of two so we can % cheaply
Maciej Żenczykowski1ec8d7d2024-09-04 16:44:04 -070028DEFINE_BPF_MAP_KERNEL_INTERNAL(socket_policy_cache_map, PERCPU_ARRAY, uint32_t, RuleEntry,
29 CACHE_MAP_SIZE)
Tyler Wear72388212021-09-09 14:49:02 -070030
Tyler Wear11f494f2022-06-14 16:04:49 -070031DEFINE_BPF_MAP_GRW(ipv4_dscp_policies_map, ARRAY, uint32_t, DscpPolicy, MAX_POLICIES, AID_SYSTEM)
32DEFINE_BPF_MAP_GRW(ipv6_dscp_policies_map, ARRAY, uint32_t, DscpPolicy, MAX_POLICIES, AID_SYSTEM)
Tyler Wear3ad80892022-02-03 15:14:44 -080033
Maciej Żenczykowski116d17a2024-08-26 16:40:57 -070034static inline __always_inline uint64_t calculate_u64(uint64_t v) {
35 COMPILER_FORCE_CALCULATION(v);
36 return v;
37}
38
Maciej Żenczykowski1ab3ad82024-08-22 17:30:20 +000039static inline __always_inline void match_policy(struct __sk_buff* skb, const bool ipv4) {
Tyler Wear3ad80892022-02-03 15:14:44 -080040 void* data = (void*)(long)skb->data;
41 const void* data_end = (void*)(long)skb->data_end;
42
Patrick Rohr7f325cc2022-07-25 10:15:02 -070043 const int l2_header_size = sizeof(struct ethhdr);
44 struct ethhdr* eth = data;
Tyler Wear3ad80892022-02-03 15:14:44 -080045
46 if (data + l2_header_size > data_end) return;
47
Tyler Wear3ad80892022-02-03 15:14:44 -080048 int hdr_size = 0;
Tyler Wear72388212021-09-09 14:49:02 -070049
50 // used for map lookup
51 uint64_t cookie = bpf_get_socket_cookie(skb);
Tyler Wear11f494f2022-06-14 16:04:49 -070052 if (!cookie) return;
Tyler Wear72388212021-09-09 14:49:02 -070053
Maciej Żenczykowski52ff2b62024-08-27 18:17:33 -070054 uint32_t cacheid = cookie % CACHE_MAP_SIZE;
55
Maciej Żenczykowski640752b2022-08-09 23:02:57 +000056 __be16 sport = 0;
Tyler Wear3ad80892022-02-03 15:14:44 -080057 uint16_t dport = 0;
Tyler Wear11f494f2022-06-14 16:04:49 -070058 uint8_t protocol = 0; // TODO: Use are reserved value? Or int (-1) and cast to uint below?
Tyler Wear92281052022-06-22 15:32:14 -070059 struct in6_addr src_ip = {};
60 struct in6_addr dst_ip = {};
Maciej Żenczykowski242af392022-08-22 09:11:10 +000061 uint8_t tos = 0; // Only used for IPv4
62 __be32 old_first_be32 = 0; // Only used for IPv6
Tyler Wear3ad80892022-02-03 15:14:44 -080063 if (ipv4) {
Patrick Rohr7f325cc2022-07-25 10:15:02 -070064 const struct iphdr* const iph = (void*)(eth + 1);
Tyler Wear11f494f2022-06-14 16:04:49 -070065 hdr_size = l2_header_size + sizeof(struct iphdr);
Tyler Wear72388212021-09-09 14:49:02 -070066 // Must have ipv4 header
Tyler Wear11f494f2022-06-14 16:04:49 -070067 if (data + hdr_size > data_end) return;
Tyler Wear72388212021-09-09 14:49:02 -070068
69 // IP version must be 4
Tyler Wear3ad80892022-02-03 15:14:44 -080070 if (iph->version != 4) return;
Tyler Wear72388212021-09-09 14:49:02 -070071
72 // We cannot handle IP options, just standard 20 byte == 5 dword minimal IPv4 header
Tyler Wear3ad80892022-02-03 15:14:44 -080073 if (iph->ihl != 5) return;
Tyler Wear72388212021-09-09 14:49:02 -070074
Tyler Wear3ad80892022-02-03 15:14:44 -080075 // V4 mapped address in in6_addr sets 10/11 position to 0xff.
Tyler Wear92281052022-06-22 15:32:14 -070076 src_ip.s6_addr32[2] = htonl(0x0000ffff);
77 dst_ip.s6_addr32[2] = htonl(0x0000ffff);
Tyler Wear72388212021-09-09 14:49:02 -070078
Tyler Wear3ad80892022-02-03 15:14:44 -080079 // Copy IPv4 address into in6_addr for easy comparison below.
Tyler Wear92281052022-06-22 15:32:14 -070080 src_ip.s6_addr32[3] = iph->saddr;
81 dst_ip.s6_addr32[3] = iph->daddr;
Tyler Wear3ad80892022-02-03 15:14:44 -080082 protocol = iph->protocol;
83 tos = iph->tos;
Tyler Wear3ad80892022-02-03 15:14:44 -080084 } else {
Patrick Rohr7f325cc2022-07-25 10:15:02 -070085 struct ipv6hdr* ip6h = (void*)(eth + 1);
Tyler Wear11f494f2022-06-14 16:04:49 -070086 hdr_size = l2_header_size + sizeof(struct ipv6hdr);
Tyler Wear3ad80892022-02-03 15:14:44 -080087 // Must have ipv6 header
Tyler Wear11f494f2022-06-14 16:04:49 -070088 if (data + hdr_size > data_end) return;
Tyler Wear72388212021-09-09 14:49:02 -070089
Tyler Wear3ad80892022-02-03 15:14:44 -080090 if (ip6h->version != 6) return;
Tyler Wear72388212021-09-09 14:49:02 -070091
Tyler Wear92281052022-06-22 15:32:14 -070092 src_ip = ip6h->saddr;
93 dst_ip = ip6h->daddr;
Tyler Wear3ad80892022-02-03 15:14:44 -080094 protocol = ip6h->nexthdr;
Maciej Żenczykowski242af392022-08-22 09:11:10 +000095 old_first_be32 = *(__be32*)ip6h;
Tyler Wear3ad80892022-02-03 15:14:44 -080096 }
Tyler Wear72388212021-09-09 14:49:02 -070097
Tyler Wear3ad80892022-02-03 15:14:44 -080098 switch (protocol) {
99 case IPPROTO_UDP:
Tyler Wear11f494f2022-06-14 16:04:49 -0700100 case IPPROTO_UDPLITE: {
101 struct udphdr* udp;
Tyler Wear3ad80892022-02-03 15:14:44 -0800102 udp = data + hdr_size;
103 if ((void*)(udp + 1) > data_end) return;
104 sport = udp->source;
Maciej Żenczykowski640752b2022-08-09 23:02:57 +0000105 dport = ntohs(udp->dest);
Tyler Wear11f494f2022-06-14 16:04:49 -0700106 } break;
107 case IPPROTO_TCP: {
108 struct tcphdr* tcp;
Tyler Wear3ad80892022-02-03 15:14:44 -0800109 tcp = data + hdr_size;
110 if ((void*)(tcp + 1) > data_end) return;
111 sport = tcp->source;
Maciej Żenczykowski640752b2022-08-09 23:02:57 +0000112 dport = ntohs(tcp->dest);
Tyler Wear11f494f2022-06-14 16:04:49 -0700113 } break;
Tyler Wear3ad80892022-02-03 15:14:44 -0800114 default:
115 return;
116 }
117
Maciej Żenczykowski52ff2b62024-08-27 18:17:33 -0700118 // this array lookup cannot actually fail
119 RuleEntry* existing_rule = bpf_socket_policy_cache_map_lookup_elem(&cacheid);
Tyler Wear3ad80892022-02-03 15:14:44 -0800120
Maciej Żenczykowskibbb54992024-08-29 18:15:30 -0700121 if (!existing_rule) return; // impossible
122
Maciej Żenczykowski3188abb2024-08-30 11:44:31 -0700123 uint64_t nomatch = 0;
124 nomatch |= v6_not_equal(src_ip, existing_rule->src_ip);
125 nomatch |= v6_not_equal(dst_ip, existing_rule->dst_ip);
126 nomatch |= (skb->ifindex ^ existing_rule->ifindex);
127 nomatch |= (sport ^ existing_rule->src_port);
128 nomatch |= (dport ^ existing_rule->dst_port);
129 nomatch |= (protocol ^ existing_rule->proto);
130 COMPILER_FORCE_CALCULATION(nomatch);
131
132 /*
133 * After the above funky bitwise arithmetic we have 'nomatch == 0' iff
134 * src_ip == existing_rule->src_ip &&
135 * dst_ip == existing_rule->dst_ip &&
136 * skb->ifindex == existing_rule->ifindex &&
137 * sport == existing_rule->src_port &&
138 * dport == existing_rule->dst_port &&
139 * protocol == existing_rule->proto
140 */
141
142 if (!nomatch) {
143 if (existing_rule->dscp_val < 0) return; // cached no-op
144
Tyler Wear3ad80892022-02-03 15:14:44 -0800145 if (ipv4) {
Tyler Wear92281052022-06-22 15:32:14 -0700146 uint8_t newTos = UPDATE_TOS(existing_rule->dscp_val, tos);
Maciej Żenczykowski85c9c992024-08-16 17:57:36 -0700147 bpf_l3_csum_replace(skb, l2_header_size + IP4_OFFSET(check), htons(tos), htons(newTos),
Tyler Wear11f494f2022-06-14 16:04:49 -0700148 sizeof(uint16_t));
Maciej Żenczykowski85c9c992024-08-16 17:57:36 -0700149 bpf_skb_store_bytes(skb, l2_header_size + IP4_OFFSET(tos), &newTos, sizeof(newTos), 0);
Tyler Wear3ad80892022-02-03 15:14:44 -0800150 } else {
Maciej Żenczykowski242af392022-08-22 09:11:10 +0000151 __be32 new_first_be32 =
152 htonl(ntohl(old_first_be32) & 0xF03FFFFF | (existing_rule->dscp_val << 22));
153 bpf_skb_store_bytes(skb, l2_header_size, &new_first_be32, sizeof(__be32),
Tyler Wear4e8949b2022-06-23 14:15:58 -0700154 BPF_F_RECOMPUTE_CSUM);
Tyler Wear3ad80892022-02-03 15:14:44 -0800155 }
Maciej Żenczykowski3188abb2024-08-30 11:44:31 -0700156 return; // cached DSCP mutation
Tyler Wear3ad80892022-02-03 15:14:44 -0800157 }
158
Maciej Żenczykowski116d17a2024-08-26 16:40:57 -0700159 // Linear scan ipv?_dscp_policies_map since stored params didn't match skb.
160 uint64_t best_score = 0;
161 int8_t new_dscp = -1; // meaning no mutation
Tyler Wear3ad80892022-02-03 15:14:44 -0800162
163 for (register uint64_t i = 0; i < MAX_POLICIES; i++) {
Tyler Wear3ad80892022-02-03 15:14:44 -0800164 // Using a uint64 in for loop prevents infinite loop during BPF load,
165 // but the key is uint32, so convert back.
166 uint32_t key = i;
167
168 DscpPolicy* policy;
169 if (ipv4) {
170 policy = bpf_ipv4_dscp_policies_map_lookup_elem(&key);
171 } else {
172 policy = bpf_ipv6_dscp_policies_map_lookup_elem(&key);
Tyler Wear72388212021-09-09 14:49:02 -0700173 }
174
Maciej Żenczykowski1ab3ad82024-08-22 17:30:20 +0000175 // Lookup failure cannot happen on an array with MAX_POLICIES entries.
176 // While 'continue' would make logical sense here, 'return' should be
177 // easier for the verifier to analyze.
178 if (!policy) return;
Tyler Wear72388212021-09-09 14:49:02 -0700179
Maciej Żenczykowski116d17a2024-08-26 16:40:57 -0700180 // Think of 'nomatch' as a 64-bit boolean: false iff zero, true iff non-zero.
181 // Start off with nomatch being false, ie. we assume things *are* matching.
182 uint64_t nomatch = 0;
183
184 // Due to 'a ^ b' being 0 iff a == b:
185 // nomatch |= a ^ b
186 // should/can be read as:
187 // nomatch ||= (a != b)
188 // which you can also think of as:
189 // match &&= (a == b)
190
Maciej Żenczykowski1feaa432022-07-29 21:17:07 +0000191 // If policy iface index does not match skb, then skip to next policy.
Maciej Żenczykowski116d17a2024-08-26 16:40:57 -0700192 nomatch |= (policy->ifindex ^ skb->ifindex);
Tyler Wear72388212021-09-09 14:49:02 -0700193
Maciej Żenczykowski116d17a2024-08-26 16:40:57 -0700194 // policy->match_* are normal booleans, and should thus always be 0 or 1,
195 // thus you can think of these as:
196 // if (policy->match_foo) match &&= (foo == policy->foo);
197 nomatch |= policy->match_proto * (protocol ^ policy->proto);
198 nomatch |= policy->match_src_ip * v6_not_equal(src_ip, policy->src_ip);
199 nomatch |= policy->match_dst_ip * v6_not_equal(dst_ip, policy->dst_ip);
200 nomatch |= policy->match_src_port * (sport ^ policy->src_port);
Maciej Żenczykowski1feaa432022-07-29 21:17:07 +0000201
Maciej Żenczykowski116d17a2024-08-26 16:40:57 -0700202 // Since these values are u16s (<=63 bits), we can rely on u64 subtraction
203 // underflow setting the topmost bit. Basically, you can think of:
204 // nomatch |= (a - b) >> 63
205 // as:
206 // match &&= (a >= b)
207 uint64_t dport64 = dport; // Note: dst_port_{start_end} range is inclusive of both ends.
208 nomatch |= calculate_u64(dport64 - policy->dst_port_start) >> 63;
209 nomatch |= calculate_u64(policy->dst_port_end - dport64) >> 63;
Maciej Żenczykowski1feaa432022-07-29 21:17:07 +0000210
Maciej Żenczykowski116d17a2024-08-26 16:40:57 -0700211 // score is 0x10000 for each matched field (proto, src_ip, dst_ip, src_port)
212 // plus 1..0x10000 for the dst_port range match (smaller for bigger ranges)
213 uint64_t score = 0;
214 score += policy->match_proto; // reminder: match_* are boolean, thus 0 or 1
215 score += policy->match_src_ip;
216 score += policy->match_dst_ip;
217 score += policy->match_src_port;
218 score += 1; // for a 1 element dst_port_{start,end} range
219 score <<= 16; // scale up: ie. *= 0x10000
220 // now reduce score if the dst_port range is more than a single element
221 // we want to prioritize (ie. better score) matches of smaller ranges
222 score -= (policy->dst_port_end - policy->dst_port_start); // -= 0..0xFFFF
223
224 // Here we need:
225 // match &&= (score > best_score)
226 // which is the same as
227 // match &&= (score >= best_score + 1)
228 // > not >= because we want equal score matches to prefer choosing earlier policies
229 nomatch |= calculate_u64(score - best_score - 1) >> 63;
230
231 COMPILER_FORCE_CALCULATION(nomatch);
232 if (nomatch) continue;
233
234 // only reachable if we matched the policy and (score > best_score)
235 best_score = score;
236 new_dscp = policy->dscp_val;
Maciej Żenczykowskid7b92c02022-07-27 19:57:15 +0000237 }
Tyler Wear72388212021-09-09 14:49:02 -0700238
Maciej Żenczykowskibbb54992024-08-29 18:15:30 -0700239 // Update cache with found policy.
240 *existing_rule = (RuleEntry){
Tyler Wear92281052022-06-22 15:32:14 -0700241 .src_ip = src_ip,
242 .dst_ip = dst_ip,
Tyler Wear3ad80892022-02-03 15:14:44 -0800243 .ifindex = skb->ifindex,
Tyler Wear92281052022-06-22 15:32:14 -0700244 .src_port = sport,
245 .dst_port = dport,
Tyler Wear3ad80892022-02-03 15:14:44 -0800246 .proto = protocol,
Tyler Wear92281052022-06-22 15:32:14 -0700247 .dscp_val = new_dscp,
Tyler Wear3ad80892022-02-03 15:14:44 -0800248 };
Tyler Wear72388212021-09-09 14:49:02 -0700249
Maciej Żenczykowskid7b92c02022-07-27 19:57:15 +0000250 if (new_dscp < 0) return;
251
Tyler Wear3ad80892022-02-03 15:14:44 -0800252 // Need to store bytes after updating map or program will not load.
Tyler Wear4e8949b2022-06-23 14:15:58 -0700253 if (ipv4) {
254 uint8_t new_tos = UPDATE_TOS(new_dscp, tos);
Maciej Żenczykowski85c9c992024-08-16 17:57:36 -0700255 bpf_l3_csum_replace(skb, l2_header_size + IP4_OFFSET(check), htons(tos), htons(new_tos), 2);
256 bpf_skb_store_bytes(skb, l2_header_size + IP4_OFFSET(tos), &new_tos, sizeof(new_tos), 0);
Tyler Wear4e8949b2022-06-23 14:15:58 -0700257 } else {
Maciej Żenczykowski242af392022-08-22 09:11:10 +0000258 __be32 new_first_be32 = htonl(ntohl(old_first_be32) & 0xF03FFFFF | (new_dscp << 22));
259 bpf_skb_store_bytes(skb, l2_header_size, &new_first_be32, sizeof(__be32),
Tyler Wear4e8949b2022-06-23 14:15:58 -0700260 BPF_F_RECOMPUTE_CSUM);
Tyler Wear3ad80892022-02-03 15:14:44 -0800261 }
262 return;
263}
Tyler Wear72388212021-09-09 14:49:02 -0700264
Tyler Wear4e8949b2022-06-23 14:15:58 -0700265DEFINE_BPF_PROG_KVER("schedcls/set_dscp_ether", AID_ROOT, AID_SYSTEM, schedcls_set_dscp_ether,
Maciej Żenczykowski901c7102023-10-06 15:47:46 -0700266 KVER_5_15)
Tyler Wear3ad80892022-02-03 15:14:44 -0800267(struct __sk_buff* skb) {
Tyler Wear3ad80892022-02-03 15:14:44 -0800268 if (skb->pkt_type != PACKET_HOST) return TC_ACT_PIPE;
269
270 if (skb->protocol == htons(ETH_P_IP)) {
Patrick Rohr7f325cc2022-07-25 10:15:02 -0700271 match_policy(skb, true);
Tyler Wear3ad80892022-02-03 15:14:44 -0800272 } else if (skb->protocol == htons(ETH_P_IPV6)) {
Patrick Rohr7f325cc2022-07-25 10:15:02 -0700273 match_policy(skb, false);
Tyler Wear3ad80892022-02-03 15:14:44 -0800274 }
275
276 // Always return TC_ACT_PIPE
277 return TC_ACT_PIPE;
278}
279
Tyler Wear72388212021-09-09 14:49:02 -0700280LICENSE("Apache 2.0");
281CRITICAL("Connectivity");