dscpPolicy: verifier optimizations (once more)

Manually tested via x86_64 UML netbpfloader hack on:
  5.15.158-U android/kernel/common/android14-5.15
  c8c7a44800dffe1c3a3840f433ba72f9ab1cdfb0

Combined with previous dscpPolicy verifier optimizer patch load time
goes from 0.151s to 0.061s, but this is on a desktop Android build
capable box.  Decrease of 2.5x will of course be much more noticeable
on slower devices - like Pixel phones, and *even* more noticeable
on low/mid tier devices (including Android Go).

Bug: 361492282
Test: TreeHugger, atest CtsNetTestCases:android.net.cts.DscpPolicyTest
Signed-off-by: Maciej Żenczykowski <maze@google.com>
Change-Id: Ice2bfa0d614d70420c41fb25294bb0425ec273b0
diff --git a/bpf/progs/dscpPolicy.c b/bpf/progs/dscpPolicy.c
index de9723d..94d717b 100644
--- a/bpf/progs/dscpPolicy.c
+++ b/bpf/progs/dscpPolicy.c
@@ -31,6 +31,11 @@
 DEFINE_BPF_MAP_GRW(ipv4_dscp_policies_map, ARRAY, uint32_t, DscpPolicy, MAX_POLICIES, AID_SYSTEM)
 DEFINE_BPF_MAP_GRW(ipv6_dscp_policies_map, ARRAY, uint32_t, DscpPolicy, MAX_POLICIES, AID_SYSTEM)
 
+static inline __always_inline uint64_t calculate_u64(uint64_t v) {
+    COMPILER_FORCE_CALCULATION(v);
+    return v;
+}
+
 static inline __always_inline void match_policy(struct __sk_buff* skb, const bool ipv4) {
     void* data = (void*)(long)skb->data;
     const void* data_end = (void*)(long)skb->data_end;
@@ -151,9 +156,9 @@
         return;  // cached DSCP mutation
     }
 
-    // Linear scan ipv4_dscp_policies_map since no stored params match skb.
-    int best_score = 0;
-    int8_t new_dscp = -1;
+    // Linear scan ipv?_dscp_policies_map since stored params didn't match skb.
+    uint64_t best_score = 0;
+    int8_t new_dscp = -1;  // meaning no mutation
 
     for (register uint64_t i = 0; i < MAX_POLICIES; i++) {
         // Using a uint64 in for loop prevents infinite loop during BPF load,
@@ -172,35 +177,63 @@
         // easier for the verifier to analyze.
         if (!policy) return;
 
+        // Think of 'nomatch' as a 64-bit boolean: false iff zero, true iff non-zero.
+        // Start off with nomatch being false, ie. we assume things *are* matching.
+        uint64_t nomatch = 0;
+
+        // Due to 'a ^ b' being 0 iff a == b:
+        //   nomatch |= a ^ b
+        // should/can be read as:
+        //   nomatch ||= (a != b)
+        // which you can also think of as:
+        //   match &&= (a == b)
+
         // If policy iface index does not match skb, then skip to next policy.
-        if (policy->ifindex != skb->ifindex) continue;
+        nomatch |= (policy->ifindex ^ skb->ifindex);
 
-        int score = 0;
+        // policy->match_* are normal booleans, and should thus always be 0 or 1,
+        // thus you can think of these as:
+        //   if (policy->match_foo) match &&= (foo == policy->foo);
+        nomatch |= policy->match_proto * (protocol ^ policy->proto);
+        nomatch |= policy->match_src_ip * v6_not_equal(src_ip, policy->src_ip);
+        nomatch |= policy->match_dst_ip * v6_not_equal(dst_ip, policy->dst_ip);
+        nomatch |= policy->match_src_port * (sport ^ policy->src_port);
 
-        if (policy->match_proto) {
-            if (protocol != policy->proto) continue;
-            score += 0xFFFF;
-        }
-        if (policy->match_src_ip) {
-            if (v6_not_equal(src_ip, policy->src_ip)) continue;
-            score += 0xFFFF;
-        }
-        if (policy->match_dst_ip) {
-            if (v6_not_equal(dst_ip, policy->dst_ip)) continue;
-            score += 0xFFFF;
-        }
-        if (policy->match_src_port) {
-            if (sport != policy->src_port) continue;
-            score += 0xFFFF;
-        }
-        if (dport < policy->dst_port_start) continue;
-        if (dport > policy->dst_port_end) continue;
-        score += 0xFFFF + policy->dst_port_start - policy->dst_port_end;
+        // Since these values are u16s (<=63 bits), we can rely on u64 subtraction
+        // underflow setting the topmost bit.  Basically, you can think of:
+        //   nomatch |= (a - b) >> 63
+        // as:
+        //   match &&= (a >= b)
+        uint64_t dport64 = dport;  // Note: dst_port_{start_end} range is inclusive of both ends.
+        nomatch |= calculate_u64(dport64 - policy->dst_port_start) >> 63;
+        nomatch |= calculate_u64(policy->dst_port_end - dport64) >> 63;
 
-        if (score > best_score) {
-            best_score = score;
-            new_dscp = policy->dscp_val;
-        }
+        // score is 0x10000 for each matched field (proto, src_ip, dst_ip, src_port)
+        // plus 1..0x10000 for the dst_port range match (smaller for bigger ranges)
+        uint64_t score = 0;
+        score += policy->match_proto;  // reminder: match_* are boolean, thus 0 or 1
+        score += policy->match_src_ip;
+        score += policy->match_dst_ip;
+        score += policy->match_src_port;
+        score += 1;  // for a 1 element dst_port_{start,end} range
+        score <<= 16;  // scale up: ie. *= 0x10000
+        // now reduce score if the dst_port range is more than a single element
+        // we want to prioritize (ie. better score) matches of smaller ranges
+        score -= (policy->dst_port_end - policy->dst_port_start);  // -= 0..0xFFFF
+
+        // Here we need:
+        //   match &&= (score > best_score)
+        // which is the same as
+        //   match &&= (score >= best_score + 1)
+        // > not >= because we want equal score matches to prefer choosing earlier policies
+        nomatch |= calculate_u64(score - best_score - 1) >> 63;
+
+        COMPILER_FORCE_CALCULATION(nomatch);
+        if (nomatch) continue;
+
+        // only reachable if we matched the policy and (score > best_score)
+        best_score = score;
+        new_dscp = policy->dscp_val;
     }
 
     // Update cache with found policy.