switch from using packet ring to normal read

With clat ebpf offload of the receive path on 4.9+ P+ devices
this just doesn't matter any more.

Additionally:
- this is *much* simpler,
- reduces memory consumption,
- eliminates the need for memory locking privs,
- and allows us to increment max supported packet size,
  which fixes a long outstanding terrible bug wrt. GRO'ed
  packets being oversize and simply dropped.

So, win, win, win.

Test:
  git grep '(^|[^t])ring[.][ch]'
  git grep 'TP_FRAME_SIZE|TP_BLOCK_SIZE|TP_FRAMES|TP_FRAME_GAP|TP_NUM_BLOCKS|packet_ring|ring_create|ring_read'
  come up empty

  Verified ipv4 ping works on a flame device on an ipv6 only wireless network.

Bug: 130253220
Test: atest clatd_test netd_integration_test
Signed-off-by: Maciej Żenczykowski <maze@google.com>
Change-Id: I676dfd16ffd2961f14a4e8fcd707c6f9f484e024
diff --git a/Android.bp b/Android.bp
index bdddd7a..c6961de 100644
--- a/Android.bp
+++ b/Android.bp
@@ -52,7 +52,6 @@
         "logging.c",
         "netlink_callbacks.c",
         "netlink_msg.c",
-        "ring.c",
         "setif.c",
         "translate.c",
     ],
diff --git a/clatd.c b/clatd.c
index 3ffe934..86850b0 100644
--- a/clatd.c
+++ b/clatd.c
@@ -47,7 +47,6 @@
 #include "dump.h"
 #include "getaddr.h"
 #include "logging.h"
-#include "ring.h"
 #include "setif.h"
 #include "translate.h"
 
@@ -177,11 +176,8 @@
     exit(1);
   }
 
-  // keep CAP_NET_RAW capability to open raw socket, and CAP_IPC_LOCK for mmap
-  // to lock memory.
-  set_capability((1 << CAP_NET_ADMIN) |
-                 (1 << CAP_NET_RAW) |
-                 (1 << CAP_IPC_LOCK));
+  // keep CAP_NET_RAW capability to open raw socket.
+  set_capability((1 << CAP_NET_ADMIN) | (1 << CAP_NET_RAW));
 }
 
 /* function: open_sockets
@@ -202,8 +198,11 @@
 
   tunnel->write_fd6 = rawsock;
 
-  tunnel->read_fd6 = ring_create(tunnel);
+  // Will eventually be bound to htons(ETH_P_IPV6) protocol,
+  // but only after appropriate bpf filter is attached.
+  tunnel->read_fd6 = socket(AF_PACKET, SOCK_DGRAM | SOCK_CLOEXEC, 0);
   if (tunnel->read_fd6 < 0) {
+    logmsg(ANDROID_LOG_FATAL, "packet socket failed: %s", strerror(errno));
     exit(1);
   }
 }
@@ -343,10 +342,8 @@
  *   to_ipv6  - whether the packet is to be translated to ipv6 or ipv4
  */
 void read_packet(int read_fd, int write_fd, int to_ipv6) {
-  ssize_t readlen;
-  uint8_t buf[PACKETLEN], *packet;
-
-  readlen = read(read_fd, buf, PACKETLEN);
+  uint8_t buf[PACKETLEN];
+  ssize_t readlen = read(read_fd, buf, PACKETLEN);
 
   if (readlen < 0) {
     if (errno != EAGAIN) {
@@ -359,6 +356,11 @@
     return;
   }
 
+  if (!to_ipv6) {
+    translate_packet(write_fd, 0 /* to_ipv6 */, buf, readlen);
+    return;
+  }
+
   struct tun_pi *tun_header = (struct tun_pi *)buf;
   if (readlen < (ssize_t)sizeof(*tun_header)) {
     logmsg(ANDROID_LOG_WARN, "read_packet/short read: got %ld bytes", readlen);
@@ -375,9 +377,9 @@
     logmsg(ANDROID_LOG_WARN, "%s: unexpected flags = %d", __func__, tun_header->flags);
   }
 
-  packet = (uint8_t *)(tun_header + 1);
+  uint8_t *packet = (uint8_t *)(tun_header + 1);
   readlen -= sizeof(*tun_header);
-  translate_packet(write_fd, to_ipv6, packet, readlen);
+  translate_packet(write_fd, 1 /* to_ipv6 */, packet, readlen);
 }
 
 /* function: event_loop
@@ -400,24 +402,13 @@
         logmsg(ANDROID_LOG_WARN, "event_loop/poll returned an error: %s", strerror(errno));
       }
     } else {
-      if (wait_fd[0].revents & POLLIN) {
-        ring_read(&tunnel->ring, tunnel->fd4, 0 /* to_ipv6 */);
-      }
-      // If any other bit is set, assume it's due to an error (i.e. POLLERR).
-      if (wait_fd[0].revents & ~POLLIN) {
-        // ring_read doesn't clear the error indication on the socket.
-        recv(tunnel->read_fd6, NULL, 0, MSG_PEEK);
-        logmsg(ANDROID_LOG_WARN, "event_loop: clearing error on read_fd6: %s", strerror(errno));
-      }
-
       // Call read_packet if the socket has data to be read, but also if an
       // error is waiting. If we don't call read() after getting POLLERR, a
       // subsequent poll() will return immediately with POLLERR again,
       // causing this code to spin in a loop. Calling read() will clear the
       // socket error flag instead.
-      if (wait_fd[1].revents) {
-        read_packet(tunnel->fd4, tunnel->write_fd6, 1 /* to_ipv6 */);
-      }
+      if (wait_fd[0].revents) read_packet(tunnel->read_fd6, tunnel->fd4, 0 /* to_ipv6 */);
+      if (wait_fd[1].revents) read_packet(tunnel->fd4, tunnel->write_fd6, 1 /* to_ipv6 */);
     }
 
     time_t now = time(NULL);
diff --git a/clatd.h b/clatd.h
index 899458c..34fa885 100644
--- a/clatd.h
+++ b/clatd.h
@@ -24,9 +24,9 @@
 
 struct tun_data;
 
-#define MAXMTU 1500
+#define MAXMTU 65536
 #define PACKETLEN (MAXMTU + sizeof(struct tun_pi))
-#define CLATD_VERSION "1.4"
+#define CLATD_VERSION "1.5"
 
 #define ARRAY_SIZE(x) (sizeof(x) / sizeof((x)[0]))
 
diff --git a/clatd_test.cpp b/clatd_test.cpp
index c16a4dd..9b7a3e7 100644
--- a/clatd_test.cpp
+++ b/clatd_test.cpp
@@ -19,6 +19,7 @@
 #include <iostream>
 
 #include <arpa/inet.h>
+#include <linux/if_packet.h>
 #include <netinet/in6.h>
 #include <stdio.h>
 #include <sys/uio.h>
diff --git a/config.h b/config.h
index 1ba6850..9612192 100644
--- a/config.h
+++ b/config.h
@@ -21,12 +21,9 @@
 #include <linux/if.h>
 #include <netinet/in.h>
 
-#include "ring.h"
-
 struct tun_data {
   char device4[IFNAMSIZ];
   int read_fd6, write_fd6, fd4;
-  struct packet_ring ring;
 };
 
 struct clat_config {
diff --git a/ring.c b/ring.c
deleted file mode 100644
index 7626c6d..0000000
--- a/ring.c
+++ /dev/null
@@ -1,128 +0,0 @@
-/*
- * Copyright 2014 The Android Open Source Project
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *
- * ring.c - packet ring buffer functions
- */
-
-#include <arpa/inet.h>
-#include <errno.h>
-#include <linux/if.h>
-#include <linux/if_packet.h>
-#include <string.h>
-#include <sys/mman.h>
-#include <sys/socket.h>
-
-#include "config.h"
-#include "logging.h"
-#include "ring.h"
-#include "translate.h"
-
-int ring_create(struct tun_data *tunnel) {
-  // Will eventually be bound to htons(ETH_P_IPV6) protocol,
-  // but only after appropriate bpf filter is attached.
-  int packetsock = socket(AF_PACKET, SOCK_DGRAM | SOCK_CLOEXEC, 0);
-  if (packetsock < 0) {
-    logmsg(ANDROID_LOG_FATAL, "packet socket failed: %s", strerror(errno));
-    return -1;
-  }
-
-  int ver = TPACKET_V2;
-  if (setsockopt(packetsock, SOL_PACKET, PACKET_VERSION, (void *)&ver, sizeof(ver))) {
-    logmsg(ANDROID_LOG_FATAL, "setsockopt(PACKET_VERSION, %d) failed: %s", ver, strerror(errno));
-    return -1;
-  }
-
-  int on = 1;
-  if (setsockopt(packetsock, SOL_PACKET, PACKET_LOSS, (void *)&on, sizeof(on))) {
-    logmsg(ANDROID_LOG_WARN, "PACKET_LOSS failed: %s", strerror(errno));
-  }
-
-  struct packet_ring *ring = &tunnel->ring;
-  ring->numblocks          = TP_NUM_BLOCKS;
-
-  int total_frames = TP_FRAMES * ring->numblocks;
-
-  struct tpacket_req req = {
-    .tp_frame_size = TP_FRAME_SIZE,    // Frame size.
-    .tp_block_size = TP_BLOCK_SIZE,    // Frames per block.
-    .tp_block_nr   = ring->numblocks,  // Number of blocks.
-    .tp_frame_nr   = total_frames,     // Total frames.
-  };
-
-  if (setsockopt(packetsock, SOL_PACKET, PACKET_RX_RING, &req, sizeof(req)) < 0) {
-    logmsg(ANDROID_LOG_FATAL, "PACKET_RX_RING failed: %s", strerror(errno));
-    return -1;
-  }
-
-  size_t buflen = TP_BLOCK_SIZE * ring->numblocks;
-  ring->base    = mmap(NULL, buflen, PROT_READ | PROT_WRITE, MAP_SHARED | MAP_LOCKED | MAP_POPULATE,
-                    packetsock, 0);
-  if (ring->base == MAP_FAILED) {
-    logmsg(ANDROID_LOG_FATAL, "mmap %lu failed: %s", buflen, strerror(errno));
-    return -1;
-  }
-
-  ring->block    = 0;
-  ring->slot     = 0;
-  ring->numslots = TP_BLOCK_SIZE / TP_FRAME_SIZE;
-  ring->next     = (struct tpacket2_hdr *)ring->base;
-
-  logmsg(ANDROID_LOG_INFO, "Using ring buffer with %d frames (%d bytes) at %p", total_frames,
-         buflen, ring->base);
-
-  return packetsock;
-}
-
-/* function: ring_advance
- * advances to the next position in the packet ring
- * ring - packet ring buffer
- */
-static struct tpacket2_hdr *ring_advance(struct packet_ring *ring) {
-  uint8_t *next = (uint8_t *)ring->next;
-
-  ring->slot++;
-  next += TP_FRAME_SIZE;
-
-  if (ring->slot == ring->numslots) {
-    ring->slot = 0;
-    ring->block++;
-
-    if (ring->block < ring->numblocks) {
-      next += TP_FRAME_GAP;
-    } else {
-      ring->block = 0;
-      next        = (uint8_t *)ring->base;
-    }
-  }
-
-  ring->next = (struct tpacket2_hdr *)next;
-  return ring->next;
-}
-
-/* function: ring_read
- * reads a packet from the ring buffer and translates it
- * read_fd  - file descriptor to read original packet from
- * write_fd - file descriptor to write translated packet to
- * to_ipv6  - whether the packet is to be translated to ipv6 or ipv4
- */
-void ring_read(struct packet_ring *ring, int write_fd, int to_ipv6) {
-  struct tpacket2_hdr *tp = ring->next;
-  if (tp->tp_status & TP_STATUS_USER) {
-    uint8_t *packet = ((uint8_t *)tp) + tp->tp_net;
-    translate_packet(write_fd, to_ipv6, packet, tp->tp_len);
-    tp->tp_status = TP_STATUS_KERNEL;
-    tp            = ring_advance(ring);
-  }
-}
diff --git a/ring.h b/ring.h
deleted file mode 100644
index b9b8c11..0000000
--- a/ring.h
+++ /dev/null
@@ -1,55 +0,0 @@
-/*
- * Copyright 2014 The Android Open Source Project
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *
- * ring.c - packet ring buffer functions
- */
-#ifndef __RING_H__
-#define __RING_H__
-
-#include <linux/if.h>
-#include <linux/if_packet.h>
-
-#include "clatd.h"
-
-struct tun_data;
-
-// Frame size. Must be a multiple of TPACKET_ALIGNMENT (=16)
-// Why the 16? http://lxr.free-electrons.com/source/net/packet/af_packet.c?v=3.4#L1764
-#define TP_FRAME_SIZE (TPACKET_ALIGN(MAXMTU) + TPACKET_ALIGN(TPACKET2_HDRLEN) + 16)
-
-// Block size. Must be a multiple of the page size, and a power of two for efficient memory use.
-#define TP_BLOCK_SIZE 65536
-
-// In order to save memory, our frames are not an exact divider of the block size. Therefore, the
-// mmaped region will have gaps corresponding to the empty space at the end of each block.
-#define TP_FRAMES (TP_BLOCK_SIZE / TP_FRAME_SIZE)
-#define TP_FRAME_GAP (TP_BLOCK_SIZE % TP_FRAME_SIZE)
-
-// TODO: Make this configurable. This requires some refactoring because the packet socket is
-// opened before we drop privileges, but the configuration file is read after. A value of 16
-// results in 656 frames (1048576 bytes).
-#define TP_NUM_BLOCKS 16
-
-struct packet_ring {
-  uint8_t *base;
-  struct tpacket2_hdr *next;
-  int slot, numslots;
-  int block, numblocks;
-};
-
-int ring_create(struct tun_data *tunnel);
-void ring_read(struct packet_ring *ring, int write_fd, int to_ipv6);
-
-#endif