switch from using packet ring to normal read
With clat ebpf offload of the receive path on 4.9+ P+ devices
this just doesn't matter any more.
Additionally:
- this is *much* simpler,
- reduces memory consumption,
- eliminates the need for memory locking privs,
- and allows us to increment max supported packet size,
which fixes a long outstanding terrible bug wrt. GRO'ed
packets being oversize and simply dropped.
So, win, win, win.
Test:
git grep '(^|[^t])ring[.][ch]'
git grep 'TP_FRAME_SIZE|TP_BLOCK_SIZE|TP_FRAMES|TP_FRAME_GAP|TP_NUM_BLOCKS|packet_ring|ring_create|ring_read'
come up empty
Verified ipv4 ping works on a flame device on an ipv6 only wireless network.
Bug: 130253220
Test: atest clatd_test netd_integration_test
Signed-off-by: Maciej Żenczykowski <maze@google.com>
Change-Id: I676dfd16ffd2961f14a4e8fcd707c6f9f484e024
diff --git a/Android.bp b/Android.bp
index bdddd7a..c6961de 100644
--- a/Android.bp
+++ b/Android.bp
@@ -52,7 +52,6 @@
"logging.c",
"netlink_callbacks.c",
"netlink_msg.c",
- "ring.c",
"setif.c",
"translate.c",
],
diff --git a/clatd.c b/clatd.c
index 3ffe934..86850b0 100644
--- a/clatd.c
+++ b/clatd.c
@@ -47,7 +47,6 @@
#include "dump.h"
#include "getaddr.h"
#include "logging.h"
-#include "ring.h"
#include "setif.h"
#include "translate.h"
@@ -177,11 +176,8 @@
exit(1);
}
- // keep CAP_NET_RAW capability to open raw socket, and CAP_IPC_LOCK for mmap
- // to lock memory.
- set_capability((1 << CAP_NET_ADMIN) |
- (1 << CAP_NET_RAW) |
- (1 << CAP_IPC_LOCK));
+ // keep CAP_NET_RAW capability to open raw socket.
+ set_capability((1 << CAP_NET_ADMIN) | (1 << CAP_NET_RAW));
}
/* function: open_sockets
@@ -202,8 +198,11 @@
tunnel->write_fd6 = rawsock;
- tunnel->read_fd6 = ring_create(tunnel);
+ // Will eventually be bound to htons(ETH_P_IPV6) protocol,
+ // but only after appropriate bpf filter is attached.
+ tunnel->read_fd6 = socket(AF_PACKET, SOCK_DGRAM | SOCK_CLOEXEC, 0);
if (tunnel->read_fd6 < 0) {
+ logmsg(ANDROID_LOG_FATAL, "packet socket failed: %s", strerror(errno));
exit(1);
}
}
@@ -343,10 +342,8 @@
* to_ipv6 - whether the packet is to be translated to ipv6 or ipv4
*/
void read_packet(int read_fd, int write_fd, int to_ipv6) {
- ssize_t readlen;
- uint8_t buf[PACKETLEN], *packet;
-
- readlen = read(read_fd, buf, PACKETLEN);
+ uint8_t buf[PACKETLEN];
+ ssize_t readlen = read(read_fd, buf, PACKETLEN);
if (readlen < 0) {
if (errno != EAGAIN) {
@@ -359,6 +356,11 @@
return;
}
+ if (!to_ipv6) {
+ translate_packet(write_fd, 0 /* to_ipv6 */, buf, readlen);
+ return;
+ }
+
struct tun_pi *tun_header = (struct tun_pi *)buf;
if (readlen < (ssize_t)sizeof(*tun_header)) {
logmsg(ANDROID_LOG_WARN, "read_packet/short read: got %ld bytes", readlen);
@@ -375,9 +377,9 @@
logmsg(ANDROID_LOG_WARN, "%s: unexpected flags = %d", __func__, tun_header->flags);
}
- packet = (uint8_t *)(tun_header + 1);
+ uint8_t *packet = (uint8_t *)(tun_header + 1);
readlen -= sizeof(*tun_header);
- translate_packet(write_fd, to_ipv6, packet, readlen);
+ translate_packet(write_fd, 1 /* to_ipv6 */, packet, readlen);
}
/* function: event_loop
@@ -400,24 +402,13 @@
logmsg(ANDROID_LOG_WARN, "event_loop/poll returned an error: %s", strerror(errno));
}
} else {
- if (wait_fd[0].revents & POLLIN) {
- ring_read(&tunnel->ring, tunnel->fd4, 0 /* to_ipv6 */);
- }
- // If any other bit is set, assume it's due to an error (i.e. POLLERR).
- if (wait_fd[0].revents & ~POLLIN) {
- // ring_read doesn't clear the error indication on the socket.
- recv(tunnel->read_fd6, NULL, 0, MSG_PEEK);
- logmsg(ANDROID_LOG_WARN, "event_loop: clearing error on read_fd6: %s", strerror(errno));
- }
-
// Call read_packet if the socket has data to be read, but also if an
// error is waiting. If we don't call read() after getting POLLERR, a
// subsequent poll() will return immediately with POLLERR again,
// causing this code to spin in a loop. Calling read() will clear the
// socket error flag instead.
- if (wait_fd[1].revents) {
- read_packet(tunnel->fd4, tunnel->write_fd6, 1 /* to_ipv6 */);
- }
+ if (wait_fd[0].revents) read_packet(tunnel->read_fd6, tunnel->fd4, 0 /* to_ipv6 */);
+ if (wait_fd[1].revents) read_packet(tunnel->fd4, tunnel->write_fd6, 1 /* to_ipv6 */);
}
time_t now = time(NULL);
diff --git a/clatd.h b/clatd.h
index 899458c..34fa885 100644
--- a/clatd.h
+++ b/clatd.h
@@ -24,9 +24,9 @@
struct tun_data;
-#define MAXMTU 1500
+#define MAXMTU 65536
#define PACKETLEN (MAXMTU + sizeof(struct tun_pi))
-#define CLATD_VERSION "1.4"
+#define CLATD_VERSION "1.5"
#define ARRAY_SIZE(x) (sizeof(x) / sizeof((x)[0]))
diff --git a/clatd_test.cpp b/clatd_test.cpp
index c16a4dd..9b7a3e7 100644
--- a/clatd_test.cpp
+++ b/clatd_test.cpp
@@ -19,6 +19,7 @@
#include <iostream>
#include <arpa/inet.h>
+#include <linux/if_packet.h>
#include <netinet/in6.h>
#include <stdio.h>
#include <sys/uio.h>
diff --git a/config.h b/config.h
index 1ba6850..9612192 100644
--- a/config.h
+++ b/config.h
@@ -21,12 +21,9 @@
#include <linux/if.h>
#include <netinet/in.h>
-#include "ring.h"
-
struct tun_data {
char device4[IFNAMSIZ];
int read_fd6, write_fd6, fd4;
- struct packet_ring ring;
};
struct clat_config {
diff --git a/ring.c b/ring.c
deleted file mode 100644
index 7626c6d..0000000
--- a/ring.c
+++ /dev/null
@@ -1,128 +0,0 @@
-/*
- * Copyright 2014 The Android Open Source Project
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *
- * ring.c - packet ring buffer functions
- */
-
-#include <arpa/inet.h>
-#include <errno.h>
-#include <linux/if.h>
-#include <linux/if_packet.h>
-#include <string.h>
-#include <sys/mman.h>
-#include <sys/socket.h>
-
-#include "config.h"
-#include "logging.h"
-#include "ring.h"
-#include "translate.h"
-
-int ring_create(struct tun_data *tunnel) {
- // Will eventually be bound to htons(ETH_P_IPV6) protocol,
- // but only after appropriate bpf filter is attached.
- int packetsock = socket(AF_PACKET, SOCK_DGRAM | SOCK_CLOEXEC, 0);
- if (packetsock < 0) {
- logmsg(ANDROID_LOG_FATAL, "packet socket failed: %s", strerror(errno));
- return -1;
- }
-
- int ver = TPACKET_V2;
- if (setsockopt(packetsock, SOL_PACKET, PACKET_VERSION, (void *)&ver, sizeof(ver))) {
- logmsg(ANDROID_LOG_FATAL, "setsockopt(PACKET_VERSION, %d) failed: %s", ver, strerror(errno));
- return -1;
- }
-
- int on = 1;
- if (setsockopt(packetsock, SOL_PACKET, PACKET_LOSS, (void *)&on, sizeof(on))) {
- logmsg(ANDROID_LOG_WARN, "PACKET_LOSS failed: %s", strerror(errno));
- }
-
- struct packet_ring *ring = &tunnel->ring;
- ring->numblocks = TP_NUM_BLOCKS;
-
- int total_frames = TP_FRAMES * ring->numblocks;
-
- struct tpacket_req req = {
- .tp_frame_size = TP_FRAME_SIZE, // Frame size.
- .tp_block_size = TP_BLOCK_SIZE, // Frames per block.
- .tp_block_nr = ring->numblocks, // Number of blocks.
- .tp_frame_nr = total_frames, // Total frames.
- };
-
- if (setsockopt(packetsock, SOL_PACKET, PACKET_RX_RING, &req, sizeof(req)) < 0) {
- logmsg(ANDROID_LOG_FATAL, "PACKET_RX_RING failed: %s", strerror(errno));
- return -1;
- }
-
- size_t buflen = TP_BLOCK_SIZE * ring->numblocks;
- ring->base = mmap(NULL, buflen, PROT_READ | PROT_WRITE, MAP_SHARED | MAP_LOCKED | MAP_POPULATE,
- packetsock, 0);
- if (ring->base == MAP_FAILED) {
- logmsg(ANDROID_LOG_FATAL, "mmap %lu failed: %s", buflen, strerror(errno));
- return -1;
- }
-
- ring->block = 0;
- ring->slot = 0;
- ring->numslots = TP_BLOCK_SIZE / TP_FRAME_SIZE;
- ring->next = (struct tpacket2_hdr *)ring->base;
-
- logmsg(ANDROID_LOG_INFO, "Using ring buffer with %d frames (%d bytes) at %p", total_frames,
- buflen, ring->base);
-
- return packetsock;
-}
-
-/* function: ring_advance
- * advances to the next position in the packet ring
- * ring - packet ring buffer
- */
-static struct tpacket2_hdr *ring_advance(struct packet_ring *ring) {
- uint8_t *next = (uint8_t *)ring->next;
-
- ring->slot++;
- next += TP_FRAME_SIZE;
-
- if (ring->slot == ring->numslots) {
- ring->slot = 0;
- ring->block++;
-
- if (ring->block < ring->numblocks) {
- next += TP_FRAME_GAP;
- } else {
- ring->block = 0;
- next = (uint8_t *)ring->base;
- }
- }
-
- ring->next = (struct tpacket2_hdr *)next;
- return ring->next;
-}
-
-/* function: ring_read
- * reads a packet from the ring buffer and translates it
- * read_fd - file descriptor to read original packet from
- * write_fd - file descriptor to write translated packet to
- * to_ipv6 - whether the packet is to be translated to ipv6 or ipv4
- */
-void ring_read(struct packet_ring *ring, int write_fd, int to_ipv6) {
- struct tpacket2_hdr *tp = ring->next;
- if (tp->tp_status & TP_STATUS_USER) {
- uint8_t *packet = ((uint8_t *)tp) + tp->tp_net;
- translate_packet(write_fd, to_ipv6, packet, tp->tp_len);
- tp->tp_status = TP_STATUS_KERNEL;
- tp = ring_advance(ring);
- }
-}
diff --git a/ring.h b/ring.h
deleted file mode 100644
index b9b8c11..0000000
--- a/ring.h
+++ /dev/null
@@ -1,55 +0,0 @@
-/*
- * Copyright 2014 The Android Open Source Project
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *
- * ring.c - packet ring buffer functions
- */
-#ifndef __RING_H__
-#define __RING_H__
-
-#include <linux/if.h>
-#include <linux/if_packet.h>
-
-#include "clatd.h"
-
-struct tun_data;
-
-// Frame size. Must be a multiple of TPACKET_ALIGNMENT (=16)
-// Why the 16? http://lxr.free-electrons.com/source/net/packet/af_packet.c?v=3.4#L1764
-#define TP_FRAME_SIZE (TPACKET_ALIGN(MAXMTU) + TPACKET_ALIGN(TPACKET2_HDRLEN) + 16)
-
-// Block size. Must be a multiple of the page size, and a power of two for efficient memory use.
-#define TP_BLOCK_SIZE 65536
-
-// In order to save memory, our frames are not an exact divider of the block size. Therefore, the
-// mmaped region will have gaps corresponding to the empty space at the end of each block.
-#define TP_FRAMES (TP_BLOCK_SIZE / TP_FRAME_SIZE)
-#define TP_FRAME_GAP (TP_BLOCK_SIZE % TP_FRAME_SIZE)
-
-// TODO: Make this configurable. This requires some refactoring because the packet socket is
-// opened before we drop privileges, but the configuration file is read after. A value of 16
-// results in 656 frames (1048576 bytes).
-#define TP_NUM_BLOCKS 16
-
-struct packet_ring {
- uint8_t *base;
- struct tpacket2_hdr *next;
- int slot, numslots;
- int block, numblocks;
-};
-
-int ring_create(struct tun_data *tunnel);
-void ring_read(struct packet_ring *ring, int write_fd, int to_ipv6);
-
-#endif