Skip to content

Commit 1576d98

Browse files
jasowangdavem330
authored andcommitted
tun: switch to use skb array for tx
We used to queue tx packets in sk_receive_queue, this is less efficient since it requires spinlocks to synchronize between producer and consumer. This patch tries to address this by: - switch from sk_receive_queue to a skb_array, and resize it when tx_queue_len was changed. - introduce a new proto_ops peek_len which was used for peeking the skb length. - implement a tun version of peek_len for vhost_net to use and convert vhost_net to use peek_len if possible. Pktgen test shows about 15.3% improvement on guest receiving pps for small buffers: Before: ~1300000pps After : ~1500000pps Signed-off-by: Jason Wang <[email protected]> Signed-off-by: David S. Miller <[email protected]>
1 parent 08294a2 commit 1576d98

File tree

3 files changed

+146
-9
lines changed

3 files changed

+146
-9
lines changed

drivers/net/tun.c

Lines changed: 130 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -71,6 +71,7 @@
7171
#include <net/sock.h>
7272
#include <linux/seq_file.h>
7373
#include <linux/uio.h>
74+
#include <linux/skb_array.h>
7475

7576
#include <asm/uaccess.h>
7677

@@ -167,6 +168,7 @@ struct tun_file {
167168
};
168169
struct list_head next;
169170
struct tun_struct *detached;
171+
struct skb_array tx_array;
170172
};
171173

172174
struct tun_flow_entry {
@@ -515,7 +517,11 @@ static struct tun_struct *tun_enable_queue(struct tun_file *tfile)
515517

516518
static void tun_queue_purge(struct tun_file *tfile)
517519
{
518-
skb_queue_purge(&tfile->sk.sk_receive_queue);
520+
struct sk_buff *skb;
521+
522+
while ((skb = skb_array_consume(&tfile->tx_array)) != NULL)
523+
kfree_skb(skb);
524+
519525
skb_queue_purge(&tfile->sk.sk_error_queue);
520526
}
521527

@@ -560,6 +566,8 @@ static void __tun_detach(struct tun_file *tfile, bool clean)
560566
tun->dev->reg_state == NETREG_REGISTERED)
561567
unregister_netdevice(tun->dev);
562568
}
569+
if (tun)
570+
skb_array_cleanup(&tfile->tx_array);
563571
sock_put(&tfile->sk);
564572
}
565573
}
@@ -613,6 +621,7 @@ static void tun_detach_all(struct net_device *dev)
613621
static int tun_attach(struct tun_struct *tun, struct file *file, bool skip_filter)
614622
{
615623
struct tun_file *tfile = file->private_data;
624+
struct net_device *dev = tun->dev;
616625
int err;
617626

618627
err = security_tun_dev_attach(tfile->socket.sk, tun->security);
@@ -642,6 +651,13 @@ static int tun_attach(struct tun_struct *tun, struct file *file, bool skip_filte
642651
if (!err)
643652
goto out;
644653
}
654+
655+
if (!tfile->detached &&
656+
skb_array_init(&tfile->tx_array, dev->tx_queue_len, GFP_KERNEL)) {
657+
err = -ENOMEM;
658+
goto out;
659+
}
660+
645661
tfile->queue_index = tun->numqueues;
646662
tfile->socket.sk->sk_shutdown &= ~RCV_SHUTDOWN;
647663
rcu_assign_pointer(tfile->tun, tun);
@@ -891,8 +907,8 @@ static netdev_tx_t tun_net_xmit(struct sk_buff *skb, struct net_device *dev)
891907

892908
nf_reset(skb);
893909

894-
/* Enqueue packet */
895-
skb_queue_tail(&tfile->socket.sk->sk_receive_queue, skb);
910+
if (skb_array_produce(&tfile->tx_array, skb))
911+
goto drop;
896912

897913
/* Notify and wake up reader process */
898914
if (tfile->flags & TUN_FASYNC)
@@ -1107,7 +1123,7 @@ static unsigned int tun_chr_poll(struct file *file, poll_table *wait)
11071123

11081124
poll_wait(file, sk_sleep(sk), wait);
11091125

1110-
if (!skb_queue_empty(&sk->sk_receive_queue))
1126+
if (!skb_array_empty(&tfile->tx_array))
11111127
mask |= POLLIN | POLLRDNORM;
11121128

11131129
if (sock_writeable(sk) ||
@@ -1426,22 +1442,61 @@ static ssize_t tun_put_user(struct tun_struct *tun,
14261442
return total;
14271443
}
14281444

1445+
static struct sk_buff *tun_ring_recv(struct tun_file *tfile, int noblock,
1446+
int *err)
1447+
{
1448+
DECLARE_WAITQUEUE(wait, current);
1449+
struct sk_buff *skb = NULL;
1450+
1451+
skb = skb_array_consume(&tfile->tx_array);
1452+
if (skb)
1453+
goto out;
1454+
if (noblock) {
1455+
*err = -EAGAIN;
1456+
goto out;
1457+
}
1458+
1459+
add_wait_queue(&tfile->wq.wait, &wait);
1460+
current->state = TASK_INTERRUPTIBLE;
1461+
1462+
while (1) {
1463+
skb = skb_array_consume(&tfile->tx_array);
1464+
if (skb)
1465+
break;
1466+
if (signal_pending(current)) {
1467+
*err = -ERESTARTSYS;
1468+
break;
1469+
}
1470+
if (tfile->socket.sk->sk_shutdown & RCV_SHUTDOWN) {
1471+
*err = -EFAULT;
1472+
break;
1473+
}
1474+
1475+
schedule();
1476+
}
1477+
1478+
current->state = TASK_RUNNING;
1479+
remove_wait_queue(&tfile->wq.wait, &wait);
1480+
1481+
out:
1482+
return skb;
1483+
}
1484+
14291485
static ssize_t tun_do_read(struct tun_struct *tun, struct tun_file *tfile,
14301486
struct iov_iter *to,
14311487
int noblock)
14321488
{
14331489
struct sk_buff *skb;
14341490
ssize_t ret;
1435-
int peeked, err, off = 0;
1491+
int err;
14361492

14371493
tun_debug(KERN_INFO, tun, "tun_do_read\n");
14381494

14391495
if (!iov_iter_count(to))
14401496
return 0;
14411497

1442-
/* Read frames from queue */
1443-
skb = __skb_recv_datagram(tfile->socket.sk, noblock ? MSG_DONTWAIT : 0,
1444-
&peeked, &off, &err);
1498+
/* Read frames from ring */
1499+
skb = tun_ring_recv(tfile, noblock, &err);
14451500
if (!skb)
14461501
return err;
14471502

@@ -1574,8 +1629,25 @@ static int tun_recvmsg(struct socket *sock, struct msghdr *m, size_t total_len,
15741629
return ret;
15751630
}
15761631

1632+
static int tun_peek_len(struct socket *sock)
1633+
{
1634+
struct tun_file *tfile = container_of(sock, struct tun_file, socket);
1635+
struct tun_struct *tun;
1636+
int ret = 0;
1637+
1638+
tun = __tun_get(tfile);
1639+
if (!tun)
1640+
return 0;
1641+
1642+
ret = skb_array_peek_len(&tfile->tx_array);
1643+
tun_put(tun);
1644+
1645+
return ret;
1646+
}
1647+
15771648
/* Ops structure to mimic raw sockets with tun */
15781649
static const struct proto_ops tun_socket_ops = {
1650+
.peek_len = tun_peek_len,
15791651
.sendmsg = tun_sendmsg,
15801652
.recvmsg = tun_recvmsg,
15811653
};
@@ -2397,6 +2469,53 @@ static const struct ethtool_ops tun_ethtool_ops = {
23972469
.get_ts_info = ethtool_op_get_ts_info,
23982470
};
23992471

2472+
static int tun_queue_resize(struct tun_struct *tun)
2473+
{
2474+
struct net_device *dev = tun->dev;
2475+
struct tun_file *tfile;
2476+
struct skb_array **arrays;
2477+
int n = tun->numqueues + tun->numdisabled;
2478+
int ret, i;
2479+
2480+
arrays = kmalloc(sizeof *arrays * n, GFP_KERNEL);
2481+
if (!arrays)
2482+
return -ENOMEM;
2483+
2484+
for (i = 0; i < tun->numqueues; i++) {
2485+
tfile = rtnl_dereference(tun->tfiles[i]);
2486+
arrays[i] = &tfile->tx_array;
2487+
}
2488+
list_for_each_entry(tfile, &tun->disabled, next)
2489+
arrays[i++] = &tfile->tx_array;
2490+
2491+
ret = skb_array_resize_multiple(arrays, n,
2492+
dev->tx_queue_len, GFP_KERNEL);
2493+
2494+
kfree(arrays);
2495+
return ret;
2496+
}
2497+
2498+
static int tun_device_event(struct notifier_block *unused,
2499+
unsigned long event, void *ptr)
2500+
{
2501+
struct net_device *dev = netdev_notifier_info_to_dev(ptr);
2502+
struct tun_struct *tun = netdev_priv(dev);
2503+
2504+
switch (event) {
2505+
case NETDEV_CHANGE_TX_QUEUE_LEN:
2506+
if (tun_queue_resize(tun))
2507+
return NOTIFY_BAD;
2508+
break;
2509+
default:
2510+
break;
2511+
}
2512+
2513+
return NOTIFY_DONE;
2514+
}
2515+
2516+
static struct notifier_block tun_notifier_block __read_mostly = {
2517+
.notifier_call = tun_device_event,
2518+
};
24002519

24012520
static int __init tun_init(void)
24022521
{
@@ -2416,6 +2535,8 @@ static int __init tun_init(void)
24162535
pr_err("Can't register misc device %d\n", TUN_MINOR);
24172536
goto err_misc;
24182537
}
2538+
2539+
register_netdevice_notifier(&tun_notifier_block);
24192540
return 0;
24202541
err_misc:
24212542
rtnl_link_unregister(&tun_link_ops);
@@ -2427,6 +2548,7 @@ static void tun_cleanup(void)
24272548
{
24282549
misc_deregister(&tun_miscdev);
24292550
rtnl_link_unregister(&tun_link_ops);
2551+
unregister_netdevice_notifier(&tun_notifier_block);
24302552
}
24312553

24322554
/* Get an underlying socket object from tun file. Returns error unless file is

drivers/vhost/net.c

Lines changed: 15 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -481,10 +481,14 @@ static void handle_tx(struct vhost_net *net)
481481

482482
static int peek_head_len(struct sock *sk)
483483
{
484+
struct socket *sock = sk->sk_socket;
484485
struct sk_buff *head;
485486
int len = 0;
486487
unsigned long flags;
487488

489+
if (sock->ops->peek_len)
490+
return sock->ops->peek_len(sock);
491+
488492
spin_lock_irqsave(&sk->sk_receive_queue.lock, flags);
489493
head = skb_peek(&sk->sk_receive_queue);
490494
if (likely(head)) {
@@ -497,6 +501,16 @@ static int peek_head_len(struct sock *sk)
497501
return len;
498502
}
499503

504+
static int sk_has_rx_data(struct sock *sk)
505+
{
506+
struct socket *sock = sk->sk_socket;
507+
508+
if (sock->ops->peek_len)
509+
return sock->ops->peek_len(sock);
510+
511+
return skb_queue_empty(&sk->sk_receive_queue);
512+
}
513+
500514
static int vhost_net_rx_peek_head_len(struct vhost_net *net, struct sock *sk)
501515
{
502516
struct vhost_net_virtqueue *nvq = &net->vqs[VHOST_NET_VQ_TX];
@@ -513,7 +527,7 @@ static int vhost_net_rx_peek_head_len(struct vhost_net *net, struct sock *sk)
513527
endtime = busy_clock() + vq->busyloop_timeout;
514528

515529
while (vhost_can_busy_poll(&net->dev, endtime) &&
516-
skb_queue_empty(&sk->sk_receive_queue) &&
530+
!sk_has_rx_data(sk) &&
517531
vhost_vq_avail_empty(&net->dev, vq))
518532
cpu_relax_lowlatency();
519533

include/linux/net.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -185,6 +185,7 @@ struct proto_ops {
185185
ssize_t (*splice_read)(struct socket *sock, loff_t *ppos,
186186
struct pipe_inode_info *pipe, size_t len, unsigned int flags);
187187
int (*set_peek_off)(struct sock *sk, int val);
188+
int (*peek_len)(struct socket *sock);
188189
};
189190

190191
#define DECLARE_SOCKADDR(type, dst, src) \

0 commit comments

Comments
 (0)