Skip to content

Commit f1a456f

Browse files
Talal Ahmadkuba-moo
authored andcommitted
net: avoid double accounting for pure zerocopy skbs
Track skbs with only zerocopy data and avoid charging them to kernel memory to correctly account the memory utilization for msg_zerocopy. All of the data in such skbs is held in user pages which are already accounted to user. Before this change, they are charged again in kernel in __zerocopy_sg_from_iter. The charging in kernel is excessive because data is not being copied into skb frags. This excessive charging can lead to kernel going into memory pressure state which impacts all sockets in the system adversely. Mark pure zerocopy skbs with a SKBFL_PURE_ZEROCOPY flag and remove charge/uncharge for data in such skbs. Initially, an skb is marked pure zerocopy when it is empty and in zerocopy path. skb can then change from a pure zerocopy skb to mixed data skb (zerocopy and copy data) if it is at tail of write queue and there is room available in it and non-zerocopy data is being sent in the next sendmsg call. At this time sk_mem_charge is done for the pure zerocopied data and the pure zerocopy flag is unmarked. We found that this happens very rarely on workloads that pass MSG_ZEROCOPY. A pure zerocopy skb can later be coalesced into normal skb if they are next to each other in queue but this patch prevents coalescing from happening. This avoids complexity of charging when skb downgrades from pure zerocopy to mixed. This is also rare. In sk_wmem_free_skb, if it is a pure zerocopy skb, an sk_mem_uncharge for SKB_TRUESIZE(MAX_TCP_HEADER) is done for sk_mem_charge in tcp_skb_entail for an skb without data. Testing with the msg_zerocopy.c benchmark between two hosts(100G nics) with zerocopy showed that before this patch the 'sock' variable in memory.stat for cgroup2 that tracks sum of sk_forward_alloc, sk_rmem_alloc and sk_wmem_queued is around 1822720 and with this change it is 0. This is due to no charge to sk_forward_alloc for zerocopy data and shows memory utilization for kernel is lowered. Signed-off-by: Talal Ahmad <[email protected]> Acked-by: Arjun Roy <[email protected]> Acked-by: Soheil Hassas Yeganeh <[email protected]> Signed-off-by: Willem de Bruijn <[email protected]> Signed-off-by: Eric Dumazet <[email protected]> Signed-off-by: Jakub Kicinski <[email protected]>
1 parent 03271f3 commit f1a456f

File tree

6 files changed

+53
-9
lines changed

6 files changed

+53
-9
lines changed

include/linux/skbuff.h

Lines changed: 18 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -454,9 +454,15 @@ enum {
454454
* all frags to avoid possible bad checksum
455455
*/
456456
SKBFL_SHARED_FRAG = BIT(1),
457+
458+
/* segment contains only zerocopy data and should not be
459+
* charged to the kernel memory.
460+
*/
461+
SKBFL_PURE_ZEROCOPY = BIT(2),
457462
};
458463

459464
#define SKBFL_ZEROCOPY_FRAG (SKBFL_ZEROCOPY_ENABLE | SKBFL_SHARED_FRAG)
465+
#define SKBFL_ALL_ZEROCOPY (SKBFL_ZEROCOPY_FRAG | SKBFL_PURE_ZEROCOPY)
460466

461467
/*
462468
* The callback notifies userspace to release buffers when skb DMA is done in
@@ -1464,6 +1470,17 @@ static inline struct ubuf_info *skb_zcopy(struct sk_buff *skb)
14641470
return is_zcopy ? skb_uarg(skb) : NULL;
14651471
}
14661472

1473+
static inline bool skb_zcopy_pure(const struct sk_buff *skb)
1474+
{
1475+
return skb_shinfo(skb)->flags & SKBFL_PURE_ZEROCOPY;
1476+
}
1477+
1478+
static inline bool skb_pure_zcopy_same(const struct sk_buff *skb1,
1479+
const struct sk_buff *skb2)
1480+
{
1481+
return skb_zcopy_pure(skb1) == skb_zcopy_pure(skb2);
1482+
}
1483+
14671484
static inline void net_zcopy_get(struct ubuf_info *uarg)
14681485
{
14691486
refcount_inc(&uarg->refcnt);
@@ -1528,7 +1545,7 @@ static inline void skb_zcopy_clear(struct sk_buff *skb, bool zerocopy_success)
15281545
if (!skb_zcopy_is_nouarg(skb))
15291546
uarg->callback(skb, uarg, zerocopy_success);
15301547

1531-
skb_shinfo(skb)->flags &= ~SKBFL_ZEROCOPY_FRAG;
1548+
skb_shinfo(skb)->flags &= ~SKBFL_ALL_ZEROCOPY;
15321549
}
15331550
}
15341551

include/net/tcp.h

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -293,7 +293,10 @@ static inline bool tcp_out_of_memory(struct sock *sk)
293293
static inline void tcp_wmem_free_skb(struct sock *sk, struct sk_buff *skb)
294294
{
295295
sk_wmem_queued_add(sk, -skb->truesize);
296-
sk_mem_uncharge(sk, skb->truesize);
296+
if (!skb_zcopy_pure(skb))
297+
sk_mem_uncharge(sk, skb->truesize);
298+
else
299+
sk_mem_uncharge(sk, SKB_TRUESIZE(MAX_TCP_HEADER));
297300
__kfree_skb(skb);
298301
}
299302

@@ -974,7 +977,8 @@ static inline bool tcp_skb_can_collapse(const struct sk_buff *to,
974977
const struct sk_buff *from)
975978
{
976979
return likely(tcp_skb_can_collapse_to(to) &&
977-
mptcp_skb_can_collapse(to, from));
980+
mptcp_skb_can_collapse(to, from) &&
981+
skb_pure_zcopy_same(to, from));
978982
}
979983

980984
/* Events passed to congestion control interface */

net/core/datagram.c

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -646,7 +646,8 @@ int __zerocopy_sg_from_iter(struct sock *sk, struct sk_buff *skb,
646646
skb->truesize += truesize;
647647
if (sk && sk->sk_type == SOCK_STREAM) {
648648
sk_wmem_queued_add(sk, truesize);
649-
sk_mem_charge(sk, truesize);
649+
if (!skb_zcopy_pure(skb))
650+
sk_mem_charge(sk, truesize);
650651
} else {
651652
refcount_add(truesize, &skb->sk->sk_wmem_alloc);
652653
}

net/core/skbuff.c

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3433,8 +3433,9 @@ static inline void skb_split_no_header(struct sk_buff *skb,
34333433
void skb_split(struct sk_buff *skb, struct sk_buff *skb1, const u32 len)
34343434
{
34353435
int pos = skb_headlen(skb);
3436+
const int zc_flags = SKBFL_SHARED_FRAG | SKBFL_PURE_ZEROCOPY;
34363437

3437-
skb_shinfo(skb1)->flags |= skb_shinfo(skb)->flags & SKBFL_SHARED_FRAG;
3438+
skb_shinfo(skb1)->flags |= skb_shinfo(skb)->flags & zc_flags;
34383439
skb_zerocopy_clone(skb1, skb, 0);
34393440
if (len < pos) /* Split line is inside header. */
34403441
skb_split_inside_header(skb, skb1, len, pos);

net/ipv4/tcp.c

Lines changed: 20 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -863,6 +863,7 @@ struct sk_buff *tcp_stream_alloc_skb(struct sock *sk, int size, gfp_t gfp,
863863
if (likely(skb)) {
864864
bool mem_scheduled;
865865

866+
skb->truesize = SKB_TRUESIZE(size + MAX_TCP_HEADER);
866867
if (force_schedule) {
867868
mem_scheduled = true;
868869
sk_forced_mem_schedule(sk, skb->truesize);
@@ -1319,6 +1320,15 @@ int tcp_sendmsg_locked(struct sock *sk, struct msghdr *msg, size_t size)
13191320

13201321
copy = min_t(int, copy, pfrag->size - pfrag->offset);
13211322

1323+
/* skb changing from pure zc to mixed, must charge zc */
1324+
if (unlikely(skb_zcopy_pure(skb))) {
1325+
if (!sk_wmem_schedule(sk, skb->data_len))
1326+
goto wait_for_space;
1327+
1328+
sk_mem_charge(sk, skb->data_len);
1329+
skb_shinfo(skb)->flags &= ~SKBFL_PURE_ZEROCOPY;
1330+
}
1331+
13221332
if (!sk_wmem_schedule(sk, copy))
13231333
goto wait_for_space;
13241334

@@ -1339,8 +1349,16 @@ int tcp_sendmsg_locked(struct sock *sk, struct msghdr *msg, size_t size)
13391349
}
13401350
pfrag->offset += copy;
13411351
} else {
1342-
if (!sk_wmem_schedule(sk, copy))
1343-
goto wait_for_space;
1352+
/* First append to a fragless skb builds initial
1353+
* pure zerocopy skb
1354+
*/
1355+
if (!skb->len)
1356+
skb_shinfo(skb)->flags |= SKBFL_PURE_ZEROCOPY;
1357+
1358+
if (!skb_zcopy_pure(skb)) {
1359+
if (!sk_wmem_schedule(sk, copy))
1360+
goto wait_for_space;
1361+
}
13441362

13451363
err = skb_zerocopy_iter_stream(sk, skb, msg, copy, uarg);
13461364
if (err == -EMSGSIZE || err == -EEXIST) {

net/ipv4/tcp_output.c

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1677,7 +1677,8 @@ int tcp_trim_head(struct sock *sk, struct sk_buff *skb, u32 len)
16771677
if (delta_truesize) {
16781678
skb->truesize -= delta_truesize;
16791679
sk_wmem_queued_add(sk, -delta_truesize);
1680-
sk_mem_uncharge(sk, delta_truesize);
1680+
if (!skb_zcopy_pure(skb))
1681+
sk_mem_uncharge(sk, delta_truesize);
16811682
}
16821683

16831684
/* Any change of skb->len requires recalculation of tso factor. */
@@ -2295,7 +2296,9 @@ static bool tcp_can_coalesce_send_queue_head(struct sock *sk, int len)
22952296
if (len <= skb->len)
22962297
break;
22972298

2298-
if (unlikely(TCP_SKB_CB(skb)->eor) || tcp_has_tx_tstamp(skb))
2299+
if (unlikely(TCP_SKB_CB(skb)->eor) ||
2300+
tcp_has_tx_tstamp(skb) ||
2301+
!skb_pure_zcopy_same(skb, next))
22992302
return false;
23002303

23012304
len -= skb->len;

0 commit comments

Comments
 (0)