Skip to content

Commit 8edf086

Browse files
Florian Westphalkuba-moo
authored andcommitted
mptcp: rework poll+nospace handling
MPTCP maintains a status bit, MPTCP_SEND_SPACE, that is set when at least one subflow and the mptcp socket itself are writeable. mptcp_poll returns EPOLLOUT if the bit is set. mptcp_sendmsg makes sure MPTCP_SEND_SPACE gets cleared when last write has used up all subflows or the mptcp socket wmem. This reworks nospace handling as follows: MPTCP_SEND_SPACE is replaced with MPTCP_NOSPACE, i.e. inverted meaning. This bit is set when the mptcp socket is not writeable. The mptcp-level ack path schedule will then schedule the mptcp worker to allow it to free already-acked data (and reduce wmem usage). This will then wake userspace processes that wait for a POLLOUT event. sendmsg will set MPTCP_NOSPACE only when it has to wait for more wmem (blocking I/O case). poll path will set MPTCP_NOSPACE in case the mptcp socket is not writeable. Normal tcp-level notification (SOCK_NOSPACE) is only enabled in case the subflow socket has no available wmem. Signed-off-by: Florian Westphal <[email protected]> Signed-off-by: Paolo Abeni <[email protected]> Signed-off-by: Jakub Kicinski <[email protected]>
1 parent 813e0a6 commit 8edf086

File tree

3 files changed

+54
-51
lines changed

3 files changed

+54
-51
lines changed

net/mptcp/protocol.c

Lines changed: 48 additions & 44 deletions
Original file line numberDiff line numberDiff line change
@@ -724,7 +724,7 @@ void mptcp_data_acked(struct sock *sk)
724724
{
725725
mptcp_reset_timer(sk);
726726

727-
if ((!test_bit(MPTCP_SEND_SPACE, &mptcp_sk(sk)->flags) ||
727+
if ((test_bit(MPTCP_NOSPACE, &mptcp_sk(sk)->flags) ||
728728
mptcp_send_head(sk) ||
729729
(inet_sk_state_load(sk) != TCP_ESTABLISHED)))
730730
mptcp_schedule_work(sk);
@@ -835,20 +835,6 @@ static void dfrag_clear(struct sock *sk, struct mptcp_data_frag *dfrag)
835835
put_page(dfrag->page);
836836
}
837837

838-
static bool mptcp_is_writeable(struct mptcp_sock *msk)
839-
{
840-
struct mptcp_subflow_context *subflow;
841-
842-
if (!sk_stream_is_writeable((struct sock *)msk))
843-
return false;
844-
845-
mptcp_for_each_subflow(msk, subflow) {
846-
if (sk_stream_is_writeable(subflow->tcp_sock))
847-
return true;
848-
}
849-
return false;
850-
}
851-
852838
static void mptcp_clean_una(struct sock *sk)
853839
{
854840
struct mptcp_sock *msk = mptcp_sk(sk);
@@ -901,13 +887,8 @@ static void mptcp_clean_una_wakeup(struct sock *sk)
901887
mptcp_clean_una(sk);
902888

903889
/* Only wake up writers if a subflow is ready */
904-
if (mptcp_is_writeable(msk)) {
905-
set_bit(MPTCP_SEND_SPACE, &msk->flags);
906-
smp_mb__after_atomic();
907-
908-
/* set SEND_SPACE before sk_stream_write_space clears
909-
* NOSPACE
910-
*/
890+
if (sk_stream_is_writeable(sk)) {
891+
clear_bit(MPTCP_NOSPACE, &msk->flags);
911892
sk_stream_write_space(sk);
912893
}
913894
}
@@ -1041,17 +1022,25 @@ static void mptcp_nospace(struct mptcp_sock *msk)
10411022
{
10421023
struct mptcp_subflow_context *subflow;
10431024

1044-
clear_bit(MPTCP_SEND_SPACE, &msk->flags);
1025+
set_bit(MPTCP_NOSPACE, &msk->flags);
10451026
smp_mb__after_atomic(); /* msk->flags is changed by write_space cb */
10461027

10471028
mptcp_for_each_subflow(msk, subflow) {
10481029
struct sock *ssk = mptcp_subflow_tcp_sock(subflow);
1030+
bool ssk_writeable = sk_stream_is_writeable(ssk);
10491031
struct socket *sock = READ_ONCE(ssk->sk_socket);
10501032

1033+
if (ssk_writeable || !sock)
1034+
continue;
1035+
10511036
/* enables ssk->write_space() callbacks */
1052-
if (sock)
1053-
set_bit(SOCK_NOSPACE, &sock->flags);
1037+
set_bit(SOCK_NOSPACE, &sock->flags);
10541038
}
1039+
1040+
/* mptcp_data_acked() could run just before we set the NOSPACE bit,
1041+
* so explicitly check for snd_una value
1042+
*/
1043+
mptcp_clean_una((struct sock *)msk);
10551044
}
10561045

10571046
static bool mptcp_subflow_active(struct mptcp_subflow_context *subflow)
@@ -1155,12 +1144,6 @@ static struct sock *mptcp_subflow_get_send(struct mptcp_sock *msk,
11551144
return NULL;
11561145
}
11571146

1158-
static void ssk_check_wmem(struct mptcp_sock *msk)
1159-
{
1160-
if (unlikely(!mptcp_is_writeable(msk)))
1161-
mptcp_nospace(msk);
1162-
}
1163-
11641147
static void mptcp_push_release(struct sock *sk, struct sock *ssk,
11651148
struct mptcp_sendmsg_info *info)
11661149
{
@@ -1332,7 +1315,6 @@ static int mptcp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
13321315

13331316
wait_for_memory:
13341317
mptcp_nospace(msk);
1335-
mptcp_clean_una(sk);
13361318
if (mptcp_timer_pending(sk))
13371319
mptcp_reset_timer(sk);
13381320
ret = sk_stream_wait_memory(sk, &timeo);
@@ -1344,7 +1326,6 @@ static int mptcp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
13441326
mptcp_push_pending(sk, msg->msg_flags);
13451327

13461328
out:
1347-
ssk_check_wmem(msk);
13481329
release_sock(sk);
13491330
return copied ? : ret;
13501331
}
@@ -1921,7 +1902,6 @@ static int __mptcp_init_sock(struct sock *sk)
19211902
INIT_LIST_HEAD(&msk->conn_list);
19221903
INIT_LIST_HEAD(&msk->join_list);
19231904
INIT_LIST_HEAD(&msk->rtx_queue);
1924-
__set_bit(MPTCP_SEND_SPACE, &msk->flags);
19251905
INIT_WORK(&msk->work, mptcp_worker);
19261906
msk->out_of_order_queue = RB_ROOT;
19271907
msk->first_pending = NULL;
@@ -2619,13 +2599,6 @@ bool mptcp_finish_join(struct sock *ssk)
26192599
return true;
26202600
}
26212601

2622-
static bool mptcp_memory_free(const struct sock *sk, int wake)
2623-
{
2624-
struct mptcp_sock *msk = mptcp_sk(sk);
2625-
2626-
return wake ? test_bit(MPTCP_SEND_SPACE, &msk->flags) : true;
2627-
}
2628-
26292602
static struct proto mptcp_prot = {
26302603
.name = "MPTCP",
26312604
.owner = THIS_MODULE,
@@ -2646,7 +2619,6 @@ static struct proto mptcp_prot = {
26462619
.sockets_allocated = &mptcp_sockets_allocated,
26472620
.memory_allocated = &tcp_memory_allocated,
26482621
.memory_pressure = &tcp_memory_pressure,
2649-
.stream_memory_free = mptcp_memory_free,
26502622
.sysctl_wmem_offset = offsetof(struct net, ipv4.sysctl_tcp_wmem),
26512623
.sysctl_rmem_offset = offsetof(struct net, ipv4.sysctl_tcp_rmem),
26522624
.sysctl_mem = sysctl_tcp_mem,
@@ -2820,6 +2792,39 @@ static __poll_t mptcp_check_readable(struct mptcp_sock *msk)
28202792
0;
28212793
}
28222794

2795+
static bool __mptcp_check_writeable(struct mptcp_sock *msk)
2796+
{
2797+
struct sock *sk = (struct sock *)msk;
2798+
bool mptcp_writable;
2799+
2800+
mptcp_clean_una(sk);
2801+
mptcp_writable = sk_stream_is_writeable(sk);
2802+
if (!mptcp_writable)
2803+
mptcp_nospace(msk);
2804+
2805+
return mptcp_writable;
2806+
}
2807+
2808+
static __poll_t mptcp_check_writeable(struct mptcp_sock *msk)
2809+
{
2810+
struct sock *sk = (struct sock *)msk;
2811+
__poll_t ret = 0;
2812+
bool slow;
2813+
2814+
if (unlikely(sk->sk_shutdown & SEND_SHUTDOWN))
2815+
return 0;
2816+
2817+
if (sk_stream_is_writeable(sk))
2818+
return EPOLLOUT | EPOLLWRNORM;
2819+
2820+
slow = lock_sock_fast(sk);
2821+
if (__mptcp_check_writeable(msk))
2822+
ret = EPOLLOUT | EPOLLWRNORM;
2823+
2824+
unlock_sock_fast(sk, slow);
2825+
return ret;
2826+
}
2827+
28232828
static __poll_t mptcp_poll(struct file *file, struct socket *sock,
28242829
struct poll_table_struct *wait)
28252830
{
@@ -2838,8 +2843,7 @@ static __poll_t mptcp_poll(struct file *file, struct socket *sock,
28382843

28392844
if (state != TCP_SYN_SENT && state != TCP_SYN_RECV) {
28402845
mask |= mptcp_check_readable(msk);
2841-
if (test_bit(MPTCP_SEND_SPACE, &msk->flags))
2842-
mask |= EPOLLOUT | EPOLLWRNORM;
2846+
mask |= mptcp_check_writeable(msk);
28432847
}
28442848
if (sk->sk_shutdown & RCV_SHUTDOWN)
28452849
mask |= EPOLLIN | EPOLLRDNORM | EPOLLRDHUP;

net/mptcp/protocol.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -86,7 +86,7 @@
8686

8787
/* MPTCP socket flags */
8888
#define MPTCP_DATA_READY 0
89-
#define MPTCP_SEND_SPACE 1
89+
#define MPTCP_NOSPACE 1
9090
#define MPTCP_WORK_RTX 2
9191
#define MPTCP_WORK_EOF 3
9292
#define MPTCP_FALLBACK_DONE 4

net/mptcp/subflow.c

Lines changed: 5 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -997,17 +997,16 @@ static void subflow_data_ready(struct sock *sk)
997997
static void subflow_write_space(struct sock *sk)
998998
{
999999
struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk);
1000+
struct socket *sock = READ_ONCE(sk->sk_socket);
10001001
struct sock *parent = subflow->conn;
10011002

10021003
if (!sk_stream_is_writeable(sk))
10031004
return;
10041005

1005-
if (sk_stream_is_writeable(parent)) {
1006-
set_bit(MPTCP_SEND_SPACE, &mptcp_sk(parent)->flags);
1007-
smp_mb__after_atomic();
1008-
/* set SEND_SPACE before sk_stream_write_space clears NOSPACE */
1009-
sk_stream_write_space(parent);
1010-
}
1006+
if (sock && sk_stream_is_writeable(parent))
1007+
clear_bit(SOCK_NOSPACE, &sock->flags);
1008+
1009+
sk_stream_write_space(parent);
10111010
}
10121011

10131012
static struct inet_connection_sock_af_ops *

0 commit comments

Comments
 (0)