Skip to content

Commit b0983d3

Browse files
Eric Dumazetdavem330
authored andcommitted
tcp: fix dynamic right sizing
Dynamic Right Sizing (DRS) is supposed to open TCP receive window automatically, but suffers from two bugs, presented by order of importance. 1) tcp_rcv_space_adjust() fix : Using twice the last received amount is very pessimistic, because it doesn't allow fast recovery or proper slow start ramp up, if sender wants to increase cwin by 100% every RTT. copied = bytes received in previous RTT 2*copied = bytes we expect to receive in next RTT 4*copied = bytes we need to advertise in rwin at end of next RTT DRS is one RTT late, it needs a 4x factor. If sender is not using ABC, and increases cwin by 50% every rtt, then we needed 1.5*1.5 = 2.25 factor. This is probably why this bug was not really noticed. 2) There is no window adjustment after first RTT. DRS triggers only after the second RTT. DRS needs two RTT to initialize, so tcp_fixup_rcvbuf() should setup sk_rcvbuf to allow proper window grow for first two RTT. This patch increases TCP efficiency particularly for large RTT flows when autotuning is used at the receiver, and more particularly in presence of packet losses. Signed-off-by: Eric Dumazet <[email protected]> Signed-off-by: Neal Cardwell <[email protected]> Signed-off-by: Yuchung Cheng <[email protected]> Cc: Van Jacobson <[email protected]> Signed-off-by: David S. Miller <[email protected]>
1 parent 0862935 commit b0983d3

File tree

1 file changed

+53
-31
lines changed

1 file changed

+53
-31
lines changed

net/ipv4/tcp_input.c

Lines changed: 53 additions & 31 deletions
Original file line numberDiff line numberDiff line change
@@ -355,6 +355,12 @@ static void tcp_fixup_rcvbuf(struct sock *sk)
355355
rcvmem = 2 * SKB_TRUESIZE(mss + MAX_TCP_HEADER) *
356356
tcp_default_init_rwnd(mss);
357357

358+
/* Dynamic Right Sizing (DRS) has 2 to 3 RTT latency
359+
* Allow enough cushion so that sender is not limited by our window
360+
*/
361+
if (sysctl_tcp_moderate_rcvbuf)
362+
rcvmem <<= 2;
363+
358364
if (sk->sk_rcvbuf < rcvmem)
359365
sk->sk_rcvbuf = min(rcvmem, sysctl_tcp_rmem[2]);
360366
}
@@ -373,6 +379,8 @@ void tcp_init_buffer_space(struct sock *sk)
373379
tcp_fixup_sndbuf(sk);
374380

375381
tp->rcvq_space.space = tp->rcv_wnd;
382+
tp->rcvq_space.time = tcp_time_stamp;
383+
tp->rcvq_space.seq = tp->copied_seq;
376384

377385
maxwin = tcp_full_space(sk);
378386

@@ -512,48 +520,62 @@ void tcp_rcv_space_adjust(struct sock *sk)
512520
{
513521
struct tcp_sock *tp = tcp_sk(sk);
514522
int time;
515-
int space;
516-
517-
if (tp->rcvq_space.time == 0)
518-
goto new_measure;
523+
int copied;
519524

520525
time = tcp_time_stamp - tp->rcvq_space.time;
521526
if (time < (tp->rcv_rtt_est.rtt >> 3) || tp->rcv_rtt_est.rtt == 0)
522527
return;
523528

524-
space = 2 * (tp->copied_seq - tp->rcvq_space.seq);
529+
/* Number of bytes copied to user in last RTT */
530+
copied = tp->copied_seq - tp->rcvq_space.seq;
531+
if (copied <= tp->rcvq_space.space)
532+
goto new_measure;
533+
534+
/* A bit of theory :
535+
* copied = bytes received in previous RTT, our base window
536+
* To cope with packet losses, we need a 2x factor
537+
* To cope with slow start, and sender growing its cwin by 100 %
538+
* every RTT, we need a 4x factor, because the ACK we are sending
539+
* now is for the next RTT, not the current one :
540+
* <prev RTT . ><current RTT .. ><next RTT .... >
541+
*/
542+
543+
if (sysctl_tcp_moderate_rcvbuf &&
544+
!(sk->sk_userlocks & SOCK_RCVBUF_LOCK)) {
545+
int rcvwin, rcvmem, rcvbuf;
525546

526-
space = max(tp->rcvq_space.space, space);
547+
/* minimal window to cope with packet losses, assuming
548+
* steady state. Add some cushion because of small variations.
549+
*/
550+
rcvwin = (copied << 1) + 16 * tp->advmss;
527551

528-
if (tp->rcvq_space.space != space) {
529-
int rcvmem;
552+
/* If rate increased by 25%,
553+
* assume slow start, rcvwin = 3 * copied
554+
* If rate increased by 50%,
555+
* assume sender can use 2x growth, rcvwin = 4 * copied
556+
*/
557+
if (copied >=
558+
tp->rcvq_space.space + (tp->rcvq_space.space >> 2)) {
559+
if (copied >=
560+
tp->rcvq_space.space + (tp->rcvq_space.space >> 1))
561+
rcvwin <<= 1;
562+
else
563+
rcvwin += (rcvwin >> 1);
564+
}
530565

531-
tp->rcvq_space.space = space;
566+
rcvmem = SKB_TRUESIZE(tp->advmss + MAX_TCP_HEADER);
567+
while (tcp_win_from_space(rcvmem) < tp->advmss)
568+
rcvmem += 128;
532569

533-
if (sysctl_tcp_moderate_rcvbuf &&
534-
!(sk->sk_userlocks & SOCK_RCVBUF_LOCK)) {
535-
int new_clamp = space;
570+
rcvbuf = min(rcvwin / tp->advmss * rcvmem, sysctl_tcp_rmem[2]);
571+
if (rcvbuf > sk->sk_rcvbuf) {
572+
sk->sk_rcvbuf = rcvbuf;
536573

537-
/* Receive space grows, normalize in order to
538-
* take into account packet headers and sk_buff
539-
* structure overhead.
540-
*/
541-
space /= tp->advmss;
542-
if (!space)
543-
space = 1;
544-
rcvmem = SKB_TRUESIZE(tp->advmss + MAX_TCP_HEADER);
545-
while (tcp_win_from_space(rcvmem) < tp->advmss)
546-
rcvmem += 128;
547-
space *= rcvmem;
548-
space = min(space, sysctl_tcp_rmem[2]);
549-
if (space > sk->sk_rcvbuf) {
550-
sk->sk_rcvbuf = space;
551-
552-
/* Make the window clamp follow along. */
553-
tp->window_clamp = new_clamp;
554-
}
574+
/* Make the window clamp follow along. */
575+
tp->window_clamp = rcvwin;
555576
}
556577
}
578+
tp->rcvq_space.space = copied;
557579

558580
new_measure:
559581
tp->rcvq_space.seq = tp->copied_seq;
@@ -5674,8 +5696,8 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb,
56745696
tcp_init_congestion_control(sk);
56755697

56765698
tcp_mtup_init(sk);
5677-
tcp_init_buffer_space(sk);
56785699
tp->copied_seq = tp->rcv_nxt;
5700+
tcp_init_buffer_space(sk);
56795701
}
56805702
smp_mb();
56815703
tcp_set_state(sk, TCP_ESTABLISHED);

0 commit comments

Comments
 (0)