From: Yuchung Cheng ycheng@google.com
mainline inclusion from mainline-v5.1-rc1 commit 7ae189759cc48cf8b54beebff566e9fd2d4e7d7c category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I4AFRJ?from=project-issue CVE: NA
------------------------------------------------------------
Previously TCP socket's retrans_stamp is not set if the retransmission has failed to send. As a result if a socket is experiencing local issues to retransmit packets, determining when to abort a socket is complicated w/o knowning the starting time of the recovery since retrans_stamp may remain zero.
This complication causes sub-optimal behavior that TCP may use the latest, instead of the first, retransmission time to compute the elapsed time of a stalling connection due to local issues. Then TCP may disrecard TCP retries settings and keep retrying until it finally succeed: not a good idea when the local host is already strained.
The simple fix is to always timestamp the start of a recovery. It's worth noting that retrans_stamp is also used to compare echo timestamp values to detect spurious recovery. This patch does not break that because retrans_stamp is still later than when the original packet was sent.
Signed-off-by: Yuchung Cheng ycheng@google.com Signed-off-by: Eric Dumazet edumazet@google.com Reviewed-by: Neal Cardwell ncardwell@google.com Reviewed-by: Soheil Hassas Yeganeh soheil@google.com Signed-off-by: David S. Miller davem@davemloft.net Conflicts: net/ipv4/tcp_timer.c Signed-off-by: Jiazhenyuan jiazhenyuan@uniontech #openEuler_contributor Signed-off-by: Laibin Qiu qiulaibin@huawei.com Reviewed-by: Wei Yongjun weiyongjun1@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- net/ipv4/tcp_output.c | 9 ++++----- net/ipv4/tcp_timer.c | 23 +++-------------------- 2 files changed, 7 insertions(+), 25 deletions(-)
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index 97b9d671a83c9..6710056fd1b23 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -3011,13 +3011,12 @@ int tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb, int segs) #endif TCP_SKB_CB(skb)->sacked |= TCPCB_RETRANS; tp->retrans_out += tcp_skb_pcount(skb); - - /* Save stamp of the first retransmit. */ - if (!tp->retrans_stamp) - tp->retrans_stamp = tcp_skb_timestamp(skb); - }
+ /* Save stamp of the first (attempted) retransmit. */ + if (!tp->retrans_stamp) + tp->retrans_stamp = tcp_skb_timestamp(skb); + if (tp->undo_retrans < 0) tp->undo_retrans = 0; tp->undo_retrans += tcp_skb_pcount(skb); diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c index 681882a409686..8435cbad337d8 100644 --- a/net/ipv4/tcp_timer.c +++ b/net/ipv4/tcp_timer.c @@ -22,27 +22,13 @@ #include <linux/gfp.h> #include <net/tcp.h>
-static u32 tcp_retransmit_stamp(const struct sock *sk) -{ - u32 start_ts = tcp_sk(sk)->retrans_stamp; - - if (unlikely(!start_ts)) { - struct sk_buff *head = tcp_rtx_queue_head(sk); - - if (!head) - return 0; - start_ts = tcp_skb_timestamp(head); - } - return start_ts; -} - static u32 tcp_clamp_rto_to_user_timeout(const struct sock *sk) { struct inet_connection_sock *icsk = inet_csk(sk); u32 elapsed, start_ts;
- start_ts = tcp_retransmit_stamp(sk); - if (!icsk->icsk_user_timeout || !start_ts) + start_ts = tcp_sk(sk)->retrans_stamp; + if (!icsk->icsk_user_timeout) return icsk->icsk_rto; elapsed = tcp_time_stamp(tcp_sk(sk)) - start_ts; if (elapsed >= icsk->icsk_user_timeout) @@ -196,10 +182,7 @@ static bool retransmits_timed_out(struct sock *sk, if (!inet_csk(sk)->icsk_retransmits) return false;
- start_ts = tcp_retransmit_stamp(sk); - if (!start_ts) - return false; - + start_ts = tcp_sk(sk)->retrans_stamp; if (likely(timeout == 0)) { linear_backoff_thresh = ilog2(TCP_RTO_MAX/rto_base);
From: Yuchung Cheng ycheng@google.com
mainline inclusion from mainline-v5.1-rc1 commit 01a523b071618abbc634d1958229fe3bd2dfa5fa category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I4AFRJ?from=project-issue CVE: NA
------------------------------------------------------------
Create a helper to model TCP exponential backoff for the next patch. This is pure refactor w no behavior change.
Signed-off-by: Yuchung Cheng ycheng@google.com Signed-off-by: Eric Dumazet edumazet@google.com Reviewed-by: Neal Cardwell ncardwell@google.com Reviewed-by: Soheil Hassas Yeganeh soheil@google.com Signed-off-by: David S. Miller davem@davemloft.net Conflicts: net/ipv4/tcp_timer.c Signed-off-by: Jiazhenyuan jiazhenyuan@uniontech.com #openEuler_contributor Signed-off-by: Laibin Qiu qiulaibin@huawei.com Reviewed-by: Wei Yongjun weiyongjun1@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- net/ipv4/tcp_timer.c | 29 +++++++++++++++++------------ 1 file changed, 17 insertions(+), 12 deletions(-)
diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c index 8435cbad337d8..02337d633beb4 100644 --- a/net/ipv4/tcp_timer.c +++ b/net/ipv4/tcp_timer.c @@ -158,7 +158,20 @@ static void tcp_mtu_probing(struct inet_connection_sock *icsk, struct sock *sk) tcp_sync_mss(sk, icsk->icsk_pmtu_cookie); }
+static unsigned int tcp_model_timeout(struct sock *sk, + unsigned int boundary, + unsigned int rto_base) +{ + unsigned int linear_backoff_thresh, timeout;
+ linear_backoff_thresh = ilog2(TCP_RTO_MAX / rto_base); + if (boundary <= linear_backoff_thresh) + timeout = ((2 << boundary) - 1) * rto_base; + else + timeout = ((2 << linear_backoff_thresh) - 1) * rto_base + + (boundary - linear_backoff_thresh) * TCP_RTO_MAX; + return jiffies_to_msecs(timeout); +} /** * retransmits_timed_out() - returns true if this connection has timed out * @sk: The current socket @@ -176,23 +189,15 @@ static bool retransmits_timed_out(struct sock *sk, unsigned int boundary, unsigned int timeout) { - const unsigned int rto_base = TCP_RTO_MIN; - unsigned int linear_backoff_thresh, start_ts; + unsigned int start_ts;
if (!inet_csk(sk)->icsk_retransmits) return false;
start_ts = tcp_sk(sk)->retrans_stamp; - if (likely(timeout == 0)) { - linear_backoff_thresh = ilog2(TCP_RTO_MAX/rto_base); - - if (boundary <= linear_backoff_thresh) - timeout = ((2 << boundary) - 1) * rto_base; - else - timeout = ((2 << linear_backoff_thresh) - 1) * rto_base + - (boundary - linear_backoff_thresh) * TCP_RTO_MAX; - timeout = jiffies_to_msecs(timeout); - } + if (likely(timeout == 0)) + timeout = tcp_model_timeout(sk, boundary, TCP_RTO_MIN); + return (tcp_time_stamp(tcp_sk(sk)) - start_ts) >= timeout; }
From: Eric Dumazet edumazet@google.com
mainline inclusion from mainline-v5.4-rc2 commit 3256a2d6ab1f71f9a1bd2d7f6f18eb8108c48d17 category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I4AFRJ?from=project-issue CVE: NA
------------------------------------------------------------
The cited commit exposed an old retransmits_timed_out() bug which assumed it could call tcp_model_timeout() with TCP_RTO_MIN as rto_base for all states.
But flows in SYN_SENT or SYN_RECV state uses a different RTO base (1 sec instead of 200 ms, unless BPF choses another value)
This caused a reduction of SYN retransmits from 6 to 4 with the default /proc/sys/net/ipv4/tcp_syn_retries value.
Fixes: a41e8a88b06e ("tcp: better handle TCP_USER_TIMEOUT in SYN_SENT state") Signed-off-by: Eric Dumazet edumazet@google.com Cc: Yuchung Cheng ycheng@google.com Cc: Marek Majkowski marek@cloudflare.com Signed-off-by: David S. Miller davem@davemloft.net Conflicts: net/ipv4/tcp_timer.c Signed-off-by: Jiazhenyuan jiazhenyuan@uniontech.com #openEuler_contributor Signed-off-by: Laibin Qiu qiulaibin@huawei.com Reviewed-by: Wei Yongjun weiyongjun1@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- net/ipv4/tcp_timer.c | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-)
diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c index 02337d633beb4..f4e2d01c058d4 100644 --- a/net/ipv4/tcp_timer.c +++ b/net/ipv4/tcp_timer.c @@ -195,8 +195,13 @@ static bool retransmits_timed_out(struct sock *sk, return false;
start_ts = tcp_sk(sk)->retrans_stamp; - if (likely(timeout == 0)) - timeout = tcp_model_timeout(sk, boundary, TCP_RTO_MIN); + if (likely(timeout == 0)) { + unsigned int rto_base = TCP_RTO_MIN; + + if ((1 << sk->sk_state) & (TCPF_SYN_SENT | TCPF_SYN_RECV)) + rto_base = tcp_timeout_init(sk); + timeout = tcp_model_timeout(sk, boundary, rto_base); + }
return (tcp_time_stamp(tcp_sk(sk)) - start_ts) >= timeout; }