LTS patches round.
Neal Cardwell (1): UPSTREAM: tcp: fix DSACK undo in fast recovery to call tcp_try_to_open()
Yousuk Seung (1): tcp: add ece_ack flag to reno sack functions
Yuchung Cheng (1): net: tcp better handling of reordering then loss cases
zhang kai (1): tcp: tcp_mark_head_lost is only valid for sack-tcp
net/ipv4/tcp_input.c | 97 ++++++++++++++++++++------------------------ 1 file changed, 44 insertions(+), 53 deletions(-)
From: Yousuk Seung ysseung@google.com
stable inclusion from stable-v4.19.318 commit 57a672a35020884512ae64fbbcb5c70149424781 category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/IAMPH5
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=...
---------------------------
[ Upstream commit c634e34f6ebfb75259e6ce467523fd3adf30d3d2 ]
Pass a boolean flag that tells the ECE state of the current ack to reno sack functions. This is pure refactor for future patches to improve tracking delivered counts.
Signed-off-by: Yousuk Seung ysseung@google.com Signed-off-by: Yuchung Cheng ycheng@google.com Acked-by: Eric Dumazet edumazet@google.com Acked-by: Neal Cardwell ncardwell@google.com Acked-by: Soheil Hassas Yeganeh soheil@google.com Signed-off-by: David S. Miller davem@davemloft.net Stable-dep-of: a6458ab7fd4f ("UPSTREAM: tcp: fix DSACK undo in fast recovery to call tcp_try_to_open()") Signed-off-by: Sasha Levin sashal@kernel.org Signed-off-by: Zhengchao Shao shaozhengchao@huawei.com --- net/ipv4/tcp_input.c | 22 ++++++++++++---------- 1 file changed, 12 insertions(+), 10 deletions(-)
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index f41df46a54df..935e39bc5d30 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -1884,7 +1884,7 @@ static void tcp_check_reno_reordering(struct sock *sk, const int addend)
/* Emulate SACKs for SACKless connection: account for a new dupack. */
-static void tcp_add_reno_sack(struct sock *sk, int num_dupack) +static void tcp_add_reno_sack(struct sock *sk, int num_dupack, bool ece_ack) { if (num_dupack) { struct tcp_sock *tp = tcp_sk(sk); @@ -1902,7 +1902,7 @@ static void tcp_add_reno_sack(struct sock *sk, int num_dupack)
/* Account for ACK, ACKing some data in Reno Recovery phase. */
-static void tcp_remove_reno_sacks(struct sock *sk, int acked) +static void tcp_remove_reno_sacks(struct sock *sk, int acked, bool ece_ack) { struct tcp_sock *tp = tcp_sk(sk);
@@ -2733,7 +2733,7 @@ static void tcp_process_loss(struct sock *sk, int flag, int num_dupack, * delivered. Lower inflight to clock out (re)tranmissions. */ if (after(tp->snd_nxt, tp->high_seq) && num_dupack) - tcp_add_reno_sack(sk, num_dupack); + tcp_add_reno_sack(sk, num_dupack, flag & FLAG_ECE); else if (flag & FLAG_SND_UNA_ADVANCED) tcp_reset_reno_sack(tp); } @@ -2816,6 +2816,7 @@ static void tcp_fastretrans_alert(struct sock *sk, const u32 prior_snd_una, struct inet_connection_sock *icsk = inet_csk(sk); struct tcp_sock *tp = tcp_sk(sk); int fast_rexmit = 0, flag = *ack_flag; + bool ece_ack = flag & FLAG_ECE; bool do_lost = num_dupack || ((flag & FLAG_DATA_SACKED) && tcp_force_fast_retransmit(sk));
@@ -2824,7 +2825,7 @@ static void tcp_fastretrans_alert(struct sock *sk, const u32 prior_snd_una,
/* Now state machine starts. * A. ECE, hence prohibit cwnd undoing, the reduction is required. */ - if (flag & FLAG_ECE) + if (ece_ack) tp->prior_ssthresh = 0;
/* B. In all the states check for reneging SACKs. */ @@ -2865,7 +2866,7 @@ static void tcp_fastretrans_alert(struct sock *sk, const u32 prior_snd_una, case TCP_CA_Recovery: if (!(flag & FLAG_SND_UNA_ADVANCED)) { if (tcp_is_reno(tp)) - tcp_add_reno_sack(sk, num_dupack); + tcp_add_reno_sack(sk, num_dupack, ece_ack); } else { if (tcp_try_undo_partial(sk, prior_snd_una)) return; @@ -2891,7 +2892,7 @@ static void tcp_fastretrans_alert(struct sock *sk, const u32 prior_snd_una, if (tcp_is_reno(tp)) { if (flag & FLAG_SND_UNA_ADVANCED) tcp_reset_reno_sack(tp); - tcp_add_reno_sack(sk, num_dupack); + tcp_add_reno_sack(sk, num_dupack, ece_ack); }
if (icsk->icsk_ca_state <= TCP_CA_Disorder) @@ -2915,7 +2916,7 @@ static void tcp_fastretrans_alert(struct sock *sk, const u32 prior_snd_una, }
/* Otherwise enter Recovery state */ - tcp_enter_recovery(sk, (flag & FLAG_ECE)); + tcp_enter_recovery(sk, ece_ack); fast_rexmit = 1; }
@@ -3091,7 +3092,7 @@ static void tcp_ack_tstamp(struct sock *sk, struct sk_buff *skb, */ static int tcp_clean_rtx_queue(struct sock *sk, u32 prior_fack, u32 prior_snd_una, - struct tcp_sacktag_state *sack) + struct tcp_sacktag_state *sack, bool ece_ack) { const struct inet_connection_sock *icsk = inet_csk(sk); u64 first_ackt, last_ackt; @@ -3229,7 +3230,7 @@ static int tcp_clean_rtx_queue(struct sock *sk, u32 prior_fack, }
if (tcp_is_reno(tp)) { - tcp_remove_reno_sacks(sk, pkts_acked); + tcp_remove_reno_sacks(sk, pkts_acked, ece_ack);
/* If any of the cumulatively ACKed segments was * retransmitted, non-SACK case cannot confirm that @@ -3734,7 +3735,8 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag) goto no_queue;
/* See if we can take anything off of the retransmit queue. */ - flag |= tcp_clean_rtx_queue(sk, prior_fack, prior_snd_una, &sack_state); + flag |= tcp_clean_rtx_queue(sk, prior_fack, prior_snd_una, &sack_state, + flag & FLAG_ECE);
tcp_rack_update_reo_wnd(sk, &rs);
From: zhang kai zhangkaiheb@126.com
stable inclusion from stable-v4.19.318 commit 552970cb728120695820ef9decb0933b3f9ce774 category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/IAMPH5
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=...
---------------------------
[ Upstream commit 636ef28d6e4d174e424102466caf572b0406fb0e ]
so tcp_is_sack/reno checks are removed from tcp_mark_head_lost.
Signed-off-by: zhang kai zhangkaiheb@126.com Acked-by: Neal Cardwell ncardwell@google.com Signed-off-by: David S. Miller davem@davemloft.net Stable-dep-of: a6458ab7fd4f ("UPSTREAM: tcp: fix DSACK undo in fast recovery to call tcp_try_to_open()") Signed-off-by: Sasha Levin sashal@kernel.org Signed-off-by: Zhengchao Shao shaozhengchao@huawei.com --- net/ipv4/tcp_input.c | 32 +++++++------------------------- 1 file changed, 7 insertions(+), 25 deletions(-)
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 935e39bc5d30..85e5690bd553 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -2189,8 +2189,7 @@ static bool tcp_time_to_recover(struct sock *sk, int flag) }
/* Detect loss in event "A" above by marking head of queue up as lost. - * For non-SACK(Reno) senders, the first "packets" number of segments - * are considered lost. For RFC3517 SACK, a segment is considered lost if it + * For RFC3517 SACK, a segment is considered lost if it * has at least tp->reordering SACKed seqments above it; "packets" refers to * the maximum SACKed segments to pass before reaching this limit. */ @@ -2198,10 +2197,9 @@ static void tcp_mark_head_lost(struct sock *sk, int packets, int mark_head) { struct tcp_sock *tp = tcp_sk(sk); struct sk_buff *skb; - int cnt, oldcnt, lost; - unsigned int mss; + int cnt; /* Use SACK to deduce losses of new sequences sent during recovery */ - const u32 loss_high = tcp_is_sack(tp) ? tp->snd_nxt : tp->high_seq; + const u32 loss_high = tp->snd_nxt;
WARN_ON(packets > tp->packets_out); skb = tp->lost_skb_hint; @@ -2224,26 +2222,11 @@ static void tcp_mark_head_lost(struct sock *sk, int packets, int mark_head) if (after(TCP_SKB_CB(skb)->end_seq, loss_high)) break;
- oldcnt = cnt; - if (tcp_is_reno(tp) || - (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED)) + if (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED) cnt += tcp_skb_pcount(skb);
- if (cnt > packets) { - if (tcp_is_sack(tp) || - (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED) || - (oldcnt >= packets)) - break; - - mss = tcp_skb_mss(skb); - /* If needed, chop off the prefix to mark as lost. */ - lost = (packets - oldcnt) * mss; - if (lost < skb->len && - tcp_fragment(sk, TCP_FRAG_IN_RTX_QUEUE, skb, - lost, mss, GFP_ATOMIC) < 0) - break; - cnt = packets; - } + if (cnt > packets) + break;
tcp_skb_mark_lost(tp, skb);
@@ -2871,8 +2854,7 @@ static void tcp_fastretrans_alert(struct sock *sk, const u32 prior_snd_una, if (tcp_try_undo_partial(sk, prior_snd_una)) return; /* Partial ACK arrived. Force fast retransmit. */ - do_lost = tcp_is_reno(tp) || - tcp_force_fast_retransmit(sk); + do_lost = tcp_force_fast_retransmit(sk); } if (tcp_try_undo_dsack(sk)) { tcp_try_keep_open(sk);
From: Yuchung Cheng ycheng@google.com
stable inclusion from stable-v4.19.318 commit 550968385e2f0df8e29c6431ab2dc0961499acb0 category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/IAMPH5
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=...
---------------------------
[ Upstream commit a29cb6914681a55667436a9eb7a42e28da8cf387 ]
This patch aims to improve the situation when reordering and loss are ocurring in the same flight of packets.
Previously the reordering would first induce a spurious recovery, then the subsequent ACK may undo the cwnd (based on the timestamps e.g.). However the current loss recovery does not proceed to invoke RACK to install a reordering timer. If some packets are also lost, this may lead to a long RTO-based recovery. An example is https://groups.google.com/g/bbr-dev/c/OFHADvJbTEI
The solution is to after reverting the recovery, always invoke RACK to either mount the RACK timer to fast retransmit after the reordering window, or restarts the recovery if new loss is identified. Hence it is possible the sender may go from Recovery to Disorder/Open to Recovery again in one ACK.
Reported-by: mingkun bian bianmingkun@gmail.com Signed-off-by: Yuchung Cheng ycheng@google.com Signed-off-by: Neal Cardwell ncardwell@google.com Signed-off-by: Eric Dumazet edumazet@google.com Signed-off-by: David S. Miller davem@davemloft.net Stable-dep-of: a6458ab7fd4f ("UPSTREAM: tcp: fix DSACK undo in fast recovery to call tcp_try_to_open()") Signed-off-by: Sasha Levin sashal@kernel.org Signed-off-by: Zhengchao Shao shaozhengchao@huawei.com --- net/ipv4/tcp_input.c | 45 +++++++++++++++++++++++++------------------- 1 file changed, 26 insertions(+), 19 deletions(-)
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 85e5690bd553..4531d8381d24 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -2723,8 +2723,17 @@ static void tcp_process_loss(struct sock *sk, int flag, int num_dupack, *rexmit = REXMIT_LOST; }
+static bool tcp_force_fast_retransmit(struct sock *sk) +{ + struct tcp_sock *tp = tcp_sk(sk); + + return after(tcp_highest_sack_seq(tp), + tp->snd_una + tp->reordering * tp->mss_cache); +} + /* Undo during fast recovery after partial ACK. */ -static bool tcp_try_undo_partial(struct sock *sk, u32 prior_snd_una) +static bool tcp_try_undo_partial(struct sock *sk, u32 prior_snd_una, + bool *do_lost) { struct tcp_sock *tp = tcp_sk(sk);
@@ -2749,7 +2758,9 @@ static bool tcp_try_undo_partial(struct sock *sk, u32 prior_snd_una) tcp_undo_cwnd_reduction(sk, true); NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPPARTIALUNDO); tcp_try_keep_open(sk); - return true; + } else { + /* Partial ACK arrived. Force fast retransmit. */ + *do_lost = tcp_force_fast_retransmit(sk); } return false; } @@ -2773,14 +2784,6 @@ static void tcp_identify_packet_loss(struct sock *sk, int *ack_flag) } }
-static bool tcp_force_fast_retransmit(struct sock *sk) -{ - struct tcp_sock *tp = tcp_sk(sk); - - return after(tcp_highest_sack_seq(tp), - tp->snd_una + tp->reordering * tp->mss_cache); -} - /* Process an event, which can update packets-in-flight not trivially. * Main goal of this function is to calculate new estimate for left_out, * taking into account both packets sitting in receiver's buffer and @@ -2850,17 +2853,21 @@ static void tcp_fastretrans_alert(struct sock *sk, const u32 prior_snd_una, if (!(flag & FLAG_SND_UNA_ADVANCED)) { if (tcp_is_reno(tp)) tcp_add_reno_sack(sk, num_dupack, ece_ack); - } else { - if (tcp_try_undo_partial(sk, prior_snd_una)) - return; - /* Partial ACK arrived. Force fast retransmit. */ - do_lost = tcp_force_fast_retransmit(sk); - } - if (tcp_try_undo_dsack(sk)) { - tcp_try_keep_open(sk); + } else if (tcp_try_undo_partial(sk, prior_snd_una, &do_lost)) return; - } + + if (tcp_try_undo_dsack(sk)) + tcp_try_keep_open(sk); + tcp_identify_packet_loss(sk, ack_flag); + if (icsk->icsk_ca_state != TCP_CA_Recovery) { + if (!tcp_time_to_recover(sk, flag)) + return; + /* Undo reverts the recovery state. If loss is evident, + * starts a new recovery (e.g. reordering then loss); + */ + tcp_enter_recovery(sk, ece_ack); + } break; case TCP_CA_Loss: tcp_process_loss(sk, flag, num_dupack, rexmit);
From: Neal Cardwell ncardwell@google.com
stable inclusion from stable-v4.19.318 commit 8b5fd51b3040ce2596d22a72767c66d7435853b6 category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/IAMPH5
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=...
---------------------------
[ Upstream commit a6458ab7fd4f427d4f6f54380453ad255b7fde83 ]
In some production workloads we noticed that connections could sometimes close extremely prematurely with ETIMEDOUT after transmitting only 1 TLP and RTO retransmission (when we would normally expect roughly tcp_retries2 = TCP_RETR2 = 15 RTOs before a connection closes with ETIMEDOUT).
From tracing we determined that these workloads can suffer from a scenario where in fast recovery, after some retransmits, a DSACK undo can happen at a point where the scoreboard is totally clear (we have retrans_out == sacked_out == lost_out == 0). In such cases, calling tcp_try_keep_open() means that we do not execute any code path that clears tp->retrans_stamp to 0. That means that tp->retrans_stamp can remain erroneously set to the start time of the undone fast recovery, even after the fast recovery is undone. If minutes or hours elapse, and then a TLP/RTO/RTO sequence occurs, then the start_ts value in retransmits_timed_out() (which is from tp->retrans_stamp) will be erroneously ancient (left over from the fast recovery undone via DSACKs). Thus this ancient tp->retrans_stamp value can cause the connection to die very prematurely with ETIMEDOUT via tcp_write_err().
The fix: we change DSACK undo in fast recovery (TCP_CA_Recovery) to call tcp_try_to_open() instead of tcp_try_keep_open(). This ensures that if no retransmits are in flight at the time of DSACK undo in fast recovery then we properly zero retrans_stamp. Note that calling tcp_try_to_open() is more consistent with other loss recovery behavior, since normal fast recovery (CA_Recovery) and RTO recovery (CA_Loss) both normally end when tp->snd_una meets or exceeds tp->high_seq and then in tcp_fastretrans_alert() the "default" switch case executes tcp_try_to_open(). Also note that by inspection this change to call tcp_try_to_open() implies at least one other nice bug fix, where now an ECE-marked DSACK that causes an undo will properly invoke tcp_enter_cwr() rather than ignoring the ECE mark.
Fixes: c7d9d6a185a7 ("tcp: undo on DSACK during recovery") Signed-off-by: Neal Cardwell ncardwell@google.com Signed-off-by: Yuchung Cheng ycheng@google.com Signed-off-by: Eric Dumazet edumazet@google.com Signed-off-by: David S. Miller davem@davemloft.net Signed-off-by: Sasha Levin sashal@kernel.org Signed-off-by: Zhengchao Shao shaozhengchao@huawei.com --- net/ipv4/tcp_input.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 4531d8381d24..a577d68b80cb 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -2857,7 +2857,7 @@ static void tcp_fastretrans_alert(struct sock *sk, const u32 prior_snd_una, return;
if (tcp_try_undo_dsack(sk)) - tcp_try_keep_open(sk); + tcp_try_to_open(sk, flag);
tcp_identify_packet_loss(sk, ack_flag); if (icsk->icsk_ca_state != TCP_CA_Recovery) {
反馈: 您发送到kernel@openeuler.org的补丁/补丁集,已成功转换为PR! PR链接地址: https://gitee.com/openeuler/kernel/pulls/11209 邮件列表地址:https://mailweb.openeuler.org/hyperkitty/list/kernel@openeuler.org/message/W...
FeedBack: The patch(es) which you have sent to kernel@openeuler.org mailing list has been converted to a pull request successfully! Pull request link: https://gitee.com/openeuler/kernel/pulls/11209 Mailing list address: https://mailweb.openeuler.org/hyperkitty/list/kernel@openeuler.org/message/W...