From: George Wilkie gwilkie@vyatta.att-mail.com
[ Upstream commit 2f3f7d1fa0d1039b24a55d127ed190f196fc3e79 ]
If you configure a route with multiple labels, e.g. ip route add 10.10.3.0/24 encap mpls 16/100 via 10.10.2.2 dev ens4 A warning is logged: kernel: [ 130.561819] netlink: 'ip': attribute type 1 has an invalid length.
This happens because mpls_iptunnel_policy has set the type of MPLS_IPTUNNEL_DST to fixed size NLA_U32. Change it to a minimum size. nla_get_labels() does the remaining validation.
Fixes: e3e4712ec096 ("mpls: ip tunnel support") Signed-off-by: George Wilkie gwilkie@vyatta.att-mail.com Reviewed-by: David Ahern dsahern@gmail.com Signed-off-by: David S. Miller davem@davemloft.net Signed-off-by: Sasha Levin sashal@kernel.org Signed-off-by: Yang Yingliang yangyingliang@huawei.com Signed-off-by: Aichun Li liaichun@huawei.com Reviewed-by: wangxiaopeng wangxiaopeng7@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- net/mpls/mpls_iptunnel.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/net/mpls/mpls_iptunnel.c b/net/mpls/mpls_iptunnel.c index 8141eb10752f..b556acebdc86 100644 --- a/net/mpls/mpls_iptunnel.c +++ b/net/mpls/mpls_iptunnel.c @@ -28,7 +28,7 @@ #include "internal.h"
static const struct nla_policy mpls_iptunnel_policy[MPLS_IPTUNNEL_MAX + 1] = { - [MPLS_IPTUNNEL_DST] = { .type = NLA_U32 }, + [MPLS_IPTUNNEL_DST] = { .len = sizeof(u32) }, [MPLS_IPTUNNEL_TTL] = { .type = NLA_U8 }, };
From: Mark Zhang markz@mellanox.com
[ Upstream commit ea77388b02270b0af8dc57f668f311235ea068f0 ]
Remove the "reserved_at_40" field to match the device specification.
Fixes: 84df61ebc69b ("net/mlx5: Add HW interfaces used by LAG") Signed-off-by: Mark Zhang markz@mellanox.com Reviewed-by: Yishai Hadas yishaih@mellanox.com Signed-off-by: Leon Romanovsky leonro@mellanox.com Signed-off-by: Sasha Levin sashal@kernel.org Signed-off-by: Yang Yingliang yangyingliang@huawei.com Signed-off-by: Aichun Li liaichun@huawei.com Reviewed-by: wangxiaopeng wangxiaopeng7@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- include/linux/mlx5/mlx5_ifc.h | 2 -- 1 file changed, 2 deletions(-)
diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h index 177f11c96187..76b76b6aa83d 100644 --- a/include/linux/mlx5/mlx5_ifc.h +++ b/include/linux/mlx5/mlx5_ifc.h @@ -9053,8 +9053,6 @@ struct mlx5_ifc_query_lag_out_bits {
u8 syndrome[0x20];
- u8 reserved_at_40[0x40]; - struct mlx5_ifc_lagc_bits ctx; };
From: Jakub Kicinski kuba@kernel.org
[ Upstream commit 4c16d64ea04056f1b1b324ab6916019f6a064114 ]
Add missing netlink policy entry for FRA_TUN_ID.
Fixes: e7030878fc84 ("fib: Add fib rule match on tunnel id") Signed-off-by: Jakub Kicinski kuba@kernel.org Reviewed-by: David Ahern dsahern@gmail.com Signed-off-by: David S. Miller davem@davemloft.net Signed-off-by: Greg Kroah-Hartman gregkh@linuxfoundation.org Signed-off-by: Yang Yingliang yangyingliang@huawei.com Signed-off-by: Aichun Li liaichun@huawei.com Reviewed-by: wangxiaopeng wangxiaopeng7@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- include/net/fib_rules.h | 1 + 1 file changed, 1 insertion(+)
diff --git a/include/net/fib_rules.h b/include/net/fib_rules.h index a5aaf9a0c1c1..f5907336106a 100644 --- a/include/net/fib_rules.h +++ b/include/net/fib_rules.h @@ -117,6 +117,7 @@ struct fib_rule_notifier_info { [FRA_OIFNAME] = { .type = NLA_STRING, .len = IFNAMSIZ - 1 }, \ [FRA_PRIORITY] = { .type = NLA_U32 }, \ [FRA_FWMARK] = { .type = NLA_U32 }, \ + [FRA_TUN_ID] = { .type = NLA_U64 }, \ [FRA_FWMASK] = { .type = NLA_U32 }, \ [FRA_TABLE] = { .type = NLA_U32 }, \ [FRA_SUPPRESS_PREFIXLEN] = { .type = NLA_U32 }, \
From: Alex Vesker valex@mellanox.com
stable inclusion from linux-4.19.116 commit dce1622d540119b9643c19ccb8b3953c37107582
--------------------------------
[ Upstream commit 41e684ef3f37ce6e5eac3fb5b9c7c1853f4b0447 ]
Until now the flex parser capability was used in ib_query_device() to indicate tunnel_offloads_caps support for mpls_over_gre/mpls_over_udp.
Newer devices and firmware will have configurations with the flexparser but without mpls support.
Testing for the flex parser capability was a mistake, the tunnel_stateless capability was intended for detecting mpls and was introduced at the same time as the flex parser capability.
Otherwise userspace will be incorrectly informed that a future device supports MPLS when it does not.
Link: https://lore.kernel.org/r/20200305123841.196086-1-leon@kernel.org Cc: stable@vger.kernel.org # 4.17 Fixes: e818e255a58d ("IB/mlx5: Expose MPLS related tunneling offloads") Signed-off-by: Alex Vesker valex@mellanox.com Reviewed-by: Ariel Levkovich lariel@mellanox.com Signed-off-by: Leon Romanovsky leonro@mellanox.com Signed-off-by: Jason Gunthorpe jgg@mellanox.com Signed-off-by: Sasha Levin sashal@kernel.org Signed-off-by: Yang Yingliang yangyingliang@huawei.com
Signed-off-by: Aichun Li liaichun@huawei.com Reviewed-by: wangxiaopeng wangxiaopeng7@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- drivers/infiniband/hw/mlx5/main.c | 6 ++---- include/linux/mlx5/mlx5_ifc.h | 9 ++++++++- 2 files changed, 10 insertions(+), 5 deletions(-)
diff --git a/drivers/infiniband/hw/mlx5/main.c b/drivers/infiniband/hw/mlx5/main.c index 2db34f7b5ced..f41f3ff689c5 100644 --- a/drivers/infiniband/hw/mlx5/main.c +++ b/drivers/infiniband/hw/mlx5/main.c @@ -1070,12 +1070,10 @@ static int mlx5_ib_query_device(struct ib_device *ibdev, if (MLX5_CAP_ETH(mdev, tunnel_stateless_gre)) resp.tunnel_offloads_caps |= MLX5_IB_TUNNELED_OFFLOADS_GRE; - if (MLX5_CAP_GEN(mdev, flex_parser_protocols) & - MLX5_FLEX_PROTO_CW_MPLS_GRE) + if (MLX5_CAP_ETH(mdev, tunnel_stateless_mpls_over_gre)) resp.tunnel_offloads_caps |= MLX5_IB_TUNNELED_OFFLOADS_MPLS_GRE; - if (MLX5_CAP_GEN(mdev, flex_parser_protocols) & - MLX5_FLEX_PROTO_CW_MPLS_UDP) + if (MLX5_CAP_ETH(mdev, tunnel_stateless_mpls_over_udp)) resp.tunnel_offloads_caps |= MLX5_IB_TUNNELED_OFFLOADS_MPLS_UDP; } diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h index 76b76b6aa83d..b87b1569d15b 100644 --- a/include/linux/mlx5/mlx5_ifc.h +++ b/include/linux/mlx5/mlx5_ifc.h @@ -672,7 +672,14 @@ struct mlx5_ifc_per_protocol_networking_offload_caps_bits { u8 swp[0x1]; u8 swp_csum[0x1]; u8 swp_lso[0x1]; - u8 reserved_at_23[0xd]; + u8 cqe_checksum_full[0x1]; + u8 tunnel_stateless_geneve_tx[0x1]; + u8 tunnel_stateless_mpls_over_udp[0x1]; + u8 tunnel_stateless_mpls_over_gre[0x1]; + u8 tunnel_stateless_vxlan_gpe[0x1]; + u8 tunnel_stateless_ipv4_over_vxlan[0x1]; + u8 tunnel_stateless_ip_over_ip[0x1]; + u8 reserved_at_2a[0x6]; u8 max_vxlan_udp_ports[0x8]; u8 reserved_at_38[0x6]; u8 max_geneve_opt_len[0x1];
From: Roman Mashak mrv@mojatatu.com
stable inclusion from linux-4.19.126 commit 5d878dd4bac33a1ac08b720b5227a42b991c1968
--------------------------------
[ Upstream commit b15e62631c5f19fea9895f7632dae9c1b27fe0cd ]
When a new action is installed, firstuse field of 'tcf_t' is explicitly set to 0. Value of zero means "new action, not yet used"; as a packet hits the action, 'firstuse' is stamped with the current jiffies value.
tcf_tm_dump() should return 0 for firstuse if action has not yet been hit.
Fixes: 48d8ee1694dd ("net sched actions: aggregate dumping of actions timeinfo") Cc: Jamal Hadi Salim jhs@mojatatu.com Signed-off-by: Roman Mashak mrv@mojatatu.com Acked-by: Jamal Hadi Salim jhs@mojatatu.com Signed-off-by: David S. Miller davem@davemloft.net Signed-off-by: Greg Kroah-Hartman gregkh@linuxfoundation.org Signed-off-by: Yang Yingliang yangyingliang@huawei.com
Signed-off-by: Aichun Li liaichun@huawei.com Reviewed-by: wangxiaopeng wangxiaopeng7@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- include/net/act_api.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-)
diff --git a/include/net/act_api.h b/include/net/act_api.h index 0c82d7ea6ee1..c48b750de2fc 100644 --- a/include/net/act_api.h +++ b/include/net/act_api.h @@ -67,7 +67,8 @@ static inline void tcf_tm_dump(struct tcf_t *dtm, const struct tcf_t *stm) { dtm->install = jiffies_to_clock_t(jiffies - stm->install); dtm->lastuse = jiffies_to_clock_t(jiffies - stm->lastuse); - dtm->firstuse = jiffies_to_clock_t(jiffies - stm->firstuse); + dtm->firstuse = stm->firstuse ? + jiffies_to_clock_t(jiffies - stm->firstuse) : 0; dtm->expires = jiffies_to_clock_t(stm->expires); }
From: Steffen Klassert steffen.klassert@secunet.com
stable inclusion from linux-4.19.137 commit 8b2a6581c88f277d99ff6a530d590603c68426b2
--------------------------------
[ Upstream commit 101dde4207f1daa1fda57d714814a03835dccc3f ]
The commits "xfrm: Move dst->path into struct xfrm_dst" and "net: Create and use new helper xfrm_dst_child()." changed xfrm bundle handling under the assumption that xdst->path and dst->child are not a NULL pointer only if dst->xfrm is not a NULL pointer. That is true with one exception. If the xfrm hold queue is used to wait until a SA is installed by the key manager, we create a dummy bundle without a valid dst->xfrm pointer. The current xfrm bundle handling crashes in that case. Fix this by extending the NULL check of dst->xfrm with a test of the DST_XFRM_QUEUE flag.
Fixes: 0f6c480f23f4 ("xfrm: Move dst->path into struct xfrm_dst") Fixes: b92cf4aab8e6 ("net: Create and use new helper xfrm_dst_child().") Signed-off-by: Steffen Klassert steffen.klassert@secunet.com Signed-off-by: Sasha Levin sashal@kernel.org Signed-off-by: Yang Yingliang yangyingliang@huawei.com
Signed-off-by: Aichun Li liaichun@huawei.com Reviewed-by: wangxiaopeng wangxiaopeng7@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- include/net/xfrm.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-)
diff --git a/include/net/xfrm.h b/include/net/xfrm.h index b9c71cf57905..2ea1dc1e06b9 100644 --- a/include/net/xfrm.h +++ b/include/net/xfrm.h @@ -1024,7 +1024,7 @@ struct xfrm_dst { static inline struct dst_entry *xfrm_dst_path(const struct dst_entry *dst) { #ifdef CONFIG_XFRM - if (dst->xfrm) { + if (dst->xfrm || (dst->flags & DST_XFRM_QUEUE)) { const struct xfrm_dst *xdst = (const struct xfrm_dst *) dst;
return xdst->path; @@ -1036,7 +1036,7 @@ static inline struct dst_entry *xfrm_dst_path(const struct dst_entry *dst) static inline struct dst_entry *xfrm_dst_child(const struct dst_entry *dst) { #ifdef CONFIG_XFRM - if (dst->xfrm) { + if (dst->xfrm || (dst->flags & DST_XFRM_QUEUE)) { struct xfrm_dst *xdst = (struct xfrm_dst *) dst; return xdst->child; }
From: Julian Anastasov ja@ssi.bg
stable inclusion from linux-4.19.140 commit 7106f943302247ed9fcde84afdca06cbe9e19dce
--------------------------------
[ Upstream commit f0a5e4d7a594e0fe237d3dfafb069bb82f80f42f ]
YangYuxi is reporting that connection reuse is causing one-second delay when SYN hits existing connection in TIME_WAIT state. Such delay was added to give time to expire both the IPVS connection and the corresponding conntrack. This was considered a rare case at that time but it is causing problem for some environments such as Kubernetes.
As nf_conntrack_tcp_packet() can decide to release the conntrack in TIME_WAIT state and to replace it with a fresh NEW conntrack, we can use this to allow rescheduling just by tuning our check: if the conntrack is confirmed we can not schedule it to different real server and the one-second delay still applies but if new conntrack was created, we are free to select new real server without any delays.
YangYuxi lists some of the problem reports:
- One second connection delay in masquerading mode: https://marc.info/?t=151683118100004&r=1&w=2
- IPVS low throughput #70747 https://github.com/kubernetes/kubernetes/issues/70747
- Apache Bench can fill up ipvs service proxy in seconds #544 https://github.com/cloudnativelabs/kube-router/issues/544
- Additional 1s latency in `host -> service IP -> pod` https://github.com/kubernetes/kubernetes/issues/90854
Fixes: f719e3754ee2 ("ipvs: drop first packet to redirect conntrack") Co-developed-by: YangYuxi yx.atom1@gmail.com Signed-off-by: YangYuxi yx.atom1@gmail.com Signed-off-by: Julian Anastasov ja@ssi.bg Reviewed-by: Simon Horman horms@verge.net.au Signed-off-by: Pablo Neira Ayuso pablo@netfilter.org Signed-off-by: Sasha Levin sashal@kernel.org Signed-off-by: Yang Yingliang yangyingliang@huawei.com
Signed-off-by: Aichun Li liaichun@huawei.com Reviewed-by: wangxiaopeng wangxiaopeng7@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- include/net/ip_vs.h | 10 ++++------ net/netfilter/ipvs/ip_vs_core.c | 12 +++++++----- 2 files changed, 11 insertions(+), 11 deletions(-)
diff --git a/include/net/ip_vs.h b/include/net/ip_vs.h index af0ede9ad4d0..c31e54a41b5c 100644 --- a/include/net/ip_vs.h +++ b/include/net/ip_vs.h @@ -1614,18 +1614,16 @@ static inline void ip_vs_conn_drop_conntrack(struct ip_vs_conn *cp) } #endif /* CONFIG_IP_VS_NFCT */
-/* Really using conntrack? */ -static inline bool ip_vs_conn_uses_conntrack(struct ip_vs_conn *cp, - struct sk_buff *skb) +/* Using old conntrack that can not be redirected to another real server? */ +static inline bool ip_vs_conn_uses_old_conntrack(struct ip_vs_conn *cp, + struct sk_buff *skb) { #ifdef CONFIG_IP_VS_NFCT enum ip_conntrack_info ctinfo; struct nf_conn *ct;
- if (!(cp->flags & IP_VS_CONN_F_NFCT)) - return false; ct = nf_ct_get(skb, &ctinfo); - if (ct) + if (ct && nf_ct_is_confirmed(ct)) return true; #endif return false; diff --git a/net/netfilter/ipvs/ip_vs_core.c b/net/netfilter/ipvs/ip_vs_core.c index a71f777d1353..d5e4329579e2 100644 --- a/net/netfilter/ipvs/ip_vs_core.c +++ b/net/netfilter/ipvs/ip_vs_core.c @@ -1928,14 +1928,14 @@ ip_vs_in(struct netns_ipvs *ipvs, unsigned int hooknum, struct sk_buff *skb, int
conn_reuse_mode = sysctl_conn_reuse_mode(ipvs); if (conn_reuse_mode && !iph.fragoffs && is_new_conn(skb, &iph) && cp) { - bool uses_ct = false, resched = false; + bool old_ct = false, resched = false;
if (unlikely(sysctl_expire_nodest_conn(ipvs)) && cp->dest && unlikely(!atomic_read(&cp->dest->weight))) { resched = true; - uses_ct = ip_vs_conn_uses_conntrack(cp, skb); + old_ct = ip_vs_conn_uses_old_conntrack(cp, skb); } else if (is_new_conn_expected(cp, conn_reuse_mode)) { - uses_ct = ip_vs_conn_uses_conntrack(cp, skb); + old_ct = ip_vs_conn_uses_old_conntrack(cp, skb); if (!atomic_read(&cp->n_control)) { resched = true; } else { @@ -1943,15 +1943,17 @@ ip_vs_in(struct netns_ipvs *ipvs, unsigned int hooknum, struct sk_buff *skb, int * that uses conntrack while it is still * referenced by controlled connection(s). */ - resched = !uses_ct; + resched = !old_ct; } }
if (resched) { + if (!old_ct) + cp->flags &= ~IP_VS_CONN_F_NFCT; if (!atomic_read(&cp->n_control)) ip_vs_conn_expire_now(cp); __ip_vs_conn_put(cp); - if (uses_ct) + if (old_ct) return NF_DROP; cp = NULL; }
From: John Ogness john.ogness@linutronix.de
stable inclusion from linux-4.19.140 commit ff00ef9aac0a8e17f52169c25f70e5c7e88c87c9
--------------------------------
[ Upstream commit 88fd1cb80daa20af063bce81e1fad14e945a8dc4 ]
After @blk_fill_in_prog_lock is acquired there is an early out vnet situation that can occur. In that case, the rwlock needs to be released.
Also, since @blk_fill_in_prog_lock is only acquired when @tp_version is exactly TPACKET_V3, only release it on that exact condition as well.
And finally, add sparse annotation so that it is clearer that prb_fill_curr_block() and prb_clear_blk_fill_status() are acquiring and releasing @blk_fill_in_prog_lock, respectively. sparse is still unable to understand the balance, but the warnings are now on a higher level that make more sense.
Fixes: 632ca50f2cbd ("af_packet: TPACKET_V3: replace busy-wait loop") Signed-off-by: John Ogness john.ogness@linutronix.de Reported-by: kernel test robot lkp@intel.com Signed-off-by: David S. Miller davem@davemloft.net Signed-off-by: Greg Kroah-Hartman gregkh@linuxfoundation.org Signed-off-by: Yang Yingliang yangyingliang@huawei.com
Signed-off-by: Aichun Li liaichun@huawei.com Reviewed-by: wangxiaopeng wangxiaopeng7@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- net/packet/af_packet.c | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-)
diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c index 4c2f431e367b..c5a325697642 100644 --- a/net/packet/af_packet.c +++ b/net/packet/af_packet.c @@ -949,6 +949,7 @@ static int prb_queue_frozen(struct tpacket_kbdq_core *pkc) }
static void prb_clear_blk_fill_status(struct packet_ring_buffer *rb) + __releases(&pkc->blk_fill_in_prog_lock) { struct tpacket_kbdq_core *pkc = GET_PBDQC_FROM_RB(rb); atomic_dec(&pkc->blk_fill_in_prog); @@ -996,6 +997,7 @@ static void prb_fill_curr_block(char *curr, struct tpacket_kbdq_core *pkc, struct tpacket_block_desc *pbd, unsigned int len) + __acquires(&pkc->blk_fill_in_prog_lock) { struct tpacket3_hdr *ppd;
@@ -2275,8 +2277,11 @@ static int tpacket_rcv(struct sk_buff *skb, struct net_device *dev, if (do_vnet && virtio_net_hdr_from_skb(skb, h.raw + macoff - sizeof(struct virtio_net_hdr), - vio_le(), true, 0)) + vio_le(), true, 0)) { + if (po->tp_version == TPACKET_V3) + prb_clear_blk_fill_status(&po->rx_ring); goto drop_n_account; + }
if (po->tp_version <= TPACKET_V2) { packet_increment_rx_head(po, &po->rx_ring); @@ -2382,7 +2387,7 @@ static int tpacket_rcv(struct sk_buff *skb, struct net_device *dev, __clear_bit(slot_id, po->rx_ring.rx_owner_map); spin_unlock(&sk->sk_receive_queue.lock); sk->sk_data_ready(sk); - } else { + } else if (po->tp_version == TPACKET_V3) { prb_clear_blk_fill_status(&po->rx_ring); }
From: Miaohe Lin linmiaohe@huawei.com
stable inclusion from linux-4.19.140 commit 520f0e37ba9ea2ca70135d99924652a106f85072
--------------------------------
[ Upstream commit ce787a5a074a86f76f5d3fd804fa78e01bfb9e89 ]
We should fput() file iff FDPUT_FPUT is set. So we should set fput_needed accordingly.
Fixes: 00e188ef6a7e ("sockfd_lookup_light(): switch to fdget^W^Waway from fget_light") Signed-off-by: Miaohe Lin linmiaohe@huawei.com Signed-off-by: David S. Miller davem@davemloft.net Signed-off-by: Greg Kroah-Hartman gregkh@linuxfoundation.org Signed-off-by: Yang Yingliang yangyingliang@huawei.com
Signed-off-by: Aichun Li liaichun@huawei.com Reviewed-by: wangxiaopeng wangxiaopeng7@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- net/socket.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/net/socket.c b/net/socket.c index 1030a612423b..29169045dcfe 100644 --- a/net/socket.c +++ b/net/socket.c @@ -474,7 +474,7 @@ static struct socket *sockfd_lookup_light(int fd, int *err, int *fput_needed) if (f.file) { sock = sock_from_file(f.file, err); if (likely(sock)) { - *fput_needed = f.flags; + *fput_needed = f.flags & FDPUT_FPUT; return sock; } fdput(f);
From: Ira Weiny ira.weiny@intel.com
stable inclusion from linux-4.19.140 commit bc8ce7de36b00ad3d416a03555cb4f4bdd4519ef
--------------------------------
[ Upstream commit b06c19d9f827f6743122795570bfc0c72db482b0 ]
When MSG_OOB is specified to tls_device_sendpage() the mapped page is never unmapped.
Hold off mapping the page until after the flags are checked and the page is actually needed.
Fixes: e8f69799810c ("net/tls: Add generic NIC offload infrastructure") Signed-off-by: Ira Weiny ira.weiny@intel.com Reviewed-by: Jakub Kicinski kuba@kernel.org Signed-off-by: David S. Miller davem@davemloft.net Signed-off-by: Greg Kroah-Hartman gregkh@linuxfoundation.org Signed-off-by: Yang Yingliang yangyingliang@huawei.com
Signed-off-by: Aichun Li liaichun@huawei.com Reviewed-by: wangxiaopeng wangxiaopeng7@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- net/tls/tls_device.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-)
diff --git a/net/tls/tls_device.c b/net/tls/tls_device.c index 8f40bbfd60ea..575d62130578 100644 --- a/net/tls/tls_device.c +++ b/net/tls/tls_device.c @@ -476,7 +476,7 @@ int tls_device_sendpage(struct sock *sk, struct page *page, int offset, size_t size, int flags) { struct iov_iter msg_iter; - char *kaddr = kmap(page); + char *kaddr; struct kvec iov; int rc;
@@ -490,6 +490,7 @@ int tls_device_sendpage(struct sock *sk, struct page *page, goto out; }
+ kaddr = kmap(page); iov.iov_base = kaddr + offset; iov.iov_len = size; iov_iter_kvec(&msg_iter, WRITE | ITER_KVEC, &iov, 1, size);
From: Tim Froidcoeur tim.froidcoeur@tessares.net
stable inclusion from linux-4.19.140 commit 42f4480a37d682da5d144488f4c7443fd41d5067
--------------------------------
[ Upstream commit 62ffc589abb176821662efc4525ee4ac0b9c3894 ]
Refactor the fastreuse update code in inet_csk_get_port into a small helper function that can be called from other places.
Acked-by: Matthieu Baerts matthieu.baerts@tessares.net Signed-off-by: Tim Froidcoeur tim.froidcoeur@tessares.net Signed-off-by: David S. Miller davem@davemloft.net Signed-off-by: Greg Kroah-Hartman gregkh@linuxfoundation.org Conflicts: net/ipv4/inet_connection_sock.c [yyl: resolve the conflicts that introduced by l3mdev code] Signed-off-by: Yang Yingliang yangyingliang@huawei.com
Signed-off-by: Aichun Li liaichun@huawei.com Reviewed-by: wangxiaopeng wangxiaopeng7@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- include/net/inet_connection_sock.h | 4 ++ net/ipv4/inet_connection_sock.c | 97 ++++++++++++++++-------------- 2 files changed, 57 insertions(+), 44 deletions(-)
diff --git a/include/net/inet_connection_sock.h b/include/net/inet_connection_sock.h index 371b3b45fd5c..2d5220ab0600 100644 --- a/include/net/inet_connection_sock.h +++ b/include/net/inet_connection_sock.h @@ -313,5 +313,9 @@ int inet_csk_compat_getsockopt(struct sock *sk, int level, int optname, int inet_csk_compat_setsockopt(struct sock *sk, int level, int optname, char __user *optval, unsigned int optlen);
+/* update the fast reuse flag when adding a socket */ +void inet_csk_update_fastreuse(struct inet_bind_bucket *tb, + struct sock *sk); + struct dst_entry *inet_csk_update_pmtu(struct sock *sk, u32 mtu); #endif /* _INET_CONNECTION_SOCK_H */ diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c index 9023b2df54c0..6da30fd1dbef 100644 --- a/net/ipv4/inet_connection_sock.c +++ b/net/ipv4/inet_connection_sock.c @@ -288,6 +288,57 @@ static inline int sk_reuseport_match(struct inet_bind_bucket *tb, ipv6_only_sock(sk), true, false); }
+void inet_csk_update_fastreuse(struct inet_bind_bucket *tb, + struct sock *sk) +{ + kuid_t uid = sock_i_uid(sk); + bool reuse = sk->sk_reuse && sk->sk_state != TCP_LISTEN; + + if (hlist_empty(&tb->owners)) { + tb->fastreuse = reuse; + if (sk->sk_reuseport) { + tb->fastreuseport = FASTREUSEPORT_ANY; + tb->fastuid = uid; + tb->fast_rcv_saddr = sk->sk_rcv_saddr; + tb->fast_ipv6_only = ipv6_only_sock(sk); + tb->fast_sk_family = sk->sk_family; +#if IS_ENABLED(CONFIG_IPV6) + tb->fast_v6_rcv_saddr = sk->sk_v6_rcv_saddr; +#endif + } else { + tb->fastreuseport = 0; + } + } else { + if (!reuse) + tb->fastreuse = 0; + if (sk->sk_reuseport) { + /* We didn't match or we don't have fastreuseport set on + * the tb, but we have sk_reuseport set on this socket + * and we know that there are no bind conflicts with + * this socket in this tb, so reset our tb's reuseport + * settings so that any subsequent sockets that match + * our current socket will be put on the fast path. + * + * If we reset we need to set FASTREUSEPORT_STRICT so we + * do extra checking for all subsequent sk_reuseport + * socks. + */ + if (!sk_reuseport_match(tb, sk)) { + tb->fastreuseport = FASTREUSEPORT_STRICT; + tb->fastuid = uid; + tb->fast_rcv_saddr = sk->sk_rcv_saddr; + tb->fast_ipv6_only = ipv6_only_sock(sk); + tb->fast_sk_family = sk->sk_family; +#if IS_ENABLED(CONFIG_IPV6) + tb->fast_v6_rcv_saddr = sk->sk_v6_rcv_saddr; +#endif + } + } else { + tb->fastreuseport = 0; + } + } +} + /* Obtain a reference to a local port for the given sock, * if snum is zero it means select any available local port. * We try to allocate an odd port (and leave even ports for connect()) @@ -300,7 +351,6 @@ int inet_csk_get_port(struct sock *sk, unsigned short snum) struct inet_bind_hashbucket *head; struct net *net = sock_net(sk); struct inet_bind_bucket *tb = NULL; - kuid_t uid = sock_i_uid(sk); int l3mdev;
l3mdev = inet_sk_bound_l3mdev(sk); @@ -337,49 +387,8 @@ int inet_csk_get_port(struct sock *sk, unsigned short snum) goto fail_unlock; } success: - if (hlist_empty(&tb->owners)) { - tb->fastreuse = reuse; - if (sk->sk_reuseport) { - tb->fastreuseport = FASTREUSEPORT_ANY; - tb->fastuid = uid; - tb->fast_rcv_saddr = sk->sk_rcv_saddr; - tb->fast_ipv6_only = ipv6_only_sock(sk); - tb->fast_sk_family = sk->sk_family; -#if IS_ENABLED(CONFIG_IPV6) - tb->fast_v6_rcv_saddr = sk->sk_v6_rcv_saddr; -#endif - } else { - tb->fastreuseport = 0; - } - } else { - if (!reuse) - tb->fastreuse = 0; - if (sk->sk_reuseport) { - /* We didn't match or we don't have fastreuseport set on - * the tb, but we have sk_reuseport set on this socket - * and we know that there are no bind conflicts with - * this socket in this tb, so reset our tb's reuseport - * settings so that any subsequent sockets that match - * our current socket will be put on the fast path. - * - * If we reset we need to set FASTREUSEPORT_STRICT so we - * do extra checking for all subsequent sk_reuseport - * socks. - */ - if (!sk_reuseport_match(tb, sk)) { - tb->fastreuseport = FASTREUSEPORT_STRICT; - tb->fastuid = uid; - tb->fast_rcv_saddr = sk->sk_rcv_saddr; - tb->fast_ipv6_only = ipv6_only_sock(sk); - tb->fast_sk_family = sk->sk_family; -#if IS_ENABLED(CONFIG_IPV6) - tb->fast_v6_rcv_saddr = sk->sk_v6_rcv_saddr; -#endif - } - } else { - tb->fastreuseport = 0; - } - } + inet_csk_update_fastreuse(tb, sk); + if (!inet_csk(sk)->icsk_bind_hash) inet_bind_hash(sk, tb, port); WARN_ON(inet_csk(sk)->icsk_bind_hash != tb);
From: Tim Froidcoeur tim.froidcoeur@tessares.net
stable inclusion from linux-4.19.140 commit 0e866a435676b6c6fa546bdc1f6709be6367685e
--------------------------------
[ Upstream commit d76f3351cea2d927fdf70dd7c06898235035e84e ]
In the case of TPROXY, bind_conflict optimizations for SO_REUSEADDR or SO_REUSEPORT are broken, possibly resulting in O(n) instead of O(1) bind behaviour or in the incorrect reuse of a bind.
the kernel keeps track for each bind_bucket if all sockets in the bind_bucket support SO_REUSEADDR or SO_REUSEPORT in two fastreuse flags. These flags allow skipping the costly bind_conflict check when possible (meaning when all sockets have the proper SO_REUSE option).
For every socket added to a bind_bucket, these flags need to be updated. As soon as a socket that does not support reuse is added, the flag is set to false and will never go back to true, unless the bind_bucket is deleted.
Note that there is no mechanism to re-evaluate these flags when a socket is removed (this might make sense when removing a socket that would not allow reuse; this leaves room for a future patch).
For this optimization to work, it is mandatory that these flags are properly initialized and updated.
When a child socket is created from a listen socket in __inet_inherit_port, the TPROXY case could create a new bind bucket without properly initializing these flags, thus preventing the optimization to work. Alternatively, a socket not allowing reuse could be added to an existing bind bucket without updating the flags, causing bind_conflict to never be called as it should.
Call inet_csk_update_fastreuse when __inet_inherit_port decides to create a new bind_bucket or use a different bind_bucket than the one of the listen socket.
Fixes: 093d282321da ("tproxy: fix hash locking issue when using port redirection in __inet_inherit_port()") Acked-by: Matthieu Baerts matthieu.baerts@tessares.net Signed-off-by: Tim Froidcoeur tim.froidcoeur@tessares.net Signed-off-by: David S. Miller davem@davemloft.net Signed-off-by: Greg Kroah-Hartman gregkh@linuxfoundation.org Signed-off-by: Yang Yingliang yangyingliang@huawei.com
Signed-off-by: Aichun Li liaichun@huawei.com Reviewed-by: wangxiaopeng wangxiaopeng7@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- net/ipv4/inet_hashtables.c | 1 + 1 file changed, 1 insertion(+)
diff --git a/net/ipv4/inet_hashtables.c b/net/ipv4/inet_hashtables.c index d3b4b06fea4e..941477baa8d2 100644 --- a/net/ipv4/inet_hashtables.c +++ b/net/ipv4/inet_hashtables.c @@ -167,6 +167,7 @@ int __inet_inherit_port(const struct sock *sk, struct sock *child) return -ENOMEM; } } + inet_csk_update_fastreuse(tb, child); } inet_bind_hash(child, tb, port); spin_unlock(&head->lock);
From: Kees Cook keescook@chromium.org
stable inclusion from linux-4.19.141 commit f90339a4eccf1768f044ac98ec6d2a8afeeab58d
--------------------------------
commit d9539752d23283db4692384a634034f451261e29 upstream.
Add missed sock updates to compat path via a new helper, which will be used more in coming patches. (The net/core/scm.c code is left as-is here to assist with -stable backports for the compat path.)
Cc: Christoph Hellwig hch@lst.de Cc: Sargun Dhillon sargun@sargun.me Cc: Jakub Kicinski kuba@kernel.org Cc: stable@vger.kernel.org Fixes: 48a87cc26c13 ("net: netprio: fd passed in SCM_RIGHTS datagram not set correctly") Fixes: d84295067fc7 ("net: net_cls: fd passed in SCM_RIGHTS datagram not set correctly") Acked-by: Christian Brauner christian.brauner@ubuntu.com Signed-off-by: Kees Cook keescook@chromium.org Signed-off-by: Greg Kroah-Hartman gregkh@linuxfoundation.org Signed-off-by: Yang Yingliang yangyingliang@huawei.com
Signed-off-by: Aichun Li liaichun@huawei.com Reviewed-by: wangxiaopeng wangxiaopeng7@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- include/net/sock.h | 4 ++++ net/compat.c | 1 + net/core/sock.c | 21 +++++++++++++++++++++ 3 files changed, 26 insertions(+)
diff --git a/include/net/sock.h b/include/net/sock.h index c7d91efe98e5..b5fb4d367406 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -863,6 +863,8 @@ static inline int sk_memalloc_socks(void) { return static_branch_unlikely(&memalloc_socks_key); } + +void __receive_sock(struct file *file); #else
static inline int sk_memalloc_socks(void) @@ -870,6 +872,8 @@ static inline int sk_memalloc_socks(void) return 0; }
+static inline void __receive_sock(struct file *file) +{ } #endif
static inline gfp_t sk_gfp_mask(const struct sock *sk, gfp_t gfp_mask) diff --git a/net/compat.c b/net/compat.c index b17436eee701..981424bd707d 100644 --- a/net/compat.c +++ b/net/compat.c @@ -289,6 +289,7 @@ void scm_detach_fds_compat(struct msghdr *kmsg, struct scm_cookie *scm) break; } /* Bump the usage count and install the file. */ + __receive_sock(fp[i]); fd_install(new_fd, get_file(fp[i])); }
diff --git a/net/core/sock.c b/net/core/sock.c index d5d8402e02bd..cb27f9cc7ab9 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -2658,6 +2658,27 @@ int sock_no_mmap(struct file *file, struct socket *sock, struct vm_area_struct * } EXPORT_SYMBOL(sock_no_mmap);
+/* + * When a file is received (via SCM_RIGHTS, etc), we must bump the + * various sock-based usage counts. + */ +void __receive_sock(struct file *file) +{ + struct socket *sock; + int error; + + /* + * The resulting value of "error" is ignored here since we only + * need to take action when the file is a socket and testing + * "sock" for NULL is sufficient. + */ + sock = sock_from_file(file, &error); + if (sock) { + sock_update_netprioidx(&sock->sk->sk_cgrp_data); + sock_update_classid(&sock->sk->sk_cgrp_data); + } +} + ssize_t sock_no_sendpage(struct socket *sock, struct page *page, int offset, size_t size, int flags) { ssize_t res;
From: Chuck Lever chuck.lever@oracle.com
stable inclusion from linux-4.19.142 commit 88f7857c90e457e44ff45872d012e37cb3e3a41e
--------------------------------
[ Upstream commit 64d26422516b2e347b32e6d9b1d40b3c19a62aae ]
During a connection tear down, the Receive queue is flushed before the device resources are freed. Typically, all the Receives flush with IB_WR_FLUSH_ERR.
However, any pending successful Receives flush with IB_WR_SUCCESS, and the server automatically posts a fresh Receive to replace the completing one. This happens even after the connection has closed and the RQ is drained. Receives that are posted after the RQ is drained appear never to complete, causing a Receive resource leak. The leaked Receive buffer is left DMA-mapped.
To prevent these late-posted recv_ctxt's from leaking, block new Receive posting after XPT_CLOSE is set.
Signed-off-by: Chuck Lever chuck.lever@oracle.com Signed-off-by: Sasha Levin sashal@kernel.org Signed-off-by: Yang Yingliang yangyingliang@huawei.com
Signed-off-by: Aichun Li liaichun@huawei.com Reviewed-by: wangxiaopeng wangxiaopeng7@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- net/sunrpc/xprtrdma/svc_rdma_recvfrom.c | 2 ++ 1 file changed, 2 insertions(+)
diff --git a/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c b/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c index 16c8174658fd..252495ff9010 100644 --- a/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c +++ b/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c @@ -268,6 +268,8 @@ static int svc_rdma_post_recv(struct svcxprt_rdma *rdma) { struct svc_rdma_recv_ctxt *ctxt;
+ if (test_bit(XPT_CLOSE, &rdma->sc_xprt.xpt_flags)) + return 0; ctxt = svc_rdma_recv_ctxt_get(rdma); if (!ctxt) return -ENOMEM;
From: Przemyslaw Patynowski przemyslawx.patynowski@intel.com
stable inclusion from linux-4.19.142 commit 04efb368bc0bd90cbb07b8e9fa79dcea11f5b8ce
--------------------------------
[ Upstream commit 4bd5e02a2ed1575c2f65bd3c557a077dd399f0e8 ]
Trusted VF with unicast promiscuous mode set, could listen to TX traffic of other VFs. Set unicast promiscuous mode to RX traffic, if VSI has port VLAN configured. Rename misleading I40E_AQC_SET_VSI_PROMISC_TX bit to I40E_AQC_SET_VSI_PROMISC_RX_ONLY. Aligned unicast promiscuous with VLAN to the one without VLAN.
Fixes: 6c41a7606967 ("i40e: Add promiscuous on VLAN support") Fixes: 3b1200891b7f ("i40e: When in promisc mode apply promisc mode to Tx Traffic as well") Signed-off-by: Przemyslaw Patynowski przemyslawx.patynowski@intel.com Signed-off-by: Aleksandr Loktionov aleksandr.loktionov@intel.com Signed-off-by: Arkadiusz Kubalewski arkadiusz.kubalewski@intel.com Tested-by: Andrew Bowers andrewx.bowers@intel.com Signed-off-by: Tony Nguyen anthony.l.nguyen@intel.com Signed-off-by: Sasha Levin sashal@kernel.org Signed-off-by: Yang Yingliang yangyingliang@huawei.com
Signed-off-by: Aichun Li liaichun@huawei.com Reviewed-by: wangxiaopeng wangxiaopeng7@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- .../net/ethernet/intel/i40e/i40e_adminq_cmd.h | 2 +- drivers/net/ethernet/intel/i40e/i40e_common.c | 35 ++++++++++++++----- 2 files changed, 28 insertions(+), 9 deletions(-)
diff --git a/drivers/net/ethernet/intel/i40e/i40e_adminq_cmd.h b/drivers/net/ethernet/intel/i40e/i40e_adminq_cmd.h index 80e3eec6134e..a5e5e7e14e6c 100644 --- a/drivers/net/ethernet/intel/i40e/i40e_adminq_cmd.h +++ b/drivers/net/ethernet/intel/i40e/i40e_adminq_cmd.h @@ -1206,7 +1206,7 @@ struct i40e_aqc_set_vsi_promiscuous_modes { #define I40E_AQC_SET_VSI_PROMISC_BROADCAST 0x04 #define I40E_AQC_SET_VSI_DEFAULT 0x08 #define I40E_AQC_SET_VSI_PROMISC_VLAN 0x10 -#define I40E_AQC_SET_VSI_PROMISC_TX 0x8000 +#define I40E_AQC_SET_VSI_PROMISC_RX_ONLY 0x8000 __le16 seid; #define I40E_AQC_VSI_PROM_CMD_SEID_MASK 0x3FF __le16 vlan_tag; diff --git a/drivers/net/ethernet/intel/i40e/i40e_common.c b/drivers/net/ethernet/intel/i40e/i40e_common.c index eb0ae6ab01e2..e75b4c4872c0 100644 --- a/drivers/net/ethernet/intel/i40e/i40e_common.c +++ b/drivers/net/ethernet/intel/i40e/i40e_common.c @@ -1970,6 +1970,21 @@ i40e_status i40e_aq_set_phy_debug(struct i40e_hw *hw, u8 cmd_flags, return status; }
+/** + * i40e_is_aq_api_ver_ge + * @aq: pointer to AdminQ info containing HW API version to compare + * @maj: API major value + * @min: API minor value + * + * Assert whether current HW API version is greater/equal than provided. + **/ +static bool i40e_is_aq_api_ver_ge(struct i40e_adminq_info *aq, u16 maj, + u16 min) +{ + return (aq->api_maj_ver > maj || + (aq->api_maj_ver == maj && aq->api_min_ver >= min)); +} + /** * i40e_aq_add_vsi * @hw: pointer to the hw struct @@ -2095,18 +2110,16 @@ i40e_status i40e_aq_set_vsi_unicast_promiscuous(struct i40e_hw *hw,
if (set) { flags |= I40E_AQC_SET_VSI_PROMISC_UNICAST; - if (rx_only_promisc && - (((hw->aq.api_maj_ver == 1) && (hw->aq.api_min_ver >= 5)) || - (hw->aq.api_maj_ver > 1))) - flags |= I40E_AQC_SET_VSI_PROMISC_TX; + if (rx_only_promisc && i40e_is_aq_api_ver_ge(&hw->aq, 1, 5)) + flags |= I40E_AQC_SET_VSI_PROMISC_RX_ONLY; }
cmd->promiscuous_flags = cpu_to_le16(flags);
cmd->valid_flags = cpu_to_le16(I40E_AQC_SET_VSI_PROMISC_UNICAST); - if (((hw->aq.api_maj_ver >= 1) && (hw->aq.api_min_ver >= 5)) || - (hw->aq.api_maj_ver > 1)) - cmd->valid_flags |= cpu_to_le16(I40E_AQC_SET_VSI_PROMISC_TX); + if (i40e_is_aq_api_ver_ge(&hw->aq, 1, 5)) + cmd->valid_flags |= + cpu_to_le16(I40E_AQC_SET_VSI_PROMISC_RX_ONLY);
cmd->seid = cpu_to_le16(seid); status = i40e_asq_send_command(hw, &desc, NULL, 0, cmd_details); @@ -2203,11 +2216,17 @@ enum i40e_status_code i40e_aq_set_vsi_uc_promisc_on_vlan(struct i40e_hw *hw, i40e_fill_default_direct_cmd_desc(&desc, i40e_aqc_opc_set_vsi_promiscuous_modes);
- if (enable) + if (enable) { flags |= I40E_AQC_SET_VSI_PROMISC_UNICAST; + if (i40e_is_aq_api_ver_ge(&hw->aq, 1, 5)) + flags |= I40E_AQC_SET_VSI_PROMISC_RX_ONLY; + }
cmd->promiscuous_flags = cpu_to_le16(flags); cmd->valid_flags = cpu_to_le16(I40E_AQC_SET_VSI_PROMISC_UNICAST); + if (i40e_is_aq_api_ver_ge(&hw->aq, 1, 5)) + cmd->valid_flags |= + cpu_to_le16(I40E_AQC_SET_VSI_PROMISC_RX_ONLY); cmd->seid = cpu_to_le16(seid); cmd->vlan_tag = cpu_to_le16(vid | I40E_AQC_SET_VSI_VLAN_VALID);
From: Grzegorz Szczurek grzegorzx.szczurek@intel.com
stable inclusion from linux-4.19.142 commit 46858b2f609ffd81d6f437a6099d4782fa2e33a8
--------------------------------
[ Upstream commit 5b6d4a7f20b09c47ca598760f6dafd554af8b6d5 ]
Fix the reason of crashing system by add waiting time to finish reset recovery process before starting remove driver procedure. Now VSI is releasing if VSI is not in reset recovery mode. Without this fix it was possible to start remove driver if other processing command need reset recovery procedure which resulted in null pointer dereference. VSI used by the ethtool process has been cleared by remove driver process.
[ 6731.508665] BUG: kernel NULL pointer dereference, address: 0000000000000000 [ 6731.508668] #PF: supervisor read access in kernel mode [ 6731.508670] #PF: error_code(0x0000) - not-present page [ 6731.508671] PGD 0 P4D 0 [ 6731.508674] Oops: 0000 [#1] SMP PTI [ 6731.508679] Hardware name: Intel Corporation S2600WT2R/S2600WT2R, BIOS SE5C610.86B.01.01.0021.032120170601 03/21/2017 [ 6731.508694] RIP: 0010:i40e_down+0x252/0x310 [i40e] [ 6731.508696] Code: c7 78 de fa c0 e8 61 02 3a c1 66 83 bb f6 0c 00 00 00 0f 84 bf 00 00 00 45 31 e4 45 31 ff eb 03 41 89 c7 48 8b 83 98 0c 00 00 <4a> 8b 3c 20 e8 a5 79 02 00 48 83 bb d0 0c 00 00 00 74 10 48 8b 83 [ 6731.508698] RSP: 0018:ffffb75ac7b3faf0 EFLAGS: 00010246 [ 6731.508700] RAX: 0000000000000000 RBX: ffff9c9874bd5000 RCX: 0000000000000007 [ 6731.508701] RDX: 0000000000000000 RSI: 0000000000000096 RDI: ffff9c987f4d9780 [ 6731.508703] RBP: ffffb75ac7b3fb30 R08: 0000000000005b60 R09: 0000000000000004 [ 6731.508704] R10: ffffb75ac64fbd90 R11: 0000000000000001 R12: 0000000000000000 [ 6731.508706] R13: ffff9c97a08e0000 R14: ffff9c97a08e0a68 R15: 0000000000000000 [ 6731.508708] FS: 00007f2617cd2740(0000) GS:ffff9c987f4c0000(0000) knlGS:0000000000000000 [ 6731.508710] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [ 6731.508711] CR2: 0000000000000000 CR3: 0000001e765c4006 CR4: 00000000003606e0 [ 6731.508713] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 [ 6731.508714] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 [ 6731.508715] Call Trace: [ 6731.508734] i40e_vsi_close+0x84/0x90 [i40e] [ 6731.508742] i40e_quiesce_vsi.part.98+0x3c/0x40 [i40e] [ 6731.508749] i40e_pf_quiesce_all_vsi+0x55/0x60 [i40e] [ 6731.508757] i40e_prep_for_reset+0x59/0x130 [i40e] [ 6731.508765] i40e_reconfig_rss_queues+0x5a/0x120 [i40e] [ 6731.508774] i40e_set_channels+0xda/0x170 [i40e] [ 6731.508778] ethtool_set_channels+0xe9/0x150 [ 6731.508781] dev_ethtool+0x1b94/0x2920 [ 6731.508805] dev_ioctl+0xc2/0x590 [ 6731.508811] sock_do_ioctl+0xae/0x150 [ 6731.508813] sock_ioctl+0x34f/0x3c0 [ 6731.508821] ksys_ioctl+0x98/0xb0 [ 6731.508828] __x64_sys_ioctl+0x1a/0x20 [ 6731.508831] do_syscall_64+0x57/0x1c0 [ 6731.508835] entry_SYSCALL_64_after_hwframe+0x44/0xa9
Fixes: 4b8164467b85 ("i40e: Add common function for finding VSI by type") Signed-off-by: Grzegorz Szczurek grzegorzx.szczurek@intel.com Signed-off-by: Arkadiusz Kubalewski arkadiusz.kubalewski@intel.com Tested-by: Aaron Brown aaron.f.brown@intel.com Signed-off-by: Tony Nguyen anthony.l.nguyen@intel.com Signed-off-by: Sasha Levin sashal@kernel.org Signed-off-by: Yang Yingliang yangyingliang@huawei.com
Signed-off-by: Aichun Li liaichun@huawei.com Reviewed-by: wangxiaopeng wangxiaopeng7@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- drivers/net/ethernet/intel/i40e/i40e_main.c | 3 +++ 1 file changed, 3 insertions(+)
diff --git a/drivers/net/ethernet/intel/i40e/i40e_main.c b/drivers/net/ethernet/intel/i40e/i40e_main.c index a74b01bf581e..3200c75b9ed2 100644 --- a/drivers/net/ethernet/intel/i40e/i40e_main.c +++ b/drivers/net/ethernet/intel/i40e/i40e_main.c @@ -14152,6 +14152,9 @@ static void i40e_remove(struct pci_dev *pdev) i40e_write_rx_ctl(hw, I40E_PFQF_HENA(0), 0); i40e_write_rx_ctl(hw, I40E_PFQF_HENA(1), 0);
+ while (test_bit(__I40E_RESET_RECOVERY_PENDING, pf->state)) + usleep_range(1000, 2000); + /* no more scheduling of any task */ set_bit(__I40E_SUSPENDED, pf->state); set_bit(__I40E_DOWN, pf->state);
From: Jarod Wilson jarod@redhat.com
stable inclusion from linux-4.19.142 commit a74506f2d91c7395e353202e616aa8712a606bff
--------------------------------
[ Upstream commit 4ca0d9ac3fd8f9f90b72a15d8da2aca3ffb58418 ]
Broadcast mode bonds transmit a copy of all traffic simultaneously out of all interfaces, so the "speed" of the bond isn't really the aggregate of all interfaces, but rather, the speed of the slowest active interface.
Also, the type of the speed field is u32, not unsigned long, so adjust that accordingly, as required to make min() function here without complaining about mismatching types.
Fixes: bb5b052f751b ("bond: add support to read speed and duplex via ethtool") CC: Jay Vosburgh j.vosburgh@gmail.com CC: Veaceslav Falico vfalico@gmail.com CC: Andy Gospodarek andy@greyhouse.net CC: "David S. Miller" davem@davemloft.net CC: netdev@vger.kernel.org Acked-by: Jay Vosburgh jay.vosburgh@canonical.com Signed-off-by: Jarod Wilson jarod@redhat.com Signed-off-by: David S. Miller davem@davemloft.net Signed-off-by: Sasha Levin sashal@kernel.org Signed-off-by: Yang Yingliang yangyingliang@huawei.com
Signed-off-by: Aichun Li liaichun@huawei.com Reviewed-by: wangxiaopeng wangxiaopeng7@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- drivers/net/bonding/bond_main.c | 21 ++++++++++++++++++--- 1 file changed, 18 insertions(+), 3 deletions(-)
diff --git a/drivers/net/bonding/bond_main.c b/drivers/net/bonding/bond_main.c index 11429df74306..76fd5fc437eb 100644 --- a/drivers/net/bonding/bond_main.c +++ b/drivers/net/bonding/bond_main.c @@ -4200,13 +4200,23 @@ static netdev_tx_t bond_start_xmit(struct sk_buff *skb, struct net_device *dev) return ret; }
+static u32 bond_mode_bcast_speed(struct slave *slave, u32 speed) +{ + if (speed == 0 || speed == SPEED_UNKNOWN) + speed = slave->speed; + else + speed = min(speed, slave->speed); + + return speed; +} + static int bond_ethtool_get_link_ksettings(struct net_device *bond_dev, struct ethtool_link_ksettings *cmd) { struct bonding *bond = netdev_priv(bond_dev); - unsigned long speed = 0; struct list_head *iter; struct slave *slave; + u32 speed = 0;
cmd->base.duplex = DUPLEX_UNKNOWN; cmd->base.port = PORT_OTHER; @@ -4218,8 +4228,13 @@ static int bond_ethtool_get_link_ksettings(struct net_device *bond_dev, */ bond_for_each_slave(bond, slave, iter) { if (bond_slave_can_tx(slave)) { - if (slave->speed != SPEED_UNKNOWN) - speed += slave->speed; + if (slave->speed != SPEED_UNKNOWN) { + if (BOND_MODE(bond) == BOND_MODE_BROADCAST) + speed = bond_mode_bcast_speed(slave, + speed); + else + speed += slave->speed; + } if (cmd->base.duplex == DUPLEX_UNKNOWN && slave->duplex != DUPLEX_UNKNOWN) cmd->base.duplex = slave->duplex;
From: Cong Wang xiyou.wangcong@gmail.com
stable inclusion from linux-4.19.142 commit 8dcf188973549e89ed751a10911eab5ff91a98e8
--------------------------------
[ Upstream commit 832707021666411d04795c564a4adea5d6b94f17 ]
When we tear down a network namespace, we unregister all the netdevices within it. So we may queue a slave device and a bonding device together in the same unregister queue.
If the only slave device is non-ethernet, it would automatically unregister the bonding device as well. Thus, we may end up unregistering the bonding device twice.
Workaround this special case by checking reg_state.
Fixes: 9b5e383c11b0 ("net: Introduce unregister_netdevice_many()") Reported-by: syzbot+af23e7f3e0a7e10c8b67@syzkaller.appspotmail.com Cc: Eric Dumazet eric.dumazet@gmail.com Cc: Andy Gospodarek andy@greyhouse.net Cc: Jay Vosburgh j.vosburgh@gmail.com Signed-off-by: Cong Wang xiyou.wangcong@gmail.com Signed-off-by: David S. Miller davem@davemloft.net Signed-off-by: Sasha Levin sashal@kernel.org Signed-off-by: Yang Yingliang yangyingliang@huawei.com
Signed-off-by: Aichun Li liaichun@huawei.com Reviewed-by: wangxiaopeng wangxiaopeng7@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- drivers/net/bonding/bond_main.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-)
diff --git a/drivers/net/bonding/bond_main.c b/drivers/net/bonding/bond_main.c index 76fd5fc437eb..ee7138a92d5e 100644 --- a/drivers/net/bonding/bond_main.c +++ b/drivers/net/bonding/bond_main.c @@ -2029,7 +2029,8 @@ static int bond_release_and_destroy(struct net_device *bond_dev, int ret;
ret = __bond_release_one(bond_dev, slave_dev, false, true); - if (ret == 0 && !bond_has_slaves(bond)) { + if (ret == 0 && !bond_has_slaves(bond) && + bond_dev->reg_state != NETREG_UNREGISTERING) { bond_dev->priv_flags |= IFF_DISABLE_NETPOLL; netdev_info(bond_dev, "Destroying bond %s\n", bond_dev->name);
From: Jiri Wiesner jwiesner@suse.com
stable inclusion from linux-4.19.142 commit ae405ea98693f3991192d172cbbcea04c53b0aba
--------------------------------
[ Upstream commit 0410d07190961ac526f05085765a8d04d926545b ]
When the ARP monitor is used for link detection, ARP replies are validated for all slaves (arp_validate=3) and fail_over_mac is set to active, two slaves of an active-backup bond may get stuck in a state where both of them are active and pass packets that they receive to the bond. This state makes IPv6 duplicate address detection fail. The state is reached thus: 1. The current active slave goes down because the ARP target is not reachable. 2. The current ARP slave is chosen and made active. 3. A new slave is enslaved. This new slave becomes the current active slave and can reach the ARP target. As a result, the current ARP slave stays active after the enslave action has finished and the log is littered with "PROBE BAD" messages:
bond0: PROBE: c_arp ens10 && cas ens11 BAD
The workaround is to remove the slave with "going back" status from the bond and re-enslave it. This issue was encountered when DPDK PMD interfaces were being enslaved to an active-backup bond.
I would be possible to fix the issue in bond_enslave() or bond_change_active_slave() but the ARP monitor was fixed instead to keep most of the actions changing the current ARP slave in the ARP monitor code. The current ARP slave is set as inactive and backup during the commit phase. A new state, BOND_LINK_FAIL, has been introduced for slaves in the context of the ARP monitor. This allows administrators to see how slaves are rotated for sending ARP requests and attempts are made to find a new active slave.
Fixes: b2220cad583c9 ("bonding: refactor ARP active-backup monitor") Signed-off-by: Jiri Wiesner jwiesner@suse.com Signed-off-by: David S. Miller davem@davemloft.net Signed-off-by: Sasha Levin sashal@kernel.org Signed-off-by: Yang Yingliang yangyingliang@huawei.com
Signed-off-by: Aichun Li liaichun@huawei.com Reviewed-by: wangxiaopeng wangxiaopeng7@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- drivers/net/bonding/bond_main.c | 18 ++++++++++++++++-- 1 file changed, 16 insertions(+), 2 deletions(-)
diff --git a/drivers/net/bonding/bond_main.c b/drivers/net/bonding/bond_main.c index ee7138a92d5e..d32e32e79174 100644 --- a/drivers/net/bonding/bond_main.c +++ b/drivers/net/bonding/bond_main.c @@ -2773,6 +2773,9 @@ static int bond_ab_arp_inspect(struct bonding *bond) if (bond_time_in_interval(bond, last_rx, 1)) { bond_propose_link_state(slave, BOND_LINK_UP); commit++; + } else if (slave->link == BOND_LINK_BACK) { + bond_propose_link_state(slave, BOND_LINK_FAIL); + commit++; } continue; } @@ -2883,6 +2886,19 @@ static void bond_ab_arp_commit(struct bonding *bond)
continue;
+ case BOND_LINK_FAIL: + bond_set_slave_link_state(slave, BOND_LINK_FAIL, + BOND_SLAVE_NOTIFY_NOW); + bond_set_slave_inactive_flags(slave, + BOND_SLAVE_NOTIFY_NOW); + + /* A slave has just been enslaved and has become + * the current active slave. + */ + if (rtnl_dereference(bond->curr_active_slave)) + RCU_INIT_POINTER(bond->current_arp_slave, NULL); + continue; + default: netdev_err(bond->dev, "impossible: new_link %d on slave %s\n", slave->link_new_state, slave->dev->name); @@ -2932,8 +2948,6 @@ static bool bond_ab_arp_probe(struct bonding *bond) return should_notify_rtnl; }
- bond_set_slave_inactive_flags(curr_arp_slave, BOND_SLAVE_NOTIFY_LATER); - bond_for_each_slave_rcu(bond, slave, iter) { if (!found && !before && bond_slave_is_up(slave)) before = slave;
From: Mark Tomlinson mark.tomlinson@alliedtelesis.co.nz
stable inclusion from linux-4.19.143 commit c3f242a608040074ff54fc67d74b168408c92899
--------------------------------
[ Upstream commit 272502fcb7cda01ab07fc2fcff82d1d2f73d43cc ]
When receiving an IPv4 packet inside an IPv6 GRE packet, and the IP6_TNL_F_RCV_DSCP_COPY flag is set on the tunnel, the IPv4 header would get corrupted. This is due to the common ip6_tnl_rcv() function assuming that the inner header is always IPv6. This patch checks the tunnel protocol for IPv4 inner packets, but still defaults to IPv6.
Fixes: 308edfdf1563 ("gre6: Cleanup GREv6 receive path, call common GRE functions") Signed-off-by: Mark Tomlinson mark.tomlinson@alliedtelesis.co.nz Signed-off-by: David S. Miller davem@davemloft.net Signed-off-by: Greg Kroah-Hartman gregkh@linuxfoundation.org Signed-off-by: Yang Yingliang yangyingliang@huawei.com
Signed-off-by: Aichun Li liaichun@huawei.com Reviewed-by: wangxiaopeng wangxiaopeng7@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- net/ipv6/ip6_tunnel.c | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-)
diff --git a/net/ipv6/ip6_tunnel.c b/net/ipv6/ip6_tunnel.c index 8e70a015c792..b825ac025d5b 100644 --- a/net/ipv6/ip6_tunnel.c +++ b/net/ipv6/ip6_tunnel.c @@ -865,7 +865,15 @@ int ip6_tnl_rcv(struct ip6_tnl *t, struct sk_buff *skb, struct metadata_dst *tun_dst, bool log_ecn_err) { - return __ip6_tnl_rcv(t, skb, tpi, tun_dst, ip6ip6_dscp_ecn_decapsulate, + int (*dscp_ecn_decapsulate)(const struct ip6_tnl *t, + const struct ipv6hdr *ipv6h, + struct sk_buff *skb); + + dscp_ecn_decapsulate = ip6ip6_dscp_ecn_decapsulate; + if (tpi->proto == htons(ETH_P_IP)) + dscp_ecn_decapsulate = ip4ip6_dscp_ecn_decapsulate; + + return __ip6_tnl_rcv(t, skb, tpi, tun_dst, dscp_ecn_decapsulate, log_ecn_err); } EXPORT_SYMBOL(ip6_tnl_rcv);
From: Miaohe Lin linmiaohe@huawei.com
stable inclusion from linux-4.19.143 commit ad270a5a9a04923da92238eaabfe04ad3cb06c16
--------------------------------
[ Upstream commit 55eff0eb7460c3d50716ed9eccf22257b046ca92 ]
We may access the two bytes after vlan_hdr in vlan_set_encap_proto(). So we should pull VLAN_HLEN + sizeof(unsigned short) in skb_vlan_untag() or we may access the wrong data.
Fixes: 0d5501c1c828 ("net: Always untag vlan-tagged traffic on input.") Signed-off-by: Miaohe Lin linmiaohe@huawei.com Signed-off-by: David S. Miller davem@davemloft.net Signed-off-by: Greg Kroah-Hartman gregkh@linuxfoundation.org Signed-off-by: Yang Yingliang yangyingliang@huawei.com
Signed-off-by: Aichun Li liaichun@huawei.com Reviewed-by: wangxiaopeng wangxiaopeng7@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- net/core/skbuff.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-)
diff --git a/net/core/skbuff.c b/net/core/skbuff.c index 0629ca89ab74..af6e9028716d 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -5128,8 +5128,8 @@ struct sk_buff *skb_vlan_untag(struct sk_buff *skb) skb = skb_share_check(skb, GFP_ATOMIC); if (unlikely(!skb)) goto err_free; - - if (unlikely(!pskb_may_pull(skb, VLAN_HLEN))) + /* We may access the two bytes after vlan_hdr in vlan_set_encap_proto(). */ + if (unlikely(!pskb_may_pull(skb, VLAN_HLEN + sizeof(unsigned short)))) goto err_free;
vhdr = (struct vlan_hdr *)skb->data;
From: Cong Wang xiyou.wangcong@gmail.com
stable inclusion from linux-4.19.143 commit 0d43753b0273731997a180f11d426d22f61bbb7d
--------------------------------
[ Upstream commit 47733f9daf4fe4f7e0eb9e273f21ad3a19130487 ]
__tipc_nl_compat_dumpit() has two callers, and it expects them to pass a valid nlmsghdr via arg->data. This header is artificial and crafted just for __tipc_nl_compat_dumpit().
tipc_nl_compat_publ_dump() does so by putting a genlmsghdr as well as some nested attribute, TIPC_NLA_SOCK. But the other caller tipc_nl_compat_dumpit() does not, this leaves arg->data uninitialized on this call path.
Fix this by just adding a similar nlmsghdr without any payload in tipc_nl_compat_dumpit().
This bug exists since day 1, but the recent commit 6ea67769ff33 ("net: tipc: prepare attrs in __tipc_nl_compat_dumpit()") makes it easier to appear.
Reported-and-tested-by: syzbot+0e7181deafa7e0b79923@syzkaller.appspotmail.com Fixes: d0796d1ef63d ("tipc: convert legacy nl bearer dump to nl compat") Cc: Jon Maloy jmaloy@redhat.com Cc: Ying Xue ying.xue@windriver.com Cc: Richard Alpe richard.alpe@ericsson.com Signed-off-by: Cong Wang xiyou.wangcong@gmail.com Acked-by: Ying Xue ying.xue@windriver.com Signed-off-by: David S. Miller davem@davemloft.net Signed-off-by: Greg Kroah-Hartman gregkh@linuxfoundation.org Signed-off-by: Yang Yingliang yangyingliang@huawei.com
Signed-off-by: Aichun Li liaichun@huawei.com Reviewed-by: wangxiaopeng wangxiaopeng7@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- net/tipc/netlink_compat.c | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-)
diff --git a/net/tipc/netlink_compat.c b/net/tipc/netlink_compat.c index 29e684054abe..f8e111218a0e 100644 --- a/net/tipc/netlink_compat.c +++ b/net/tipc/netlink_compat.c @@ -255,8 +255,9 @@ static int __tipc_nl_compat_dumpit(struct tipc_nl_compat_cmd_dump *cmd, static int tipc_nl_compat_dumpit(struct tipc_nl_compat_cmd_dump *cmd, struct tipc_nl_compat_msg *msg) { - int err; + struct nlmsghdr *nlh; struct sk_buff *arg; + int err;
if (msg->req_type && (!msg->req_size || !TLV_CHECK_TYPE(msg->req, msg->req_type))) @@ -285,6 +286,15 @@ static int tipc_nl_compat_dumpit(struct tipc_nl_compat_cmd_dump *cmd, return -ENOMEM; }
+ nlh = nlmsg_put(arg, 0, 0, tipc_genl_family.id, 0, NLM_F_MULTI); + if (!nlh) { + kfree_skb(arg); + kfree_skb(msg->rep); + msg->rep = NULL; + return -EMSGSIZE; + } + nlmsg_end(arg, nlh); + err = __tipc_nl_compat_dumpit(cmd, msg, arg); if (err) { kfree_skb(msg->rep);
From: Mahesh Bandewar maheshb@google.com
stable inclusion from linux-4.19.143 commit 76791cccd967c382384c6164df4c0e9bf9b9c61f
--------------------------------
[ Upstream commit d0f5c7076e01fef6fcb86988d9508bf3ce258bd4 ]
Processing NETDEV_FEAT_CHANGE causes IPvlan links to lose NETIF_F_LLTX feature because of the incorrect handling of features in ipvlan_fix_features().
--before-- lpaa10:~# ethtool -k ipvl0 | grep tx-lockless tx-lockless: on [fixed] lpaa10:~# ethtool -K ipvl0 tso off Cannot change tcp-segmentation-offload Actual changes: vlan-challenged: off [fixed] tx-lockless: off [fixed] lpaa10:~# ethtool -k ipvl0 | grep tx-lockless tx-lockless: off [fixed] lpaa10:~#
--after-- lpaa10:~# ethtool -k ipvl0 | grep tx-lockless tx-lockless: on [fixed] lpaa10:~# ethtool -K ipvl0 tso off Cannot change tcp-segmentation-offload Could not change any device features lpaa10:~# ethtool -k ipvl0 | grep tx-lockless tx-lockless: on [fixed] lpaa10:~#
Fixes: 2ad7bf363841 ("ipvlan: Initial check-in of the IPVLAN driver.") Signed-off-by: Mahesh Bandewar maheshb@google.com Cc: Eric Dumazet edumazet@google.com Signed-off-by: David S. Miller davem@davemloft.net Signed-off-by: Greg Kroah-Hartman gregkh@linuxfoundation.org Signed-off-by: Yang Yingliang yangyingliang@huawei.com
Signed-off-by: Aichun Li liaichun@huawei.com Reviewed-by: wangxiaopeng wangxiaopeng7@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- drivers/net/ipvlan/ipvlan_main.c | 27 ++++++++++++++++++++++----- 1 file changed, 22 insertions(+), 5 deletions(-)
diff --git a/drivers/net/ipvlan/ipvlan_main.c b/drivers/net/ipvlan/ipvlan_main.c index 8609b58b05cb..403852adaff5 100644 --- a/drivers/net/ipvlan/ipvlan_main.c +++ b/drivers/net/ipvlan/ipvlan_main.c @@ -224,12 +224,21 @@ static void ipvlan_port_destroy(struct net_device *dev) kfree(port); }
+#define IPVLAN_ALWAYS_ON_OFLOADS \ + (NETIF_F_SG | NETIF_F_HW_CSUM | \ + NETIF_F_GSO_ROBUST | NETIF_F_GSO_SOFTWARE | NETIF_F_GSO_ENCAP_ALL) + +#define IPVLAN_ALWAYS_ON \ + (IPVLAN_ALWAYS_ON_OFLOADS | NETIF_F_LLTX | NETIF_F_VLAN_CHALLENGED) + #define IPVLAN_FEATURES \ - (NETIF_F_SG | NETIF_F_CSUM_MASK | NETIF_F_HIGHDMA | NETIF_F_FRAGLIST | \ + (NETIF_F_SG | NETIF_F_HW_CSUM | NETIF_F_HIGHDMA | NETIF_F_FRAGLIST | \ NETIF_F_GSO | NETIF_F_TSO | NETIF_F_GSO_ROBUST | \ NETIF_F_TSO_ECN | NETIF_F_TSO6 | NETIF_F_GRO | NETIF_F_RXCSUM | \ NETIF_F_HW_VLAN_CTAG_FILTER | NETIF_F_HW_VLAN_STAG_FILTER)
+ /* NETIF_F_GSO_ENCAP_ALL NETIF_F_GSO_SOFTWARE Newly added */ + #define IPVLAN_STATE_MASK \ ((1<<__LINK_STATE_NOCARRIER) | (1<<__LINK_STATE_DORMANT))
@@ -243,7 +252,9 @@ static int ipvlan_init(struct net_device *dev) dev->state = (dev->state & ~IPVLAN_STATE_MASK) | (phy_dev->state & IPVLAN_STATE_MASK); dev->features = phy_dev->features & IPVLAN_FEATURES; - dev->features |= NETIF_F_LLTX | NETIF_F_VLAN_CHALLENGED; + dev->features |= IPVLAN_ALWAYS_ON; + dev->vlan_features = phy_dev->vlan_features & IPVLAN_FEATURES; + dev->vlan_features |= IPVLAN_ALWAYS_ON_OFLOADS; dev->gso_max_size = phy_dev->gso_max_size; dev->gso_max_segs = phy_dev->gso_max_segs; dev->hard_header_len = phy_dev->hard_header_len; @@ -372,7 +383,14 @@ static netdev_features_t ipvlan_fix_features(struct net_device *dev, { struct ipvl_dev *ipvlan = netdev_priv(dev);
- return features & (ipvlan->sfeatures | ~IPVLAN_FEATURES); + features |= NETIF_F_ALL_FOR_ALL; + features &= (ipvlan->sfeatures | ~IPVLAN_FEATURES); + features = netdev_increment_features(ipvlan->phy_dev->features, + features, features); + features |= IPVLAN_ALWAYS_ON; + features &= (IPVLAN_FEATURES | IPVLAN_ALWAYS_ON); + + return features; }
static void ipvlan_change_rx_flags(struct net_device *dev, int change) @@ -877,10 +895,9 @@ static int ipvlan_device_event(struct notifier_block *unused,
case NETDEV_FEAT_CHANGE: list_for_each_entry(ipvlan, &port->ipvlans, pnode) { - ipvlan->dev->features = dev->features & IPVLAN_FEATURES; ipvlan->dev->gso_max_size = dev->gso_max_size; ipvlan->dev->gso_max_segs = dev->gso_max_segs; - netdev_features_change(ipvlan->dev); + netdev_update_features(ipvlan->dev); } break;
From: Mike Christie michael.christie@oracle.com
stable inclusion from linux-4.19.143 commit 2229e50f25a5d9e6fa9aa263530978a8c83ff944
--------------------------------
[ Upstream commit fa39ab5184d64563cd36f2fb5f0d3fbad83a432c ]
ixgbe_fcoe_ddp_setup() can be called from the main I/O path and is called with a spin_lock held, so we have to use GFP_ATOMIC allocation instead of GFP_KERNEL.
Link: https://lore.kernel.org/r/1596831813-9839-1-git-send-email-michael.christie@... cc: Hannes Reinecke hare@suse.de Reviewed-by: Lee Duncan lduncan@suse.com Signed-off-by: Mike Christie michael.christie@oracle.com Signed-off-by: Martin K. Petersen martin.petersen@oracle.com Signed-off-by: Sasha Levin sashal@kernel.org Signed-off-by: Yang Yingliang yangyingliang@huawei.com
Signed-off-by: Aichun Li liaichun@huawei.com Reviewed-by: wangxiaopeng wangxiaopeng7@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- drivers/net/ethernet/intel/ixgbe/ixgbe_fcoe.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_fcoe.c b/drivers/net/ethernet/intel/ixgbe/ixgbe_fcoe.c index ccd852ad62a4..d50c5b55da18 100644 --- a/drivers/net/ethernet/intel/ixgbe/ixgbe_fcoe.c +++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_fcoe.c @@ -192,7 +192,7 @@ static int ixgbe_fcoe_ddp_setup(struct net_device *netdev, u16 xid, }
/* alloc the udl from per cpu ddp pool */ - ddp->udl = dma_pool_alloc(ddp_pool->pool, GFP_KERNEL, &ddp->udp); + ddp->udl = dma_pool_alloc(ddp_pool->pool, GFP_ATOMIC, &ddp->udp); if (!ddp->udl) { e_err(drv, "failed allocated ddp context\n"); goto out_noddp_unmap;
From: Pablo Neira Ayuso pablo@netfilter.org
stable inclusion from linux-4.19.144 commit 88c161599f17249deed573befc5f85f4cd919e7b
--------------------------------
[ Upstream commit 6f03bf43ee05b31d3822def2a80f11b3591c55b3 ]
Kernel sends an empty NFTA_SET_USERDATA attribute with no value if userspace adds a set with no NFTA_SET_USERDATA attribute.
Fixes: e6d8ecac9e68 ("netfilter: nf_tables: Add new attributes into nft_set to store user data.") Signed-off-by: Pablo Neira Ayuso pablo@netfilter.org Signed-off-by: Sasha Levin sashal@kernel.org Signed-off-by: Yang Yingliang yangyingliang@huawei.com
Signed-off-by: Aichun Li liaichun@huawei.com Reviewed-by: wangxiaopeng wangxiaopeng7@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- net/netfilter/nf_tables_api.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-)
diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index 1b8a53081632..159ec1533c98 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -3204,7 +3204,8 @@ static int nf_tables_fill_set(struct sk_buff *skb, const struct nft_ctx *ctx, goto nla_put_failure; }
- if (nla_put(skb, NFTA_SET_USERDATA, set->udlen, set->udata)) + if (set->udata && + nla_put(skb, NFTA_SET_USERDATA, set->udlen, set->udata)) goto nla_put_failure;
desc = nla_nest_start(skb, NFTA_SET_DESC);
From: Florian Westphal fw@strlen.de
stable inclusion from linux-4.19.144 commit 86c459915577b0c87287b11f4dde44f908885461
--------------------------------
[ Upstream commit 1e105e6afa6c3d32bfb52c00ffa393894a525c27 ]
Following bug was reported via irc: nft list ruleset set knock_candidates_ipv4 { type ipv4_addr . inet_service size 65535 elements = { 127.0.0.1 . 123, 127.0.0.1 . 123 } } .. udp dport 123 add @knock_candidates_ipv4 { ip saddr . 123 } udp dport 123 add @knock_candidates_ipv4 { ip saddr . udp dport }
It should not have been possible to add a duplicate set entry.
After some debugging it turned out that the problem is the immediate value (123) in the second-to-last rule.
Concatenations use 32bit registers, i.e. the elements are 8 bytes each, not 6 and it turns out the kernel inserted
inet firewall @knock_candidates_ipv4 element 0100007f ffff7b00 : 0 [end] element 0100007f 00007b00 : 0 [end]
Note the non-zero upper bits of the first element. It turns out that nft_immediate doesn't zero the destination register, but this is needed when the length isn't a multiple of 4.
Furthermore, the zeroing in nft_payload is broken. We can't use [len / 4] = 0 -- if len is a multiple of 4, index is off by one.
Skip zeroing in this case and use a conditional instead of (len -1) / 4.
Fixes: 49499c3e6e18 ("netfilter: nf_tables: switch registers to 32 bit addressing") Signed-off-by: Florian Westphal fw@strlen.de Signed-off-by: Pablo Neira Ayuso pablo@netfilter.org Signed-off-by: Sasha Levin sashal@kernel.org Signed-off-by: Yang Yingliang yangyingliang@huawei.com
Signed-off-by: Aichun Li liaichun@huawei.com Reviewed-by: wangxiaopeng wangxiaopeng7@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- include/net/netfilter/nf_tables.h | 2 ++ net/netfilter/nft_payload.c | 4 +++- 2 files changed, 5 insertions(+), 1 deletion(-)
diff --git a/include/net/netfilter/nf_tables.h b/include/net/netfilter/nf_tables.h index 024636c31adc..93253ba1eeac 100644 --- a/include/net/netfilter/nf_tables.h +++ b/include/net/netfilter/nf_tables.h @@ -130,6 +130,8 @@ static inline u8 nft_reg_load8(u32 *sreg) static inline void nft_data_copy(u32 *dst, const struct nft_data *src, unsigned int len) { + if (len % NFT_REG32_SIZE) + dst[len / NFT_REG32_SIZE] = 0; memcpy(dst, src, len); }
diff --git a/net/netfilter/nft_payload.c b/net/netfilter/nft_payload.c index 19446a89a2a8..b1a9f330a51f 100644 --- a/net/netfilter/nft_payload.c +++ b/net/netfilter/nft_payload.c @@ -79,7 +79,9 @@ static void nft_payload_eval(const struct nft_expr *expr, u32 *dest = ®s->data[priv->dreg]; int offset;
- dest[priv->len / NFT_REG32_SIZE] = 0; + if (priv->len % NFT_REG32_SIZE) + dest[priv->len / NFT_REG32_SIZE] = 0; + switch (priv->base) { case NFT_PAYLOAD_LL_HEADER: if (!skb_mac_header_was_set(skb))
From: Pablo Neira Ayuso pablo@netfilter.org
stable inclusion from linux-4.19.144 commit 9ad2f018636c6741c41867f14d49d9441b50930d
--------------------------------
[ Upstream commit ee921183557af39c1a0475f982d43b0fcac25e2e ]
Frontend callback reports EAGAIN to nfnetlink to retry a command, this is used to signal that module autoloading is required. Unfortunately, nlmsg_unicast() reports EAGAIN in case the receiver socket buffer gets full, so it enters a busy-loop.
This patch updates nfnetlink_unicast() to turn EAGAIN into ENOBUFS and to use nlmsg_unicast(). Remove the flags field in nfnetlink_unicast() since this is always MSG_DONTWAIT in the existing code which is exactly what nlmsg_unicast() passes to netlink_unicast() as parameter.
Fixes: 96518518cc41 ("netfilter: add nftables") Reported-by: Phil Sutter phil@nwl.cc Signed-off-by: Pablo Neira Ayuso pablo@netfilter.org Signed-off-by: Sasha Levin sashal@kernel.org Signed-off-by: Yang Yingliang yangyingliang@huawei.com
Signed-off-by: Aichun Li liaichun@huawei.com Reviewed-by: wangxiaopeng wangxiaopeng7@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- include/linux/netfilter/nfnetlink.h | 3 +- net/netfilter/nf_tables_api.c | 61 ++++++++++++++--------------- net/netfilter/nfnetlink.c | 11 ++++-- net/netfilter/nfnetlink_log.c | 3 +- net/netfilter/nfnetlink_queue.c | 2 +- 5 files changed, 40 insertions(+), 40 deletions(-)
diff --git a/include/linux/netfilter/nfnetlink.h b/include/linux/netfilter/nfnetlink.h index cf09ab37b45b..e713476ff29d 100644 --- a/include/linux/netfilter/nfnetlink.h +++ b/include/linux/netfilter/nfnetlink.h @@ -43,8 +43,7 @@ int nfnetlink_has_listeners(struct net *net, unsigned int group); int nfnetlink_send(struct sk_buff *skb, struct net *net, u32 portid, unsigned int group, int echo, gfp_t flags); int nfnetlink_set_err(struct net *net, u32 portid, u32 group, int error); -int nfnetlink_unicast(struct sk_buff *skb, struct net *net, u32 portid, - int flags); +int nfnetlink_unicast(struct sk_buff *skb, struct net *net, u32 portid);
static inline u16 nfnl_msg_type(u8 subsys, u8 msg_type) { diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index 159ec1533c98..5b4632826dc6 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -718,11 +718,11 @@ static int nf_tables_gettable(struct net *net, struct sock *nlsk, nlh->nlmsg_seq, NFT_MSG_NEWTABLE, 0, family, table); if (err < 0) - goto err; + goto err_fill_table_info;
- return nlmsg_unicast(nlsk, skb2, NETLINK_CB(skb).portid); + return nfnetlink_unicast(skb2, net, NETLINK_CB(skb).portid);
-err: +err_fill_table_info: kfree_skb(skb2); return err; } @@ -1383,11 +1383,11 @@ static int nf_tables_getchain(struct net *net, struct sock *nlsk, nlh->nlmsg_seq, NFT_MSG_NEWCHAIN, 0, family, table, chain); if (err < 0) - goto err; + goto err_fill_chain_info;
- return nlmsg_unicast(nlsk, skb2, NETLINK_CB(skb).portid); + return nfnetlink_unicast(skb2, net, NETLINK_CB(skb).portid);
-err: +err_fill_chain_info: kfree_skb(skb2); return err; } @@ -2488,11 +2488,11 @@ static int nf_tables_getrule(struct net *net, struct sock *nlsk, nlh->nlmsg_seq, NFT_MSG_NEWRULE, 0, family, table, chain, rule); if (err < 0) - goto err; + goto err_fill_rule_info;
- return nlmsg_unicast(nlsk, skb2, NETLINK_CB(skb).portid); + return nfnetlink_unicast(skb2, net, NETLINK_CB(skb).portid);
-err: +err_fill_rule_info: kfree_skb(skb2); return err; } @@ -3377,11 +3377,11 @@ static int nf_tables_getset(struct net *net, struct sock *nlsk,
err = nf_tables_fill_set(skb2, &ctx, set, NFT_MSG_NEWSET, 0); if (err < 0) - goto err; + goto err_fill_set_info;
- return nlmsg_unicast(nlsk, skb2, NETLINK_CB(skb).portid); + return nfnetlink_unicast(skb2, net, NETLINK_CB(skb).portid);
-err: +err_fill_set_info: kfree_skb(skb2); return err; } @@ -4157,24 +4157,18 @@ static int nft_get_set_elem(struct nft_ctx *ctx, struct nft_set *set, err = -ENOMEM; skb = nlmsg_new(NLMSG_GOODSIZE, GFP_ATOMIC); if (skb == NULL) - goto err1; + return err;
err = nf_tables_fill_setelem_info(skb, ctx, ctx->seq, ctx->portid, NFT_MSG_NEWSETELEM, 0, set, &elem); if (err < 0) - goto err2; + goto err_fill_setelem;
- err = nfnetlink_unicast(skb, ctx->net, ctx->portid, MSG_DONTWAIT); - /* This avoids a loop in nfnetlink. */ - if (err < 0) - goto err1; + return nfnetlink_unicast(skb, ctx->net, ctx->portid);
- return 0; -err2: +err_fill_setelem: kfree_skb(skb); -err1: - /* this avoids a loop in nfnetlink. */ - return err == -EAGAIN ? -ENOBUFS : err; + return err; }
/* called with rcu_read_lock held */ @@ -5273,10 +5267,11 @@ static int nf_tables_getobj(struct net *net, struct sock *nlsk, nlh->nlmsg_seq, NFT_MSG_NEWOBJ, 0, family, table, obj, reset); if (err < 0) - goto err; + goto err_fill_obj_info;
- return nlmsg_unicast(nlsk, skb2, NETLINK_CB(skb).portid); -err: + return nfnetlink_unicast(skb2, net, NETLINK_CB(skb).portid); + +err_fill_obj_info: kfree_skb(skb2); return err; } @@ -5933,10 +5928,11 @@ static int nf_tables_getflowtable(struct net *net, struct sock *nlsk, NFT_MSG_NEWFLOWTABLE, 0, family, flowtable); if (err < 0) - goto err; + goto err_fill_flowtable_info;
- return nlmsg_unicast(nlsk, skb2, NETLINK_CB(skb).portid); -err: + return nfnetlink_unicast(skb2, net, NETLINK_CB(skb).portid); + +err_fill_flowtable_info: kfree_skb(skb2); return err; } @@ -6097,10 +6093,11 @@ static int nf_tables_getgen(struct net *net, struct sock *nlsk, err = nf_tables_fill_gen_info(skb2, net, NETLINK_CB(skb).portid, nlh->nlmsg_seq); if (err < 0) - goto err; + goto err_fill_gen_info;
- return nlmsg_unicast(nlsk, skb2, NETLINK_CB(skb).portid); -err: + return nfnetlink_unicast(skb2, net, NETLINK_CB(skb).portid); + +err_fill_gen_info: kfree_skb(skb2); return err; } diff --git a/net/netfilter/nfnetlink.c b/net/netfilter/nfnetlink.c index 7f2c1915763f..9bacddc761ba 100644 --- a/net/netfilter/nfnetlink.c +++ b/net/netfilter/nfnetlink.c @@ -148,10 +148,15 @@ int nfnetlink_set_err(struct net *net, u32 portid, u32 group, int error) } EXPORT_SYMBOL_GPL(nfnetlink_set_err);
-int nfnetlink_unicast(struct sk_buff *skb, struct net *net, u32 portid, - int flags) +int nfnetlink_unicast(struct sk_buff *skb, struct net *net, u32 portid) { - return netlink_unicast(net->nfnl, skb, portid, flags); + int err; + + err = nlmsg_unicast(net->nfnl, skb, portid); + if (err == -EAGAIN) + err = -ENOBUFS; + + return err; } EXPORT_SYMBOL_GPL(nfnetlink_unicast);
diff --git a/net/netfilter/nfnetlink_log.c b/net/netfilter/nfnetlink_log.c index 332c69d27b47..25298b3eb854 100644 --- a/net/netfilter/nfnetlink_log.c +++ b/net/netfilter/nfnetlink_log.c @@ -359,8 +359,7 @@ __nfulnl_send(struct nfulnl_instance *inst) goto out; } } - nfnetlink_unicast(inst->skb, inst->net, inst->peer_portid, - MSG_DONTWAIT); + nfnetlink_unicast(inst->skb, inst->net, inst->peer_portid); out: inst->qlen = 0; inst->skb = NULL; diff --git a/net/netfilter/nfnetlink_queue.c b/net/netfilter/nfnetlink_queue.c index d33094f4ec41..f81a3ce0fe48 100644 --- a/net/netfilter/nfnetlink_queue.c +++ b/net/netfilter/nfnetlink_queue.c @@ -685,7 +685,7 @@ __nfqnl_enqueue_packet(struct net *net, struct nfqnl_instance *queue, *packet_id_ptr = htonl(entry->id);
/* nfnetlink_unicast will either free the nskb or add it to a socket */ - err = nfnetlink_unicast(nskb, net, queue->peer_portid, MSG_DONTWAIT); + err = nfnetlink_unicast(nskb, net, queue->peer_portid); if (err < 0) { if (queue->flags & NFQA_CFG_F_FAIL_OPEN) { failopen = 1;
From: Shung-Hsi Yu shung-hsi.yu@suse.com
stable inclusion from linux-4.19.144 commit f00d82c3fb4368afb41cba89b287801a7888627c
--------------------------------
[ Upstream commit cbedcb044e9cc4e14bbe6658111224bb923094f4 ]
On machines with much memory (> 2 TByte) and log_mtts_per_seg == 0, a max_order of 31 will be passed to mlx_buddy_init(), which results in s = BITS_TO_LONGS(1 << 31) becoming a negative value, leading to kvmalloc_array() failure when it is converted to size_t.
mlx4_core 0000:b1:00.0: Failed to initialize memory region table, aborting mlx4_core: probe of 0000:b1:00.0 failed with error -12
Fix this issue by changing the left shifting operand from a signed literal to an unsigned one.
Fixes: 225c7b1feef1 ("IB/mlx4: Add a driver Mellanox ConnectX InfiniBand adapters") Signed-off-by: Shung-Hsi Yu shung-hsi.yu@suse.com Signed-off-by: David S. Miller davem@davemloft.net Signed-off-by: Sasha Levin sashal@kernel.org Signed-off-by: Yang Yingliang yangyingliang@huawei.com
Signed-off-by: Aichun Li liaichun@huawei.com Reviewed-by: wangxiaopeng wangxiaopeng7@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- drivers/net/ethernet/mellanox/mlx4/mr.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/drivers/net/ethernet/mellanox/mlx4/mr.c b/drivers/net/ethernet/mellanox/mlx4/mr.c index 1a11bc0e1612..cfa0bba3940f 100644 --- a/drivers/net/ethernet/mellanox/mlx4/mr.c +++ b/drivers/net/ethernet/mellanox/mlx4/mr.c @@ -114,7 +114,7 @@ static int mlx4_buddy_init(struct mlx4_buddy *buddy, int max_order) goto err_out;
for (i = 0; i <= buddy->max_order; ++i) { - s = BITS_TO_LONGS(1 << (buddy->max_order - i)); + s = BITS_TO_LONGS(1UL << (buddy->max_order - i)); buddy->bits[i] = kvmalloc_array(s, sizeof(long), GFP_KERNEL | __GFP_ZERO); if (!buddy->bits[i]) goto err_out_free;
From: Xin Long lucien.xin@gmail.com
stable inclusion from linux-4.19.145 commit 84cfc87866b7e01b591999cc4cdd176d78a3a69c
--------------------------------
[ Upstream commit 3106ecb43a05dc3e009779764b9da245a5d082de ]
With disabling bh in the whole sctp_get_port_local(), when snum == 0 and too many ports have been used, the do-while loop will take the cpu for a long time and cause cpu stuck:
[ ] watchdog: BUG: soft lockup - CPU#11 stuck for 22s! [ ] RIP: 0010:native_queued_spin_lock_slowpath+0x4de/0x940 [ ] Call Trace: [ ] _raw_spin_lock+0xc1/0xd0 [ ] sctp_get_port_local+0x527/0x650 [sctp] [ ] sctp_do_bind+0x208/0x5e0 [sctp] [ ] sctp_autobind+0x165/0x1e0 [sctp] [ ] sctp_connect_new_asoc+0x355/0x480 [sctp] [ ] __sctp_connect+0x360/0xb10 [sctp]
There's no need to disable bh in the whole function of sctp_get_port_local. So fix this cpu stuck by removing local_bh_disable() called at the beginning, and using spin_lock_bh() instead.
The same thing was actually done for inet_csk_get_port() in Commit ea8add2b1903 ("tcp/dccp: better use of ephemeral ports in bind()").
Thanks to Marcelo for pointing the buggy code out.
v1->v2: - use cond_resched() to yield cpu to other tasks if needed, as Eric noticed.
Fixes: 1da177e4c3f4 ("Linux-2.6.12-rc2") Reported-by: Ying Xu yinxu@redhat.com Signed-off-by: Xin Long lucien.xin@gmail.com Acked-by: Marcelo Ricardo Leitner marcelo.leitner@gmail.com Signed-off-by: David S. Miller davem@davemloft.net Signed-off-by: Greg Kroah-Hartman gregkh@linuxfoundation.org Signed-off-by: Yang Yingliang yangyingliang@huawei.com
Signed-off-by: Aichun Li liaichun@huawei.com Reviewed-by: wangxiaopeng wangxiaopeng7@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- net/sctp/socket.c | 16 ++++++---------- 1 file changed, 6 insertions(+), 10 deletions(-)
diff --git a/net/sctp/socket.c b/net/sctp/socket.c index bff075c26130..733c3f448008 100644 --- a/net/sctp/socket.c +++ b/net/sctp/socket.c @@ -7646,8 +7646,6 @@ static long sctp_get_port_local(struct sock *sk, union sctp_addr *addr)
pr_debug("%s: begins, snum:%d\n", __func__, snum);
- local_bh_disable(); - if (snum == 0) { /* Search for an available port. */ int low, high, remaining, index; @@ -7666,20 +7664,21 @@ static long sctp_get_port_local(struct sock *sk, union sctp_addr *addr) continue; index = sctp_phashfn(sock_net(sk), rover); head = &sctp_port_hashtable[index]; - spin_lock(&head->lock); + spin_lock_bh(&head->lock); sctp_for_each_hentry(pp, &head->chain) if ((pp->port == rover) && net_eq(sock_net(sk), pp->net)) goto next; break; next: - spin_unlock(&head->lock); + spin_unlock_bh(&head->lock); + cond_resched(); } while (--remaining > 0);
/* Exhausted local port range during search? */ ret = 1; if (remaining <= 0) - goto fail; + return ret;
/* OK, here is the one we will use. HEAD (the port * hash table list entry) is non-NULL and we hold it's @@ -7694,7 +7693,7 @@ static long sctp_get_port_local(struct sock *sk, union sctp_addr *addr) * port iterator, pp being NULL. */ head = &sctp_port_hashtable[sctp_phashfn(sock_net(sk), snum)]; - spin_lock(&head->lock); + spin_lock_bh(&head->lock); sctp_for_each_hentry(pp, &head->chain) { if ((pp->port == snum) && net_eq(pp->net, sock_net(sk))) goto pp_found; @@ -7776,10 +7775,7 @@ static long sctp_get_port_local(struct sock *sk, union sctp_addr *addr) ret = 0;
fail_unlock: - spin_unlock(&head->lock); - -fail: - local_bh_enable(); + spin_unlock_bh(&head->lock); return ret; }
From: Tetsuo Handa penguin-kernel@I-love.SAKURA.ne.jp
stable inclusion from linux-4.19.145 commit 553d1bb7f4d1d2b86b7527c10be31b2f5f78bd8c
--------------------------------
[ Upstream commit 2a63866c8b51a3f72cea388dfac259d0e14c4ba6 ]
syzbot is reporting hung task at nbd_ioctl() [1], for there are two problems regarding TIPC's connectionless socket's shutdown() operation.
---------- #include <fcntl.h> #include <sys/socket.h> #include <sys/ioctl.h> #include <linux/nbd.h> #include <unistd.h>
int main(int argc, char *argv[]) { const int fd = open("/dev/nbd0", 3); alarm(5); ioctl(fd, NBD_SET_SOCK, socket(PF_TIPC, SOCK_DGRAM, 0)); ioctl(fd, NBD_DO_IT, 0); /* To be interrupted by SIGALRM. */ return 0; } ----------
One problem is that wait_for_completion() from flush_workqueue() from nbd_start_device_ioctl() from nbd_ioctl() cannot be completed when nbd_start_device_ioctl() received a signal at wait_event_interruptible(), for tipc_shutdown() from kernel_sock_shutdown(SHUT_RDWR) from nbd_mark_nsock_dead() from sock_shutdown() from nbd_start_device_ioctl() is failing to wake up a WQ thread sleeping at wait_woken() from tipc_wait_for_rcvmsg() from sock_recvmsg() from sock_xmit() from nbd_read_stat() from recv_work() scheduled by nbd_start_device() from nbd_start_device_ioctl(). Fix this problem by always invoking sk->sk_state_change() (like inet_shutdown() does) when tipc_shutdown() is called.
The other problem is that tipc_wait_for_rcvmsg() cannot return when tipc_shutdown() is called, for tipc_shutdown() sets sk->sk_shutdown to SEND_SHUTDOWN (despite "how" is SHUT_RDWR) while tipc_wait_for_rcvmsg() needs sk->sk_shutdown set to RCV_SHUTDOWN or SHUTDOWN_MASK. Fix this problem by setting sk->sk_shutdown to SHUTDOWN_MASK (like inet_shutdown() does) when the socket is connectionless.
[1] https://syzkaller.appspot.com/bug?id=3fe51d307c1f0a845485cf1798aa059d12bf18b...
Reported-by: syzbot syzbot+e36f41d207137b5d12f7@syzkaller.appspotmail.com Signed-off-by: Tetsuo Handa penguin-kernel@I-love.SAKURA.ne.jp Signed-off-by: David S. Miller davem@davemloft.net Signed-off-by: Greg Kroah-Hartman gregkh@linuxfoundation.org Signed-off-by: Yang Yingliang yangyingliang@huawei.com
Signed-off-by: Aichun Li liaichun@huawei.com Reviewed-by: wangxiaopeng wangxiaopeng7@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- net/tipc/socket.c | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-)
diff --git a/net/tipc/socket.c b/net/tipc/socket.c index 45d28dc74b9c..78edb4926506 100644 --- a/net/tipc/socket.c +++ b/net/tipc/socket.c @@ -2565,18 +2565,21 @@ static int tipc_shutdown(struct socket *sock, int how) lock_sock(sk);
__tipc_shutdown(sock, TIPC_CONN_SHUTDOWN); - sk->sk_shutdown = SEND_SHUTDOWN; + if (tipc_sk_type_connectionless(sk)) + sk->sk_shutdown = SHUTDOWN_MASK; + else + sk->sk_shutdown = SEND_SHUTDOWN;
if (sk->sk_state == TIPC_DISCONNECTING) { /* Discard any unreceived messages */ __skb_queue_purge(&sk->sk_receive_queue);
- /* Wake up anyone sleeping in poll */ - sk->sk_state_change(sk); res = 0; } else { res = -ENOTCONN; } + /* Wake up anyone sleeping in poll. */ + sk->sk_state_change(sk);
release_sock(sk); return res;
From: Jakub Kicinski kuba@kernel.org
stable inclusion from linux-4.19.145 commit 9f313bcb3b3d021f9b2f04f6d8464f8e9e309452
--------------------------------
[ Upstream commit 96e97bc07e90f175a8980a22827faf702ca4cb30 ]
napi_disable() makes sure to set the NAPI_STATE_NPSVC bit to prevent netpoll from accessing rings before init is complete. However, the same is not done for fresh napi instances in netif_napi_add(), even though we expect NAPI instances to be added as disabled.
This causes crashes during driver reconfiguration (enabling XDP, changing the channel count) - if there is any printk() after netif_napi_add() but before napi_enable().
To ensure memory ordering is correct we need to use RCU accessors.
Reported-by: Rob Sherwood rsher@fb.com Fixes: 2d8bff12699a ("netpoll: Close race condition between poll_one_napi and napi_disable") Signed-off-by: Jakub Kicinski kuba@kernel.org Signed-off-by: David S. Miller davem@davemloft.net Signed-off-by: Greg Kroah-Hartman gregkh@linuxfoundation.org Signed-off-by: Yang Yingliang yangyingliang@huawei.com
Signed-off-by: Aichun Li liaichun@huawei.com Reviewed-by: wangxiaopeng wangxiaopeng7@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- net/core/dev.c | 3 ++- net/core/netpoll.c | 2 +- 2 files changed, 3 insertions(+), 2 deletions(-)
diff --git a/net/core/dev.c b/net/core/dev.c index 42ba150fa18d..c77d12a35f92 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -6196,12 +6196,13 @@ void netif_napi_add(struct net_device *dev, struct napi_struct *napi, pr_err_once("netif_napi_add() called with weight %d on device %s\n", weight, dev->name); napi->weight = weight; - list_add(&napi->dev_list, &dev->napi_list); napi->dev = dev; #ifdef CONFIG_NETPOLL napi->poll_owner = -1; #endif set_bit(NAPI_STATE_SCHED, &napi->state); + set_bit(NAPI_STATE_NPSVC, &napi->state); + list_add_rcu(&napi->dev_list, &dev->napi_list); napi_hash_add(napi); } EXPORT_SYMBOL(netif_napi_add); diff --git a/net/core/netpoll.c b/net/core/netpoll.c index a581cf101cd9..023ce0fbb496 100644 --- a/net/core/netpoll.c +++ b/net/core/netpoll.c @@ -161,7 +161,7 @@ static void poll_napi(struct net_device *dev) struct napi_struct *napi; int cpu = smp_processor_id();
- list_for_each_entry(napi, &dev->napi_list, dev_list) { + list_for_each_entry_rcu(napi, &dev->napi_list, dev_list) { if (cmpxchg(&napi->poll_owner, -1, cpu) == -1) { poll_one_napi(napi); smp_store_release(&napi->poll_owner, -1);
From: Roi Dayan roid@mellanox.com
stable inclusion from linux-4.19.145 commit 044be307e550b4532960eadabfb6942de96751f0
--------------------------------
Support for phys switch id ndo added for representors and if we do not have representors there is no need to support it. Since each port return different switch id supporting this block support for creating bond over PFs and attaching to bridge in legacy mode.
This bug doesn't exist upstream as the code got refactored and the netdev api is totally different.
Fixes: cb67b832921c ("net/mlx5e: Introduce SRIOV VF representors") Signed-off-by: Roi Dayan roid@mellanox.com Signed-off-by: Saeed Mahameed saeedm@mellanox.com Signed-off-by: Greg Kroah-Hartman gregkh@linuxfoundation.org Signed-off-by: Yang Yingliang yangyingliang@huawei.com
Signed-off-by: Aichun Li liaichun@huawei.com Reviewed-by: wangxiaopeng wangxiaopeng7@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- drivers/net/ethernet/mellanox/mlx5/core/en_rep.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_rep.c b/drivers/net/ethernet/mellanox/mlx5/core/en_rep.c index 701624a63d2f..1ab40d622ae1 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_rep.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_rep.c @@ -198,7 +198,7 @@ int mlx5e_attr_get(struct net_device *dev, struct switchdev_attr *attr) struct mlx5_eswitch_rep *rep = rpriv->rep; struct mlx5_eswitch *esw = priv->mdev->priv.eswitch;
- if (esw->mode == SRIOV_NONE) + if (esw->mode != SRIOV_OFFLOADS) return -EOPNOTSUPP;
switch (attr->id) {
From: Miaohe Lin linmiaohe@huawei.com
stable inclusion from linux-4.19.147 commit cf1a59e1ac54d8f7e211b845100dbd778624aeda
--------------------------------
commit eabe861881a733fc84f286f4d5a1ffaddd4f526f upstream.
pskb_carve_frag_list() may return -ENOMEM in pskb_carve_inside_nonlinear(). we should handle this correctly or we would get wrong sk_buff.
Fixes: 6fa01ccd8830 ("skbuff: Add pskb_extract() helper function") Signed-off-by: Miaohe Lin linmiaohe@huawei.com Signed-off-by: David S. Miller davem@davemloft.net Signed-off-by: Greg Kroah-Hartman gregkh@linuxfoundation.org Signed-off-by: Yang Yingliang yangyingliang@huawei.com
Signed-off-by: Aichun Li liaichun@huawei.com Reviewed-by: wangxiaopeng wangxiaopeng7@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- net/core/skbuff.c | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-)
diff --git a/net/core/skbuff.c b/net/core/skbuff.c index af6e9028716d..be4bc833c28a 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -5521,9 +5521,13 @@ static int pskb_carve_inside_nonlinear(struct sk_buff *skb, const u32 off, if (skb_has_frag_list(skb)) skb_clone_fraglist(skb);
- if (k == 0) { - /* split line is in frag list */ - pskb_carve_frag_list(skb, shinfo, off - pos, gfp_mask); + /* split line is in frag list */ + if (k == 0 && pskb_carve_frag_list(skb, shinfo, off - pos, gfp_mask)) { + /* skb_frag_unref() is not needed here as shinfo->nr_frags = 0. */ + if (skb_has_frag_list(skb)) + kfree_skb_list(skb_shinfo(skb)->frag_list); + kfree(data); + return -ENOMEM; } skb_release_data(skb);
From: "J. Bruce Fields" bfields@redhat.com
stable inclusion from linux-4.19.147 commit 61279a7b3e337b8c8605987591964db4a2b3eb91
--------------------------------
[ Upstream commit 8c6b6c793ed32b8f9770ebcdf1ba99af423c303b ]
Since p points at raw xdr data, there's no guarantee that it's NULL terminated, so we should give a length. And probably escape any special characters too.
Reported-by: Zhi Li yieli@redhat.com Signed-off-by: J. Bruce Fields bfields@redhat.com Signed-off-by: Trond Myklebust trond.myklebust@hammerspace.com Signed-off-by: Sasha Levin sashal@kernel.org Signed-off-by: Yang Yingliang yangyingliang@huawei.com
Signed-off-by: Aichun Li liaichun@huawei.com Reviewed-by: wangxiaopeng wangxiaopeng7@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- net/sunrpc/rpcb_clnt.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-)
diff --git a/net/sunrpc/rpcb_clnt.c b/net/sunrpc/rpcb_clnt.c index 08b5fa4a2852..ba8f36731228 100644 --- a/net/sunrpc/rpcb_clnt.c +++ b/net/sunrpc/rpcb_clnt.c @@ -981,8 +981,8 @@ static int rpcb_dec_getaddr(struct rpc_rqst *req, struct xdr_stream *xdr, p = xdr_inline_decode(xdr, len); if (unlikely(p == NULL)) goto out_fail; - dprintk("RPC: %5u RPCB_%s reply: %s\n", req->rq_task->tk_pid, - req->rq_task->tk_msg.rpc_proc->p_name, (char *)p); + dprintk("RPC: %5u RPCB_%s reply: %*pE\n", req->rq_task->tk_pid, + req->rq_task->tk_msg.rpc_proc->p_name, len, (char *)p);
if (rpc_uaddr2sockaddr(req->rq_xprt->xprt_net, (char *)p, len, sap, sizeof(address)) == 0)
From: Mark Salyzyn salyzyn@android.com
stable inclusion from linux-4.19.148 commit b59a23d596807a5aa88d8dd5655a66c6843729b3
--------------------------------
commit 37bd22420f856fcd976989f1d4f1f7ad28e1fcac upstream.
In pfkey_dump() dplen and splen can both be specified to access the xfrm_address_t structure out of bounds in__xfrm_state_filter_match() when it calls addr_match() with the indexes. Return EINVAL if either are out of range.
Signed-off-by: Mark Salyzyn salyzyn@android.com Cc: netdev@vger.kernel.org Cc: linux-kernel@vger.kernel.org Cc: kernel-team@android.com Cc: Steffen Klassert steffen.klassert@secunet.com Cc: Herbert Xu herbert@gondor.apana.org.au Cc: "David S. Miller" davem@davemloft.net Cc: Jakub Kicinski kuba@kernel.org Fixes: 1da177e4c3f4 ("Linux-2.6.12-rc2") Signed-off-by: Steffen Klassert steffen.klassert@secunet.com Signed-off-by: Greg Kroah-Hartman gregkh@linuxfoundation.org Signed-off-by: Yang Yingliang yangyingliang@huawei.com
Signed-off-by: Aichun Li liaichun@huawei.com Reviewed-by: wangxiaopeng wangxiaopeng7@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- net/key/af_key.c | 7 +++++++ 1 file changed, 7 insertions(+)
diff --git a/net/key/af_key.c b/net/key/af_key.c index 1982f9f31deb..e340e97224c3 100644 --- a/net/key/af_key.c +++ b/net/key/af_key.c @@ -1855,6 +1855,13 @@ static int pfkey_dump(struct sock *sk, struct sk_buff *skb, const struct sadb_ms if (ext_hdrs[SADB_X_EXT_FILTER - 1]) { struct sadb_x_filter *xfilter = ext_hdrs[SADB_X_EXT_FILTER - 1];
+ if ((xfilter->sadb_x_filter_splen >= + (sizeof(xfrm_address_t) << 3)) || + (xfilter->sadb_x_filter_dplen >= + (sizeof(xfrm_address_t) << 3))) { + mutex_unlock(&pfk->dump_lock); + return -EINVAL; + } filter = kmalloc(sizeof(*filter), GFP_KERNEL); if (filter == NULL) { mutex_unlock(&pfk->dump_lock);
From: Wei Wang weiwan@google.com
stable inclusion from linux-4.19.148 commit 2fc322bf67594e240eb23b4e0c6c8a09c69f9918
--------------------------------
[ Upstream commit ba9e04a7ddf4f22a10e05bf9403db6b97743c7bf ]
Currently, in tcp_v4_reqsk_send_ack() and tcp_v4_send_reset(), we echo the TOS value of the received packets in the response. However, we do not want to echo the lower 2 ECN bits in accordance with RFC 3168 6.1.5 robustness principles.
Fixes: 1da177e4c3f4 ("Linux-2.6.12-rc2")
Signed-off-by: Wei Wang weiwan@google.com Signed-off-by: Eric Dumazet edumazet@google.com Signed-off-by: David S. Miller davem@davemloft.net Signed-off-by: Greg Kroah-Hartman gregkh@linuxfoundation.org Signed-off-by: Yang Yingliang yangyingliang@huawei.com
Signed-off-by: Aichun Li liaichun@huawei.com Reviewed-by: wangxiaopeng wangxiaopeng7@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- net/ipv4/ip_output.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-)
diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c index fbf30122e8bf..f0faf1193dd8 100644 --- a/net/ipv4/ip_output.c +++ b/net/ipv4/ip_output.c @@ -73,6 +73,7 @@ #include <net/icmp.h> #include <net/checksum.h> #include <net/inetpeer.h> +#include <net/inet_ecn.h> #include <net/lwtunnel.h> #include <linux/bpf-cgroup.h> #include <linux/igmp.h> @@ -1582,7 +1583,7 @@ void ip_send_unicast_reply(struct sock *sk, struct sk_buff *skb, if (IS_ERR(rt)) return;
- inet_sk(sk)->tos = arg->tos; + inet_sk(sk)->tos = arg->tos & ~INET_ECN_MASK;
sk->sk_priority = skb->priority; sk->sk_protocol = ip_hdr(skb)->protocol;
From: Eric Dumazet edumazet@google.com
stable inclusion from linux-4.19.148 commit f2e5359dd3bffa434cba0f62179b1e72065183af
--------------------------------
[ Upstream commit 843d926b003ea692468c8cc5bea1f9f58dfa8c75 ]
syzbot reported twice a lockdep issue in fib6_del() [1] which I think is caused by net->ipv6.fib6_null_entry having a NULL fib6_table pointer.
fib6_del() already checks for fib6_null_entry special case, we only need to return earlier.
Bug seems to occur very rarely, I have thus chosen a 'bug origin' that makes backports not too complex.
[1] WARNING: suspicious RCU usage 5.9.0-rc4-syzkaller #0 Not tainted ----------------------------- net/ipv6/ip6_fib.c:1996 suspicious rcu_dereference_protected() usage!
other info that might help us debug this:
rcu_scheduler_active = 2, debug_locks = 1 4 locks held by syz-executor.5/8095: #0: ffffffff8a7ea708 (rtnl_mutex){+.+.}-{3:3}, at: ppp_release+0x178/0x240 drivers/net/ppp/ppp_generic.c:401 #1: ffff88804c422dd8 (&net->ipv6.fib6_gc_lock){+.-.}-{2:2}, at: spin_trylock_bh include/linux/spinlock.h:414 [inline] #1: ffff88804c422dd8 (&net->ipv6.fib6_gc_lock){+.-.}-{2:2}, at: fib6_run_gc+0x21b/0x2d0 net/ipv6/ip6_fib.c:2312 #2: ffffffff89bd6a40 (rcu_read_lock){....}-{1:2}, at: __fib6_clean_all+0x0/0x290 net/ipv6/ip6_fib.c:2613 #3: ffff8880a82e6430 (&tb->tb6_lock){+.-.}-{2:2}, at: spin_lock_bh include/linux/spinlock.h:359 [inline] #3: ffff8880a82e6430 (&tb->tb6_lock){+.-.}-{2:2}, at: __fib6_clean_all+0x107/0x290 net/ipv6/ip6_fib.c:2245
stack backtrace: CPU: 1 PID: 8095 Comm: syz-executor.5 Not tainted 5.9.0-rc4-syzkaller #0 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011 Call Trace: __dump_stack lib/dump_stack.c:77 [inline] dump_stack+0x198/0x1fd lib/dump_stack.c:118 fib6_del+0x12b4/0x1630 net/ipv6/ip6_fib.c:1996 fib6_clean_node+0x39b/0x570 net/ipv6/ip6_fib.c:2180 fib6_walk_continue+0x4aa/0x8e0 net/ipv6/ip6_fib.c:2102 fib6_walk+0x182/0x370 net/ipv6/ip6_fib.c:2150 fib6_clean_tree+0xdb/0x120 net/ipv6/ip6_fib.c:2230 __fib6_clean_all+0x120/0x290 net/ipv6/ip6_fib.c:2246 fib6_clean_all net/ipv6/ip6_fib.c:2257 [inline] fib6_run_gc+0x113/0x2d0 net/ipv6/ip6_fib.c:2320 ndisc_netdev_event+0x217/0x350 net/ipv6/ndisc.c:1805 notifier_call_chain+0xb5/0x200 kernel/notifier.c:83 call_netdevice_notifiers_info+0xb5/0x130 net/core/dev.c:2033 call_netdevice_notifiers_extack net/core/dev.c:2045 [inline] call_netdevice_notifiers net/core/dev.c:2059 [inline] dev_close_many+0x30b/0x650 net/core/dev.c:1634 rollback_registered_many+0x3a8/0x1210 net/core/dev.c:9261 rollback_registered net/core/dev.c:9329 [inline] unregister_netdevice_queue+0x2dd/0x570 net/core/dev.c:10410 unregister_netdevice include/linux/netdevice.h:2774 [inline] ppp_release+0x216/0x240 drivers/net/ppp/ppp_generic.c:403 __fput+0x285/0x920 fs/file_table.c:281 task_work_run+0xdd/0x190 kernel/task_work.c:141 tracehook_notify_resume include/linux/tracehook.h:188 [inline] exit_to_user_mode_loop kernel/entry/common.c:163 [inline] exit_to_user_mode_prepare+0x1e1/0x200 kernel/entry/common.c:190 syscall_exit_to_user_mode+0x7e/0x2e0 kernel/entry/common.c:265 entry_SYSCALL_64_after_hwframe+0x44/0xa9
Fixes: 421842edeaf6 ("net/ipv6: Add fib6_null_entry") Signed-off-by: Eric Dumazet edumazet@google.com Cc: David Ahern dsahern@gmail.com Reviewed-by: David Ahern dsahern@gmail.com Signed-off-by: David S. Miller davem@davemloft.net Signed-off-by: Greg Kroah-Hartman gregkh@linuxfoundation.org Signed-off-by: Yang Yingliang yangyingliang@huawei.com
Signed-off-by: Aichun Li liaichun@huawei.com Reviewed-by: wangxiaopeng wangxiaopeng7@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- net/ipv6/ip6_fib.c | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-)
diff --git a/net/ipv6/ip6_fib.c b/net/ipv6/ip6_fib.c index 5e8979c1f76d..05a206202e23 100644 --- a/net/ipv6/ip6_fib.c +++ b/net/ipv6/ip6_fib.c @@ -1811,14 +1811,19 @@ static void fib6_del_route(struct fib6_table *table, struct fib6_node *fn, /* Need to own table->tb6_lock */ int fib6_del(struct fib6_info *rt, struct nl_info *info) { - struct fib6_node *fn = rcu_dereference_protected(rt->fib6_node, - lockdep_is_held(&rt->fib6_table->tb6_lock)); - struct fib6_table *table = rt->fib6_table; struct net *net = info->nl_net; struct fib6_info __rcu **rtp; struct fib6_info __rcu **rtp_next; + struct fib6_table *table; + struct fib6_node *fn; + + if (rt == net->ipv6.fib6_null_entry) + return -ENOENT;
- if (!fn || rt == net->ipv6.fib6_null_entry) + table = rt->fib6_table; + fn = rcu_dereference_protected(rt->fib6_node, + lockdep_is_held(&table->tb6_lock)); + if (!fn) return -ENOENT;
WARN_ON(!(fn->fn_flags & RTN_RTINFO));
From: Peilin Ye yepeilin.cs@gmail.com
stable inclusion from linux-4.19.148 commit d82e08de23e36c37667f67a502b0cf4a3e3f61bd
--------------------------------
[ Upstream commit bb3a420d47ab00d7e1e5083286cab15235a96680 ]
tipc_group_add_to_tree() returns silently if `key` matches `nkey` of an existing node, causing tipc_group_create_member() to leak memory. Let tipc_group_add_to_tree() return an error in such a case, so that tipc_group_create_member() can handle it properly.
Fixes: 75da2163dbb6 ("tipc: introduce communication groups") Reported-and-tested-by: syzbot+f95d90c454864b3b5bc9@syzkaller.appspotmail.com Cc: Hillf Danton hdanton@sina.com Link: https://syzkaller.appspot.com/bug?id=048390604fe1b60df34150265479202f10e13af... Signed-off-by: Peilin Ye yepeilin.cs@gmail.com Signed-off-by: David S. Miller davem@davemloft.net Signed-off-by: Greg Kroah-Hartman gregkh@linuxfoundation.org Signed-off-by: Yang Yingliang yangyingliang@huawei.com
Signed-off-by: Aichun Li liaichun@huawei.com Reviewed-by: wangxiaopeng wangxiaopeng7@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- net/tipc/group.c | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-)
diff --git a/net/tipc/group.c b/net/tipc/group.c index 9a9138de4eca..b656385efad6 100644 --- a/net/tipc/group.c +++ b/net/tipc/group.c @@ -273,8 +273,8 @@ static struct tipc_member *tipc_group_find_node(struct tipc_group *grp, return NULL; }
-static void tipc_group_add_to_tree(struct tipc_group *grp, - struct tipc_member *m) +static int tipc_group_add_to_tree(struct tipc_group *grp, + struct tipc_member *m) { u64 nkey, key = (u64)m->node << 32 | m->port; struct rb_node **n, *parent = NULL; @@ -291,10 +291,11 @@ static void tipc_group_add_to_tree(struct tipc_group *grp, else if (key > nkey) n = &(*n)->rb_right; else - return; + return -EEXIST; } rb_link_node(&m->tree_node, parent, n); rb_insert_color(&m->tree_node, &grp->members); + return 0; }
static struct tipc_member *tipc_group_create_member(struct tipc_group *grp, @@ -302,6 +303,7 @@ static struct tipc_member *tipc_group_create_member(struct tipc_group *grp, u32 instance, int state) { struct tipc_member *m; + int ret;
m = kzalloc(sizeof(*m), GFP_ATOMIC); if (!m) @@ -314,8 +316,12 @@ static struct tipc_member *tipc_group_create_member(struct tipc_group *grp, m->port = port; m->instance = instance; m->bc_acked = grp->bc_snd_nxt - 1; + ret = tipc_group_add_to_tree(grp, m); + if (ret < 0) { + kfree(m); + return NULL; + } grp->member_cnt++; - tipc_group_add_to_tree(grp, m); tipc_nlist_add(&grp->dests, m->node); m->state = state; return m;
From: Tetsuo Handa penguin-kernel@I-love.SAKURA.ne.jp
stable inclusion from linux-4.19.148 commit 0183a74c915882509f70c2ddc05bc9e6726cfb7c
--------------------------------
[ Upstream commit a4b5cc9e10803ecba64a7d54c0f47e4564b4a980 ]
I confirmed that the problem fixed by commit 2a63866c8b51a3f7 ("tipc: fix shutdown() of connectionless socket") also applies to stream socket.
---------- #include <sys/socket.h> #include <unistd.h> #include <sys/wait.h>
int main(int argc, char *argv[]) { int fds[2] = { -1, -1 }; socketpair(PF_TIPC, SOCK_STREAM /* or SOCK_DGRAM */, 0, fds); if (fork() == 0) _exit(read(fds[0], NULL, 1)); shutdown(fds[0], SHUT_RDWR); /* This must make read() return. */ wait(NULL); /* To be woken up by _exit(). */ return 0; } ----------
Since shutdown(SHUT_RDWR) should affect all processes sharing that socket, unconditionally setting sk->sk_shutdown to SHUTDOWN_MASK will be the right behavior.
Signed-off-by: Tetsuo Handa penguin-kernel@I-love.SAKURA.ne.jp Acked-by: Ying Xue ying.xue@windriver.com Signed-off-by: David S. Miller davem@davemloft.net Signed-off-by: Greg Kroah-Hartman gregkh@linuxfoundation.org Signed-off-by: Yang Yingliang yangyingliang@huawei.com
Signed-off-by: Aichun Li liaichun@huawei.com Reviewed-by: wangxiaopeng wangxiaopeng7@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- net/tipc/socket.c | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-)
diff --git a/net/tipc/socket.c b/net/tipc/socket.c index 78edb4926506..6a1e7ff667ad 100644 --- a/net/tipc/socket.c +++ b/net/tipc/socket.c @@ -2565,10 +2565,7 @@ static int tipc_shutdown(struct socket *sock, int how) lock_sock(sk);
__tipc_shutdown(sock, TIPC_CONN_SHUTDOWN); - if (tipc_sk_type_connectionless(sk)) - sk->sk_shutdown = SHUTDOWN_MASK; - else - sk->sk_shutdown = SEND_SHUTDOWN; + sk->sk_shutdown = SHUTDOWN_MASK;
if (sk->sk_state == TIPC_DISCONNECTING) { /* Discard any unreceived messages */
From: Xin Long lucien.xin@gmail.com
stable inclusion from linux-4.19.148 commit b15fcca8eff903c4a9a50336f5bd8a208ca45df7
--------------------------------
[ Upstream commit ff48b6222e65ebdba5a403ef1deba6214e749193 ]
In tipc_buf_append() it may change skb's frag_list, and it causes problems when this skb is cloned. skb_unclone() doesn't really make this skb's flag_list available to change.
Shuang Li has reported an use-after-free issue because of this when creating quite a few macvlan dev over the same dev, where the broadcast packets will be cloned and go up to the stack:
[ ] BUG: KASAN: use-after-free in pskb_expand_head+0x86d/0xea0 [ ] Call Trace: [ ] dump_stack+0x7c/0xb0 [ ] print_address_description.constprop.7+0x1a/0x220 [ ] kasan_report.cold.10+0x37/0x7c [ ] check_memory_region+0x183/0x1e0 [ ] pskb_expand_head+0x86d/0xea0 [ ] process_backlog+0x1df/0x660 [ ] net_rx_action+0x3b4/0xc90 [ ] [ ] Allocated by task 1786: [ ] kmem_cache_alloc+0xbf/0x220 [ ] skb_clone+0x10a/0x300 [ ] macvlan_broadcast+0x2f6/0x590 [macvlan] [ ] macvlan_process_broadcast+0x37c/0x516 [macvlan] [ ] process_one_work+0x66a/0x1060 [ ] worker_thread+0x87/0xb10 [ ] [ ] Freed by task 3253: [ ] kmem_cache_free+0x82/0x2a0 [ ] skb_release_data+0x2c3/0x6e0 [ ] kfree_skb+0x78/0x1d0 [ ] tipc_recvmsg+0x3be/0xa40 [tipc]
So fix it by using skb_unshare() instead, which would create a new skb for the cloned frag and it'll be safe to change its frag_list. The similar things were also done in sctp_make_reassembled_event(), which is using skb_copy().
Reported-by: Shuang Li shuali@redhat.com Fixes: 37e22164a8a3 ("tipc: rename and move message reassembly function") Signed-off-by: Xin Long lucien.xin@gmail.com Signed-off-by: David S. Miller davem@davemloft.net Signed-off-by: Greg Kroah-Hartman gregkh@linuxfoundation.org Signed-off-by: Yang Yingliang yangyingliang@huawei.com
Signed-off-by: Aichun Li liaichun@huawei.com Reviewed-by: wangxiaopeng wangxiaopeng7@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- net/tipc/msg.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-)
diff --git a/net/tipc/msg.c b/net/tipc/msg.c index cbccf1791d3c..b078b77620f1 100644 --- a/net/tipc/msg.c +++ b/net/tipc/msg.c @@ -140,7 +140,8 @@ int tipc_buf_append(struct sk_buff **headbuf, struct sk_buff **buf) if (fragid == FIRST_FRAGMENT) { if (unlikely(head)) goto err; - if (unlikely(skb_unclone(frag, GFP_ATOMIC))) + frag = skb_unshare(frag, GFP_ATOMIC); + if (unlikely(!frag)) goto err; head = *headbuf = frag; *buf = NULL;
From: David Ahern dsahern@kernel.org
stable inclusion from linux-4.19.148 commit 98776a365da509ad923083ae54b38ee521c52742
--------------------------------
[ Upstream commit 2fbc6e89b2f1403189e624cabaf73e189c5e50c6 ]
Kfir reported that pmtu exceptions are not created properly for deployments where multipath routes use the same device.
After some digging I see 2 compounding problems: 1. ip_route_output_key_hash_rcu is updating the flowi4_oif *after* the route lookup. This is the second use case where this has been a problem (the first is related to use of vti devices with VRF). I can not find any reason for the oif to be changed after the lookup; the code goes back to the start of git. It does not seem logical so remove it.
2. fib_lookups for exceptions do not call fib_select_path to handle multipath route selection based on the hash.
The end result is that the fib_lookup used to add the exception always creates it based using the first leg of the route.
An example topology showing the problem:
| host1 +------+ | eth0 | .209 +------+ | +------+ switch | br0 | +------+ | +---------+---------+ | host2 | host3 +------+ +------+ | eth0 | .250 | eth0 | 192.168.252.252 +------+ +------+
+-----+ +-----+ | vti | .2 | vti | 192.168.247.3 +-----+ +-----+ \ / ================================= tunnels 192.168.247.1/24
for h in host1 host2 host3; do ip netns add ${h} ip -netns ${h} link set lo up ip netns exec ${h} sysctl -wq net.ipv4.ip_forward=1 done
ip netns add switch ip -netns switch li set lo up ip -netns switch link add br0 type bridge stp 0 ip -netns switch link set br0 up
for n in 1 2 3; do ip -netns switch link add eth-sw type veth peer name eth-h${n} ip -netns switch li set eth-h${n} master br0 up ip -netns switch li set eth-sw netns host${n} name eth0 done
ip -netns host1 addr add 192.168.252.209/24 dev eth0 ip -netns host1 link set dev eth0 up ip -netns host1 route add 192.168.247.0/24 \ nexthop via 192.168.252.250 dev eth0 nexthop via 192.168.252.252 dev eth0
ip -netns host2 addr add 192.168.252.250/24 dev eth0 ip -netns host2 link set dev eth0 up
ip -netns host2 addr add 192.168.252.252/24 dev eth0 ip -netns host3 link set dev eth0 up
ip netns add tunnel ip -netns tunnel li set lo up ip -netns tunnel li add br0 type bridge ip -netns tunnel li set br0 up for n in $(seq 11 20); do ip -netns tunnel addr add dev br0 192.168.247.${n}/24 done
for n in 2 3 do ip -netns tunnel link add vti${n} type veth peer name eth${n} ip -netns tunnel link set eth${n} mtu 1360 master br0 up ip -netns tunnel link set vti${n} netns host${n} mtu 1360 up ip -netns host${n} addr add dev vti${n} 192.168.247.${n}/24 done ip -netns tunnel ro add default nexthop via 192.168.247.2 nexthop via 192.168.247.3
ip netns exec host1 ping -M do -s 1400 -c3 -I 192.168.252.209 192.168.247.11 ip netns exec host1 ping -M do -s 1400 -c3 -I 192.168.252.209 192.168.247.15 ip -netns host1 ro ls cache
Before this patch the cache always shows exceptions against the first leg in the multipath route; 192.168.252.250 per this example. Since the hash has an initial random seed, you may need to vary the final octet more than what is listed. In my tests, using addresses between 11 and 19 usually found 1 that used both legs.
With this patch, the cache will have exceptions for both legs.
Fixes: 4895c771c7f0 ("ipv4: Add FIB nexthop exceptions") Reported-by: Kfir Itzhak mastertheknife@gmail.com Signed-off-by: David Ahern dsahern@kernel.org Signed-off-by: David S. Miller davem@davemloft.net Signed-off-by: Greg Kroah-Hartman gregkh@linuxfoundation.org Signed-off-by: Yang Yingliang yangyingliang@huawei.com Conflicts: net/ipv4/route.c [yyl: adjust context] Signed-off-by: Yang Yingliang yangyingliang@huawei.com
Signed-off-by: Aichun Li liaichun@huawei.com Reviewed-by: wangxiaopeng wangxiaopeng7@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- net/ipv4/route.c | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-)
diff --git a/net/ipv4/route.c b/net/ipv4/route.c index b939b8dc8347..b033fdd6437e 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c @@ -773,8 +773,10 @@ static void __ip_do_redirect(struct rtable *rt, struct sk_buff *skb, struct flow neigh_event_send(n, NULL); } else { if (fib_lookup(net, fl4, &res, 0) == 0) { - struct fib_nh *nh = &FIB_RES_NH(res); + struct fib_nh *nh;
+ fib_select_path(net, &res, fl4, skb); + nh = &FIB_RES_NH(res); update_or_create_fnhe(nh, fl4->daddr, new_gw, 0, false, jiffies + ip_rt_gc_timeout); @@ -1000,6 +1002,7 @@ out: kfree_skb(skb); static void __ip_rt_update_pmtu(struct rtable *rt, struct flowi4 *fl4, u32 mtu) { struct dst_entry *dst = &rt->dst; + struct net *net = dev_net(dst->dev); struct fib_result res; bool lock = false;
@@ -1019,9 +1022,11 @@ static void __ip_rt_update_pmtu(struct rtable *rt, struct flowi4 *fl4, u32 mtu) return;
rcu_read_lock(); - if (fib_lookup(dev_net(dst->dev), fl4, &res, 0) == 0) { - struct fib_nh *nh = &FIB_RES_NH(res); + if (fib_lookup(net, fl4, &res, 0) == 0) { + struct fib_nh *nh;
+ fib_select_path(net, &res, fl4, NULL); + nh = &FIB_RES_NH(res); update_or_create_fnhe(nh, fl4->daddr, 0, mtu, lock, jiffies + ip_rt_mtu_expires); } @@ -2531,8 +2536,6 @@ struct rtable *ip_route_output_key_hash_rcu(struct net *net, struct flowi4 *fl4, fib_select_path(net, res, fl4, skb);
dev_out = FIB_RES_DEV(*res); - fl4->flowi4_oif = dev_out->ifindex; -
make_route: rth = __mkroute_output(res, fl4, orig_oif, dev_out, flags);
From: Priyaranjan Jha priyarjha@google.com
stable inclusion from linux-4.19.148 commit a56eb38acc700684f365740993ce3ddedcbb9152
--------------------------------
commit 232aa8ec3ed979d4716891540c03a806ecab0c37 upstream.
Because bbr_target_cwnd() is really a general-purpose BBR helper for computing some volume of inflight data as a function of the estimated BDP, refactor it into following helper functions: - bbr_bdp() - bbr_quantization_budget() - bbr_inflight()
Signed-off-by: Priyaranjan Jha priyarjha@google.com Signed-off-by: Neal Cardwell ncardwell@google.com Signed-off-by: Yuchung Cheng ycheng@google.com Signed-off-by: David S. Miller davem@davemloft.net Signed-off-by: Greg Kroah-Hartman gregkh@linuxfoundation.org Signed-off-by: Yang Yingliang yangyingliang@huawei.com
Signed-off-by: Aichun Li liaichun@huawei.com Reviewed-by: wangxiaopeng wangxiaopeng7@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- net/ipv4/tcp_bbr.c | 60 ++++++++++++++++++++++++++++++---------------- 1 file changed, 39 insertions(+), 21 deletions(-)
diff --git a/net/ipv4/tcp_bbr.c b/net/ipv4/tcp_bbr.c index b371e66502c3..4ee6cf1235f7 100644 --- a/net/ipv4/tcp_bbr.c +++ b/net/ipv4/tcp_bbr.c @@ -315,30 +315,19 @@ static void bbr_cwnd_event(struct sock *sk, enum tcp_ca_event event) } }
-/* Find target cwnd. Right-size the cwnd based on min RTT and the - * estimated bottleneck bandwidth: +/* Calculate bdp based on min RTT and the estimated bottleneck bandwidth: * - * cwnd = bw * min_rtt * gain = BDP * gain + * bdp = bw * min_rtt * gain * * The key factor, gain, controls the amount of queue. While a small gain * builds a smaller queue, it becomes more vulnerable to noise in RTT * measurements (e.g., delayed ACKs or other ACK compression effects). This * noise may cause BBR to under-estimate the rate. - * - * To achieve full performance in high-speed paths, we budget enough cwnd to - * fit full-sized skbs in-flight on both end hosts to fully utilize the path: - * - one skb in sending host Qdisc, - * - one skb in sending host TSO/GSO engine - * - one skb being received by receiver host LRO/GRO/delayed-ACK engine - * Don't worry, at low rates (bbr_min_tso_rate) this won't bloat cwnd because - * in such cases tso_segs_goal is 1. The minimum cwnd is 4 packets, - * which allows 2 outstanding 2-packet sequences, to try to keep pipe - * full even with ACK-every-other-packet delayed ACKs. */ -static u32 bbr_target_cwnd(struct sock *sk, u32 bw, int gain) +static u32 bbr_bdp(struct sock *sk, u32 bw, int gain) { struct bbr *bbr = inet_csk_ca(sk); - u32 cwnd; + u32 bdp; u64 w;
/* If we've never had a valid RTT sample, cap cwnd at the initial @@ -353,7 +342,24 @@ static u32 bbr_target_cwnd(struct sock *sk, u32 bw, int gain) w = (u64)bw * bbr->min_rtt_us;
/* Apply a gain to the given value, then remove the BW_SCALE shift. */ - cwnd = (((w * gain) >> BBR_SCALE) + BW_UNIT - 1) / BW_UNIT; + bdp = (((w * gain) >> BBR_SCALE) + BW_UNIT - 1) / BW_UNIT; + + return bdp; +} + +/* To achieve full performance in high-speed paths, we budget enough cwnd to + * fit full-sized skbs in-flight on both end hosts to fully utilize the path: + * - one skb in sending host Qdisc, + * - one skb in sending host TSO/GSO engine + * - one skb being received by receiver host LRO/GRO/delayed-ACK engine + * Don't worry, at low rates (bbr_min_tso_rate) this won't bloat cwnd because + * in such cases tso_segs_goal is 1. The minimum cwnd is 4 packets, + * which allows 2 outstanding 2-packet sequences, to try to keep pipe + * full even with ACK-every-other-packet delayed ACKs. + */ +static u32 bbr_quantization_budget(struct sock *sk, u32 cwnd, int gain) +{ + struct bbr *bbr = inet_csk_ca(sk);
/* Allow enough full-sized skbs in flight to utilize end systems. */ cwnd += 3 * bbr_tso_segs_goal(sk); @@ -368,6 +374,17 @@ static u32 bbr_target_cwnd(struct sock *sk, u32 bw, int gain) return cwnd; }
+/* Find inflight based on min RTT and the estimated bottleneck bandwidth. */ +static u32 bbr_inflight(struct sock *sk, u32 bw, int gain) +{ + u32 inflight; + + inflight = bbr_bdp(sk, bw, gain); + inflight = bbr_quantization_budget(sk, inflight, gain); + + return inflight; +} + /* An optimization in BBR to reduce losses: On the first round of recovery, we * follow the packet conservation principle: send P packets per P packets acked. * After that, we slow-start and send at most 2*P packets per P packets acked. @@ -429,7 +446,8 @@ static void bbr_set_cwnd(struct sock *sk, const struct rate_sample *rs, goto done;
/* If we're below target cwnd, slow start cwnd toward target cwnd. */ - target_cwnd = bbr_target_cwnd(sk, bw, gain); + target_cwnd = bbr_bdp(sk, bw, gain); + target_cwnd = bbr_quantization_budget(sk, target_cwnd, gain); if (bbr_full_bw_reached(sk)) /* only cut cwnd if we filled the pipe */ cwnd = min(cwnd + acked, target_cwnd); else if (cwnd < target_cwnd || tp->delivered < TCP_INIT_CWND) @@ -470,14 +488,14 @@ static bool bbr_is_next_cycle_phase(struct sock *sk, if (bbr->pacing_gain > BBR_UNIT) return is_full_length && (rs->losses || /* perhaps pacing_gain*BDP won't fit */ - inflight >= bbr_target_cwnd(sk, bw, bbr->pacing_gain)); + inflight >= bbr_inflight(sk, bw, bbr->pacing_gain));
/* A pacing_gain < 1.0 tries to drain extra queue we added if bw * probing didn't find more bw. If inflight falls to match BDP then we * estimate queue is drained; persisting would underutilize the pipe. */ return is_full_length || - inflight <= bbr_target_cwnd(sk, bw, BBR_UNIT); + inflight <= bbr_inflight(sk, bw, BBR_UNIT); }
static void bbr_advance_cycle_phase(struct sock *sk) @@ -736,11 +754,11 @@ static void bbr_check_drain(struct sock *sk, const struct rate_sample *rs) bbr->pacing_gain = bbr_drain_gain; /* pace slow to drain */ bbr->cwnd_gain = bbr_high_gain; /* maintain cwnd */ tcp_sk(sk)->snd_ssthresh = - bbr_target_cwnd(sk, bbr_max_bw(sk), BBR_UNIT); + bbr_inflight(sk, bbr_max_bw(sk), BBR_UNIT); } /* fall through to check if in-flight is already small: */ if (bbr->mode == BBR_DRAIN && tcp_packets_in_flight(tcp_sk(sk)) <= - bbr_target_cwnd(sk, bbr_max_bw(sk), BBR_UNIT)) + bbr_inflight(sk, bbr_max_bw(sk), BBR_UNIT)) bbr_reset_probe_bw_mode(sk); /* we estimate queue is drained */ }
From: Priyaranjan Jha priyarjha@google.com
stable inclusion from linux-4.19.148 commit 610058f519b579e38f9be0715ec9f73697e5d40d
--------------------------------
commit 78dc70ebaa38aa303274e333be6c98eef87619e2 upstream.
Aggregation effects are extremely common with wifi, cellular, and cable modem link technologies, ACK decimation in middleboxes, and LRO and GRO in receiving hosts. The aggregation can happen in either direction, data or ACKs, but in either case the aggregation effect is visible to the sender in the ACK stream.
Previously BBR's sending was often limited by cwnd under severe ACK aggregation/decimation because BBR sized the cwnd at 2*BDP. If packets were acked in bursts after long delays (e.g. one ACK acking 5*BDP after 5*RTT), BBR's sending was halted after sending 2*BDP over 2*RTT, leaving the bottleneck idle for potentially long periods. Note that loss-based congestion control does not have this issue because when facing aggregation it continues increasing cwnd after bursts of ACKs, growing cwnd until the buffer is full.
To achieve good throughput in the presence of aggregation effects, this algorithm allows the BBR sender to put extra data in flight to keep the bottleneck utilized during silences in the ACK stream that it has evidence to suggest were caused by aggregation.
A summary of the algorithm: when a burst of packets are acked by a stretched ACK or a burst of ACKs or both, BBR first estimates the expected amount of data that should have been acked, based on its estimated bandwidth. Then the surplus ("extra_acked") is recorded in a windowed-max filter to estimate the recent level of observed ACK aggregation. Then cwnd is increased by the ACK aggregation estimate. The larger cwnd avoids BBR being cwnd-limited in the face of ACK silences that recent history suggests were caused by aggregation. As a sanity check, the ACK aggregation degree is upper-bounded by the cwnd (at the time of measurement) and a global max of BW * 100ms. The algorithm is further described by the following presentation: https://datatracker.ietf.org/meeting/101/materials/slides-101-iccrg-an-updat...
In our internal testing, we observed a significant increase in BBR throughput (measured using netperf), in a basic wifi setup. - Host1 (sender on ethernet) -> AP -> Host2 (receiver on wifi) - 2.4 GHz -> BBR before: ~73 Mbps; BBR after: ~102 Mbps; CUBIC: ~100 Mbps - 5.0 GHz -> BBR before: ~362 Mbps; BBR after: ~593 Mbps; CUBIC: ~601 Mbps
Also, this code is running globally on YouTube TCP connections and produced significant bandwidth increases for YouTube traffic.
This is based on Ian Swett's max_ack_height_ algorithm from the QUIC BBR implementation.
Signed-off-by: Priyaranjan Jha priyarjha@google.com Signed-off-by: Neal Cardwell ncardwell@google.com Signed-off-by: Yuchung Cheng ycheng@google.com Signed-off-by: David S. Miller davem@davemloft.net Signed-off-by: Greg Kroah-Hartman gregkh@linuxfoundation.org Signed-off-by: Yang Yingliang yangyingliang@huawei.com
Signed-off-by: Aichun Li liaichun@huawei.com Reviewed-by: wangxiaopeng wangxiaopeng7@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- include/net/inet_connection_sock.h | 4 +- net/ipv4/tcp_bbr.c | 122 ++++++++++++++++++++++++++++- 2 files changed, 123 insertions(+), 3 deletions(-)
diff --git a/include/net/inet_connection_sock.h b/include/net/inet_connection_sock.h index 2d5220ab0600..fc9d6e37552d 100644 --- a/include/net/inet_connection_sock.h +++ b/include/net/inet_connection_sock.h @@ -139,8 +139,8 @@ struct inet_connection_sock { } icsk_mtup; u32 icsk_user_timeout;
- u64 icsk_ca_priv[88 / sizeof(u64)]; -#define ICSK_CA_PRIV_SIZE (11 * sizeof(u64)) + u64 icsk_ca_priv[104 / sizeof(u64)]; +#define ICSK_CA_PRIV_SIZE (13 * sizeof(u64)) };
#define ICSK_TIME_RETRANS 1 /* Retransmit timer */ diff --git a/net/ipv4/tcp_bbr.c b/net/ipv4/tcp_bbr.c index 4ee6cf1235f7..93f176336297 100644 --- a/net/ipv4/tcp_bbr.c +++ b/net/ipv4/tcp_bbr.c @@ -115,6 +115,14 @@ struct bbr { unused_b:5; u32 prior_cwnd; /* prior cwnd upon entering loss recovery */ u32 full_bw; /* recent bw, to estimate if pipe is full */ + + /* For tracking ACK aggregation: */ + u64 ack_epoch_mstamp; /* start of ACK sampling epoch */ + u16 extra_acked[2]; /* max excess data ACKed in epoch */ + u32 ack_epoch_acked:20, /* packets (S)ACKed in sampling epoch */ + extra_acked_win_rtts:5, /* age of extra_acked, in round trips */ + extra_acked_win_idx:1, /* current index in extra_acked array */ + unused_c:6; };
#define CYCLE_LEN 8 /* number of phases in a pacing gain cycle */ @@ -174,6 +182,15 @@ static const u32 bbr_lt_bw_diff = 4000 / 8; /* If we estimate we're policed, use lt_bw for this many round trips: */ static const u32 bbr_lt_bw_max_rtts = 48;
+/* Gain factor for adding extra_acked to target cwnd: */ +static const int bbr_extra_acked_gain = BBR_UNIT; +/* Window length of extra_acked window. */ +static const u32 bbr_extra_acked_win_rtts = 5; +/* Max allowed val for ack_epoch_acked, after which sampling epoch is reset */ +static const u32 bbr_ack_epoch_acked_reset_thresh = 1U << 20; +/* Time period for clamping cwnd increment due to ack aggregation */ +static const u32 bbr_extra_acked_max_us = 100 * 1000; + static void bbr_check_probe_rtt_done(struct sock *sk);
/* Do we estimate that STARTUP filled the pipe? */ @@ -200,6 +217,16 @@ static u32 bbr_bw(const struct sock *sk) return bbr->lt_use_bw ? bbr->lt_bw : bbr_max_bw(sk); }
+/* Return maximum extra acked in past k-2k round trips, + * where k = bbr_extra_acked_win_rtts. + */ +static u16 bbr_extra_acked(const struct sock *sk) +{ + struct bbr *bbr = inet_csk_ca(sk); + + return max(bbr->extra_acked[0], bbr->extra_acked[1]); +} + /* Return rate in bytes per second, optionally with a gain. * The order here is chosen carefully to avoid overflow of u64. This should * work for input rates of up to 2.9Tbit/sec and gain of 2.89x. @@ -305,6 +332,8 @@ static void bbr_cwnd_event(struct sock *sk, enum tcp_ca_event event)
if (event == CA_EVENT_TX_START && tp->app_limited) { bbr->idle_restart = 1; + bbr->ack_epoch_mstamp = tp->tcp_mstamp; + bbr->ack_epoch_acked = 0; /* Avoid pointless buffer overflows: pace at est. bw if we don't * need more speed (we're restarting from idle and app-limited). */ @@ -385,6 +414,22 @@ static u32 bbr_inflight(struct sock *sk, u32 bw, int gain) return inflight; }
+/* Find the cwnd increment based on estimate of ack aggregation */ +static u32 bbr_ack_aggregation_cwnd(struct sock *sk) +{ + u32 max_aggr_cwnd, aggr_cwnd = 0; + + if (bbr_extra_acked_gain && bbr_full_bw_reached(sk)) { + max_aggr_cwnd = ((u64)bbr_bw(sk) * bbr_extra_acked_max_us) + / BW_UNIT; + aggr_cwnd = (bbr_extra_acked_gain * bbr_extra_acked(sk)) + >> BBR_SCALE; + aggr_cwnd = min(aggr_cwnd, max_aggr_cwnd); + } + + return aggr_cwnd; +} + /* An optimization in BBR to reduce losses: On the first round of recovery, we * follow the packet conservation principle: send P packets per P packets acked. * After that, we slow-start and send at most 2*P packets per P packets acked. @@ -445,9 +490,15 @@ static void bbr_set_cwnd(struct sock *sk, const struct rate_sample *rs, if (bbr_set_cwnd_to_recover_or_restore(sk, rs, acked, &cwnd)) goto done;
- /* If we're below target cwnd, slow start cwnd toward target cwnd. */ target_cwnd = bbr_bdp(sk, bw, gain); + + /* Increment the cwnd to account for excess ACKed data that seems + * due to aggregation (of data and/or ACKs) visible in the ACK stream. + */ + target_cwnd += bbr_ack_aggregation_cwnd(sk); target_cwnd = bbr_quantization_budget(sk, target_cwnd, gain); + + /* If we're below target cwnd, slow start cwnd toward target cwnd. */ if (bbr_full_bw_reached(sk)) /* only cut cwnd if we filled the pipe */ cwnd = min(cwnd + acked, target_cwnd); else if (cwnd < target_cwnd || tp->delivered < TCP_INIT_CWND) @@ -717,6 +768,67 @@ static void bbr_update_bw(struct sock *sk, const struct rate_sample *rs) } }
+/* Estimates the windowed max degree of ack aggregation. + * This is used to provision extra in-flight data to keep sending during + * inter-ACK silences. + * + * Degree of ack aggregation is estimated as extra data acked beyond expected. + * + * max_extra_acked = "maximum recent excess data ACKed beyond max_bw * interval" + * cwnd += max_extra_acked + * + * Max extra_acked is clamped by cwnd and bw * bbr_extra_acked_max_us (100 ms). + * Max filter is an approximate sliding window of 5-10 (packet timed) round + * trips. + */ +static void bbr_update_ack_aggregation(struct sock *sk, + const struct rate_sample *rs) +{ + u32 epoch_us, expected_acked, extra_acked; + struct bbr *bbr = inet_csk_ca(sk); + struct tcp_sock *tp = tcp_sk(sk); + + if (!bbr_extra_acked_gain || rs->acked_sacked <= 0 || + rs->delivered < 0 || rs->interval_us <= 0) + return; + + if (bbr->round_start) { + bbr->extra_acked_win_rtts = min(0x1F, + bbr->extra_acked_win_rtts + 1); + if (bbr->extra_acked_win_rtts >= bbr_extra_acked_win_rtts) { + bbr->extra_acked_win_rtts = 0; + bbr->extra_acked_win_idx = bbr->extra_acked_win_idx ? + 0 : 1; + bbr->extra_acked[bbr->extra_acked_win_idx] = 0; + } + } + + /* Compute how many packets we expected to be delivered over epoch. */ + epoch_us = tcp_stamp_us_delta(tp->delivered_mstamp, + bbr->ack_epoch_mstamp); + expected_acked = ((u64)bbr_bw(sk) * epoch_us) / BW_UNIT; + + /* Reset the aggregation epoch if ACK rate is below expected rate or + * significantly large no. of ack received since epoch (potentially + * quite old epoch). + */ + if (bbr->ack_epoch_acked <= expected_acked || + (bbr->ack_epoch_acked + rs->acked_sacked >= + bbr_ack_epoch_acked_reset_thresh)) { + bbr->ack_epoch_acked = 0; + bbr->ack_epoch_mstamp = tp->delivered_mstamp; + expected_acked = 0; + } + + /* Compute excess data delivered, beyond what was expected. */ + bbr->ack_epoch_acked = min_t(u32, 0xFFFFF, + bbr->ack_epoch_acked + rs->acked_sacked); + extra_acked = bbr->ack_epoch_acked - expected_acked; + extra_acked = min(extra_acked, tp->snd_cwnd); + if (extra_acked > bbr->extra_acked[bbr->extra_acked_win_idx]) + bbr->extra_acked[bbr->extra_acked_win_idx] = extra_acked; +} + /* Estimate when the pipe is full, using the change in delivery rate: BBR * estimates that STARTUP filled the pipe if the estimated bw hasn't changed by * at least bbr_full_bw_thresh (25%) after bbr_full_bw_cnt (3) non-app-limited @@ -846,6 +958,7 @@ static void bbr_update_min_rtt(struct sock *sk, const struct rate_sample *rs) static void bbr_update_model(struct sock *sk, const struct rate_sample *rs) { bbr_update_bw(sk, rs); + bbr_update_ack_aggregation(sk, rs); bbr_update_cycle_phase(sk, rs); bbr_check_full_bw_reached(sk, rs); bbr_check_drain(sk, rs); @@ -896,6 +1009,13 @@ static void bbr_init(struct sock *sk) bbr_reset_lt_bw_sampling(sk); bbr_reset_startup_mode(sk);
+ bbr->ack_epoch_mstamp = tp->tcp_mstamp; + bbr->ack_epoch_acked = 0; + bbr->extra_acked_win_rtts = 0; + bbr->extra_acked_win_idx = 0; + bbr->extra_acked[0] = 0; + bbr->extra_acked[1] = 0; + cmpxchg(&sk->sk_pacing_status, SK_PACING_NONE, SK_PACING_NEEDED); }