backport some network patches
Brian Vazquez (1): net: use indirect calls helpers for sk_exit_memory_pressure()
Eric Dumazet (5): net: cache align tcp_memory_allocated, tcp_sockets_allocated tcp: small optimization in tcp recvmsg() tcp: add RETPOLINE mitigation to sk_backlog_rcv tcp: avoid indirect calls to sock_rfree tcp: check local var (timeo) before socket fields in one test
include/linux/indirect_call_wrapper.h | 6 ++++++ include/net/sock.h | 8 ++++++- net/core/sock.c | 8 +++++-- net/ipv4/tcp.c | 31 +++++++++++++++++---------- net/ipv4/udp.c | 2 +- net/ipv6/tcp_ipv6.c | 5 +++-- net/mptcp/protocol.c | 2 +- 7 files changed, 44 insertions(+), 18 deletions(-)
From: Eric Dumazet edumazet@google.com
mainline inclusion from mainline-v5.17-rc1 commit 91b6d325635617540b6a1646ddb138bb17cbd569 category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I65HYE
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?i...
---------------------------
tcp_memory_allocated and tcp_sockets_allocated often share a common cache line, source of false sharing.
Also take care of udp_memory_allocated and mptcp_sockets_allocated.
Signed-off-by: Eric Dumazet edumazet@google.com Signed-off-by: David S. Miller davem@davemloft.net (cherry picked from commit 91b6d325635617540b6a1646ddb138bb17cbd569) Signed-off-by: Liu Jian liujian56@huawei.com
Conflicts: net/mptcp/protocol.c --- net/ipv4/tcp.c | 4 ++-- net/ipv4/udp.c | 2 +- net/mptcp/protocol.c | 2 +- 3 files changed, 4 insertions(+), 4 deletions(-)
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 5d6fa168fb8d4..ca6b15148fecd 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -286,7 +286,7 @@ EXPORT_PER_CPU_SYMBOL_GPL(tcp_orphan_count); long sysctl_tcp_mem[3] __read_mostly; EXPORT_SYMBOL(sysctl_tcp_mem);
-atomic_long_t tcp_memory_allocated; /* Current allocated memory. */ +atomic_long_t tcp_memory_allocated ____cacheline_aligned_in_smp; /* Current allocated memory. */ EXPORT_SYMBOL(tcp_memory_allocated);
#if IS_ENABLED(CONFIG_SMC) @@ -301,7 +301,7 @@ DEFINE_STATIC_KEY_FALSE(tcp_have_comp); /* * Current number of TCP sockets. */ -struct percpu_counter tcp_sockets_allocated; +struct percpu_counter tcp_sockets_allocated ____cacheline_aligned_in_smp; EXPORT_SYMBOL(tcp_sockets_allocated);
/* diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index aa526322cfcba..8ab125298b8e9 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c @@ -123,7 +123,7 @@ EXPORT_SYMBOL(udp_table); long sysctl_udp_mem[3] __read_mostly; EXPORT_SYMBOL(sysctl_udp_mem);
-atomic_long_t udp_memory_allocated; +atomic_long_t udp_memory_allocated ____cacheline_aligned_in_smp; EXPORT_SYMBOL(udp_memory_allocated);
#define MAX_UDP_PORTS 65536 diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c index 72d944e6a641f..402b3da730d41 100644 --- a/net/mptcp/protocol.c +++ b/net/mptcp/protocol.c @@ -39,7 +39,7 @@ struct mptcp_skb_cb {
#define MPTCP_SKB_CB(__skb) ((struct mptcp_skb_cb *)&((__skb)->cb[0]))
-static struct percpu_counter mptcp_sockets_allocated; +static struct percpu_counter mptcp_sockets_allocated ____cacheline_aligned_in_smp;
/* If msk has an initial subflow socket, and the MP_CAPABLE handshake has not * completed yet or has failed, return the subflow socket.
From: Eric Dumazet edumazet@google.com
mainline inclusion from mainline-v5.17-rc1 commit 93afcfd1db35882921b2521a637c78755c27b02c category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I65HYE
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?i...
---------------------------
When reading large chunks of data, incoming packets might be added to the backlog from BH.
tcp recvmsg() detects the backlog queue is not empty, and uses a release_sock()/lock_sock() pair to process this backlog.
We now have __sk_flush_backlog() to perform this a bit faster.
Signed-off-by: Eric Dumazet edumazet@google.com Signed-off-by: David S. Miller davem@davemloft.net (cherry picked from commit 93afcfd1db35882921b2521a637c78755c27b02c) Signed-off-by: Liu Jian liujian56@huawei.com --- net/ipv4/tcp.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-)
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index ca6b15148fecd..81693b83a8515 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -2246,8 +2246,7 @@ int tcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int nonblock,
if (copied >= target) { /* Do not sleep, just process backlog. */ - release_sock(sk); - lock_sock(sk); + __sk_flush_backlog(sk); } else { sk_wait_data(sk, &timeo, last); }
From: Eric Dumazet edumazet@google.com
mainline inclusion from mainline-v5.17-rc1 commit d2489c7b6d7d5ed4b32b56703c57c47bfbfe7fa5 category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I65HYE
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?i...
---------------------------
Use INDIRECT_CALL_INET() to avoid an indirect call when/if CONFIG_RETPOLINE=y
Signed-off-by: Eric Dumazet edumazet@google.com Signed-off-by: David S. Miller davem@davemloft.net (cherry picked from commit d2489c7b6d7d5ed4b32b56703c57c47bfbfe7fa5) Signed-off-by: Liu Jian liujian56@huawei.com --- include/net/sock.h | 8 +++++++- net/core/sock.c | 5 ++++- net/ipv6/tcp_ipv6.c | 5 +++-- 3 files changed, 14 insertions(+), 4 deletions(-)
diff --git a/include/net/sock.h b/include/net/sock.h index 41d3953d3e5cd..32fb13dc7e5f2 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -1051,12 +1051,18 @@ static inline __must_check int sk_add_backlog(struct sock *sk, struct sk_buff *s
int __sk_backlog_rcv(struct sock *sk, struct sk_buff *skb);
+INDIRECT_CALLABLE_DECLARE(int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)); +INDIRECT_CALLABLE_DECLARE(int tcp_v6_do_rcv(struct sock *sk, struct sk_buff *skb)); + static inline int sk_backlog_rcv(struct sock *sk, struct sk_buff *skb) { if (sk_memalloc_socks() && skb_pfmemalloc(skb)) return __sk_backlog_rcv(sk, skb);
- return sk->sk_backlog_rcv(sk, skb); + return INDIRECT_CALL_INET(sk->sk_backlog_rcv, + tcp_v6_do_rcv, + tcp_v4_do_rcv, + sk, skb); }
static inline void sk_incoming_cpu_update(struct sock *sk) diff --git a/net/core/sock.c b/net/core/sock.c index 9741b4db45c8d..2459aa5c392a7 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -324,7 +324,10 @@ int __sk_backlog_rcv(struct sock *sk, struct sk_buff *skb) BUG_ON(!sock_flag(sk, SOCK_MEMALLOC));
noreclaim_flag = memalloc_noreclaim_save(); - ret = sk->sk_backlog_rcv(sk, skb); + ret = INDIRECT_CALL_INET(sk->sk_backlog_rcv, + tcp_v6_do_rcv, + tcp_v4_do_rcv, + sk, skb); memalloc_noreclaim_restore(noreclaim_flag);
return ret; diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c index 876585d335a8f..7efa3d8f16d4f 100644 --- a/net/ipv6/tcp_ipv6.c +++ b/net/ipv6/tcp_ipv6.c @@ -72,7 +72,7 @@ static void tcp_v6_send_reset(const struct sock *sk, struct sk_buff *skb); static void tcp_v6_reqsk_send_ack(const struct sock *sk, struct sk_buff *skb, struct request_sock *req);
-static int tcp_v6_do_rcv(struct sock *sk, struct sk_buff *skb); +INDIRECT_CALLABLE_SCOPE int tcp_v6_do_rcv(struct sock *sk, struct sk_buff *skb);
static const struct inet_connection_sock_af_ops ipv6_mapped; const struct inet_connection_sock_af_ops ipv6_specific; @@ -1445,7 +1445,8 @@ static struct sock *tcp_v6_syn_recv_sock(const struct sock *sk, struct sk_buff * * This is because we cannot sleep with the original spinlock * held. */ -static int tcp_v6_do_rcv(struct sock *sk, struct sk_buff *skb) +INDIRECT_CALLABLE_SCOPE +int tcp_v6_do_rcv(struct sock *sk, struct sk_buff *skb) { struct ipv6_pinfo *np = tcp_inet6_sk(sk); struct sk_buff *opt_skb = NULL;
From: Eric Dumazet edumazet@google.com
mainline inclusion from mainline-v5.17-rc1 commit 3df684c1a3d08a4f649689053a3d527b3b5fda9e category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I65HYE
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?i...
---------------------------
TCP uses sk_eat_skb() when skbs can be removed from receive queue. However, the call to skb_orphan() from __kfree_skb() incurs an indirect call so sock_rfee(), which is more expensive than a direct call, especially for CONFIG_RETPOLINE=y.
Add tcp_eat_recv_skb() function to make the call before __kfree_skb().
Signed-off-by: Eric Dumazet edumazet@google.com Signed-off-by: David S. Miller davem@davemloft.net (cherry picked from commit 3df684c1a3d08a4f649689053a3d527b3b5fda9e) Signed-off-by: Liu Jian liujian56@huawei.com --- net/ipv4/tcp.c | 20 +++++++++++++++----- 1 file changed, 15 insertions(+), 5 deletions(-)
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 81693b83a8515..e1cc494b5c78e 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -1596,6 +1596,16 @@ void tcp_cleanup_rbuf(struct sock *sk, int copied) tcp_send_ack(sk); }
+static void tcp_eat_recv_skb(struct sock *sk, struct sk_buff *skb) +{ + if (likely(skb->destructor == sock_rfree)) { + sock_rfree(skb); + skb->destructor = NULL; + skb->sk = NULL; + } + sk_eat_skb(sk, skb); +} + static struct sk_buff *tcp_recv_skb(struct sock *sk, u32 seq, u32 *off) { struct sk_buff *skb; @@ -1615,7 +1625,7 @@ static struct sk_buff *tcp_recv_skb(struct sock *sk, u32 seq, u32 *off) * splitted a fat GRO packet, while we released socket lock * in skb_splice_bits() */ - sk_eat_skb(sk, skb); + tcp_eat_recv_skb(sk, skb); } return NULL; } @@ -1683,11 +1693,11 @@ int tcp_read_sock(struct sock *sk, read_descriptor_t *desc, continue; } if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN) { - sk_eat_skb(sk, skb); + tcp_eat_recv_skb(sk, skb); ++seq; break; } - sk_eat_skb(sk, skb); + tcp_eat_recv_skb(sk, skb); if (!desc->count) break; WRITE_ONCE(tp->copied_seq, seq); @@ -2317,14 +2327,14 @@ int tcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int nonblock, if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN) goto found_fin_ok; if (!(flags & MSG_PEEK)) - sk_eat_skb(sk, skb); + tcp_eat_recv_skb(sk, skb); continue;
found_fin_ok: /* Process the FIN. */ WRITE_ONCE(*seq, *seq + 1); if (!(flags & MSG_PEEK)) - sk_eat_skb(sk, skb); + tcp_eat_recv_skb(sk, skb); break; } while (len > 0);
From: Eric Dumazet edumazet@google.com
mainline inclusion from mainline-v5.17-rc1 commit 8bd172b787298124ef75c0e466101107c036d54d category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I65HYE
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?i...
---------------------------
Testing timeo before sk_err/sk_state/sk_shutdown makes more sense.
Modern applications use non-blocking IO, while a socket is terminated only once during its life time.
Signed-off-by: Eric Dumazet edumazet@google.com Signed-off-by: David S. Miller davem@davemloft.net (cherry picked from commit 8bd172b787298124ef75c0e466101107c036d54d) Signed-off-by: Liu Jian liujian56@huawei.com --- net/ipv4/tcp.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-)
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index e1cc494b5c78e..dcbc5ec00da87 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -2215,10 +2215,10 @@ int tcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int nonblock, break;
if (copied) { - if (sk->sk_err || + if (!timeo || + sk->sk_err || sk->sk_state == TCP_CLOSE || (sk->sk_shutdown & RCV_SHUTDOWN) || - !timeo || signal_pending(current)) break; } else {
From: Brian Vazquez brianvv@google.com
mainline inclusion from mainline-v6.3-rc2 commit 5c1ebbfabcd61142a4551bfc0e51840f9bdae7af category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I65HYE
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?i...
---------------------------
Florian reported a regression and sent a patch with the following changelog:
<quote> There is a noticeable tcp performance regression (loopback or cross-netns), seen with iperf3 -Z (sendfile mode) when generic retpolines are needed.
With SK_RECLAIM_THRESHOLD checks gone number of calls to enter/leave memory pressure happen much more often. For TCP indirect calls are used.
We can't remove the if-set-return short-circuit check in tcp_enter_memory_pressure because there are callers other than sk_enter_memory_pressure. Doing a check in the sk wrapper too reduces the indirect calls enough to recover some performance.
Before, 0.00-60.00 sec 322 GBytes 46.1 Gbits/sec receiver
After: 0.00-60.04 sec 359 GBytes 51.4 Gbits/sec receiver
"iperf3 -c $peer -t 60 -Z -f g", connected via veth in another netns. </quote>
It seems we forgot to upstream this indirect call mitigation we had for years, lets do this instead.
[edumazet] - It seems we forgot to upstream this indirect call mitigation we had for years, let's do this instead. - Changed to INDIRECT_CALL_INET_1() to avoid bots reports.
Fixes: 4890b686f408 ("net: keep sk->sk_forward_alloc as small as possible") Reported-by: Florian Westphal fw@strlen.de Link: https://lore.kernel.org/netdev/20230227152741.4a53634b@kernel.org/T/ Signed-off-by: Brian Vazquez brianvv@google.com Signed-off-by: Eric Dumazet edumazet@google.com Link: https://lore.kernel.org/r/20230301133247.2346111-1-edumazet@google.com Signed-off-by: Paolo Abeni pabeni@redhat.com (cherry picked from commit 5c1ebbfabcd61142a4551bfc0e51840f9bdae7af) Signed-off-by: Liu Jian liujian56@huawei.com
Conflicts: include/linux/indirect_call_wrapper.h --- include/linux/indirect_call_wrapper.h | 6 ++++++ net/core/sock.c | 3 ++- 2 files changed, 8 insertions(+), 1 deletion(-)
diff --git a/include/linux/indirect_call_wrapper.h b/include/linux/indirect_call_wrapper.h index 54c02c84906ab..cfcfef37b2f1a 100644 --- a/include/linux/indirect_call_wrapper.h +++ b/include/linux/indirect_call_wrapper.h @@ -60,4 +60,10 @@ #define INDIRECT_CALL_INET(f, f2, f1, ...) f(__VA_ARGS__) #endif
+#if IS_ENABLED(CONFIG_INET) +#define INDIRECT_CALL_INET_1(f, f1, ...) INDIRECT_CALL_1(f, f1, __VA_ARGS__) +#else +#define INDIRECT_CALL_INET_1(f, f1, ...) f(__VA_ARGS__) +#endif + #endif diff --git a/net/core/sock.c b/net/core/sock.c index 2459aa5c392a7..decd8098dd60e 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -2452,7 +2452,8 @@ static void sk_enter_memory_pressure(struct sock *sk) static void sk_leave_memory_pressure(struct sock *sk) { if (sk->sk_prot->leave_memory_pressure) { - sk->sk_prot->leave_memory_pressure(sk); + INDIRECT_CALL_INET_1(sk->sk_prot->leave_memory_pressure, + tcp_leave_memory_pressure, sk); } else { unsigned long *memory_pressure = sk->sk_prot->memory_pressure;
反馈: 您发送到kernel@openeuler.org的补丁/补丁集,已成功转换为PR! PR链接地址: https://gitee.com/openeuler/kernel/pulls/3421 邮件列表地址:https://mailweb.openeuler.org/hyperkitty/list/kernel@openeuler.org/message/H...
FeedBack: The patch(es) which you have sent to kernel@openeuler.org mailing list has been converted to a pull request successfully! Pull request link: https://gitee.com/openeuler/kernel/pulls/3421 Mailing list address: https://mailweb.openeuler.org/hyperkitty/list/kernel@openeuler.org/message/H...