*** BLURB HERE ***
48147 (10): net/smc: add sysctl interface for SMC net/smc: fix compile warning for smc_sysctl net/smc: fix -Wmissing-prototypes warning when CONFIG_SYSCTL not set net/smc: fix a memory leak in smc_sysctl_net_exit() net/smc: Introduce a sysctl for setting SMC-R buffer type net/smc: Use sysctl-specified types of buffers in new link group net/smc: Allow SMC-D 1MB DMB allocations net/smc: Allow virtually contiguous sndbufs or RMBs for SMC-R net/smc: Unbind r/w buffer size from clcsock and make them tunable net/smc: Add size match for smc_buf_get_slot
Litao Jiao (1): net/smc: Tune the maximum size of virtually contiguous sndbufs or RMBs for SMC-R
Documentation/networking/smc-sysctl.rst | 41 ++++ include/net/net_namespace.h | 5 +- include/net/netns/smc.h | 13 ++ net/smc/Makefile | 1 + net/smc/af_smc.c | 78 ++++++- net/smc/smc_clc.c | 8 +- net/smc/smc_clc.h | 2 +- net/smc/smc_core.c | 265 ++++++++++++++++-------- net/smc/smc_core.h | 17 +- net/smc/smc_ib.c | 15 +- net/smc/smc_llc.c | 26 ++- net/smc/smc_rx.c | 90 ++++++-- net/smc/smc_sysctl.c | 93 +++++++++ net/smc/smc_sysctl.h | 32 +++ net/smc/smc_tx.c | 7 +- 15 files changed, 556 insertions(+), 137 deletions(-) create mode 100644 Documentation/networking/smc-sysctl.rst create mode 100644 include/net/netns/smc.h create mode 100644 net/smc/smc_sysctl.c create mode 100644 net/smc/smc_sysctl.h
mainline inclusion from mainline-v5.18-rc1 commit 462791bbfa350189e309a5a94541f6b63cd874e8 category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I76JHC CVE: NA
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/ne...
--------------------------------
This patch add sysctl interface to support container environment for SMC as we talk in the mail list.
Link: https://lore.kernel.org/netdev/20220224020253.GF5443@linux.alibaba.com Co-developed-by: Tony Lu tonylu@linux.alibaba.com Signed-off-by: Tony Lu tonylu@linux.alibaba.com Signed-off-by: Dust Li dust.li@linux.alibaba.com Signed-off-by: David S. Miller davem@davemloft.net Signed-off-by: Litao Jiao jiaolitao@sangfor.com.cn --- include/net/net_namespace.h | 5 ++- include/net/netns/smc.h | 10 ++++++ net/smc/Makefile | 1 + net/smc/af_smc.c | 8 +++++ net/smc/smc_sysctl.c | 70 +++++++++++++++++++++++++++++++++++++ net/smc/smc_sysctl.h | 32 +++++++++++++++++ 6 files changed, 125 insertions(+), 1 deletion(-) create mode 100644 include/net/netns/smc.h create mode 100644 net/smc/smc_sysctl.c create mode 100644 net/smc/smc_sysctl.h
diff --git a/include/net/net_namespace.h b/include/net/net_namespace.h index c7faca9d7447..576372924f3d 100644 --- a/include/net/net_namespace.h +++ b/include/net/net_namespace.h @@ -33,6 +33,7 @@ #include <net/netns/mpls.h> #include <net/netns/can.h> #include <net/netns/xdp.h> +#include <net/netns/smc.h> #include <net/netns/bpf.h> #include <linux/ns_common.h> #include <linux/idr.h> @@ -190,7 +191,9 @@ struct net { struct sock *crypto_nlsk; #endif struct sock *diag_nlsk; - +#if IS_ENABLED(CONFIG_SMC) + struct netns_smc smc; +#endif KABI_RESERVE(1) KABI_RESERVE(2) KABI_RESERVE(3) diff --git a/include/net/netns/smc.h b/include/net/netns/smc.h new file mode 100644 index 000000000000..0a7d25a124e9 --- /dev/null +++ b/include/net/netns/smc.h @@ -0,0 +1,10 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef __NETNS_SMC_H__ +#define __NETNS_SMC_H__ + +struct netns_smc { +#ifdef CONFIG_SYSCTL + struct ctl_table_header *smc_hdr; +#endif +}; +#endif diff --git a/net/smc/Makefile b/net/smc/Makefile index cb1254541f37..efee8fa4a14e 100644 --- a/net/smc/Makefile +++ b/net/smc/Makefile @@ -3,3 +3,4 @@ obj-$(CONFIG_SMC) += smc.o obj-$(CONFIG_SMC_DIAG) += smc_diag.o smc-y := af_smc.o smc_pnet.o smc_ib.o smc_clc.o smc_core.o smc_wr.o smc_llc.o smc-y += smc_cdc.o smc_tx.o smc_rx.o smc_close.o smc_ism.o +smc-y += smc_sysctl.o diff --git a/net/smc/af_smc.c b/net/smc/af_smc.c index 41cbc7c89c9d..1e75628c5a12 100644 --- a/net/smc/af_smc.c +++ b/net/smc/af_smc.c @@ -48,6 +48,7 @@ #include "smc_tx.h" #include "smc_rx.h" #include "smc_close.h" +#include "smc_sysctl.h"
static DEFINE_MUTEX(smc_server_lgr_pending); /* serialize link group * creation on server @@ -2577,6 +2578,12 @@ static int __init smc_init(void) goto out_sock; }
+ rc = smc_sysctl_init(); + if (rc) { + pr_err("%s: sysctl_init fails with %d\n", __func__, rc); + goto out_sock; + } + static_branch_enable(&tcp_have_smc); return 0;
@@ -2603,6 +2610,7 @@ static int __init smc_init(void) static void __exit smc_exit(void) { static_branch_disable(&tcp_have_smc); + smc_sysctl_exit(); sock_unregister(PF_SMC); smc_core_exit(); smc_ib_unregister_client(); diff --git a/net/smc/smc_sysctl.c b/net/smc/smc_sysctl.c new file mode 100644 index 000000000000..8a3a8e145976 --- /dev/null +++ b/net/smc/smc_sysctl.c @@ -0,0 +1,70 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Shared Memory Communications over RDMA (SMC-R) and RoCE + * + * smc_sysctl.c: sysctl interface to SMC subsystem. + * + * Copyright (c) 2022, Alibaba Inc. + * + * Author: Tony Lu tonylu@linux.alibaba.com + * + */ + +#include <linux/init.h> +#include <linux/sysctl.h> +#include <net/net_namespace.h> + +#include "smc_sysctl.h" + +static struct ctl_table smc_table[] = { + { } +}; + +static __net_init int smc_sysctl_init_net(struct net *net) +{ + struct ctl_table *table; + + table = smc_table; + if (!net_eq(net, &init_net)) { + int i; + + table = kmemdup(table, sizeof(smc_table), GFP_KERNEL); + if (!table) + goto err_alloc; + + for (i = 0; i < ARRAY_SIZE(smc_table) - 1; i++) + table[i].data += (void *)net - (void *)&init_net; + } + + net->smc.smc_hdr = register_net_sysctl(net, "net/smc", table); + if (!net->smc.smc_hdr) + goto err_reg; + + return 0; + +err_reg: + if (!net_eq(net, &init_net)) + kfree(table); +err_alloc: + return -ENOMEM; +} + +static __net_exit void smc_sysctl_exit_net(struct net *net) +{ + unregister_net_sysctl_table(net->smc.smc_hdr); +} + +static struct pernet_operations smc_sysctl_ops __net_initdata = { + .init = smc_sysctl_init_net, + .exit = smc_sysctl_exit_net, +}; + +int __init smc_sysctl_init(void) +{ + return register_pernet_subsys(&smc_sysctl_ops); +} + +void smc_sysctl_exit(void) +{ + unregister_pernet_subsys(&smc_sysctl_ops); +} diff --git a/net/smc/smc_sysctl.h b/net/smc/smc_sysctl.h new file mode 100644 index 000000000000..49553ac236b6 --- /dev/null +++ b/net/smc/smc_sysctl.h @@ -0,0 +1,32 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Shared Memory Communications over RDMA (SMC-R) and RoCE + * + * smc_sysctl.c: sysctl interface to SMC subsystem. + * + * Copyright (c) 2022, Alibaba Inc. + * + * Author: Tony Lu tonylu@linux.alibaba.com + * + */ + +#ifndef _SMC_SYSCTL_H +#define _SMC_SYSCTL_H + +#ifdef CONFIG_SYSCTL + +int smc_sysctl_init(void); +void smc_sysctl_exit(void); + +#else + +int smc_sysctl_init(void) +{ + return 0; +} + +void smc_sysctl_exit(void) { } + +#endif /* CONFIG_SYSCTL */ + +#endif /* _SMC_SYSCTL_H */
mainline inclusion from mainline-v5.18-rc1 commit 7de8eb0d9039f16e1122d7aa524a1502a160c4ff category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I76JHC CVE: NA
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/ne...
--------------------------------
kernel test robot reports multiple warning for smc_sysctl:
In file included from net/smc/smc_sysctl.c:17:
net/smc/smc_sysctl.h:23:5: warning: no previous prototype \
for function 'smc_sysctl_init' [-Wmissing-prototypes] int smc_sysctl_init(void) ^ and
WARNING: modpost: vmlinux.o(.text+0x12ced2d): Section mismatch \
in reference from the function smc_sysctl_exit() to the variable .init.data:smc_sysctl_ops The function smc_sysctl_exit() references the variable __initdata smc_sysctl_ops. This is often because smc_sysctl_exit lacks a __initdata annotation or the annotation of smc_sysctl_ops is wrong.
and net/smc/smc_sysctl.c: In function 'smc_sysctl_init_net': net/smc/smc_sysctl.c:47:17: error: 'struct netns_smc' has no member named 'smc_hdr' 47 | net->smc.smc_hdr = register_net_sysctl(net, "net/smc", table);
Since we don't need global sysctl initialization. To make things clean and simple, remove the global pernet_operations and smc_sysctl_{init|exit}. Call smc_sysctl_net_{init|exit} directly from smc_net_{init|exit}.
Also initialized sysctl_autocorking_size if CONFIG_SYSCTL it not set, this make sure SMC autocorking is enabled by default if CONFIG_SYSCTL is not set.
Fixes: 462791bbfa35 ("net/smc: add sysctl interface for SMC") Reported-by: kernel test robot lkp@intel.com Signed-off-by: Dust Li dust.li@linux.alibaba.com Tested-by: Randy Dunlap rdunlap@infradead.org # build-tested Signed-off-by: David S. Miller davem@davemloft.net Signed-off-by: Litao Jiao jiaolitao@sangfor.com.cn --- net/smc/Makefile | 2 +- net/smc/af_smc.c | 13 ++++++------- net/smc/smc_sysctl.c | 19 ++----------------- net/smc/smc_sysctl.h | 9 +++++---- 4 files changed, 14 insertions(+), 29 deletions(-)
diff --git a/net/smc/Makefile b/net/smc/Makefile index efee8fa4a14e..79f53cc7d8dc 100644 --- a/net/smc/Makefile +++ b/net/smc/Makefile @@ -3,4 +3,4 @@ obj-$(CONFIG_SMC) += smc.o obj-$(CONFIG_SMC_DIAG) += smc_diag.o smc-y := af_smc.o smc_pnet.o smc_ib.o smc_clc.o smc_core.o smc_wr.o smc_llc.o smc-y += smc_cdc.o smc_tx.o smc_rx.o smc_close.o smc_ism.o -smc-y += smc_sysctl.o +smc-$(CONFIG_SYSCTL) += smc_sysctl.o diff --git a/net/smc/af_smc.c b/net/smc/af_smc.c index 1e75628c5a12..86627dea88bd 100644 --- a/net/smc/af_smc.c +++ b/net/smc/af_smc.c @@ -2495,11 +2495,17 @@ unsigned int smc_net_id;
static __net_init int smc_net_init(struct net *net) { + int rc; + + rc = smc_sysctl_net_init(net); + if (rc) + return rc; return smc_pnet_net_init(net); }
static void __net_exit smc_net_exit(struct net *net) { + smc_sysctl_net_exit(net); smc_pnet_net_exit(net); }
@@ -2578,12 +2584,6 @@ static int __init smc_init(void) goto out_sock; }
- rc = smc_sysctl_init(); - if (rc) { - pr_err("%s: sysctl_init fails with %d\n", __func__, rc); - goto out_sock; - } - static_branch_enable(&tcp_have_smc); return 0;
@@ -2610,7 +2610,6 @@ static int __init smc_init(void) static void __exit smc_exit(void) { static_branch_disable(&tcp_have_smc); - smc_sysctl_exit(); sock_unregister(PF_SMC); smc_core_exit(); smc_ib_unregister_client(); diff --git a/net/smc/smc_sysctl.c b/net/smc/smc_sysctl.c index 8a3a8e145976..d2cc2f5bf089 100644 --- a/net/smc/smc_sysctl.c +++ b/net/smc/smc_sysctl.c @@ -20,7 +20,7 @@ static struct ctl_table smc_table[] = { { } };
-static __net_init int smc_sysctl_init_net(struct net *net) +int __net_init smc_sysctl_net_init(struct net *net) { struct ctl_table *table;
@@ -49,22 +49,7 @@ static __net_init int smc_sysctl_init_net(struct net *net) return -ENOMEM; }
-static __net_exit void smc_sysctl_exit_net(struct net *net) +void __net_exit smc_sysctl_net_exit(struct net *net) { unregister_net_sysctl_table(net->smc.smc_hdr); } - -static struct pernet_operations smc_sysctl_ops __net_initdata = { - .init = smc_sysctl_init_net, - .exit = smc_sysctl_exit_net, -}; - -int __init smc_sysctl_init(void) -{ - return register_pernet_subsys(&smc_sysctl_ops); -} - -void smc_sysctl_exit(void) -{ - unregister_pernet_subsys(&smc_sysctl_ops); -} diff --git a/net/smc/smc_sysctl.h b/net/smc/smc_sysctl.h index 49553ac236b6..9a369bca8123 100644 --- a/net/smc/smc_sysctl.h +++ b/net/smc/smc_sysctl.h @@ -15,18 +15,19 @@
#ifdef CONFIG_SYSCTL
-int smc_sysctl_init(void); -void smc_sysctl_exit(void); +int __net_init smc_sysctl_net_init(struct net *net); +void __net_exit smc_sysctl_net_exit(struct net *net);
#else
-int smc_sysctl_init(void) +int __net_init smc_sysctl_net_init(struct net *net) { return 0; }
-void smc_sysctl_exit(void) { } +void __net_exit smc_sysctl_net_exit(struct net *net) { }
#endif /* CONFIG_SYSCTL */
#endif /* _SMC_SYSCTL_H */ +
mainline inclusion from mainline-v5.18-rc1 commit d9f50991592513cc7633684cbaff65022cfa6816 category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I76JHC CVE: NA
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/ne...
--------------------------------
when CONFIG_SYSCTL not set, smc_sysctl_net_init/exit need to be static inline to avoid missing-prototypes if compile with W=1.
Since __net_exit has noinline annotation when CONFIG_NET_NS not set, it should not be used with static inline. So remove the __net_init/exit when CONFIG_SYSCTL not set.
Fixes: 7de8eb0d9039 ("net/smc: fix compile warning for smc_sysctl") Signed-off-by: Dust Li dust.li@linux.alibaba.com Link: https://lore.kernel.org/r/20220309033051.41893-1-dust.li@linux.alibaba.com Signed-off-by: Jakub Kicinski kuba@kernel.org Signed-off-by: Litao Jiao jiaolitao@sangfor.com.cn --- net/smc/smc_sysctl.h | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-)
diff --git a/net/smc/smc_sysctl.h b/net/smc/smc_sysctl.h index 9a369bca8123..f04699bc8bbc 100644 --- a/net/smc/smc_sysctl.h +++ b/net/smc/smc_sysctl.h @@ -20,14 +20,13 @@ void __net_exit smc_sysctl_net_exit(struct net *net);
#else
-int __net_init smc_sysctl_net_init(struct net *net) +static inline int smc_sysctl_net_init(struct net *net) { return 0; }
-void __net_exit smc_sysctl_net_exit(struct net *net) { } +static inline void smc_sysctl_net_exit(struct net *net) { }
#endif /* CONFIG_SYSCTL */
#endif /* _SMC_SYSCTL_H */ -
mainline inclusion from mainline-v5.18-rc1 commit 5ae6acf1d00be462d7b08b4a8748798ef595ae5a category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I76JHC CVE: NA
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/ne...
--------------------------------
Recently added smc_sysctl_net_exit() forgot to free the memory allocated from smc_sysctl_net_init() for non initial network namespace.
Fixes: 462791bbfa35 ("net/smc: add sysctl interface for SMC") Signed-off-by: Eric Dumazet edumazet@google.com Reported-by: syzbot syzkaller@googlegroups.com Cc: Tony Lu tonylu@linux.alibaba.com Cc: Dust Li dust.li@linux.alibaba.com Signed-off-by: David S. Miller davem@davemloft.net Signed-off-by: Litao Jiao jiaolitao@sangfor.com.cn --- net/smc/smc_sysctl.c | 5 +++++ 1 file changed, 5 insertions(+)
diff --git a/net/smc/smc_sysctl.c b/net/smc/smc_sysctl.c index d2cc2f5bf089..e06ecf0e7c84 100644 --- a/net/smc/smc_sysctl.c +++ b/net/smc/smc_sysctl.c @@ -51,5 +51,10 @@ int __net_init smc_sysctl_net_init(struct net *net)
void __net_exit smc_sysctl_net_exit(struct net *net) { + struct ctl_table *table; + + table = net->smc.smc_hdr->ctl_table_arg; unregister_net_sysctl_table(net->smc.smc_hdr); + if (!net_eq(net, &init_net)) + kfree(table); }
mainline inclusion from mainline-v6.0-rc1 commit 4bc5008e4387106215b50ae1a4ac2467455725ca category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I76JHC CVE: NA
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/ne...
--------------------------------
This patch introduces the sysctl smcr_buf_type for setting the type of SMC-R sndbufs and RMBs.
Valid values includes:
- SMCR_PHYS_CONT_BUFS, which means use physically contiguous buffers for better performance and is the default value.
- SMCR_VIRT_CONT_BUFS, which means use virtually contiguous buffers in case of physically contiguous memory is scarce.
- SMCR_MIXED_BUFS, which means first try to use physically contiguous buffers. If not available, then use virtually contiguous buffers.
Signed-off-by: Wen Gu guwen@linux.alibaba.com Signed-off-by: David S. Miller davem@davemloft.net Signed-off-by: Litao Jiao jiaolitao@sangfor.com.cn --- Documentation/networking/smc-sysctl.rst | 21 +++++++++++++++++++++ include/net/netns/smc.h | 1 + net/smc/smc_core.h | 6 ++++++ net/smc/smc_sysctl.c | 14 ++++++++++++++ 4 files changed, 42 insertions(+) create mode 100644 Documentation/networking/smc-sysctl.rst
diff --git a/Documentation/networking/smc-sysctl.rst b/Documentation/networking/smc-sysctl.rst new file mode 100644 index 000000000000..3f0187ffc2a5 --- /dev/null +++ b/Documentation/networking/smc-sysctl.rst @@ -0,0 +1,21 @@ +.. SPDX-License-Identifier: GPL-2.0 + +========== +SMC Sysctl +========== + +/proc/sys/net/smc/* Variables +============================= + +smcr_buf_type - INTEGER + Controls which type of sndbufs and RMBs to use in later newly created + SMC-R link group. Only for SMC-R. + + Default: 0 (physically contiguous sndbufs and RMBs) + + Possible values: + + - 0 - Use physically contiguous buffers + - 1 - Use virtually contiguous buffers + - 2 - Mixed use of the two types. Try physically contiguous buffers first. + If not available, use virtually contiguous buffers then. diff --git a/include/net/netns/smc.h b/include/net/netns/smc.h index 0a7d25a124e9..38396599938c 100644 --- a/include/net/netns/smc.h +++ b/include/net/netns/smc.h @@ -6,5 +6,6 @@ struct netns_smc { #ifdef CONFIG_SYSCTL struct ctl_table_header *smc_hdr; #endif + unsigned int sysctl_smcr_buf_type; }; #endif diff --git a/net/smc/smc_core.h b/net/smc/smc_core.h index 9364d0f35cce..5ac5d7ac833b 100644 --- a/net/smc/smc_core.h +++ b/net/smc/smc_core.h @@ -198,6 +198,12 @@ enum smc_lgr_type { /* redundancy state of lgr */ SMC_LGR_ASYMMETRIC_LOCAL, /* local has 1, peer 2 active RNICs */ };
+enum smcr_buf_type { /* types of SMC-R sndbufs and RMBs */ + SMCR_PHYS_CONT_BUFS = 0, + SMCR_VIRT_CONT_BUFS = 1, + SMCR_MIXED_BUFS = 2, +}; + enum smc_llc_flowtype { SMC_LLC_FLOW_NONE = 0, SMC_LLC_FLOW_ADD_LINK = 2, diff --git a/net/smc/smc_sysctl.c b/net/smc/smc_sysctl.c index e06ecf0e7c84..81faae0d1b7b 100644 --- a/net/smc/smc_sysctl.c +++ b/net/smc/smc_sysctl.c @@ -15,8 +15,20 @@ #include <net/net_namespace.h>
#include "smc_sysctl.h" +#include "smc_core.h" + +static int two = 2;
static struct ctl_table smc_table[] = { + { + .procname = "smcr_buf_type", + .data = &init_net.smc.sysctl_smcr_buf_type, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = proc_douintvec_minmax, + .extra1 = SYSCTL_ZERO, + .extra2 = &two, + }, { } };
@@ -40,6 +52,8 @@ int __net_init smc_sysctl_net_init(struct net *net) if (!net->smc.smc_hdr) goto err_reg;
+ net->smc.sysctl_smcr_buf_type = SMCR_PHYS_CONT_BUFS; + return 0;
err_reg:
mainline inclusion from mainline-v6.0-rc1 commit b984f370ed5182d180f92dbf14bdf847ff6ccc04 category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I76JHC CVE: NA
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/ne...
--------------------------------
This patch introduces a new SMC-R specific element buf_type in struct smc_link_group, for recording the value of sysctl smcr_buf_type when link group is created.
New created link group will create and reuse buffers of the type specified by buf_type.
Signed-off-by: Wen Gu guwen@linux.alibaba.com Signed-off-by: David S. Miller davem@davemloft.net Signed-off-by: Litao Jiao jiaolitao@sangfor.com.cn --- net/smc/smc_core.c | 1 + net/smc/smc_core.h | 1 + 2 files changed, 2 insertions(+)
diff --git a/net/smc/smc_core.c b/net/smc/smc_core.c index bf485a2017a4..3e4c14aebc7e 100644 --- a/net/smc/smc_core.c +++ b/net/smc/smc_core.c @@ -437,6 +437,7 @@ static int smc_lgr_create(struct smc_sock *smc, struct smc_init_info *ini) goto free_wq; lgr_list = &smc_lgr_list.list; lgr_lock = &smc_lgr_list.lock; + lgr->buf_type = sock_net(&smc->sk)->smc.sysctl_smcr_buf_type; atomic_inc(&lgr_cnt); } smc->conn.lgr = lgr; diff --git a/net/smc/smc_core.h b/net/smc/smc_core.h index 5ac5d7ac833b..f04a046e93b4 100644 --- a/net/smc/smc_core.h +++ b/net/smc/smc_core.h @@ -259,6 +259,7 @@ struct smc_link_group { /* used rtoken elements */ u8 next_link_id; enum smc_lgr_type type; + enum smcr_buf_type buf_type; /* redundancy state */ u8 pnet_id[SMC_MAX_PNETID_LEN + 1]; /* pnet id of this lgr */
mainline inclusion from mainline-v5.15-rc1 commit 67161779a9ea926fccee8de047ae66cbd3482b91 category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I76JHC CVE: NA
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/ne...
--------------------------------
Commit a3fe3d01bd0d7 ("net/smc: introduce sg-logic for RMBs") introduced a restriction for RMB allocations as used by SMC-R. However, SMC-D does not use scatter-gather lists to back its DMBs, yet it was limited by this restriction, still. This patch exempts SMC, but limits allocations to the maximum RMB/DMB size respectively.
Signed-off-by: Stefan Raspl raspl@linux.ibm.com Signed-off-by: Guvenc Gulce guvenc@linux.ibm.com Signed-off-by: David S. Miller davem@davemloft.net Signed-off-by: Litao Jiao jiaolitao@sangfor.com.cn --- net/smc/smc_core.c | 31 ++++++++++++++++--------------- 1 file changed, 16 insertions(+), 15 deletions(-)
diff --git a/net/smc/smc_core.c b/net/smc/smc_core.c index 3e4c14aebc7e..2ff3acf6ed5f 100644 --- a/net/smc/smc_core.c +++ b/net/smc/smc_core.c @@ -1374,21 +1374,30 @@ int smc_conn_create(struct smc_sock *smc, struct smc_init_info *ini) return rc; }
-/* convert the RMB size into the compressed notation - minimum 16K. +#define SMCD_DMBE_SIZES 6 /* 0 -> 16KB, 1 -> 32KB, .. 6 -> 1MB */ +#define SMCR_RMBE_SIZES 5 /* 0 -> 16KB, 1 -> 32KB, .. 5 -> 512KB */ + +/* convert the RMB size into the compressed notation (minimum 16K, see + * SMCD/R_DMBE_SIZES. * In contrast to plain ilog2, this rounds towards the next power of 2, * so the socket application gets at least its desired sndbuf / rcvbuf size. */ -static u8 smc_compress_bufsize(int size) +static u8 smc_compress_bufsize(int size, bool is_smcd, bool is_rmb) { + const unsigned int max_scat = SG_MAX_SINGLE_ALLOC * PAGE_SIZE; u8 compressed;
if (size <= SMC_BUF_MIN_SIZE) return 0;
- size = (size - 1) >> 14; - compressed = ilog2(size) + 1; - if (compressed >= SMC_RMBE_SIZES) - compressed = SMC_RMBE_SIZES - 1; + size = (size - 1) >> 14; /* convert to 16K multiple */ + compressed = min_t(u8, ilog2(size) + 1, + is_smcd ? SMCD_DMBE_SIZES : SMCR_RMBE_SIZES); + + if (!is_smcd && is_rmb) + /* RMBs are backed by & limited to max size of scatterlists */ + compressed = min_t(u8, compressed, ilog2(max_scat >> 14)); + return compressed; }
@@ -1607,17 +1616,12 @@ static int smcr_buf_map_usable_links(struct smc_link_group *lgr, return rc; }
-#define SMCD_DMBE_SIZES 6 /* 0 -> 16KB, 1 -> 32KB, .. 6 -> 1MB */ - static struct smc_buf_desc *smcd_new_buf_create(struct smc_link_group *lgr, bool is_dmb, int bufsize) { struct smc_buf_desc *buf_desc; int rc;
- if (smc_compress_bufsize(bufsize) > SMCD_DMBE_SIZES) - return ERR_PTR(-EAGAIN); - /* try to alloc a new DMB */ buf_desc = kzalloc(sizeof(*buf_desc), GFP_KERNEL); if (!buf_desc) @@ -1665,9 +1669,8 @@ static int __smc_buf_create(struct smc_sock *smc, bool is_smcd, bool is_rmb) /* use socket send buffer size (w/o overhead) as start value */ sk_buf_size = smc->sk.sk_sndbuf / 2;
- for (bufsize_short = smc_compress_bufsize(sk_buf_size); + for (bufsize_short = smc_compress_bufsize(sk_buf_size, is_smcd, is_rmb); bufsize_short >= 0; bufsize_short--) { - if (is_rmb) { lock = &lgr->rmbs_lock; buf_list = &lgr->rmbs[bufsize_short]; @@ -1676,8 +1679,6 @@ static int __smc_buf_create(struct smc_sock *smc, bool is_smcd, bool is_rmb) buf_list = &lgr->sndbufs[bufsize_short]; } bufsize = smc_uncompress_bufsize(bufsize_short); - if ((1 << get_order(bufsize)) > SG_MAX_SINGLE_ALLOC) - continue;
/* check for reusable slot in the link group */ buf_desc = smc_buf_get_slot(bufsize_short, lock, buf_list);
mainline inclusion from mainline-v6.0-rc1 commit b8d199451c99b3796b840c350eb74b830c5c869b category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I76JHC CVE: NA
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/ne...
--------------------------------
On long-running enterprise production servers, high-order contiguous memory pages are usually very rare and in most cases we can only get fragmented pages.
When replacing TCP with SMC-R in such production scenarios, attempting to allocate high-order physically contiguous sndbufs and RMBs may result in frequent memory compaction, which will cause unexpected hung issue and further stability risks.
So this patch is aimed to allow SMC-R link group to use virtually contiguous sndbufs and RMBs to avoid potential issues mentioned above. Whether to use physically or virtually contiguous buffers can be set by sysctl smcr_buf_type.
Note that using virtually contiguous buffers will bring an acceptable performance regression, which can be mainly divided into two parts:
1) regression in data path, which is brought by additional address translation of sndbuf by RNIC in Tx. But in general, translating address through MTT is fast.
Taking 256KB sndbuf and RMB as an example, the comparisons in qperf latency and bandwidth test with physically and virtually contiguous buffers are as follows:
- client: smc_run taskset -c <cpu> qperf <server> -oo msg_size:1:64K:*2\ -t 5 -vu tcp_{bw|lat} - server: smc_run taskset -c <cpu> qperf
[latency] msgsize tcp smcr smcr-use-virt-buf 1 11.17 us 7.56 us 7.51 us (-0.67%) 2 10.65 us 7.74 us 7.56 us (-2.31%) 4 11.11 us 7.52 us 7.59 us ( 0.84%) 8 10.83 us 7.55 us 7.51 us (-0.48%) 16 11.21 us 7.46 us 7.51 us ( 0.71%) 32 10.65 us 7.53 us 7.58 us ( 0.61%) 64 10.95 us 7.74 us 7.80 us ( 0.76%) 128 11.14 us 7.83 us 7.87 us ( 0.47%) 256 10.97 us 7.94 us 7.92 us (-0.28%) 512 11.23 us 7.94 us 8.20 us ( 3.25%) 1024 11.60 us 8.12 us 8.20 us ( 0.96%) 2048 14.04 us 8.30 us 8.51 us ( 2.49%) 4096 16.88 us 9.13 us 9.07 us (-0.64%) 8192 22.50 us 10.56 us 11.22 us ( 6.26%) 16384 28.99 us 12.88 us 13.83 us ( 7.37%) 32768 40.13 us 16.76 us 16.95 us ( 1.16%) 65536 68.70 us 24.68 us 24.85 us ( 0.68%) [bandwidth] msgsize tcp smcr smcr-use-virt-buf 1 1.65 MB/s 1.59 MB/s 1.53 MB/s (-3.88%) 2 3.32 MB/s 3.17 MB/s 3.08 MB/s (-2.67%) 4 6.66 MB/s 6.33 MB/s 6.09 MB/s (-3.85%) 8 13.67 MB/s 13.45 MB/s 11.97 MB/s (-10.99%) 16 25.36 MB/s 27.15 MB/s 24.16 MB/s (-11.01%) 32 48.22 MB/s 54.24 MB/s 49.41 MB/s (-8.89%) 64 106.79 MB/s 107.32 MB/s 99.05 MB/s (-7.71%) 128 210.21 MB/s 202.46 MB/s 201.02 MB/s (-0.71%) 256 400.81 MB/s 416.81 MB/s 393.52 MB/s (-5.59%) 512 746.49 MB/s 834.12 MB/s 809.99 MB/s (-2.89%) 1024 1292.33 MB/s 1641.96 MB/s 1571.82 MB/s (-4.27%) 2048 2007.64 MB/s 2760.44 MB/s 2717.68 MB/s (-1.55%) 4096 2665.17 MB/s 4157.44 MB/s 4070.76 MB/s (-2.09%) 8192 3159.72 MB/s 4361.57 MB/s 4270.65 MB/s (-2.08%) 16384 4186.70 MB/s 4574.13 MB/s 4501.17 MB/s (-1.60%) 32768 4093.21 MB/s 4487.42 MB/s 4322.43 MB/s (-3.68%) 65536 4057.14 MB/s 4735.61 MB/s 4555.17 MB/s (-3.81%)
2) regression in buffer initialization and destruction path, which is brought by additional MR operations of sndbufs. But thanks to link group buffer reuse mechanism, the impact of this kind of regression decreases as times of buffer reuse increases.
Taking 256KB sndbuf and RMB as an example, latency of some key SMC-R buffer-related function obtained by bpftrace are as follows:
Function Phys-bufs Virt-bufs smcr_new_buf_create() 67154 ns 79164 ns smc_ib_buf_map_sg() 525 ns 928 ns smc_ib_get_memory_region() 162294 ns 161191 ns smc_wr_reg_send() 9957 ns 9635 ns smc_ib_put_memory_region() 203548 ns 198374 ns smc_ib_buf_unmap_sg() 508 ns 1158 ns
------------ Test environment notes: 1. Above tests run on 2 VMs within the same Host. 2. The NIC is ConnectX-4Lx, using SRIOV and passing through 2 VFs to the each VM respectively. 3. VMs' vCPUs are binded to different physical CPUs, and the binded physical CPUs are isolated by `isolcpus=xxx` cmdline. 4. NICs' queue number are set to 1.
Signed-off-by: Wen Gu guwen@linux.alibaba.com Signed-off-by: David S. Miller davem@davemloft.net Signed-off-by: Litao Jiao jiaolitao@sangfor.com.cn --- net/smc/af_smc.c | 66 +++++++++++++-- net/smc/smc_clc.c | 8 +- net/smc/smc_clc.h | 2 +- net/smc/smc_core.c | 205 +++++++++++++++++++++++++++++++-------------- net/smc/smc_core.h | 10 ++- net/smc/smc_ib.c | 15 ++-- net/smc/smc_llc.c | 26 +++--- net/smc/smc_rx.c | 90 ++++++++++++++++---- net/smc/smc_tx.c | 7 +- 9 files changed, 318 insertions(+), 111 deletions(-)
diff --git a/net/smc/af_smc.c b/net/smc/af_smc.c index 86627dea88bd..322bd6e3b932 100644 --- a/net/smc/af_smc.c +++ b/net/smc/af_smc.c @@ -356,6 +356,29 @@ static void smc_copy_sock_settings_to_smc(struct smc_sock *smc) smc_copy_sock_settings(&smc->sk, smc->clcsock->sk, SK_FLAGS_CLC_TO_SMC); }
+/* register the new vzalloced sndbuf on all links */ +static int smcr_lgr_reg_sndbufs(struct smc_link *link, + struct smc_buf_desc *snd_desc) +{ + struct smc_link_group *lgr = link->lgr; + int i, rc = 0; + + if (!snd_desc->is_vm) + return -EINVAL; + + /* protect against parallel smcr_link_reg_buf() */ + mutex_lock(&lgr->llc_conf_mutex); + for (i = 0; i < SMC_LINKS_PER_LGR_MAX; i++) { + if (!smc_link_active(&lgr->lnk[i])) + continue; + rc = smcr_link_reg_buf(&lgr->lnk[i], snd_desc); + if (rc) + break; + } + mutex_unlock(&lgr->llc_conf_mutex); + return rc; +} + /* register the new rmb on all links */ static int smcr_lgr_reg_rmbs(struct smc_link *link, struct smc_buf_desc *rmb_desc) @@ -367,13 +390,13 @@ static int smcr_lgr_reg_rmbs(struct smc_link *link, if (rc) return rc; /* protect against parallel smc_llc_cli_rkey_exchange() and - * parallel smcr_link_reg_rmb() + * parallel smcr_link_reg_buf() */ mutex_lock(&lgr->llc_conf_mutex); for (i = 0; i < SMC_LINKS_PER_LGR_MAX; i++) { if (!smc_link_active(&lgr->lnk[i])) continue; - rc = smcr_link_reg_rmb(&lgr->lnk[i], rmb_desc); + rc = smcr_link_reg_buf(&lgr->lnk[i], rmb_desc); if (rc) goto out; } @@ -419,8 +442,15 @@ static int smcr_clnt_conf_first_link(struct smc_sock *smc)
smc_wr_remember_qp_attr(link);
- if (smcr_link_reg_rmb(link, smc->conn.rmb_desc)) - return SMC_CLC_DECL_ERR_REGRMB; + /* reg the sndbuf if it was vzalloced */ + if (smc->conn.sndbuf_desc->is_vm) { + if (smcr_link_reg_buf(link, smc->conn.sndbuf_desc)) + return SMC_CLC_DECL_ERR_REGBUF; + } + + /* reg the rmb */ + if (smcr_link_reg_buf(link, smc->conn.rmb_desc)) + return SMC_CLC_DECL_ERR_REGBUF;
/* confirm_rkey is implicit on 1st contact */ smc->conn.rmb_desc->is_conf_rkey = true; @@ -810,8 +840,15 @@ static int smc_connect_rdma(struct smc_sock *smc, goto connect_abort; } } else { + /* reg sendbufs if they were vzalloced */ + if (smc->conn.sndbuf_desc->is_vm) { + if (smcr_lgr_reg_sndbufs(link, smc->conn.sndbuf_desc)) { + reason_code = SMC_CLC_DECL_ERR_REGBUF; + goto connect_abort; + } + } if (smcr_lgr_reg_rmbs(link, smc->conn.rmb_desc)) { - reason_code = SMC_CLC_DECL_ERR_REGRMB; + reason_code = SMC_CLC_DECL_ERR_REGBUF; goto connect_abort; } } @@ -1267,8 +1304,15 @@ static int smcr_serv_conf_first_link(struct smc_sock *smc) struct smc_llc_qentry *qentry; int rc;
- if (smcr_link_reg_rmb(link, smc->conn.rmb_desc)) - return SMC_CLC_DECL_ERR_REGRMB; + /* reg the sndbuf if it was vzalloced*/ + if (smc->conn.sndbuf_desc->is_vm) { + if (smcr_link_reg_buf(link, smc->conn.sndbuf_desc)) + return SMC_CLC_DECL_ERR_REGBUF; + } + + /* reg the rmb */ + if (smcr_link_reg_buf(link, smc->conn.rmb_desc)) + return SMC_CLC_DECL_ERR_REGBUF;
/* send CONFIRM LINK request to client over the RoCE fabric */ rc = smc_llc_send_confirm_link(link, SMC_LLC_REQ); @@ -1596,8 +1640,14 @@ static int smc_listen_rdma_reg(struct smc_sock *new_smc, bool local_first) struct smc_connection *conn = &new_smc->conn;
if (!local_first) { + /* reg sendbufs if they were vzalloced */ + if (conn->sndbuf_desc->is_vm) { + if (smcr_lgr_reg_sndbufs(conn->lnk, + conn->sndbuf_desc)) + return SMC_CLC_DECL_ERR_REGBUF; + } if (smcr_lgr_reg_rmbs(conn->lnk, conn->rmb_desc)) - return SMC_CLC_DECL_ERR_REGRMB; + return SMC_CLC_DECL_ERR_REGBUF; } smc_rmb_sync_sg_for_device(&new_smc->conn);
diff --git a/net/smc/smc_clc.c b/net/smc/smc_clc.c index 5ee5b2ce29a6..3f644be48d06 100644 --- a/net/smc/smc_clc.c +++ b/net/smc/smc_clc.c @@ -693,7 +693,7 @@ static int smc_clc_send_confirm_accept(struct smc_sock *smc, ETH_ALEN); hton24(clc->r0.qpn, link->roce_qp->qp_num); clc->r0.rmb_rkey = - htonl(conn->rmb_desc->mr_rx[link->link_idx]->rkey); + htonl(conn->rmb_desc->mr[link->link_idx]->rkey); clc->r0.rmbe_idx = 1; /* for now: 1 RMB = 1 RMBE */ clc->r0.rmbe_alert_token = htonl(conn->alert_token_local); switch (clc->hdr.type) { @@ -705,8 +705,10 @@ static int smc_clc_send_confirm_accept(struct smc_sock *smc, break; } clc->r0.rmbe_size = conn->rmbe_size_short; - clc->r0.rmb_dma_addr = cpu_to_be64((u64)sg_dma_address - (conn->rmb_desc->sgt[link->link_idx].sgl)); + clc->r0.rmb_dma_addr = conn->rmb_desc->is_vm ? + cpu_to_be64((uintptr_t)conn->rmb_desc->cpu_addr) : + cpu_to_be64((u64)sg_dma_address + (conn->rmb_desc->sgt[link->link_idx].sgl)); hton24(clc->r0.psn, link->psn_initial); memcpy(trl.eyecatcher, SMC_EYECATCHER, sizeof(SMC_EYECATCHER)); } diff --git a/net/smc/smc_clc.h b/net/smc/smc_clc.h index c579d1d5995a..096a2a57c8bb 100644 --- a/net/smc/smc_clc.h +++ b/net/smc/smc_clc.h @@ -52,7 +52,7 @@ #define SMC_CLC_DECL_INTERR 0x09990000 /* internal error */ #define SMC_CLC_DECL_ERR_RTOK 0x09990001 /* rtoken handling failed */ #define SMC_CLC_DECL_ERR_RDYLNK 0x09990002 /* ib ready link failed */ -#define SMC_CLC_DECL_ERR_REGRMB 0x09990003 /* reg rmb failed */ +#define SMC_CLC_DECL_ERR_REGBUF 0x09990003 /* reg rdma bufs failed */
#define SMC_FIRST_CONTACT_MASK 0b10 /* first contact bit within typev2 */
diff --git a/net/smc/smc_core.c b/net/smc/smc_core.c index 2ff3acf6ed5f..3722a1daac5b 100644 --- a/net/smc/smc_core.c +++ b/net/smc/smc_core.c @@ -605,45 +605,54 @@ struct smc_link *smc_switch_conns(struct smc_link_group *lgr, return NULL; }
-static void smcr_buf_unuse(struct smc_buf_desc *rmb_desc, +static void smcr_buf_unuse(struct smc_buf_desc *buf_desc, bool is_rmb, struct smc_link_group *lgr) { + struct mutex *lock; /* lock buffer list */ int rc;
- if (rmb_desc->is_conf_rkey && !list_empty(&lgr->list)) { + if (is_rmb && buf_desc->is_conf_rkey && !list_empty(&lgr->list)) { /* unregister rmb with peer */ rc = smc_llc_flow_initiate(lgr, SMC_LLC_FLOW_RKEY); if (!rc) { /* protect against smc_llc_cli_rkey_exchange() */ mutex_lock(&lgr->llc_conf_mutex); - smc_llc_do_delete_rkey(lgr, rmb_desc); - rmb_desc->is_conf_rkey = false; + smc_llc_do_delete_rkey(lgr, buf_desc); + buf_desc->is_conf_rkey = false; mutex_unlock(&lgr->llc_conf_mutex); smc_llc_flow_stop(lgr, &lgr->llc_flow_lcl); } }
- if (rmb_desc->is_reg_err) { + if (buf_desc->is_reg_err) { /* buf registration failed, reuse not possible */ - mutex_lock(&lgr->rmbs_lock); - list_del(&rmb_desc->list); - mutex_unlock(&lgr->rmbs_lock); + lock = is_rmb ? &lgr->rmbs_lock : + &lgr->sndbufs_lock; + mutex_lock(lock); + list_del(&buf_desc->list); + mutex_unlock(lock);
- smc_buf_free(lgr, true, rmb_desc); + smc_buf_free(lgr, is_rmb, buf_desc); } else { - rmb_desc->used = 0; + buf_desc->used = 0; } }
static void smc_buf_unuse(struct smc_connection *conn, struct smc_link_group *lgr) { - if (conn->sndbuf_desc) - conn->sndbuf_desc->used = 0; - if (conn->rmb_desc && lgr->is_smcd) - conn->rmb_desc->used = 0; - else if (conn->rmb_desc) - smcr_buf_unuse(conn->rmb_desc, lgr); + if (conn->sndbuf_desc) { + if (!lgr->is_smcd && conn->sndbuf_desc->is_vm) + smcr_buf_unuse(conn->sndbuf_desc, false, lgr); + else + conn->sndbuf_desc->used = 0; + } + if (conn->rmb_desc) { + if (!lgr->is_smcd) + smcr_buf_unuse(conn->rmb_desc, true, lgr); + else + conn->rmb_desc->used = 0; + } }
/* remove a finished connection from its link group */ @@ -675,20 +684,21 @@ void smc_conn_free(struct smc_connection *conn) static void smcr_buf_unmap_link(struct smc_buf_desc *buf_desc, bool is_rmb, struct smc_link *lnk) { - if (is_rmb) + if (is_rmb || buf_desc->is_vm) buf_desc->is_reg_mr[lnk->link_idx] = false; if (!buf_desc->is_map_ib[lnk->link_idx]) return; - if (is_rmb) { - if (buf_desc->mr_rx[lnk->link_idx]) { - smc_ib_put_memory_region( - buf_desc->mr_rx[lnk->link_idx]); - buf_desc->mr_rx[lnk->link_idx] = NULL; - } + + if ((is_rmb || buf_desc->is_vm) && + buf_desc->mr[lnk->link_idx]) { + smc_ib_put_memory_region(buf_desc->mr[lnk->link_idx]); + buf_desc->mr[lnk->link_idx] = NULL; + } + if (is_rmb) smc_ib_buf_unmap_sg(lnk, buf_desc, DMA_FROM_DEVICE); - } else { + else smc_ib_buf_unmap_sg(lnk, buf_desc, DMA_TO_DEVICE); - } + sg_free_table(&buf_desc->sgt[lnk->link_idx]); buf_desc->is_map_ib[lnk->link_idx] = false; } @@ -756,8 +766,10 @@ static void smcr_buf_free(struct smc_link_group *lgr, bool is_rmb, for (i = 0; i < SMC_LINKS_PER_LGR_MAX; i++) smcr_buf_unmap_link(buf_desc, is_rmb, &lgr->lnk[i]);
- if (buf_desc->pages) + if (!buf_desc->is_vm && buf_desc->pages) __free_pages(buf_desc->pages, buf_desc->order); + else if (buf_desc->is_vm && buf_desc->cpu_addr) + vfree(buf_desc->cpu_addr); kfree(buf_desc); }
@@ -1439,39 +1451,66 @@ static inline int smc_rmb_wnd_update_limit(int rmbe_size) return max_t(int, rmbe_size / 10, SOCK_MIN_SNDBUF / 2); }
-/* map an rmb buf to a link */ +/* map an buf to a link */ static int smcr_buf_map_link(struct smc_buf_desc *buf_desc, bool is_rmb, struct smc_link *lnk) { - int rc; + int rc, i, nents, offset, buf_size, size, access_flags; + struct scatterlist *sg; + void *buf;
if (buf_desc->is_map_ib[lnk->link_idx]) return 0;
- rc = sg_alloc_table(&buf_desc->sgt[lnk->link_idx], 1, GFP_KERNEL); + if (buf_desc->is_vm) { + buf = buf_desc->cpu_addr; + buf_size = buf_desc->len; + offset = offset_in_page(buf_desc->cpu_addr); + nents = PAGE_ALIGN(buf_size + offset) / PAGE_SIZE; + } else { + nents = 1; + } + + rc = sg_alloc_table(&buf_desc->sgt[lnk->link_idx], nents, GFP_KERNEL); if (rc) return rc; - sg_set_buf(buf_desc->sgt[lnk->link_idx].sgl, - buf_desc->cpu_addr, buf_desc->len); + + if (buf_desc->is_vm) { + /* virtually contiguous buffer */ + for_each_sg(buf_desc->sgt[lnk->link_idx].sgl, sg, nents, i) { + size = min_t(int, PAGE_SIZE - offset, buf_size); + sg_set_page(sg, vmalloc_to_page(buf), size, offset); + buf += size / sizeof(*buf); + buf_size -= size; + offset = 0; + } + } else { + /* physically contiguous buffer */ + sg_set_buf(buf_desc->sgt[lnk->link_idx].sgl, + buf_desc->cpu_addr, buf_desc->len); + }
/* map sg table to DMA address */ rc = smc_ib_buf_map_sg(lnk, buf_desc, is_rmb ? DMA_FROM_DEVICE : DMA_TO_DEVICE); /* SMC protocol depends on mapping to one DMA address only */ - if (rc != 1) { + if (rc != nents) { rc = -EAGAIN; goto free_table; }
- /* create a new memory region for the RMB */ - if (is_rmb) { - rc = smc_ib_get_memory_region(lnk->roce_pd, - IB_ACCESS_REMOTE_WRITE | - IB_ACCESS_LOCAL_WRITE, + if (is_rmb || buf_desc->is_vm) { + /* create a new memory region for the RMB or vzalloced sndbuf */ + access_flags = is_rmb ? + IB_ACCESS_REMOTE_WRITE | IB_ACCESS_LOCAL_WRITE : + IB_ACCESS_LOCAL_WRITE; + + rc = smc_ib_get_memory_region(lnk->roce_pd, access_flags, buf_desc, lnk->link_idx); if (rc) goto buf_unmap; - smc_ib_sync_sg_for_device(lnk, buf_desc, DMA_FROM_DEVICE); + smc_ib_sync_sg_for_device(lnk, buf_desc, + is_rmb ? DMA_FROM_DEVICE : DMA_TO_DEVICE); } buf_desc->is_map_ib[lnk->link_idx] = true; return 0; @@ -1484,20 +1523,23 @@ static int smcr_buf_map_link(struct smc_buf_desc *buf_desc, bool is_rmb, return rc; }
-/* register a new rmb on IB device, +/* register a new buf on IB device, rmb or vzalloced sndbuf * must be called under lgr->llc_conf_mutex lock */ -int smcr_link_reg_rmb(struct smc_link *link, struct smc_buf_desc *rmb_desc) +int smcr_link_reg_buf(struct smc_link *link, struct smc_buf_desc *buf_desc) { if (list_empty(&link->lgr->list)) return -ENOLINK; - if (!rmb_desc->is_reg_mr[link->link_idx]) { - /* register memory region for new rmb */ - if (smc_wr_reg_send(link, rmb_desc->mr_rx[link->link_idx])) { - rmb_desc->is_reg_err = true; + if (!buf_desc->is_reg_mr[link->link_idx]) { + /* register memory region for new buf */ + if (buf_desc->is_vm) + buf_desc->mr[link->link_idx]->iova = + (uintptr_t)buf_desc->cpu_addr; + if (smc_wr_reg_send(link, buf_desc->mr[link->link_idx])) { + buf_desc->is_reg_err = true; return -EFAULT; } - rmb_desc->is_reg_mr[link->link_idx] = true; + buf_desc->is_reg_mr[link->link_idx] = true; } return 0; } @@ -1549,18 +1591,38 @@ int smcr_buf_reg_lgr(struct smc_link *lnk) struct smc_buf_desc *buf_desc, *bf; int i, rc = 0;
+ /* reg all RMBs for a new link */ mutex_lock(&lgr->rmbs_lock); for (i = 0; i < SMC_RMBE_SIZES; i++) { list_for_each_entry_safe(buf_desc, bf, &lgr->rmbs[i], list) { if (!buf_desc->used) continue; - rc = smcr_link_reg_rmb(lnk, buf_desc); - if (rc) - goto out; + rc = smcr_link_reg_buf(lnk, buf_desc); + if (rc) { + mutex_unlock(&lgr->rmbs_lock); + return rc; + } } } -out: mutex_unlock(&lgr->rmbs_lock); + + if (lgr->buf_type == SMCR_PHYS_CONT_BUFS) + return rc; + + /* reg all vzalloced sndbufs for a new link */ + mutex_lock(&lgr->sndbufs_lock); + for (i = 0; i < SMC_RMBE_SIZES; i++) { + list_for_each_entry_safe(buf_desc, bf, &lgr->sndbufs[i], list) { + if (!buf_desc->used || !buf_desc->is_vm) + continue; + rc = smcr_link_reg_buf(lnk, buf_desc); + if (rc) { + mutex_unlock(&lgr->sndbufs_lock); + return rc; + } + } + } + mutex_unlock(&lgr->sndbufs_lock); return rc; }
@@ -1574,18 +1636,39 @@ static struct smc_buf_desc *smcr_new_buf_create(struct smc_link_group *lgr, if (!buf_desc) return ERR_PTR(-ENOMEM);
- buf_desc->order = get_order(bufsize); - buf_desc->pages = alloc_pages(GFP_KERNEL | __GFP_NOWARN | - __GFP_NOMEMALLOC | __GFP_COMP | - __GFP_NORETRY | __GFP_ZERO, - buf_desc->order); - if (!buf_desc->pages) { - kfree(buf_desc); - return ERR_PTR(-EAGAIN); - } - buf_desc->cpu_addr = (void *)page_address(buf_desc->pages); - buf_desc->len = bufsize; + switch (lgr->buf_type) { + case SMCR_PHYS_CONT_BUFS: + case SMCR_MIXED_BUFS: + buf_desc->order = get_order(bufsize); + buf_desc->pages = alloc_pages(GFP_KERNEL | __GFP_NOWARN | + __GFP_NOMEMALLOC | __GFP_COMP | + __GFP_NORETRY | __GFP_ZERO, + buf_desc->order); + if (buf_desc->pages) { + buf_desc->cpu_addr = + (void *)page_address(buf_desc->pages); + buf_desc->len = bufsize; + buf_desc->is_vm = false; + break; + } + if (lgr->buf_type == SMCR_PHYS_CONT_BUFS) + goto out; + fallthrough; // try virtually continguous buf + case SMCR_VIRT_CONT_BUFS: + buf_desc->order = get_order(bufsize); + buf_desc->cpu_addr = vzalloc(PAGE_SIZE << buf_desc->order); + if (!buf_desc->cpu_addr) + goto out; + buf_desc->pages = NULL; + buf_desc->len = bufsize; + buf_desc->is_vm = true; + break; + } return buf_desc; + +out: + kfree(buf_desc); + return ERR_PTR(-EAGAIN); }
/* map buf_desc on all usable links, @@ -1709,7 +1792,7 @@ static int __smc_buf_create(struct smc_sock *smc, bool is_smcd, bool is_rmb)
if (!is_smcd) { if (smcr_buf_map_usable_links(lgr, buf_desc, is_rmb)) { - smcr_buf_unuse(buf_desc, lgr); + smcr_buf_unuse(buf_desc, is_rmb, lgr); return -ENOMEM; } } diff --git a/net/smc/smc_core.h b/net/smc/smc_core.h index f04a046e93b4..6691fff5d6a6 100644 --- a/net/smc/smc_core.h +++ b/net/smc/smc_core.h @@ -150,9 +150,11 @@ struct smc_buf_desc { struct { /* SMC-R */ struct sg_table sgt[SMC_LINKS_PER_LGR_MAX]; /* virtual buffer */ - struct ib_mr *mr_rx[SMC_LINKS_PER_LGR_MAX]; - /* for rmb only: memory region + struct ib_mr *mr[SMC_LINKS_PER_LGR_MAX]; + /* memory region: for rmb and + * vzalloced sndbuf * incl. rkey provided to peer + * and lkey provided to local */ u32 order; /* allocation order */
@@ -164,6 +166,8 @@ struct smc_buf_desc { /* mem region mapped to lnk */ u8 is_reg_err; /* buffer registration err */ + u8 is_vm; + /* virtually contiguous */ }; struct { /* SMC-D */ unsigned short sba_idx; @@ -419,7 +423,7 @@ int smcr_buf_reg_lgr(struct smc_link *lnk); void smcr_lgr_set_type(struct smc_link_group *lgr, enum smc_lgr_type new_type); void smcr_lgr_set_type_asym(struct smc_link_group *lgr, enum smc_lgr_type new_type, int asym_lnk_idx); -int smcr_link_reg_rmb(struct smc_link *link, struct smc_buf_desc *rmb_desc); +int smcr_link_reg_buf(struct smc_link *link, struct smc_buf_desc *buf_desc); struct smc_link *smc_switch_conns(struct smc_link_group *lgr, struct smc_link *from_lnk, bool is_dev_err); void smcr_link_down_cond(struct smc_link *lnk); diff --git a/net/smc/smc_ib.c b/net/smc/smc_ib.c index f1ffbd414602..8b06b41fbd0d 100644 --- a/net/smc/smc_ib.c +++ b/net/smc/smc_ib.c @@ -397,7 +397,7 @@ static int smc_ib_map_mr_sg(struct smc_buf_desc *buf_slot, u8 link_idx) int sg_num;
/* map the largest prefix of a dma mapped SG list */ - sg_num = ib_map_mr_sg(buf_slot->mr_rx[link_idx], + sg_num = ib_map_mr_sg(buf_slot->mr[link_idx], buf_slot->sgt[link_idx].sgl, buf_slot->sgt[link_idx].orig_nents, &offset, PAGE_SIZE); @@ -409,20 +409,21 @@ static int smc_ib_map_mr_sg(struct smc_buf_desc *buf_slot, u8 link_idx) int smc_ib_get_memory_region(struct ib_pd *pd, int access_flags, struct smc_buf_desc *buf_slot, u8 link_idx) { - if (buf_slot->mr_rx[link_idx]) + if (buf_slot->mr[link_idx]) return 0; /* already done */
- buf_slot->mr_rx[link_idx] = + buf_slot->mr[link_idx] = ib_alloc_mr(pd, IB_MR_TYPE_MEM_REG, 1 << buf_slot->order); - if (IS_ERR(buf_slot->mr_rx[link_idx])) { + if (IS_ERR(buf_slot->mr[link_idx])) { int rc;
- rc = PTR_ERR(buf_slot->mr_rx[link_idx]); - buf_slot->mr_rx[link_idx] = NULL; + rc = PTR_ERR(buf_slot->mr[link_idx]); + buf_slot->mr[link_idx] = NULL; return rc; }
- if (smc_ib_map_mr_sg(buf_slot, link_idx) != 1) + if (smc_ib_map_mr_sg(buf_slot, link_idx) != + buf_slot->sgt[link_idx].orig_nents) return -EINVAL;
return 0; diff --git a/net/smc/smc_llc.c b/net/smc/smc_llc.c index 0ef15f8fba90..96dfd27da490 100644 --- a/net/smc/smc_llc.c +++ b/net/smc/smc_llc.c @@ -435,19 +435,22 @@ static int smc_llc_send_confirm_rkey(struct smc_link *send_link, if (smc_link_active(link) && link != send_link) { rkeyllc->rtoken[rtok_ix].link_id = link->link_id; rkeyllc->rtoken[rtok_ix].rmb_key = - htonl(rmb_desc->mr_rx[link->link_idx]->rkey); - rkeyllc->rtoken[rtok_ix].rmb_vaddr = cpu_to_be64( - (u64)sg_dma_address( - rmb_desc->sgt[link->link_idx].sgl)); + htonl(rmb_desc->mr[link->link_idx]->rkey); + rkeyllc->rtoken[rtok_ix].rmb_vaddr = rmb_desc->is_vm ? + cpu_to_be64((uintptr_t)rmb_desc->cpu_addr) : + cpu_to_be64((u64)sg_dma_address + (rmb_desc->sgt[link->link_idx].sgl)); rtok_ix++; } } /* rkey of send_link is in rtoken[0] */ rkeyllc->rtoken[0].num_rkeys = rtok_ix - 1; rkeyllc->rtoken[0].rmb_key = - htonl(rmb_desc->mr_rx[send_link->link_idx]->rkey); - rkeyllc->rtoken[0].rmb_vaddr = cpu_to_be64( - (u64)sg_dma_address(rmb_desc->sgt[send_link->link_idx].sgl)); + htonl(rmb_desc->mr[send_link->link_idx]->rkey); + rkeyllc->rtoken[0].rmb_vaddr = rmb_desc->is_vm ? + cpu_to_be64((uintptr_t)rmb_desc->cpu_addr) : + cpu_to_be64((u64)sg_dma_address + (rmb_desc->sgt[send_link->link_idx].sgl)); /* send llc message */ rc = smc_wr_tx_send(send_link, pend); put_out: @@ -474,7 +477,7 @@ static int smc_llc_send_delete_rkey(struct smc_link *link, rkeyllc->hd.common.type = SMC_LLC_DELETE_RKEY; rkeyllc->hd.length = sizeof(struct smc_llc_msg_delete_rkey); rkeyllc->num_rkeys = 1; - rkeyllc->rkey[0] = htonl(rmb_desc->mr_rx[link->link_idx]->rkey); + rkeyllc->rkey[0] = htonl(rmb_desc->mr[link->link_idx]->rkey); /* send llc message */ rc = smc_wr_tx_send(link, pend); put_out: @@ -724,9 +727,10 @@ static int smc_llc_add_link_cont(struct smc_link *link, } rmb = *buf_pos;
- addc_llc->rt[i].rmb_key = htonl(rmb->mr_rx[prim_lnk_idx]->rkey); - addc_llc->rt[i].rmb_key_new = htonl(rmb->mr_rx[lnk_idx]->rkey); - addc_llc->rt[i].rmb_vaddr_new = + addc_llc->rt[i].rmb_key = htonl(rmb->mr[prim_lnk_idx]->rkey); + addc_llc->rt[i].rmb_key_new = htonl(rmb->mr[lnk_idx]->rkey); + addc_llc->rt[i].rmb_vaddr_new = rmb->is_vm ? + cpu_to_be64((uintptr_t)rmb->cpu_addr) : cpu_to_be64((u64)sg_dma_address(rmb->sgt[lnk_idx].sgl));
(*num_rkeys_todo)--; diff --git a/net/smc/smc_rx.c b/net/smc/smc_rx.c index 7f7e983e42b1..99785705183f 100644 --- a/net/smc/smc_rx.c +++ b/net/smc/smc_rx.c @@ -143,35 +143,93 @@ static void smc_rx_spd_release(struct splice_pipe_desc *spd, static int smc_rx_splice(struct pipe_inode_info *pipe, char *src, size_t len, struct smc_sock *smc) { + struct smc_link_group *lgr = smc->conn.lgr; + int offset = offset_in_page(src); struct splice_pipe_desc spd; - struct partial_page partial; - struct smc_spd_priv *priv; - int bytes; + struct partial_page *partial; + struct smc_spd_priv **priv; + struct page **pages; + int bytes, nr_pages; + int i;
- priv = kzalloc(sizeof(*priv), GFP_KERNEL); + nr_pages = !lgr->is_smcd && smc->conn.rmb_desc->is_vm ? + PAGE_ALIGN(len + offset) / PAGE_SIZE : 1; + + pages = kcalloc(nr_pages, sizeof(*pages), GFP_KERNEL); + if (!pages) + goto out; + partial = kcalloc(nr_pages, sizeof(*partial), GFP_KERNEL); + if (!partial) + goto out_page; + priv = kcalloc(nr_pages, sizeof(*priv), GFP_KERNEL); if (!priv) - return -ENOMEM; - priv->len = len; - priv->smc = smc; - partial.offset = src - (char *)smc->conn.rmb_desc->cpu_addr; - partial.len = len; - partial.private = (unsigned long)priv; - - spd.nr_pages_max = 1; - spd.nr_pages = 1; - spd.pages = &smc->conn.rmb_desc->pages; - spd.partial = &partial; + goto out_part; + for (i = 0; i < nr_pages; i++) { + priv[i] = kzalloc(sizeof(**priv), GFP_KERNEL); + if (!priv[i]) + goto out_priv; + } + + if (lgr->is_smcd || + (!lgr->is_smcd && !smc->conn.rmb_desc->is_vm)) { + /* smcd or smcr that uses physically contiguous RMBs */ + priv[0]->len = len; + priv[0]->smc = smc; + partial[0].offset = src - (char *)smc->conn.rmb_desc->cpu_addr; + partial[0].len = len; + partial[0].private = (unsigned long)priv[0]; + pages[0] = smc->conn.rmb_desc->pages; + } else { + int size, left = len; + void *buf = src; + /* smcr that uses virtually contiguous RMBs*/ + for (i = 0; i < nr_pages; i++) { + size = min_t(int, PAGE_SIZE - offset, left); + priv[i]->len = size; + priv[i]->smc = smc; + pages[i] = vmalloc_to_page(buf); + partial[i].offset = offset; + partial[i].len = size; + partial[i].private = (unsigned long)priv[i]; + buf += size / sizeof(*buf); + left -= size; + offset = 0; + } + } + spd.nr_pages_max = nr_pages; + spd.nr_pages = nr_pages; + spd.pages = pages; + spd.partial = partial; spd.ops = &smc_pipe_ops; spd.spd_release = smc_rx_spd_release;
bytes = splice_to_pipe(pipe, &spd); if (bytes > 0) { sock_hold(&smc->sk); - get_page(smc->conn.rmb_desc->pages); + if (!lgr->is_smcd && smc->conn.rmb_desc->is_vm) { + for (i = 0; i < PAGE_ALIGN(bytes + offset) / PAGE_SIZE; i++) + get_page(pages[i]); + } else { + get_page(smc->conn.rmb_desc->pages); + } atomic_add(bytes, &smc->conn.splice_pending); } + kfree(priv); + kfree(partial); + kfree(pages);
return bytes; + +out_priv: + for (i = (i - 1); i >= 0; i--) + kfree(priv[i]); + kfree(priv); +out_part: + kfree(partial); +out_page: + kfree(pages); +out: + return -ENOMEM; }
static int smc_rx_data_available_and_no_splice_pend(struct smc_connection *conn) diff --git a/net/smc/smc_tx.c b/net/smc/smc_tx.c index 52ef1fca0b60..77a1fa8bf037 100644 --- a/net/smc/smc_tx.c +++ b/net/smc/smc_tx.c @@ -313,6 +313,7 @@ static int smcr_tx_rdma_writes(struct smc_connection *conn, size_t len,
dma_addr_t dma_addr = sg_dma_address(conn->sndbuf_desc->sgt[link->link_idx].sgl); + u64 virt_addr = (uintptr_t)conn->sndbuf_desc->cpu_addr; int src_len_sum = src_len, dst_len_sum = dst_len; int sent_count = src_off; int srcchunk, dstchunk; @@ -325,8 +326,12 @@ static int smcr_tx_rdma_writes(struct smc_connection *conn, size_t len,
num_sges = 0; for (srcchunk = 0; srcchunk < 2; srcchunk++) { - sge[srcchunk].addr = dma_addr + src_off; + sge[srcchunk].addr = conn->sndbuf_desc->is_vm ? + (virt_addr + src_off) : (dma_addr + src_off); sge[srcchunk].length = src_len; + if (conn->sndbuf_desc->is_vm) + sge[srcchunk].lkey = + conn->sndbuf_desc->mr[link->link_idx]->lkey; num_sges++;
src_off += src_len;
mainline inclusion from mainline-v6.1-rc1 commit 0227f058aa29f5ab6f6ec79c3a36ae41f1e03a13 category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I76JHC CVE: NA
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/ne...
--------------------------------
Currently, SMC uses smc->sk.sk_{rcv|snd}buf to create buffers for send buffer and RMB. And the values of buffer size are from tcp_{w|r}mem in clcsock.
The buffer size from TCP socket doesn't fit SMC well. Generally, buffers are usually larger than TCP for SMC-R/-D to get higher performance, for they are different underlay devices and paths.
So this patch unbinds buffer size from TCP, and introduces two sysctl knobs to tune them independently. Also, these knobs are per net namespace and work for containers.
Signed-off-by: Tony Lu tonylu@linux.alibaba.com Signed-off-by: Paolo Abeni pabeni@redhat.com Signed-off-by: Litao Jiao jiaolitao@sangfor.com.cn --- Documentation/networking/smc-sysctl.rst | 18 ++++++++++++++++++ include/net/netns/smc.h | 2 ++ net/smc/af_smc.c | 5 +++-- net/smc/smc_core.c | 8 ++++---- net/smc/smc_sysctl.c | 21 ++++++++++++++++++++- 5 files changed, 47 insertions(+), 7 deletions(-)
diff --git a/Documentation/networking/smc-sysctl.rst b/Documentation/networking/smc-sysctl.rst index 3f0187ffc2a5..e38c92ab46f3 100644 --- a/Documentation/networking/smc-sysctl.rst +++ b/Documentation/networking/smc-sysctl.rst @@ -19,3 +19,21 @@ smcr_buf_type - INTEGER - 1 - Use virtually contiguous buffers - 2 - Mixed use of the two types. Try physically contiguous buffers first. If not available, use virtually contiguous buffers then. + +wmem - INTEGER + Initial size of send buffer used by SMC sockets. + The default value inherits from net.ipv4.tcp_wmem[1]. + + The minimum value is 16KiB and there is no hard limit for max value, but + only allowed 512KiB for SMC-R and 1MiB for SMC-D. + + Default: 16K + +rmem - INTEGER + Initial size of receive buffer (RMB) used by SMC sockets. + The default value inherits from net.ipv4.tcp_rmem[1]. + + The minimum value is 16KiB and there is no hard limit for max value, but + only allowed 512KiB for SMC-R and 1MiB for SMC-D. + + Default: 128K diff --git a/include/net/netns/smc.h b/include/net/netns/smc.h index 38396599938c..cded3f9a5081 100644 --- a/include/net/netns/smc.h +++ b/include/net/netns/smc.h @@ -7,5 +7,7 @@ struct netns_smc { struct ctl_table_header *smc_hdr; #endif unsigned int sysctl_smcr_buf_type; + int sysctl_wmem; + int sysctl_rmem; }; #endif diff --git a/net/smc/af_smc.c b/net/smc/af_smc.c index 322bd6e3b932..074c57554f0b 100644 --- a/net/smc/af_smc.c +++ b/net/smc/af_smc.c @@ -249,6 +249,8 @@ static struct sock *smc_sock_alloc(struct net *net, struct socket *sock, sk->sk_state = SMC_INIT; sk->sk_destruct = smc_destruct; sk->sk_protocol = protocol; + WRITE_ONCE(sk->sk_sndbuf, READ_ONCE(net->smc.sysctl_wmem)); + WRITE_ONCE(sk->sk_rcvbuf, READ_ONCE(net->smc.sysctl_rmem)); smc = smc_sk(sk); INIT_WORK(&smc->tcp_listen_work, smc_tcp_listen_work); INIT_WORK(&smc->connect_work, smc_connect_work); @@ -2528,8 +2530,7 @@ static int smc_create(struct net *net, struct socket *sock, int protocol, sk_common_release(sk); goto out; } - smc->sk.sk_sndbuf = max(smc->clcsock->sk->sk_sndbuf, SMC_BUF_MIN_SIZE); - smc->sk.sk_rcvbuf = max(smc->clcsock->sk->sk_rcvbuf, SMC_BUF_MIN_SIZE); +
out: return rc; diff --git a/net/smc/smc_core.c b/net/smc/smc_core.c index 3722a1daac5b..525dd7543c60 100644 --- a/net/smc/smc_core.c +++ b/net/smc/smc_core.c @@ -1747,10 +1747,10 @@ static int __smc_buf_create(struct smc_sock *smc, bool is_smcd, bool is_rmb)
if (is_rmb) /* use socket recv buffer size (w/o overhead) as start value */ - sk_buf_size = smc->sk.sk_rcvbuf / 2; + sk_buf_size = smc->sk.sk_rcvbuf; else /* use socket send buffer size (w/o overhead) as start value */ - sk_buf_size = smc->sk.sk_sndbuf / 2; + sk_buf_size = smc->sk.sk_sndbuf;
for (bufsize_short = smc_compress_bufsize(sk_buf_size, is_smcd, is_rmb); bufsize_short >= 0; bufsize_short--) { @@ -1800,7 +1800,7 @@ static int __smc_buf_create(struct smc_sock *smc, bool is_smcd, bool is_rmb) if (is_rmb) { conn->rmb_desc = buf_desc; conn->rmbe_size_short = bufsize_short; - smc->sk.sk_rcvbuf = bufsize * 2; + smc->sk.sk_rcvbuf = bufsize; atomic_set(&conn->bytes_to_rcv, 0); conn->rmbe_update_limit = smc_rmb_wnd_update_limit(buf_desc->len); @@ -1808,7 +1808,7 @@ static int __smc_buf_create(struct smc_sock *smc, bool is_smcd, bool is_rmb) smc_ism_set_conn(conn); /* map RMB/smcd_dev to conn */ } else { conn->sndbuf_desc = buf_desc; - smc->sk.sk_sndbuf = bufsize * 2; + smc->sk.sk_sndbuf = bufsize; atomic_set(&conn->sndbuf_space, bufsize); } return 0; diff --git a/net/smc/smc_sysctl.c b/net/smc/smc_sysctl.c index 81faae0d1b7b..a7cf6411d583 100644 --- a/net/smc/smc_sysctl.c +++ b/net/smc/smc_sysctl.c @@ -18,6 +18,8 @@ #include "smc_core.h"
static int two = 2; +static int min_sndbuf = SMC_BUF_MIN_SIZE; +static int min_rcvbuf = SMC_BUF_MIN_SIZE;
static struct ctl_table smc_table[] = { { @@ -29,6 +31,22 @@ static struct ctl_table smc_table[] = { .extra1 = SYSCTL_ZERO, .extra2 = &two, }, + { + .procname = "wmem", + .data = &init_net.smc.sysctl_wmem, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = &min_sndbuf, + }, + { + .procname = "rmem", + .data = &init_net.smc.sysctl_rmem, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = &min_rcvbuf, + }, { } };
@@ -53,7 +71,8 @@ int __net_init smc_sysctl_net_init(struct net *net) goto err_reg;
net->smc.sysctl_smcr_buf_type = SMCR_PHYS_CONT_BUFS; - + WRITE_ONCE(net->smc.sysctl_wmem, READ_ONCE(net->ipv4.sysctl_tcp_wmem[1])); + WRITE_ONCE(net->smc.sysctl_rmem, READ_ONCE(net->ipv4.sysctl_tcp_rmem[1])); return 0;
err_reg:
hulk inclusion category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I76JHC
--------------------------------
This add the ability to get a unused smc_buf_desc based on the buf size which ensures that the size of obtained smc_buf_desc is the same as the size set.
Signed-off-by: Litao Jiao jiaolitao@sangfor.com.cn --- net/smc/smc_core.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-)
diff --git a/net/smc/smc_core.c b/net/smc/smc_core.c index 525dd7543c60..f3f296e206aa 100644 --- a/net/smc/smc_core.c +++ b/net/smc/smc_core.c @@ -1425,7 +1425,7 @@ int smc_uncompress_bufsize(u8 compressed) /* try to reuse a sndbuf or rmb description slot for a certain * buffer size; if not available, return NULL */ -static struct smc_buf_desc *smc_buf_get_slot(int compressed_bufsize, +static struct smc_buf_desc *smc_buf_get_slot(int bufsize, struct mutex *lock, struct list_head *buf_list) { @@ -1433,7 +1433,7 @@ static struct smc_buf_desc *smc_buf_get_slot(int compressed_bufsize,
mutex_lock(lock); list_for_each_entry(buf_slot, buf_list, list) { - if (cmpxchg(&buf_slot->used, 0, 1) == 0) { + if ((buf_slot->len == bufsize) && (cmpxchg(&buf_slot->used, 0, 1) == 0)) { mutex_unlock(lock); return buf_slot; } @@ -1764,7 +1764,7 @@ static int __smc_buf_create(struct smc_sock *smc, bool is_smcd, bool is_rmb) bufsize = smc_uncompress_bufsize(bufsize_short);
/* check for reusable slot in the link group */ - buf_desc = smc_buf_get_slot(bufsize_short, lock, buf_list); + buf_desc = smc_buf_get_slot(bufsize, lock, buf_list); if (buf_desc) { memset(buf_desc->cpu_addr, 0, bufsize); break; /* found reusable slot */
hulk inclusion category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I76JHC
--------------------------------
If the receiver application reads data slower than the sender, the sender may occur sending failures due to a full sndbufs, and the receiver may not process the rmb timely which results in the sender unable to send data immediately. Increasing the buffer size appropriately can help reduce the probability of the above problems and increase throughput. Therefore, tune the maximum size to 256M of virtually contiguous sndbufs or RMBs for SMC-R.
Signed-off-by: Litao Jiao jiaolitao@sangfor.com.cn --- Documentation/networking/smc-sysctl.rst | 6 +++-- net/smc/smc_core.c | 32 +++++++++++++++++-------- 2 files changed, 26 insertions(+), 12 deletions(-)
diff --git a/Documentation/networking/smc-sysctl.rst b/Documentation/networking/smc-sysctl.rst index e38c92ab46f3..5983b951077e 100644 --- a/Documentation/networking/smc-sysctl.rst +++ b/Documentation/networking/smc-sysctl.rst @@ -25,7 +25,8 @@ wmem - INTEGER The default value inherits from net.ipv4.tcp_wmem[1].
The minimum value is 16KiB and there is no hard limit for max value, but - only allowed 512KiB for SMC-R and 1MiB for SMC-D. + only allowed 512KiB for SMC-R using physically contiguous buffers, 256MiB + for SMC-R using other buf type and 1MiB for SMC-D.
Default: 16K
@@ -34,6 +35,7 @@ rmem - INTEGER The default value inherits from net.ipv4.tcp_rmem[1].
The minimum value is 16KiB and there is no hard limit for max value, but - only allowed 512KiB for SMC-R and 1MiB for SMC-D. + only allowed 512KiB for SMC-R using physically contiguous buffers, 256MiB + for SMC-R using other buf type and 1MiB for SMC-D.
Default: 128K diff --git a/net/smc/smc_core.c b/net/smc/smc_core.c index f3f296e206aa..8ac07d2b4e9a 100644 --- a/net/smc/smc_core.c +++ b/net/smc/smc_core.c @@ -1386,29 +1386,41 @@ int smc_conn_create(struct smc_sock *smc, struct smc_init_info *ini) return rc; }
-#define SMCD_DMBE_SIZES 6 /* 0 -> 16KB, 1 -> 32KB, .. 6 -> 1MB */ -#define SMCR_RMBE_SIZES 5 /* 0 -> 16KB, 1 -> 32KB, .. 5 -> 512KB */ +#define SMCD_DMBE_SIZES 6 /* 0 -> 16KB, 1 -> 32KB, .. 6 -> 1MB */ +#define SMCR_RMBE_SIZES 14 /* 0 -> 16KB, 1 -> 32KB, .. 14 -> 256MB */
/* convert the RMB size into the compressed notation (minimum 16K, see * SMCD/R_DMBE_SIZES. * In contrast to plain ilog2, this rounds towards the next power of 2, * so the socket application gets at least its desired sndbuf / rcvbuf size. */ -static u8 smc_compress_bufsize(int size, bool is_smcd, bool is_rmb) +static u8 smc_compress_bufsize(struct smc_link_group *lgr, int size, bool is_smcd, bool is_rmb) { const unsigned int max_scat = SG_MAX_SINGLE_ALLOC * PAGE_SIZE; - u8 compressed; + u8 compressed, max_phy_compressed;
if (size <= SMC_BUF_MIN_SIZE) return 0;
size = (size - 1) >> 14; /* convert to 16K multiple */ compressed = min_t(u8, ilog2(size) + 1, - is_smcd ? SMCD_DMBE_SIZES : SMCR_RMBE_SIZES); - - if (!is_smcd && is_rmb) - /* RMBs are backed by & limited to max size of scatterlists */ - compressed = min_t(u8, compressed, ilog2(max_scat >> 14)); + is_smcd ? SMCD_DMBE_SIZES : SMCR_RMBE_SIZES); + + if (!is_smcd && is_rmb && (lgr->buf_type != SMCR_VIRT_CONT_BUFS)) { + max_phy_compressed = ilog2(max_scat >> 14); + switch (lgr->buf_type) { + case SMCR_MIXED_BUFS: + if (compressed > max_phy_compressed) + break; + fallthrough; // try phys continguous buf + case SMCR_PHYS_CONT_BUFS: + /* RMBs are backed by & limited to max size of scatterlists */ + compressed = min_t(u8, compressed, max_phy_compressed); + break; + default: + break; + } + }
return compressed; } @@ -1752,7 +1764,7 @@ static int __smc_buf_create(struct smc_sock *smc, bool is_smcd, bool is_rmb) /* use socket send buffer size (w/o overhead) as start value */ sk_buf_size = smc->sk.sk_sndbuf;
- for (bufsize_short = smc_compress_bufsize(sk_buf_size, is_smcd, is_rmb); + for (bufsize_short = smc_compress_bufsize(lgr, sk_buf_size, is_smcd, is_rmb); bufsize_short >= 0; bufsize_short--) { if (is_rmb) { lock = &lgr->rmbs_lock;