Jianbo Liu (1): IB/mlx5: Allocate resources just before first QP/SRQ is created
Patrisious Haddad (1): RDMA/mlx5: Move events notifier registration to be after device registration
drivers/infiniband/hw/mlx5/main.c | 185 ++++++++++++++++++--------- drivers/infiniband/hw/mlx5/mlx5_ib.h | 6 +- drivers/infiniband/hw/mlx5/qp.c | 4 + drivers/infiniband/hw/mlx5/srq.c | 4 + 4 files changed, 136 insertions(+), 63 deletions(-)
From: Jianbo Liu jianbol@nvidia.com
stable inclusion from stable-v6.6.64 commit b6334d2356fc0922ed01457960f74923058a353a category: bugfix bugzilla: https://gitee.com/src-openeuler/kernel/issues/IBEADY CVE: CVE-2024-53224
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=...
--------------------------------
[ Upstream commit 5895e70f2e6e8dc67b551ca554d6fcde0a7f0467 ]
Previously, all IB dev resources are initialized on driver load. As they are not always used, move the initialization to the time when they are needed.
To be more specific, move PD (p0) and CQ (c0) initialization to the time when the first SRQ is created. and move SRQs(s0 and s1) initialization to the time first QP is created. To avoid concurrent creations, two new mutexes are also added.
Signed-off-by: Jianbo Liu jianbol@nvidia.com Link: https://lore.kernel.org/r/98c3e53a8cc0bdfeb6dec6e5bb8b037d78ab00d8.171740936... Signed-off-by: Leon Romanovsky leon@kernel.org Stable-dep-of: ede132a5cf55 ("RDMA/mlx5: Move events notifier registration to be after device registration") Signed-off-by: Sasha Levin sashal@kernel.org Signed-off-by: Pu Lehui pulehui@huawei.com --- drivers/infiniband/hw/mlx5/main.c | 149 +++++++++++++++++++-------- drivers/infiniband/hw/mlx5/mlx5_ib.h | 4 + drivers/infiniband/hw/mlx5/qp.c | 4 + drivers/infiniband/hw/mlx5/srq.c | 4 + 4 files changed, 118 insertions(+), 43 deletions(-)
diff --git a/drivers/infiniband/hw/mlx5/main.c b/drivers/infiniband/hw/mlx5/main.c index 296af7a5c279..bc38af6cda6e 100644 --- a/drivers/infiniband/hw/mlx5/main.c +++ b/drivers/infiniband/hw/mlx5/main.c @@ -2796,37 +2796,72 @@ static u8 mlx5_get_umr_fence(u8 umr_fence_cap) } }
-static int mlx5_ib_dev_res_init(struct mlx5_ib_dev *dev) +int mlx5_ib_dev_res_cq_init(struct mlx5_ib_dev *dev) { struct mlx5_ib_resources *devr = &dev->devr; - struct ib_srq_init_attr attr; - struct ib_device *ibdev; struct ib_cq_init_attr cq_attr = {.cqe = 1}; - int port; + struct ib_device *ibdev; + struct ib_pd *pd; + struct ib_cq *cq; int ret = 0;
- ibdev = &dev->ib_dev;
- if (!MLX5_CAP_GEN(dev->mdev, xrc)) - return -EOPNOTSUPP; + /* + * devr->c0 is set once, never changed until device unload. + * Avoid taking the mutex if initialization is already done. + */ + if (devr->c0) + return 0;
- devr->p0 = ib_alloc_pd(ibdev, 0); - if (IS_ERR(devr->p0)) - return PTR_ERR(devr->p0); + mutex_lock(&devr->cq_lock); + if (devr->c0) + goto unlock;
- devr->c0 = ib_create_cq(ibdev, NULL, NULL, NULL, &cq_attr); - if (IS_ERR(devr->c0)) { - ret = PTR_ERR(devr->c0); - goto error1; + ibdev = &dev->ib_dev; + pd = ib_alloc_pd(ibdev, 0); + if (IS_ERR(pd)) { + ret = PTR_ERR(pd); + mlx5_ib_err(dev, "Couldn't allocate PD for res init, err=%d\n", ret); + goto unlock; }
- ret = mlx5_cmd_xrcd_alloc(dev->mdev, &devr->xrcdn0, 0); - if (ret) - goto error2; + cq = ib_create_cq(ibdev, NULL, NULL, NULL, &cq_attr); + if (IS_ERR(cq)) { + ret = PTR_ERR(cq); + mlx5_ib_err(dev, "Couldn't create CQ for res init, err=%d\n", ret); + ib_dealloc_pd(pd); + goto unlock; + }
- ret = mlx5_cmd_xrcd_alloc(dev->mdev, &devr->xrcdn1, 0); + devr->p0 = pd; + devr->c0 = cq; + +unlock: + mutex_unlock(&devr->cq_lock); + return ret; +} + +int mlx5_ib_dev_res_srq_init(struct mlx5_ib_dev *dev) +{ + struct mlx5_ib_resources *devr = &dev->devr; + struct ib_srq_init_attr attr; + struct ib_srq *s0, *s1; + int ret = 0; + + /* + * devr->s1 is set once, never changed until device unload. + * Avoid taking the mutex if initialization is already done. + */ + if (devr->s1) + return 0; + + mutex_lock(&devr->srq_lock); + if (devr->s1) + goto unlock; + + ret = mlx5_ib_dev_res_cq_init(dev); if (ret) - goto error3; + goto unlock;
memset(&attr, 0, sizeof(attr)); attr.attr.max_sge = 1; @@ -2834,10 +2869,11 @@ static int mlx5_ib_dev_res_init(struct mlx5_ib_dev *dev) attr.srq_type = IB_SRQT_XRC; attr.ext.cq = devr->c0;
- devr->s0 = ib_create_srq(devr->p0, &attr); - if (IS_ERR(devr->s0)) { - ret = PTR_ERR(devr->s0); - goto err_create; + s0 = ib_create_srq(devr->p0, &attr); + if (IS_ERR(s0)) { + ret = PTR_ERR(s0); + mlx5_ib_err(dev, "Couldn't create SRQ 0 for res init, err=%d\n", ret); + goto unlock; }
memset(&attr, 0, sizeof(attr)); @@ -2845,29 +2881,48 @@ static int mlx5_ib_dev_res_init(struct mlx5_ib_dev *dev) attr.attr.max_wr = 1; attr.srq_type = IB_SRQT_BASIC;
- devr->s1 = ib_create_srq(devr->p0, &attr); - if (IS_ERR(devr->s1)) { - ret = PTR_ERR(devr->s1); - goto error6; + s1 = ib_create_srq(devr->p0, &attr); + if (IS_ERR(s1)) { + ret = PTR_ERR(s1); + mlx5_ib_err(dev, "Couldn't create SRQ 1 for res init, err=%d\n", ret); + ib_destroy_srq(s0); + } + + devr->s0 = s0; + devr->s1 = s1; + +unlock: + mutex_unlock(&devr->srq_lock); + return ret; +} + +static int mlx5_ib_dev_res_init(struct mlx5_ib_dev *dev) +{ + struct mlx5_ib_resources *devr = &dev->devr; + int port; + int ret; + + if (!MLX5_CAP_GEN(dev->mdev, xrc)) + return -EOPNOTSUPP; + + ret = mlx5_cmd_xrcd_alloc(dev->mdev, &devr->xrcdn0, 0); + if (ret) + return ret; + + ret = mlx5_cmd_xrcd_alloc(dev->mdev, &devr->xrcdn1, 0); + if (ret) { + mlx5_cmd_xrcd_dealloc(dev->mdev, devr->xrcdn0, 0); + return ret; }
for (port = 0; port < ARRAY_SIZE(devr->ports); ++port) INIT_WORK(&devr->ports[port].pkey_change_work, pkey_change_handler);
- return 0; + mutex_init(&devr->cq_lock); + mutex_init(&devr->srq_lock);
-error6: - ib_destroy_srq(devr->s0); -err_create: - mlx5_cmd_xrcd_dealloc(dev->mdev, devr->xrcdn1, 0); -error3: - mlx5_cmd_xrcd_dealloc(dev->mdev, devr->xrcdn0, 0); -error2: - ib_destroy_cq(devr->c0); -error1: - ib_dealloc_pd(devr->p0); - return ret; + return 0; }
static void mlx5_ib_dev_res_cleanup(struct mlx5_ib_dev *dev) @@ -2884,12 +2939,20 @@ static void mlx5_ib_dev_res_cleanup(struct mlx5_ib_dev *dev) for (port = 0; port < ARRAY_SIZE(devr->ports); ++port) cancel_work_sync(&devr->ports[port].pkey_change_work);
- ib_destroy_srq(devr->s1); - ib_destroy_srq(devr->s0); + /* After s0/s1 init, they are not unset during the device lifetime. */ + if (devr->s1) { + ib_destroy_srq(devr->s1); + ib_destroy_srq(devr->s0); + } mlx5_cmd_xrcd_dealloc(dev->mdev, devr->xrcdn1, 0); mlx5_cmd_xrcd_dealloc(dev->mdev, devr->xrcdn0, 0); - ib_destroy_cq(devr->c0); - ib_dealloc_pd(devr->p0); + /* After p0/c0 init, they are not unset during the device lifetime. */ + if (devr->c0) { + ib_destroy_cq(devr->c0); + ib_dealloc_pd(devr->p0); + } + mutex_destroy(&devr->cq_lock); + mutex_destroy(&devr->srq_lock); }
static u32 get_core_cap_flags(struct ib_device *ibdev, diff --git a/drivers/infiniband/hw/mlx5/mlx5_ib.h b/drivers/infiniband/hw/mlx5/mlx5_ib.h index 43a963e205eb..1c83d132197f 100644 --- a/drivers/infiniband/hw/mlx5/mlx5_ib.h +++ b/drivers/infiniband/hw/mlx5/mlx5_ib.h @@ -820,11 +820,13 @@ struct mlx5_ib_port_resources {
struct mlx5_ib_resources { struct ib_cq *c0; + struct mutex cq_lock; u32 xrcdn0; u32 xrcdn1; struct ib_pd *p0; struct ib_srq *s0; struct ib_srq *s1; + struct mutex srq_lock; struct mlx5_ib_port_resources ports[2]; };
@@ -1270,6 +1272,8 @@ to_mmmap(struct rdma_user_mmap_entry *rdma_entry) struct mlx5_user_mmap_entry, rdma_entry); }
+int mlx5_ib_dev_res_cq_init(struct mlx5_ib_dev *dev); +int mlx5_ib_dev_res_srq_init(struct mlx5_ib_dev *dev); int mlx5_ib_db_map_user(struct mlx5_ib_ucontext *context, unsigned long virt, struct mlx5_db *db); void mlx5_ib_db_unmap_user(struct mlx5_ib_ucontext *context, struct mlx5_db *db); diff --git a/drivers/infiniband/hw/mlx5/qp.c b/drivers/infiniband/hw/mlx5/qp.c index 93d9b15cbbb9..71a856409cee 100644 --- a/drivers/infiniband/hw/mlx5/qp.c +++ b/drivers/infiniband/hw/mlx5/qp.c @@ -3247,6 +3247,10 @@ int mlx5_ib_create_qp(struct ib_qp *ibqp, struct ib_qp_init_attr *attr, enum ib_qp_type type; int err;
+ err = mlx5_ib_dev_res_srq_init(dev); + if (err) + return err; + err = check_qp_type(dev, attr, &type); if (err) return err; diff --git a/drivers/infiniband/hw/mlx5/srq.c b/drivers/infiniband/hw/mlx5/srq.c index 84be0c3d5699..bcb6b324af50 100644 --- a/drivers/infiniband/hw/mlx5/srq.c +++ b/drivers/infiniband/hw/mlx5/srq.c @@ -216,6 +216,10 @@ int mlx5_ib_create_srq(struct ib_srq *ib_srq, return -EINVAL; }
+ err = mlx5_ib_dev_res_cq_init(dev); + if (err) + return err; + mutex_init(&srq->mutex); spin_lock_init(&srq->lock); srq->msrq.max = roundup_pow_of_two(init_attr->attr.max_wr + 1);
From: Patrisious Haddad phaddad@nvidia.com
stable inclusion from stable-v6.6.64 commit 921fcf2971a1e8d3b904ba2c2905b96f4ec3d4ad category: bugfix bugzilla: https://gitee.com/src-openeuler/kernel/issues/IBEADY CVE: CVE-2024-53224
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=...
--------------------------------
[ Upstream commit ede132a5cf559f3ab35a4c28bac4f4a6c20334d8 ]
Move pkey change work initialization and cleanup from device resources stage to notifier stage, since this is the stage which handles this work events.
Fix a race between the device deregistration and pkey change work by moving MLX5_IB_STAGE_DEVICE_NOTIFIER to be after MLX5_IB_STAGE_IB_REG in order to ensure that the notifier is deregistered before the device during cleanup. Which ensures there are no works that are being executed after the device has already unregistered which can cause the panic below.
BUG: kernel NULL pointer dereference, address: 0000000000000000 PGD 0 P4D 0 Oops: 0000 [#1] PREEMPT SMP PTI CPU: 1 PID: 630071 Comm: kworker/1:2 Kdump: loaded Tainted: G W OE --------- --- 5.14.0-162.6.1.el9_1.x86_64 #1 Hardware name: Microsoft Corporation Virtual Machine/Virtual Machine, BIOS 090008 02/27/2023 Workqueue: events pkey_change_handler [mlx5_ib] RIP: 0010:setup_qp+0x38/0x1f0 [mlx5_ib] Code: ee 41 54 45 31 e4 55 89 f5 53 48 89 fb 48 83 ec 20 8b 77 08 65 48 8b 04 25 28 00 00 00 48 89 44 24 18 48 8b 07 48 8d 4c 24 16 <4c> 8b 38 49 8b 87 80 0b 00 00 4c 89 ff 48 8b 80 08 05 00 00 8b 40 RSP: 0018:ffffbcc54068be20 EFLAGS: 00010282 RAX: 0000000000000000 RBX: ffff954054494128 RCX: ffffbcc54068be36 RDX: ffff954004934000 RSI: 0000000000000001 RDI: ffff954054494128 RBP: 0000000000000023 R08: ffff954001be2c20 R09: 0000000000000001 R10: ffff954001be2c20 R11: ffff9540260133c0 R12: 0000000000000000 R13: 0000000000000023 R14: 0000000000000000 R15: ffff9540ffcb0905 FS: 0000000000000000(0000) GS:ffff9540ffc80000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 0000000000000000 CR3: 000000010625c001 CR4: 00000000003706e0 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 Call Trace: mlx5_ib_gsi_pkey_change+0x20/0x40 [mlx5_ib] process_one_work+0x1e8/0x3c0 worker_thread+0x50/0x3b0 ? rescuer_thread+0x380/0x380 kthread+0x149/0x170 ? set_kthread_struct+0x50/0x50 ret_from_fork+0x22/0x30 Modules linked in: rdma_ucm(OE) rdma_cm(OE) iw_cm(OE) ib_ipoib(OE) ib_cm(OE) ib_umad(OE) mlx5_ib(OE) mlx5_fwctl(OE) fwctl(OE) ib_uverbs(OE) mlx5_core(OE) mlxdevm(OE) ib_core(OE) mlx_compat(OE) psample mlxfw(OE) tls knem(OE) netconsole nfsv3 nfs_acl nfs lockd grace fscache netfs qrtr rfkill sunrpc intel_rapl_msr intel_rapl_common rapl hv_balloon hv_utils i2c_piix4 pcspkr joydev fuse ext4 mbcache jbd2 sr_mod sd_mod cdrom t10_pi sg ata_generic pci_hyperv pci_hyperv_intf hyperv_drm drm_shmem_helper drm_kms_helper hv_storvsc syscopyarea hv_netvsc sysfillrect sysimgblt hid_hyperv fb_sys_fops scsi_transport_fc hyperv_keyboard drm ata_piix crct10dif_pclmul crc32_pclmul crc32c_intel libata ghash_clmulni_intel hv_vmbus serio_raw [last unloaded: ib_core] CR2: 0000000000000000 ---[ end trace f6f8be4eae12f7bc ]---
Fixes: 7722f47e71e5 ("IB/mlx5: Create GSI transmission QPs when P_Key table is changed") Signed-off-by: Patrisious Haddad phaddad@nvidia.com Reviewed-by: Michael Guralnik michaelgur@nvidia.com Link: https://patch.msgid.link/d271ceeff0c08431b3cbbbb3e2d416f09b6d1621.1731496944... Signed-off-by: Leon Romanovsky leon@kernel.org Signed-off-by: Sasha Levin sashal@kernel.org Signed-off-by: Pu Lehui pulehui@huawei.com --- drivers/infiniband/hw/mlx5/main.c | 40 +++++++++++++--------------- drivers/infiniband/hw/mlx5/mlx5_ib.h | 2 +- 2 files changed, 20 insertions(+), 22 deletions(-)
diff --git a/drivers/infiniband/hw/mlx5/main.c b/drivers/infiniband/hw/mlx5/main.c index bc38af6cda6e..c510484e024b 100644 --- a/drivers/infiniband/hw/mlx5/main.c +++ b/drivers/infiniband/hw/mlx5/main.c @@ -2899,7 +2899,6 @@ int mlx5_ib_dev_res_srq_init(struct mlx5_ib_dev *dev) static int mlx5_ib_dev_res_init(struct mlx5_ib_dev *dev) { struct mlx5_ib_resources *devr = &dev->devr; - int port; int ret;
if (!MLX5_CAP_GEN(dev->mdev, xrc)) @@ -2915,10 +2914,6 @@ static int mlx5_ib_dev_res_init(struct mlx5_ib_dev *dev) return ret; }
- for (port = 0; port < ARRAY_SIZE(devr->ports); ++port) - INIT_WORK(&devr->ports[port].pkey_change_work, - pkey_change_handler); - mutex_init(&devr->cq_lock); mutex_init(&devr->srq_lock);
@@ -2928,16 +2923,6 @@ static int mlx5_ib_dev_res_init(struct mlx5_ib_dev *dev) static void mlx5_ib_dev_res_cleanup(struct mlx5_ib_dev *dev) { struct mlx5_ib_resources *devr = &dev->devr; - int port; - - /* - * Make sure no change P_Key work items are still executing. - * - * At this stage, the mlx5_ib_event should be unregistered - * and it ensures that no new works are added. - */ - for (port = 0; port < ARRAY_SIZE(devr->ports); ++port) - cancel_work_sync(&devr->ports[port].pkey_change_work);
/* After s0/s1 init, they are not unset during the device lifetime. */ if (devr->s1) { @@ -4201,6 +4186,13 @@ static void mlx5_ib_stage_delay_drop_cleanup(struct mlx5_ib_dev *dev)
static int mlx5_ib_stage_dev_notifier_init(struct mlx5_ib_dev *dev) { + struct mlx5_ib_resources *devr = &dev->devr; + int port; + + for (port = 0; port < ARRAY_SIZE(devr->ports); ++port) + INIT_WORK(&devr->ports[port].pkey_change_work, + pkey_change_handler); + dev->mdev_events.notifier_call = mlx5_ib_event; mlx5_notifier_register(dev->mdev, &dev->mdev_events);
@@ -4211,8 +4203,14 @@ static int mlx5_ib_stage_dev_notifier_init(struct mlx5_ib_dev *dev)
static void mlx5_ib_stage_dev_notifier_cleanup(struct mlx5_ib_dev *dev) { + struct mlx5_ib_resources *devr = &dev->devr; + int port; + mlx5r_macsec_event_unregister(dev); mlx5_notifier_unregister(dev->mdev, &dev->mdev_events); + + for (port = 0; port < ARRAY_SIZE(devr->ports); ++port) + cancel_work_sync(&devr->ports[port].pkey_change_work); }
void __mlx5_ib_remove(struct mlx5_ib_dev *dev, @@ -4286,9 +4284,6 @@ static const struct mlx5_ib_profile pf_profile = { STAGE_CREATE(MLX5_IB_STAGE_DEVICE_RESOURCES, mlx5_ib_dev_res_init, mlx5_ib_dev_res_cleanup), - STAGE_CREATE(MLX5_IB_STAGE_DEVICE_NOTIFIER, - mlx5_ib_stage_dev_notifier_init, - mlx5_ib_stage_dev_notifier_cleanup), STAGE_CREATE(MLX5_IB_STAGE_ODP, mlx5_ib_odp_init_one, mlx5_ib_odp_cleanup_one), @@ -4313,6 +4308,9 @@ static const struct mlx5_ib_profile pf_profile = { STAGE_CREATE(MLX5_IB_STAGE_IB_REG, mlx5_ib_stage_ib_reg_init, mlx5_ib_stage_ib_reg_cleanup), + STAGE_CREATE(MLX5_IB_STAGE_DEVICE_NOTIFIER, + mlx5_ib_stage_dev_notifier_init, + mlx5_ib_stage_dev_notifier_cleanup), STAGE_CREATE(MLX5_IB_STAGE_POST_IB_REG_UMR, mlx5_ib_stage_post_ib_reg_umr_init, NULL), @@ -4349,9 +4347,6 @@ const struct mlx5_ib_profile raw_eth_profile = { STAGE_CREATE(MLX5_IB_STAGE_DEVICE_RESOURCES, mlx5_ib_dev_res_init, mlx5_ib_dev_res_cleanup), - STAGE_CREATE(MLX5_IB_STAGE_DEVICE_NOTIFIER, - mlx5_ib_stage_dev_notifier_init, - mlx5_ib_stage_dev_notifier_cleanup), STAGE_CREATE(MLX5_IB_STAGE_COUNTERS, mlx5_ib_counters_init, mlx5_ib_counters_cleanup), @@ -4373,6 +4368,9 @@ const struct mlx5_ib_profile raw_eth_profile = { STAGE_CREATE(MLX5_IB_STAGE_IB_REG, mlx5_ib_stage_ib_reg_init, mlx5_ib_stage_ib_reg_cleanup), + STAGE_CREATE(MLX5_IB_STAGE_DEVICE_NOTIFIER, + mlx5_ib_stage_dev_notifier_init, + mlx5_ib_stage_dev_notifier_cleanup), STAGE_CREATE(MLX5_IB_STAGE_POST_IB_REG_UMR, mlx5_ib_stage_post_ib_reg_umr_init, NULL), diff --git a/drivers/infiniband/hw/mlx5/mlx5_ib.h b/drivers/infiniband/hw/mlx5/mlx5_ib.h index 1c83d132197f..94678e5c59dd 100644 --- a/drivers/infiniband/hw/mlx5/mlx5_ib.h +++ b/drivers/infiniband/hw/mlx5/mlx5_ib.h @@ -954,7 +954,6 @@ enum mlx5_ib_stages { MLX5_IB_STAGE_QP, MLX5_IB_STAGE_SRQ, MLX5_IB_STAGE_DEVICE_RESOURCES, - MLX5_IB_STAGE_DEVICE_NOTIFIER, MLX5_IB_STAGE_ODP, MLX5_IB_STAGE_COUNTERS, MLX5_IB_STAGE_CONG_DEBUGFS, @@ -963,6 +962,7 @@ enum mlx5_ib_stages { MLX5_IB_STAGE_PRE_IB_REG_UMR, MLX5_IB_STAGE_WHITELIST_UID, MLX5_IB_STAGE_IB_REG, + MLX5_IB_STAGE_DEVICE_NOTIFIER, MLX5_IB_STAGE_POST_IB_REG_UMR, MLX5_IB_STAGE_DELAY_DROP, MLX5_IB_STAGE_RESTRACK,
反馈: 您发送到kernel@openeuler.org的补丁/补丁集,已成功转换为PR! PR链接地址: https://gitee.com/openeuler/kernel/pulls/14664 邮件列表地址:https://mailweb.openeuler.org/hyperkitty/list/kernel@openeuler.org/message/S...
FeedBack: The patch(es) which you have sent to kernel@openeuler.org mailing list has been converted to a pull request successfully! Pull request link: https://gitee.com/openeuler/kernel/pulls/14664 Mailing list address: https://mailweb.openeuler.org/hyperkitty/list/kernel@openeuler.org/message/S...