From: Israel Rukshin israelr@mellanox.com
mainline inclusion from mainline-v5.7-rc1 commit b780d7415aacec855e2f2370cbf98f918b224903 category: bugfix bugzilla: NA CVE: NA Link: https://gitee.com/openeuler/kernel/issues/I1WGZE
--------------------------------
In case nvme_sysfs_delete() is called by the user before taking the ctrl reference count, the ctrl may be freed during the creation and cause the bug. Take the reference as soon as the controller is externally visible, which is done by cdev_device_add() in nvme_init_ctrl(). Also take the reference count at the core layer instead of taking it on each transport separately.
Signed-off-by: Israel Rukshin israelr@mellanox.com Reviewed-by: Max Gurtovoy maxg@mellanox.com Reviewed-by: Christoph Hellwig hch@lst.de Signed-off-by: Keith Busch kbusch@kernel.org Conflicts: drivers/nvme/host/tcp.c [No code about TCP in current version.] Reviewed-by: Chao Leng lengchao@huawei.com Reviewed-by: Jike Cheng chengjike.cheng@huawei.com Signed-off-by: Lijie lijie34@huawei.com Reviewed-by: Hou Tao houtao1@huawei.com Acked-by: Hanjun Guo guohanjun@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- drivers/nvme/host/core.c | 2 ++ drivers/nvme/host/fc.c | 4 +--- drivers/nvme/host/pci.c | 1 - drivers/nvme/host/rdma.c | 3 +-- drivers/nvme/target/loop.c | 3 +-- 5 files changed, 5 insertions(+), 8 deletions(-)
diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index b8446637b3c2..b7a40ac4b637 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -3779,6 +3779,7 @@ int nvme_init_ctrl(struct nvme_ctrl *ctrl, struct device *dev, if (ret) goto out_release_instance;
+ nvme_get_ctrl(ctrl); cdev_init(&ctrl->cdev, &nvme_dev_fops); ctrl->cdev.owner = ops->module; ret = cdev_device_add(&ctrl->cdev, ctrl->device); @@ -3795,6 +3796,7 @@ int nvme_init_ctrl(struct nvme_ctrl *ctrl, struct device *dev,
return 0; out_free_name: + nvme_put_ctrl(ctrl); kfree_const(ctrl->device->kobj.name); out_release_instance: ida_simple_remove(&nvme_instance_ida, ctrl->instance); diff --git a/drivers/nvme/host/fc.c b/drivers/nvme/host/fc.c index ed88d5021772..d75ae1b201ad 100644 --- a/drivers/nvme/host/fc.c +++ b/drivers/nvme/host/fc.c @@ -3146,10 +3146,7 @@ nvme_fc_init_ctrl(struct device *dev, struct nvmf_ctrl_options *opts, goto fail_ctrl; }
- nvme_get_ctrl(&ctrl->ctrl); - if (!queue_delayed_work(nvme_wq, &ctrl->connect_work, 0)) { - nvme_put_ctrl(&ctrl->ctrl); dev_err(ctrl->ctrl.device, "NVME-FC{%d}: failed to schedule initial connect\n", ctrl->cnum); @@ -3174,6 +3171,7 @@ nvme_fc_init_ctrl(struct device *dev, struct nvmf_ctrl_options *opts,
/* initiate nvme ctrl ref counting teardown */ nvme_uninit_ctrl(&ctrl->ctrl); + nvme_put_ctrl(&ctrl->ctrl);
/* Remove core ctrl ref. */ nvme_put_ctrl(&ctrl->ctrl); diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c index f194bb7ccb04..1a0eec110614 100644 --- a/drivers/nvme/host/pci.c +++ b/drivers/nvme/host/pci.c @@ -2558,7 +2558,6 @@ static int nvme_probe(struct pci_dev *pdev, const struct pci_device_id *id) dev_info(dev->ctrl.device, "pci function %s\n", dev_name(&pdev->dev));
nvme_reset_ctrl(&dev->ctrl); - nvme_get_ctrl(&dev->ctrl); async_schedule(nvme_async_probe, dev);
return 0; diff --git a/drivers/nvme/host/rdma.c b/drivers/nvme/host/rdma.c index d0cf19e593b2..0e998e85e962 100644 --- a/drivers/nvme/host/rdma.c +++ b/drivers/nvme/host/rdma.c @@ -2050,8 +2050,6 @@ static struct nvme_ctrl *nvme_rdma_create_ctrl(struct device *dev, dev_info(ctrl->ctrl.device, "new ctrl: NQN "%s", addr %pISpcs\n", ctrl->ctrl.opts->subsysnqn, &ctrl->addr);
- nvme_get_ctrl(&ctrl->ctrl); - mutex_lock(&nvme_rdma_ctrl_mutex); list_add_tail(&ctrl->list, &nvme_rdma_ctrl_list); mutex_unlock(&nvme_rdma_ctrl_mutex); @@ -2061,6 +2059,7 @@ static struct nvme_ctrl *nvme_rdma_create_ctrl(struct device *dev, out_uninit_ctrl: nvme_uninit_ctrl(&ctrl->ctrl); nvme_put_ctrl(&ctrl->ctrl); + nvme_put_ctrl(&ctrl->ctrl); if (ret > 0) ret = -EIO; return ERR_PTR(ret); diff --git a/drivers/nvme/target/loop.c b/drivers/nvme/target/loop.c index 137a27fa369c..e30080c629fd 100644 --- a/drivers/nvme/target/loop.c +++ b/drivers/nvme/target/loop.c @@ -639,8 +639,6 @@ static struct nvme_ctrl *nvme_loop_create_ctrl(struct device *dev, dev_info(ctrl->ctrl.device, "new ctrl: "%s"\n", ctrl->ctrl.opts->subsysnqn);
- nvme_get_ctrl(&ctrl->ctrl); - changed = nvme_change_ctrl_state(&ctrl->ctrl, NVME_CTRL_LIVE); WARN_ON_ONCE(!changed);
@@ -658,6 +656,7 @@ static struct nvme_ctrl *nvme_loop_create_ctrl(struct device *dev, kfree(ctrl->queues); out_uninit_ctrl: nvme_uninit_ctrl(&ctrl->ctrl); + nvme_put_ctrl(&ctrl->ctrl); out_put_ctrl: nvme_put_ctrl(&ctrl->ctrl); if (ret > 0)
From: Israel Rukshin israelr@mellanox.com
mainline inclusion from mainline-v5.7-rc1 commit 726612b6b8259afa41d265a2722991c87f059223 category: bugfix bugzilla: NA CVE: NA Link: https://gitee.com/openeuler/kernel/issues/I1WGZE
--------------------------------
Put the ctrl reference count at nvme_uninit_ctrl as opposed to nvme_init_ctrl which takes it. This decrease the reference count at the core layer instead of decreasing it on each transport separately. Also move the call of nvme_uninit_ctrl at PCI driver after calling to nvme_release_prp_pools and nvme_dev_unmap, in order to put the reference count after using the dev. This is safe because those functions use nvme_dev which is freed only later at nvme_pci_free_ctrl.
Signed-off-by: Israel Rukshin israelr@mellanox.com Reviewed-by: Christoph Hellwig hch@lst.de Signed-off-by: Keith Busch kbusch@kernel.org Conflicts: drivers/nvme/host/tcp.c drivers/nvme/host/core.c [No code about TCP in current version and adjust context.] Reviewed-by: Chao Leng lengchao@huawei.com Reviewed-by: Jike Cheng chengjike.cheng@huawei.com Signed-off-by: Lijie lijie34@huawei.com Reviewed-by: Hou Tao houtao1@huawei.com Acked-by: Hanjun Guo guohanjun@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- drivers/nvme/host/core.c | 2 +- drivers/nvme/host/fc.c | 1 - drivers/nvme/host/pci.c | 3 +-- drivers/nvme/host/rdma.c | 1 - drivers/nvme/target/loop.c | 2 -- 5 files changed, 2 insertions(+), 7 deletions(-)
diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index b7a40ac4b637..d1ee577054b4 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -165,7 +165,6 @@ static void nvme_do_delete_ctrl(struct nvme_ctrl *ctrl) nvme_remove_namespaces(ctrl); ctrl->ops->delete_ctrl(ctrl); nvme_uninit_ctrl(ctrl); - nvme_put_ctrl(ctrl); }
static void nvme_delete_ctrl_work(struct work_struct *work) @@ -3700,6 +3699,7 @@ void nvme_uninit_ctrl(struct nvme_ctrl *ctrl) { dev_pm_qos_hide_latency_tolerance(ctrl->device); cdev_device_del(&ctrl->cdev, ctrl->device); + nvme_put_ctrl(ctrl); } EXPORT_SYMBOL_GPL(nvme_uninit_ctrl);
diff --git a/drivers/nvme/host/fc.c b/drivers/nvme/host/fc.c index d75ae1b201ad..2aac21c53aa9 100644 --- a/drivers/nvme/host/fc.c +++ b/drivers/nvme/host/fc.c @@ -3171,7 +3171,6 @@ nvme_fc_init_ctrl(struct device *dev, struct nvmf_ctrl_options *opts,
/* initiate nvme ctrl ref counting teardown */ nvme_uninit_ctrl(&ctrl->ctrl); - nvme_put_ctrl(&ctrl->ctrl);
/* Remove core ctrl ref. */ nvme_put_ctrl(&ctrl->ctrl); diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c index 1a0eec110614..ce340d5cd575 100644 --- a/drivers/nvme/host/pci.c +++ b/drivers/nvme/host/pci.c @@ -2620,10 +2620,9 @@ static void nvme_remove(struct pci_dev *pdev) nvme_free_host_mem(dev); nvme_dev_remove_admin(dev); nvme_free_queues(dev, 0); - nvme_uninit_ctrl(&dev->ctrl); nvme_release_prp_pools(dev); nvme_dev_unmap(dev); - nvme_put_ctrl(&dev->ctrl); + nvme_uninit_ctrl(&dev->ctrl); }
#ifdef CONFIG_PM_SLEEP diff --git a/drivers/nvme/host/rdma.c b/drivers/nvme/host/rdma.c index 0e998e85e962..b91444c1fd45 100644 --- a/drivers/nvme/host/rdma.c +++ b/drivers/nvme/host/rdma.c @@ -2059,7 +2059,6 @@ static struct nvme_ctrl *nvme_rdma_create_ctrl(struct device *dev, out_uninit_ctrl: nvme_uninit_ctrl(&ctrl->ctrl); nvme_put_ctrl(&ctrl->ctrl); - nvme_put_ctrl(&ctrl->ctrl); if (ret > 0) ret = -EIO; return ERR_PTR(ret); diff --git a/drivers/nvme/target/loop.c b/drivers/nvme/target/loop.c index e30080c629fd..122fbfc31889 100644 --- a/drivers/nvme/target/loop.c +++ b/drivers/nvme/target/loop.c @@ -506,7 +506,6 @@ static void nvme_loop_reset_ctrl_work(struct work_struct *work) out_disable: dev_warn(ctrl->ctrl.device, "Removing after reset failure\n"); nvme_uninit_ctrl(&ctrl->ctrl); - nvme_put_ctrl(&ctrl->ctrl); }
static const struct nvme_ctrl_ops nvme_loop_ctrl_ops = { @@ -656,7 +655,6 @@ static struct nvme_ctrl *nvme_loop_create_ctrl(struct device *dev, kfree(ctrl->queues); out_uninit_ctrl: nvme_uninit_ctrl(&ctrl->ctrl); - nvme_put_ctrl(&ctrl->ctrl); out_put_ctrl: nvme_put_ctrl(&ctrl->ctrl); if (ret > 0)
From: Keith Busch kbusch@kernel.org
mainline inclusion from mainline-v5.8-rc1 commit d567572906d986dedb78b37f111c44eba033f3ef category: bugfix bugzilla: NA CVE: NA Link: https://gitee.com/openeuler/kernel/issues/I1WGZE
--------------------------------
The driver had been unlinking the namespace head from the subsystem's list only after the last reference was released, and outside of the list's subsys->lock protection.
There is no reason to track an empty head, so unlink the entry from the subsystem's list when the last namespace using that head is removed and with the mutex lock protecting the list update. The next namespace to attach reusing the previous NSID will allocate a new head rather than find the old head with mismatched identifiers.
Signed-off-by: Keith Busch kbusch@kernel.org Reviewed-by: Sagi Grimberg sagi@grimberg.me Signed-off-by: Christoph Hellwig hch@lst.de Signed-off-by: Jens Axboe axboe@kernel.dk Reviewed-by: Chao Leng lengchao@huawei.com Reviewed-by: Jike Cheng chengjike.cheng@huawei.com Signed-off-by: Lijie lijie34@huawei.com Acked-by: Hanjun Guo guohanjun@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- drivers/nvme/host/core.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-)
diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index d1ee577054b4..f28b56daa846 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -397,7 +397,6 @@ static void nvme_free_ns_head(struct kref *ref)
nvme_mpath_remove_disk(head); ida_simple_remove(&head->subsys->ns_ida, head->instance); - list_del_init(&head->entry); cleanup_srcu_struct_quiesced(&head->srcu); nvme_put_subsystem(head->subsys); kfree(head); @@ -3093,7 +3092,6 @@ static int __nvme_check_ids(struct nvme_subsystem *subsys,
list_for_each_entry(h, &subsys->nsheads, entry) { if (nvme_ns_ids_valid(&new->ids) && - !list_empty(&h->list) && nvme_ns_ids_equal(&new->ids, &h->ids)) return -EINVAL; } @@ -3321,6 +3319,8 @@ static void nvme_alloc_ns(struct nvme_ctrl *ctrl, unsigned nsid) out_unlink_ns: mutex_lock(&ctrl->subsys->lock); list_del_rcu(&ns->siblings); + if (list_empty(&ns->head->list)) + list_del_init(&ns->head->entry); mutex_unlock(&ctrl->subsys->lock); nvme_put_ns_head(ns->head); out_free_id: @@ -3340,7 +3340,10 @@ static void nvme_ns_remove(struct nvme_ns *ns)
mutex_lock(&ns->ctrl->subsys->lock); list_del_rcu(&ns->siblings); + if (list_empty(&ns->head->list)) + list_del_init(&ns->head->entry); mutex_unlock(&ns->ctrl->subsys->lock); + synchronize_rcu(); /* guarantee not available in head->list */ nvme_mpath_clear_current_path(ns); synchronize_srcu(&ns->head->srcu); /* wait for concurrent submissions */
From: Keith Busch kbusch@kernel.org
mainline inclusion from mainline-v5.8-rc1 commit ac262508daa88fb12c5dc53cf30bde163f9f26c9 category: bugfix bugzilla: NA CVE: NA Link: https://gitee.com/openeuler/kernel/issues/I1WGZE
--------------------------------
If a namespace identification does not match the subsystem's head for that NSID, release the reference that was taken when the matching head was initially found.
Signed-off-by: Keith Busch kbusch@kernel.org Reviewed-by: Sagi Grimberg sagi@grimberg.me Signed-off-by: Christoph Hellwig hch@lst.de Signed-off-by: Jens Axboe axboe@kernel.dk Reviewed-by: Chao Leng lengchao@huawei.com Reviewed-by: Jike Cheng chengjike.cheng@huawei.com Signed-off-by: Lijie lijie34@huawei.com Acked-by: Hanjun Guo guohanjun@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- drivers/nvme/host/core.c | 1 + 1 file changed, 1 insertion(+)
diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index f28b56daa846..490683b945c6 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -3174,6 +3174,7 @@ static int nvme_init_ns_head(struct nvme_ns *ns, unsigned nsid, "IDs don't match for shared namespace %d\n", nsid); ret = -EINVAL; + nvme_put_ns_head(head); goto out_unlock; } }
From: Keith Busch kbusch@kernel.org
mainline inclusion from mainline-v5.8-rc1 commit b2b2de7c5a0127882848f9c5e9379e84d2e02041 category: bugfix bugzilla: NA CVE: NA Link: https://gitee.com/openeuler/kernel/issues/I1WGZE
--------------------------------
If the namespace identifiers have changed, skip updating the disk information, as that will register parameters from a mismatched namespace.
Signed-off-by: Keith Busch kbusch@kernel.org Reviewed-by: Sagi Grimberg sagi@grimberg.me Signed-off-by: Christoph Hellwig hch@lst.de Signed-off-by: Jens Axboe axboe@kernel.dk Conflicts: drivers/nvme/host/core.c [adjust context] Reviewed-by: Chao Leng lengchao@huawei.com Reviewed-by: Jike Cheng chengjike.cheng@huawei.com Signed-off-by: Lijie lijie34@huawei.com Acked-by: Hanjun Guo guohanjun@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- drivers/nvme/host/core.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-)
diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index 490683b945c6..4a39377869fa 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -1689,14 +1689,15 @@ static int nvme_revalidate_disk(struct gendisk *disk) goto free_id; }
- __nvme_revalidate_disk(disk, id); nvme_report_ns_ids(ctrl, ns->head->ns_id, id, &ids); if (!nvme_ns_ids_equal(&ns->head->ids, &ids)) { dev_err(ctrl->device, "identifiers changed for nsid %d\n", ns->head->ns_id); ret = -ENODEV; + goto free_id; }
+ __nvme_revalidate_disk(disk, id); free_id: kfree(id); out:
From: Wu Bo wubo40@huawei.com
mainline inclusion from mainline-v5.8-rc1 commit 84e4c204b6a0e81f56bd7d1254123390ef0498c8 category: bugfix bugzilla: NA CVE: NA Link: https://gitee.com/openeuler/kernel/issues/I1WGZE
--------------------------------
Disable streams again if getting the stream params fails.
Signed-off-by: Wu Bo wubo40@huawei.com Signed-off-by: Christoph Hellwig hch@lst.de Reviewed-by: Chao Leng lengchao@huawei.com Reviewed-by: Jike Cheng chengjike.cheng@huawei.com Signed-off-by: Lijie lijie34@huawei.com Acked-by: Hanjun Guo guohanjun@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- drivers/nvme/host/core.c | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-)
diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index 4a39377869fa..a0a9042e7932 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -516,19 +516,22 @@ static int nvme_configure_directives(struct nvme_ctrl *ctrl)
ret = nvme_get_stream_params(ctrl, &s, NVME_NSID_ALL); if (ret) - return ret; + goto out_disable_stream;
ctrl->nssa = le16_to_cpu(s.nssa); if (ctrl->nssa < BLK_MAX_WRITE_HINTS - 1) { dev_info(ctrl->device, "too few streams (%u) available\n", ctrl->nssa); - nvme_disable_streams(ctrl); - return 0; + goto out_disable_stream; }
ctrl->nr_streams = min_t(unsigned, ctrl->nssa, BLK_MAX_WRITE_HINTS - 1); dev_info(ctrl->device, "Using %u streams\n", ctrl->nr_streams); return 0; + +out_disable_stream: + nvme_disable_streams(ctrl); + return ret; }
/*
From: Logan Gunthorpe logang@deltatee.com
mainline inclusion from mainline-v5.9-rc1 commit 2bf5d3bbffad06639db518c8dcf8a14c7dce770e category: bugfix bugzilla: NA CVE: NA Link: https://gitee.com/openeuler/kernel/issues/I1WGZE
--------------------------------
The host driver should decide whether to use SGLs or PRPs and they currently assume the flags are cleared after the call to nvme_setup_cmd(). However, passed-through commands may erroneously set these bits; so clear them for all cases.
Signed-off-by: Logan Gunthorpe logang@deltatee.com Reviewed-by: Keith Busch kbusch@kernel.org Reviewed-by: Sagi Grimberg sagi@grimberg.me Signed-off-by: Christoph Hellwig hch@lst.de Reviewed-by: Chao Leng lengchao@huawei.com Reviewed-by: Jike Cheng chengjike.cheng@huawei.com Signed-off-by: Lijie lijie34@huawei.com Acked-by: Hanjun Guo guohanjun@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- drivers/nvme/host/core.c | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-)
diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index a0a9042e7932..3776c6ff557b 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -559,6 +559,14 @@ static void nvme_assign_write_stream(struct nvme_ctrl *ctrl, req->q->write_hints[streamid] += blk_rq_bytes(req) >> 9; }
+static void nvme_setup_passthrough(struct request *req, + struct nvme_command *cmd) +{ + memcpy(cmd, nvme_req(req)->cmd, sizeof(*cmd)); + /* passthru commands should let the driver set the SGL flags */ + cmd->common.flags &= ~NVME_CMD_SGL_ALL; +} + static inline void nvme_setup_flush(struct nvme_ns *ns, struct nvme_command *cmnd) { @@ -716,7 +724,7 @@ blk_status_t nvme_setup_cmd(struct nvme_ns *ns, struct request *req, switch (req_op(req)) { case REQ_OP_DRV_IN: case REQ_OP_DRV_OUT: - memcpy(cmd, nvme_req(req)->cmd, sizeof(*cmd)); + nvme_setup_passthrough(req, cmd); break; case REQ_OP_FLUSH: nvme_setup_flush(ns, cmd);
From: Anton Eidelman anton@lightbitslabs.com
mainline inclusion from mainline-v5.8-rc3 commit e164471dcf19308d154adb69e7760d8ba426a77f category: bugfix bugzilla: NA CVE: NA Link: https://gitee.com/openeuler/kernel/issues/I1WGZE
--------------------------------
nvme: don't protect ns mutation with ns->head->lock Right now ns->head->lock is protecting namespace mutation which is wrong and unneeded. Move it to only protect against head mutations. While we're at it, remove unnecessary ns->head reference as we already have head pointer.
The problem with this is that the head->lock spans mpath disk node I/O that may block under some conditions (if for example the controller is disconnecting or the path became inaccessible), The locking scheme does not allow any other path to enable itself, preventing blocked I/O to complete and forward-progress from there.
This is a preparation patch for the fix in a subsequent patch where the disk I/O will also be done outside the head->lock.
Fixes: 0d0b660f214d ("nvme: add ANA support") Signed-off-by: Anton Eidelman anton@lightbitslabs.com Signed-off-by: Sagi Grimberg sagi@grimberg.me Signed-off-by: Christoph Hellwig hch@lst.de Conflicts: drivers/nvme/host/multipath.c [adjust context] Reviewed-by: Chao Leng lengchao@huawei.com Reviewed-by: Jike Cheng chengjike.cheng@huawei.com Signed-off-by: Lijie lijie34@huawei.com Acked-by: Hanjun Guo guohanjun@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- drivers/nvme/host/multipath.c | 12 ++++-------- 1 file changed, 4 insertions(+), 8 deletions(-)
diff --git a/drivers/nvme/host/multipath.c b/drivers/nvme/host/multipath.c index bf22bbb56bda..f83286f5ec1a 100644 --- a/drivers/nvme/host/multipath.c +++ b/drivers/nvme/host/multipath.c @@ -309,11 +309,10 @@ static void nvme_mpath_set_live(struct nvme_ns *ns) { struct nvme_ns_head *head = ns->head;
- lockdep_assert_held(&ns->head->lock); - if (!head->disk) return;
+ mutex_lock(&head->lock); if (!(head->disk->flags & GENHD_FL_UP)) { device_add_disk(&head->subsys->dev, head->disk); if (sysfs_create_group(&disk_to_dev(head->disk)->kobj, @@ -321,9 +320,10 @@ static void nvme_mpath_set_live(struct nvme_ns *ns) dev_warn(&head->subsys->dev, "failed to create id group.\n"); } + mutex_unlock(&head->lock);
- synchronize_srcu(&ns->head->srcu); - kblockd_schedule_work(&ns->head->requeue_work); + synchronize_srcu(&head->srcu); + kblockd_schedule_work(&head->requeue_work); }
static int nvme_parse_ana_log(struct nvme_ctrl *ctrl, void *data, @@ -378,14 +378,12 @@ static inline bool nvme_state_is_live(enum nvme_ana_state state) static void nvme_update_ns_ana_state(struct nvme_ana_group_desc *desc, struct nvme_ns *ns) { - mutex_lock(&ns->head->lock); ns->ana_grpid = le32_to_cpu(desc->grpid); ns->ana_state = desc->state; clear_bit(NVME_NS_ANA_PENDING, &ns->flags);
if (nvme_state_is_live(ns->ana_state)) nvme_mpath_set_live(ns); - mutex_unlock(&ns->head->lock); }
static int nvme_update_ana_state(struct nvme_ctrl *ctrl, @@ -527,10 +525,8 @@ void nvme_mpath_add_disk(struct nvme_ns *ns, struct nvme_id_ns *id) nvme_update_ns_ana_state(&desc, ns); } } else { - mutex_lock(&ns->head->lock); ns->ana_state = NVME_ANA_OPTIMIZED; nvme_mpath_set_live(ns); - mutex_unlock(&ns->head->lock); }
if (bdi_cap_stable_pages_required(ns->queue->backing_dev_info)) {
From: Anton Eidelman anton@lightbitslabs.com
mainline inclusion from mainline-v5.8-rc3 commit d8a22f85609fadb46ba699e0136cc3ebdeebff79 category: bugfix bugzilla: NA CVE: NA Link: https://gitee.com/openeuler/kernel/issues/I1WGZE
--------------------------------
nvme-multipath: fix deadlock due to head->lock In the following scenario scan_work and ana_work will deadlock:
When scan_work calls nvme_mpath_add_disk() this holds ana_lock and invokes nvme_parse_ana_log(), which may issue IO in device_add_disk() and hang waiting for an accessible path.
While nvme_mpath_set_live() only called when nvme_state_is_live(), a transition may cause NVME_SC_ANA_TRANSITION and requeue the IO.
Since nvme_mpath_set_live() holds ns->head->lock, an ana_work on ANY ctrl will not be able to complete nvme_mpath_set_live() on the same ns->head, which is required in order to update the new accessible path and remove NVME_NS_ANA_PENDING.. Therefore IO never completes: deadlock [1].
Fix: Move device_add_disk out of the head->lock and protect it with an atomic test_and_set for a new NVME_NS_HEAD_HAS_DISK bit.
[1]: kernel: INFO: task kworker/u8:2:160 blocked for more than 120 seconds. kernel: Tainted: G OE 5.3.5-050305-generic #201910071830 kernel: "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message. kernel: kworker/u8:2 D 0 160 2 0x80004000 kernel: Workqueue: nvme-wq nvme_ana_work [nvme_core] kernel: Call Trace: kernel: __schedule+0x2b9/0x6c0 kernel: schedule+0x42/0xb0 kernel: schedule_preempt_disabled+0xe/0x10 kernel: __mutex_lock.isra.0+0x182/0x4f0 kernel: __mutex_lock_slowpath+0x13/0x20 kernel: mutex_lock+0x2e/0x40 kernel: nvme_update_ns_ana_state+0x22/0x60 [nvme_core] kernel: nvme_update_ana_state+0xca/0xe0 [nvme_core] kernel: nvme_parse_ana_log+0xa1/0x180 [nvme_core] kernel: nvme_read_ana_log+0x76/0x100 [nvme_core] kernel: nvme_ana_work+0x15/0x20 [nvme_core] kernel: process_one_work+0x1db/0x380 kernel: worker_thread+0x4d/0x400 kernel: kthread+0x104/0x140 kernel: ret_from_fork+0x35/0x40 kernel: INFO: task kworker/u8:4:439 blocked for more than 120 seconds. kernel: Tainted: G OE 5.3.5-050305-generic #201910071830 kernel: "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message. kernel: kworker/u8:4 D 0 439 2 0x80004000 kernel: Workqueue: nvme-wq nvme_scan_work [nvme_core] kernel: Call Trace: kernel: __schedule+0x2b9/0x6c0 kernel: schedule+0x42/0xb0 kernel: io_schedule+0x16/0x40 kernel: do_read_cache_page+0x438/0x830 kernel: read_cache_page+0x12/0x20 kernel: read_dev_sector+0x27/0xc0 kernel: read_lba+0xc1/0x220 kernel: efi_partition+0x1e6/0x708 kernel: check_partition+0x154/0x244 kernel: rescan_partitions+0xae/0x280 kernel: __blkdev_get+0x40f/0x560 kernel: blkdev_get+0x3d/0x140 kernel: __device_add_disk+0x388/0x480 kernel: device_add_disk+0x13/0x20 kernel: nvme_mpath_set_live+0x119/0x140 [nvme_core] kernel: nvme_update_ns_ana_state+0x5c/0x60 [nvme_core] kernel: nvme_mpath_add_disk+0xbe/0x100 [nvme_core] kernel: nvme_validate_ns+0x396/0x940 [nvme_core] kernel: nvme_scan_work+0x256/0x390 [nvme_core] kernel: process_one_work+0x1db/0x380 kernel: worker_thread+0x4d/0x400 kernel: kthread+0x104/0x140 kernel: ret_from_fork+0x35/0x40
Fixes: 0d0b660f214d ("nvme: add ANA support") Signed-off-by: Anton Eidelman anton@lightbitslabs.com Signed-off-by: Sagi Grimberg sagi@grimberg.me Signed-off-by: Christoph Hellwig hch@lst.de Conflicts: drivers/nvme/host/multipath.c drivers/nvme/host/nvme.h [No need to use head->lock in nvme_mpath_set_live function] Reviewed-by: Chao Leng lengchao@huawei.com Reviewed-by: Jike Cheng chengjike.cheng@huawei.com Signed-off-by: Lijie lijie34@huawei.com Acked-by: Hanjun Guo guohanjun@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- drivers/nvme/host/multipath.c | 4 +--- drivers/nvme/host/nvme.h | 2 ++ 2 files changed, 3 insertions(+), 3 deletions(-)
diff --git a/drivers/nvme/host/multipath.c b/drivers/nvme/host/multipath.c index f83286f5ec1a..ed902d3ae35b 100644 --- a/drivers/nvme/host/multipath.c +++ b/drivers/nvme/host/multipath.c @@ -312,15 +312,13 @@ static void nvme_mpath_set_live(struct nvme_ns *ns) if (!head->disk) return;
- mutex_lock(&head->lock); - if (!(head->disk->flags & GENHD_FL_UP)) { + if (!test_and_set_bit(NVME_NSHEAD_DISK_LIVE, &head->flags)) { device_add_disk(&head->subsys->dev, head->disk); if (sysfs_create_group(&disk_to_dev(head->disk)->kobj, &nvme_ns_id_attr_group)) dev_warn(&head->subsys->dev, "failed to create id group.\n"); } - mutex_unlock(&head->lock);
synchronize_srcu(&head->srcu); kblockd_schedule_work(&head->requeue_work); diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h index 5146aa961ab2..d8399082b3d2 100644 --- a/drivers/nvme/host/nvme.h +++ b/drivers/nvme/host/nvme.h @@ -290,6 +290,8 @@ struct nvme_ns_head { spinlock_t requeue_lock; struct work_struct requeue_work; struct mutex lock; + unsigned long flags; +#define NVME_NSHEAD_DISK_LIVE 0 #endif struct list_head list; struct srcu_struct srcu;
From: Sagi Grimberg sagi@grimberg.me
mainline inclusion from mainline-v5.8-rc3 commit c31244669f57963b6ce133a5555b118fc50aec95 category: bugfix bugzilla: NA CVE: NA Link: https://gitee.com/openeuler/kernel/issues/I1WGZE
--------------------------------
The mpath disk node takes a reference on the request mpath request queue when adding live path to the mpath gendisk. However if we connected to an inaccessible path device_add_disk is not called, so if we disconnect and remove the mpath gendisk we endup putting an reference on the request queue that was never taken [1].
Fix that to check if we ever added a live path (using NVME_NS_HEAD_HAS_DISK flag) and if not, clear the disk->queue reference.
[1]: ------------[ cut here ]------------ refcount_t: underflow; use-after-free. WARNING: CPU: 1 PID: 1372 at lib/refcount.c:28 refcount_warn_saturate+0xa6/0xf0 CPU: 1 PID: 1372 Comm: nvme Tainted: G O 5.7.0-rc2+ #3 Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS 1.13.0-1ubuntu1 04/01/2014 RIP: 0010:refcount_warn_saturate+0xa6/0xf0 RSP: 0018:ffffb29e8053bdc0 EFLAGS: 00010282 RAX: 0000000000000000 RBX: ffff8b7a2f4fc060 RCX: 0000000000000007 RDX: 0000000000000007 RSI: 0000000000000092 RDI: ffff8b7a3ec99980 RBP: ffff8b7a2f4fc000 R08: 00000000000002e1 R09: 0000000000000004 R10: 0000000000000000 R11: 0000000000000001 R12: 0000000000000000 R13: fffffffffffffff2 R14: ffffb29e8053bf08 R15: ffff8b7a320e2da0 FS: 00007f135d4ca800(0000) GS:ffff8b7a3ec80000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 00005651178c0c30 CR3: 000000003b650005 CR4: 0000000000360ee0 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 Call Trace: disk_release+0xa2/0xc0 device_release+0x28/0x80 kobject_put+0xa5/0x1b0 nvme_put_ns_head+0x26/0x70 [nvme_core] nvme_put_ns+0x30/0x60 [nvme_core] nvme_remove_namespaces+0x9b/0xe0 [nvme_core] nvme_do_delete_ctrl+0x43/0x5c [nvme_core] nvme_sysfs_delete.cold+0x8/0xd [nvme_core] kernfs_fop_write+0xc1/0x1a0 vfs_write+0xb6/0x1a0 ksys_write+0x5f/0xe0 do_syscall_64+0x52/0x1a0 entry_SYSCALL_64_after_hwframe+0x44/0xa9
Reported-by: Anton Eidelman anton@lightbitslabs.com Tested-by: Anton Eidelman anton@lightbitslabs.com Signed-off-by: Sagi Grimberg sagi@grimberg.me Signed-off-by: Christoph Hellwig hch@lst.de Reviewed-by: Chao Leng lengchao@huawei.com Reviewed-by: Jike Cheng chengjike.cheng@huawei.com Signed-off-by: Lijie lijie34@huawei.com Acked-by: Hanjun Guo guohanjun@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- drivers/nvme/host/multipath.c | 8 ++++++++ 1 file changed, 8 insertions(+)
diff --git a/drivers/nvme/host/multipath.c b/drivers/nvme/host/multipath.c index ed902d3ae35b..8b73ec23614a 100644 --- a/drivers/nvme/host/multipath.c +++ b/drivers/nvme/host/multipath.c @@ -550,6 +550,14 @@ void nvme_mpath_remove_disk(struct nvme_ns_head *head) kblockd_schedule_work(&head->requeue_work); flush_work(&head->requeue_work); blk_cleanup_queue(head->disk->queue); + if (!test_bit(NVME_NSHEAD_DISK_LIVE, &head->flags)) { + /* + * if device_add_disk wasn't called, prevent + * disk release to put a bogus reference on the + * request queue + */ + head->disk->queue = NULL; + } put_disk(head->disk); }
From: Sagi Grimberg sagi@grimberg.me
mainline inclusion from mainline-v5.3-rc5 commit 0157ec8dad3c8fc9bc9790f76e0831ffdaf2e7f0 category: bugfix bugzilla: NA CVE: NA Link: https://gitee.com/openeuler/kernel/issues/I1WGZE
--------------------------------
With multipath enabled, nvme_scan_work() can read from the device (through nvme_mpath_add_disk()) and hang [1]. However, with fabrics, once ctrl->state is set to NVME_CTRL_DELETING, the reads will hang (see nvmf_check_ready()) and the mpath stack device make_request will block if head->list is not empty. However, when the head->list consistst of only DELETING/DEAD controllers, we should actually not block, but rather fail immediately.
In addition, before we go ahead and remove the namespaces, make sure to clear the current path and kick the requeue list so that the request will fast fail upon requeuing.
[1]: Acked-by: Hanjun Guo guohanjun@huawei.com
Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- drivers/nvme/host/core.c | 7 +++++++ drivers/nvme/host/multipath.c | 35 ++++++++++++++++++++++++++++++++--- drivers/nvme/host/nvme.h | 14 +++++++++++--- 3 files changed, 50 insertions(+), 6 deletions(-)
diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index 3776c6ff557b..60b42d778007 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -3532,6 +3532,13 @@ void nvme_remove_namespaces(struct nvme_ctrl *ctrl) struct nvme_ns *ns, *next; LIST_HEAD(ns_list);
+ /* + * make sure to requeue I/O to all namespaces as these + * might result from the scan itself and must complete + * for the scan_work to make progress + */ + nvme_mpath_clear_ctrl_paths(ctrl); + /* prevent racing with ns scanning */ flush_work(&ctrl->scan_work);
diff --git a/drivers/nvme/host/multipath.c b/drivers/nvme/host/multipath.c index 8b73ec23614a..bf349af289f6 100644 --- a/drivers/nvme/host/multipath.c +++ b/drivers/nvme/host/multipath.c @@ -141,6 +141,17 @@ static const char *nvme_ana_state_names[] = { [NVME_ANA_CHANGE] = "change", };
+void nvme_mpath_clear_ctrl_paths(struct nvme_ctrl *ctrl) +{ + struct nvme_ns *ns; + + mutex_lock(&ctrl->scan_lock); + list_for_each_entry(ns, &ctrl->namespaces, list) + if (nvme_mpath_clear_current_path(ns)) + kblockd_schedule_work(&ns->head->requeue_work); + mutex_unlock(&ctrl->scan_lock); +} + static struct nvme_ns *__nvme_find_path(struct nvme_ns_head *head) { struct nvme_ns *ns, *fallback = NULL; @@ -181,6 +192,24 @@ inline struct nvme_ns *nvme_find_path(struct nvme_ns_head *head) return ns; }
+static bool nvme_available_path(struct nvme_ns_head *head) +{ + struct nvme_ns *ns; + + list_for_each_entry_rcu(ns, &head->list, siblings) { + switch (ns->ctrl->state) { + case NVME_CTRL_LIVE: + case NVME_CTRL_RESETTING: + case NVME_CTRL_CONNECTING: + /* fallthru */ + return true; + default: + break; + } + } + return false; +} + static blk_qc_t nvme_ns_head_make_request(struct request_queue *q, struct bio *bio) { @@ -199,14 +228,14 @@ static blk_qc_t nvme_ns_head_make_request(struct request_queue *q, disk_devt(ns->head->disk), bio->bi_iter.bi_sector); ret = direct_make_request(bio); - } else if (!list_empty_careful(&head->list)) { - dev_warn_ratelimited(dev, "no path available - requeuing I/O\n"); + } else if (nvme_available_path(head)) { + dev_warn_ratelimited(dev, "no usable path - requeuing I/O\n");
spin_lock_irq(&head->requeue_lock); bio_list_add(&head->requeue_list, bio); spin_unlock_irq(&head->requeue_lock); } else { - dev_warn_ratelimited(dev, "no path - failing I/O\n"); + dev_warn_ratelimited(dev, "no available path - failing I/O\n");
bio->bi_status = BLK_STS_IOERR; bio_endio(bio); diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h index d8399082b3d2..9516b1e129d9 100644 --- a/drivers/nvme/host/nvme.h +++ b/drivers/nvme/host/nvme.h @@ -490,13 +490,17 @@ int nvme_mpath_init(struct nvme_ctrl *ctrl, struct nvme_id_ctrl *id); void nvme_mpath_uninit(struct nvme_ctrl *ctrl); void nvme_mpath_stop(struct nvme_ctrl *ctrl);
-static inline void nvme_mpath_clear_current_path(struct nvme_ns *ns) +static inline bool nvme_mpath_clear_current_path(struct nvme_ns *ns) { struct nvme_ns_head *head = ns->head;
- if (head && ns == rcu_access_pointer(head->current_path)) + if (head && ns == rcu_access_pointer(head->current_path)) { rcu_assign_pointer(head->current_path, NULL); + return true; + } + return false; } +void nvme_mpath_clear_ctrl_paths(struct nvme_ctrl *ctrl); struct nvme_ns *nvme_find_path(struct nvme_ns_head *head);
static inline void nvme_mpath_check_last_path(struct nvme_ns *ns) @@ -554,7 +558,11 @@ static inline void nvme_mpath_add_disk(struct nvme_ns *ns, static inline void nvme_mpath_remove_disk(struct nvme_ns_head *head) { } -static inline void nvme_mpath_clear_current_path(struct nvme_ns *ns) +static inline bool nvme_mpath_clear_current_path(struct nvme_ns *ns) +{ + return false; +} +static inline void nvme_mpath_clear_ctrl_paths(struct nvme_ctrl *ctrl) { } static inline void nvme_mpath_check_last_path(struct nvme_ns *ns)
From: Anton Eidelman anton@lightbitslabs.com
mainline inclusion from mainline-v5.4-rc7 commit 763303a83a095a88c3a8a0d1abf97165db2e8bf5 category: bugfix bugzilla: NA CVE: NA Link: https://gitee.com/openeuler/kernel/issues/I1WGZE
--------------------------------
nvme_mpath_clear_ctrl_paths() iterates through the ctrl->namespaces list while holding ctrl->scan_lock. This does not seem to be the correct way of protecting from concurrent list modification.
Specifically, nvme_scan_work() sorts ctrl->namespaces AFTER unlocking scan_lock.
This may result in the following (rare) crash in ctrl disconnect during scan_work:
BUG: kernel NULL pointer dereference, address: 0000000000000050 Oops: 0000 [#1] SMP PTI CPU: 0 PID: 3995 Comm: nvme 5.3.5-050305-generic RIP: 0010:nvme_mpath_clear_current_path+0xe/0x90 [nvme_core] ... Call Trace: nvme_mpath_clear_ctrl_paths+0x3c/0x70 [nvme_core] nvme_remove_namespaces+0x35/0xe0 [nvme_core] nvme_do_delete_ctrl+0x47/0x90 [nvme_core] nvme_sysfs_delete+0x49/0x60 [nvme_core] dev_attr_store+0x17/0x30 sysfs_kf_write+0x3e/0x50 kernfs_fop_write+0x11e/0x1a0 __vfs_write+0x1b/0x40 vfs_write+0xb9/0x1a0 ksys_write+0x67/0xe0 __x64_sys_write+0x1a/0x20 do_syscall_64+0x5a/0x130 entry_SYSCALL_64_after_hwframe+0x44/0xa9 RIP: 0033:0x7f8d02bfb154
Fix: After taking scan_lock in nvme_mpath_clear_ctrl_paths() down_read(&ctrl->namespaces_rwsem) as well to make list traversal safe. This will not cause deadlocks because taking scan_lock never happens while holding the namespaces_rwsem. Moreover, scan work downs namespaces_rwsem in the same order.
Alternative: sort ctrl->namespaces in nvme_scan_work() while still holding the scan_lock. This would leave nvme_mpath_clear_ctrl_paths() without correct protection against ctrl->namespaces modification by anyone other than scan_work.
Reviewed-by: Sagi Grimberg sagi@grimberg.me Reviewed-by: Christoph Hellwig hch@lst.de Signed-off-by: Anton Eidelman anton@lightbitslabs.com Signed-off-by: Keith Busch kbusch@kernel.org Reviewed-by: Chao Leng lengchao@huawei.com Reviewed-by: Jike Cheng chengjike.cheng@huawei.com Signed-off-by: Lijie lijie34@huawei.com Acked-by: Hanjun Guo guohanjun@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- drivers/nvme/host/multipath.c | 2 ++ 1 file changed, 2 insertions(+)
diff --git a/drivers/nvme/host/multipath.c b/drivers/nvme/host/multipath.c index bf349af289f6..e1fc9ffbd3ee 100644 --- a/drivers/nvme/host/multipath.c +++ b/drivers/nvme/host/multipath.c @@ -146,9 +146,11 @@ void nvme_mpath_clear_ctrl_paths(struct nvme_ctrl *ctrl) struct nvme_ns *ns;
mutex_lock(&ctrl->scan_lock); + down_read(&ctrl->namespaces_rwsem); list_for_each_entry(ns, &ctrl->namespaces, list) if (nvme_mpath_clear_current_path(ns)) kblockd_schedule_work(&ns->head->requeue_work); + up_read(&ctrl->namespaces_rwsem); mutex_unlock(&ctrl->scan_lock); }
From: Shusong Tao taoshusong@huawei.com
mainline inclusion from mainline-v5.10-rc1 commit a87da50f39d467f2ea4c1f98decb72ef6d87a31e category: bugfix bugzilla: NA CVE: NA Link: https://gitee.com/openeuler/kernel/issues/I1WGZE
-------------------------------------------------
A crash happened due to injecting error test. The cqe has incorrect command id, host may find a request which already be freed. req->mr->rkey cause a crash in nvme_rdma_process_nvme_rsp. Because the mr is already freed.
Add a check for the mr to fix it.
Signed-off-by: Shusong Tao taoshusong@huawei.com Reviewed-by: Chao Leng lengchao@huawei.com Reviewed-by: Jike Cheng chengjike.cheng@huawei.com Signed-off-by: Lijie lijie34@huawei.com Acked-by: Hanjun Guo guohanjun@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- drivers/nvme/host/rdma.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-)
diff --git a/drivers/nvme/host/rdma.c b/drivers/nvme/host/rdma.c index b91444c1fd45..db2bed55bd68 100644 --- a/drivers/nvme/host/rdma.c +++ b/drivers/nvme/host/rdma.c @@ -1425,10 +1425,11 @@ static int nvme_rdma_process_nvme_rsp(struct nvme_rdma_queue *queue, req->result = cqe->result;
if (wc->wc_flags & IB_WC_WITH_INVALIDATE) { - if (unlikely(wc->ex.invalidate_rkey != req->mr->rkey)) { + if (unlikely(!req->mr || + wc->ex.invalidate_rkey != req->mr->rkey)) { dev_err(queue->ctrl->ctrl.device, "Bogus remote invalidation for rkey %#x\n", - req->mr->rkey); + req->mr ? req->mr->rkey : 0); nvme_rdma_error_recovery(queue->ctrl); } } else if (req->mr) {
From: Sagi Grimberg sagi@grimberg.me
mainline inclusion from mainline-v5.10-rc3 commit fdf58e02adecbef4c7cbb2073d8ea225e6fd5f26 category: bugfix bugzilla: NA CVE: NA Link: https://gitee.com/openeuler/kernel/issues/I1WGZE
-------------------------------------------------
The request may be executed asynchronously, and rq->state may be changed to IDLE. To avoid repeated request completion, only MQ_RQ_COMPLETE of rq->state is checked in nvme_rdma_complete_timed_out. It is not safe, so need adding check IDLE for rq->state.
Signed-off-by: Sagi Grimberg sagi@grimberg.me Signed-off-by: Chao Leng lengchao@huawei.com Signed-off-by: Ruozhu Li liruozhu@huawei.com Signed-off-by: Lijie lijie34@huawei.com Acked-by: Hanjun Guo guohanjun@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- drivers/nvme/host/rdma.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/drivers/nvme/host/rdma.c b/drivers/nvme/host/rdma.c index db2bed55bd68..e31d340496b1 100644 --- a/drivers/nvme/host/rdma.c +++ b/drivers/nvme/host/rdma.c @@ -1677,7 +1677,7 @@ static void nvme_rdma_complete_timed_out(struct request *rq) /* fence other contexts that may complete the command */ mutex_lock(&ctrl->teardown_lock); nvme_rdma_stop_queue(queue); - if (!blk_mq_request_completed(rq)) { + if (blk_mq_request_started(rq) && !blk_mq_request_completed(rq)) { nvme_req(rq)->status = NVME_SC_HOST_ABORTED_CMD; blk_mq_complete_request(rq); }
From: Chao Leng lengchao@huawei.com
mainline inclusion from mainline-v5.10-rc3 commit 04800fbff4764ab7b32c49d19628605a5d4cb85c category: bugfix bugzilla: NA CVE: NA Link: https://gitee.com/openeuler/kernel/issues/I1WGZE
-------------------------------------------------
Introduce sync io queues for some scenarios which just only need sync io queues not sync all queues.
Signed-off-by: Chao Leng lengchao@huawei.com Reviewed-by: Sagi Grimberg sagi@grimberg.me Reviewed-by: Jike Cheng chengjike.cheng@huawei.com Signed-off-by: Ruozhu Li liruozhu@huawei.com Signed-off-by: Lijie lijie34@huawei.com Acked-by: Hanjun Guo guohanjun@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- drivers/nvme/host/core.c | 11 +++++++++++ drivers/nvme/host/nvme.h | 1 + 2 files changed, 12 insertions(+)
diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index 60b42d778007..8443c6c5c5af 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -3923,6 +3923,17 @@ void nvme_start_queues(struct nvme_ctrl *ctrl) } EXPORT_SYMBOL_GPL(nvme_start_queues);
+void nvme_sync_io_queues(struct nvme_ctrl *ctrl) +{ + struct nvme_ns *ns; + + down_read(&ctrl->namespaces_rwsem); + list_for_each_entry(ns, &ctrl->namespaces, list) + blk_sync_queue(ns->queue); + up_read(&ctrl->namespaces_rwsem); +} +EXPORT_SYMBOL_GPL(nvme_sync_io_queues); + int __init nvme_core_init(void) { int result = -ENOMEM; diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h index 9516b1e129d9..5699397d3a5d 100644 --- a/drivers/nvme/host/nvme.h +++ b/drivers/nvme/host/nvme.h @@ -440,6 +440,7 @@ void nvme_complete_async_event(struct nvme_ctrl *ctrl, __le16 status, void nvme_stop_queues(struct nvme_ctrl *ctrl); void nvme_start_queues(struct nvme_ctrl *ctrl); void nvme_kill_queues(struct nvme_ctrl *ctrl); +void nvme_sync_io_queues(struct nvme_ctrl *ctrl); void nvme_unfreeze(struct nvme_ctrl *ctrl); void nvme_wait_freeze(struct nvme_ctrl *ctrl); void nvme_wait_freeze_timeout(struct nvme_ctrl *ctrl, long timeout);
From: Chao Leng lengchao@huawei.com
mainline inclusion from mainline-v5.10-rc3 commit 3017013dcc82a4862bd1e140f8b762cfc594008d category: bugfix bugzilla: NA CVE: NA Link: https://gitee.com/openeuler/kernel/issues/I1WGZE
-------------------------------------------------
Now use teardown_lock to serialize for time out and tear down. This may cause abnormal: first cancel all request in tear down, then time out may complete the request again, but the request may already be freed or restarted.
To avoid race between time out and tear down, in tear down process, first we quiesce the queue, and then delete the timer and cancel the time out work for the queue. At the same time we need to delete teardown_lock.
Signed-off-by: Chao Leng lengchao@huawei.com Reviewed-by: Sagi Grimberg sagi@grimberg.me Signed-off-by: Ruozhu Li liruozhu@huawei.com Signed-off-by: Lijie lijie34@huawei.com Acked-by: Hanjun Guo guohanjun@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- drivers/nvme/host/rdma.c | 12 ++---------- 1 file changed, 2 insertions(+), 10 deletions(-)
diff --git a/drivers/nvme/host/rdma.c b/drivers/nvme/host/rdma.c index e31d340496b1..edd7bdd36906 100644 --- a/drivers/nvme/host/rdma.c +++ b/drivers/nvme/host/rdma.c @@ -118,7 +118,6 @@ struct nvme_rdma_ctrl { struct sockaddr_storage src_addr;
struct nvme_ctrl ctrl; - struct mutex teardown_lock; bool use_inline_data; };
@@ -867,8 +866,8 @@ static int nvme_rdma_configure_io_queues(struct nvme_rdma_ctrl *ctrl, bool new) static void nvme_rdma_teardown_admin_queue(struct nvme_rdma_ctrl *ctrl, bool remove) { - mutex_lock(&ctrl->teardown_lock); blk_mq_quiesce_queue(ctrl->ctrl.admin_q); + blk_sync_queue(ctrl->ctrl.admin_q); nvme_rdma_stop_queue(&ctrl->queues[0]); if (ctrl->ctrl.admin_tagset) { blk_mq_tagset_busy_iter(ctrl->ctrl.admin_tagset, @@ -877,15 +876,14 @@ static void nvme_rdma_teardown_admin_queue(struct nvme_rdma_ctrl *ctrl, } blk_mq_unquiesce_queue(ctrl->ctrl.admin_q); nvme_rdma_destroy_admin_queue(ctrl, remove); - mutex_unlock(&ctrl->teardown_lock); }
static void nvme_rdma_teardown_io_queues(struct nvme_rdma_ctrl *ctrl, bool remove) { - mutex_lock(&ctrl->teardown_lock); if (ctrl->ctrl.queue_count > 1) { nvme_stop_queues(&ctrl->ctrl); + nvme_sync_io_queues(&ctrl->ctrl); nvme_rdma_stop_io_queues(ctrl); if (ctrl->ctrl.tagset) { blk_mq_tagset_busy_iter(ctrl->ctrl.tagset, @@ -896,7 +894,6 @@ static void nvme_rdma_teardown_io_queues(struct nvme_rdma_ctrl *ctrl, nvme_start_queues(&ctrl->ctrl); nvme_rdma_destroy_io_queues(ctrl, remove); } - mutex_unlock(&ctrl->teardown_lock); }
static void nvme_rdma_stop_ctrl(struct nvme_ctrl *nctrl) @@ -1672,16 +1669,12 @@ static void nvme_rdma_complete_timed_out(struct request *rq) { struct nvme_rdma_request *req = blk_mq_rq_to_pdu(rq); struct nvme_rdma_queue *queue = req->queue; - struct nvme_rdma_ctrl *ctrl = queue->ctrl;
- /* fence other contexts that may complete the command */ - mutex_lock(&ctrl->teardown_lock); nvme_rdma_stop_queue(queue); if (blk_mq_request_started(rq) && !blk_mq_request_completed(rq)) { nvme_req(rq)->status = NVME_SC_HOST_ABORTED_CMD; blk_mq_complete_request(rq); } - mutex_unlock(&ctrl->teardown_lock); }
static enum blk_eh_timer_return @@ -1992,7 +1985,6 @@ static struct nvme_ctrl *nvme_rdma_create_ctrl(struct device *dev, return ERR_PTR(-ENOMEM); ctrl->ctrl.opts = opts; INIT_LIST_HEAD(&ctrl->list); - mutex_init(&ctrl->teardown_lock);
if (opts->mask & NVMF_OPT_TRSVCID) port = opts->trsvcid;
From: Ruozhu Li liruozhu@huawei.com
driver inclusion category: bugfix bugzilla: NA CVE: NA Link: https://gitee.com/openeuler/kernel/issues/I1WGZE
-------------------------------------------------
when nvme_rdma_wait_for_cm timeout happened before resolve addr timeout, we will destroy id while still resolving addr, this will cause a crash. so change wait for cm timeout to double amount of resolve addr timeout.
Signed-off-by: Ruozhu Li liruozhu@huawei.com Reviewed-by: Chao Leng lengchao@huawei.com Reviewed-by: Jike Cheng chengjike.cheng@huawei.com Signed-off-by: Lijie lijie34@huawei.com Acked-by: Hanjun Guo guohanjun@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- drivers/nvme/host/rdma.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/drivers/nvme/host/rdma.c b/drivers/nvme/host/rdma.c index edd7bdd36906..6ed0083f4629 100644 --- a/drivers/nvme/host/rdma.c +++ b/drivers/nvme/host/rdma.c @@ -240,7 +240,7 @@ static void nvme_rdma_qp_event(struct ib_event *event, void *context) static int nvme_rdma_wait_for_cm(struct nvme_rdma_queue *queue) { wait_for_completion_interruptible_timeout(&queue->cm_done, - msecs_to_jiffies(NVME_RDMA_CONNECT_TIMEOUT_MS) + 1); + msecs_to_jiffies(2 * NVME_RDMA_CONNECT_TIMEOUT_MS)); return queue->cm_error; }
From: Ruozhu Li liruozhu@huawei.com
driver inclusion category: bugfix bugzilla: NA CVE: NA Link: https://gitee.com/openeuler/kernel/issues/I1WGZE
-------------------------------------------------
add module parameter "enable_inline_data" to nvme_rdma module.So we can turn off inline data feature when use rdma transport dynamically.
Signed-off-by: Ruozhu Li liruozhu@huawei.com Reviewed-by: Chao Leng lengchao@huawei.com Reviewed-by: Jike Cheng chengjike.cheng@huawei.com Signed-off-by: Lijie lijie34@huawei.com Acked-by: Hanjun Guo guohanjun@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- drivers/nvme/host/rdma.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-)
diff --git a/drivers/nvme/host/rdma.c b/drivers/nvme/host/rdma.c index 6ed0083f4629..912b15d04f1c 100644 --- a/drivers/nvme/host/rdma.c +++ b/drivers/nvme/host/rdma.c @@ -141,6 +141,10 @@ static bool register_always = true; module_param(register_always, bool, 0444); MODULE_PARM_DESC(register_always, "Use memory registration even for contiguous memory regions"); +static bool enable_inline_data = true; +module_param(enable_inline_data, bool, 0644); +MODULE_PARM_DESC(enable_inline_data, + "global switch for inline data when use rdma transport");
static int nvme_rdma_cm_handler(struct rdma_cm_id *cm_id, struct rdma_cm_event *event); @@ -1256,7 +1260,7 @@ static int nvme_rdma_map_data(struct nvme_rdma_queue *queue,
if (count <= dev->num_inline_segments) { if (rq_data_dir(rq) == WRITE && nvme_rdma_queue_idx(queue) && - queue->ctrl->use_inline_data && + queue->ctrl->use_inline_data && enable_inline_data && blk_rq_payload_bytes(rq) <= nvme_rdma_inline_data_size(queue)) { ret = nvme_rdma_map_sg_inline(queue, req, c, count);
From: Tao Jiang jiangtao62@huawei.com
driver inclusion category: bugfix bugzilla: NA CVE: NA Link: https://gitee.com/openeuler/kernel/issues/I1WGZE
-------------------------------------------------
Modify the default value of reconnect to -1, which means reconnect forever.
Signed-off-by: Tao Jiang jiangtao62@huawei.com Reviewed-by: Chao Leng lengchao@huawei.com Reviewed-by: Jike Cheng chengjike.cheng@huawei.com Signed-off-by: Lijie lijie34@huawei.com Acked-by: Hanjun Guo guohanjun@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- drivers/nvme/host/fabrics.c | 2 +- drivers/nvme/host/fabrics.h | 1 + 2 files changed, 2 insertions(+), 1 deletion(-)
diff --git a/drivers/nvme/host/fabrics.c b/drivers/nvme/host/fabrics.c index 05dd46f98441..a509d90e520f 100644 --- a/drivers/nvme/host/fabrics.c +++ b/drivers/nvme/host/fabrics.c @@ -616,7 +616,7 @@ static int nvmf_parse_options(struct nvmf_ctrl_options *opts, char *options, *o, *p; int token, ret = 0; size_t nqnlen = 0; - int ctrl_loss_tmo = NVMF_DEF_CTRL_LOSS_TMO; + int ctrl_loss_tmo = NVMF_DEF_RECONNECT_FOREVER; uuid_t hostid;
/* Set defaults */ diff --git a/drivers/nvme/host/fabrics.h b/drivers/nvme/host/fabrics.h index aa2fdb2a2e8f..188ebbeec32c 100644 --- a/drivers/nvme/host/fabrics.h +++ b/drivers/nvme/host/fabrics.h @@ -23,6 +23,7 @@ #define NVMF_DEF_RECONNECT_DELAY 10 /* default to 600 seconds of reconnect attempts before giving up */ #define NVMF_DEF_CTRL_LOSS_TMO 600 +#define NVMF_DEF_RECONNECT_FOREVER -1
/* * Define a host as seen by the target. We allocate one at boot, but also