From: Max Gurtovoy maxg@mellanox.com
mainline inclusion from mainline-v5.2-rc1 commit 87fd125344d68adf7699ec7396aa9f905ce79a80 category: bugfix bugzilla: NA CVE: NA Link: https://gitee.com/openeuler/kernel/issues/I1WGZE
-------------------------------------------------
In the past, before adding f41725bb ("nvme-rdma: Use mr pool") commit, we needed a reference on the ib_device as long as the tagset was alive, as the MRs in the request structures needed a valid ib_device. Now, we allocate/deallocate MR pool per QP and consume on demand.
Also remove nvme_rdma_free_tagset function and use blk_mq_free_tag_set instead, as it unneeded anymore.
This commit also fixes a memory leakage and possible segmentation fault. When configuring the system with NIC teaming (aka bonding), we use 1 network interface to create an HA connection to the target side. In case one connection breaks down, nvme-rdma driver will get notification from rdma-cm layer that underlying address was change and will start error recovery process. During this process, we'll reconnect to the target via the second interface in the bond without destroying the tagset. This will cause a leakage of the initial rdma device (ndev) and miscount in the reference count of the new created rdma device (new ndev). In the final destruction (or in another error flow), we'll get a warning dump from the ib_dealloc_pd that we still have inflight MR's related to that pd. This happens becasue of the miscount of the reference tag of the rdma device and causing access violation to it's elements (some queues are not destroyed yet).
Signed-off-by: Max Gurtovoy maxg@mellanox.com Signed-off-by: Israel Rukshin israelr@mellanox.com Signed-off-by: Christoph Hellwig hch@lst.de Reviewed-by: Chao Leng lengchao@huawei.com Reviewed-by: Jike Cheng chengjike.cheng@huawei.com Conflicts: drivers/nvme/host/rdma.c [lrz: adjust context] Signed-off-by: Ruozhu Li liruozhu@huawei.com Signed-off-by: Lijie lijie34@huawei.com Reviewed-by: Tao Hou houtao1@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- drivers/nvme/host/rdma.c | 34 +++++----------------------------- 1 file changed, 5 insertions(+), 29 deletions(-)
diff --git a/drivers/nvme/host/rdma.c b/drivers/nvme/host/rdma.c index 077c67816665..0bfc1875575d 100644 --- a/drivers/nvme/host/rdma.c +++ b/drivers/nvme/host/rdma.c @@ -666,15 +666,6 @@ static int nvme_rdma_alloc_io_queues(struct nvme_rdma_ctrl *ctrl) return ret; }
-static void nvme_rdma_free_tagset(struct nvme_ctrl *nctrl, - struct blk_mq_tag_set *set) -{ - struct nvme_rdma_ctrl *ctrl = to_rdma_ctrl(nctrl); - - blk_mq_free_tag_set(set); - nvme_rdma_dev_put(ctrl->device); -} - static struct blk_mq_tag_set *nvme_rdma_alloc_tagset(struct nvme_ctrl *nctrl, bool admin) { @@ -712,24 +703,9 @@ static struct blk_mq_tag_set *nvme_rdma_alloc_tagset(struct nvme_ctrl *nctrl,
ret = blk_mq_alloc_tag_set(set); if (ret) - goto out; - - /* - * We need a reference on the device as long as the tag_set is alive, - * as the MRs in the request structures need a valid ib_device. - */ - ret = nvme_rdma_dev_get(ctrl->device); - if (!ret) { - ret = -EINVAL; - goto out_free_tagset; - } + return ERR_PTR(ret);
return set; - -out_free_tagset: - blk_mq_free_tag_set(set); -out: - return ERR_PTR(ret); }
static void nvme_rdma_destroy_admin_queue(struct nvme_rdma_ctrl *ctrl, @@ -737,7 +713,7 @@ static void nvme_rdma_destroy_admin_queue(struct nvme_rdma_ctrl *ctrl, { if (remove) { blk_cleanup_queue(ctrl->ctrl.admin_q); - nvme_rdma_free_tagset(&ctrl->ctrl, ctrl->ctrl.admin_tagset); + blk_mq_free_tag_set(ctrl->ctrl.admin_tagset); } if (ctrl->async_event_sqe.data) { cancel_work_sync(&ctrl->ctrl.async_event_work); @@ -815,7 +791,7 @@ static int nvme_rdma_configure_admin_queue(struct nvme_rdma_ctrl *ctrl, blk_cleanup_queue(ctrl->ctrl.admin_q); out_free_tagset: if (new) - nvme_rdma_free_tagset(&ctrl->ctrl, ctrl->ctrl.admin_tagset); + blk_mq_free_tag_set(ctrl->ctrl.admin_tagset); out_free_async_qe: if (ctrl->async_event_sqe.data) { nvme_rdma_free_qe(ctrl->device->dev, &ctrl->async_event_sqe, @@ -832,7 +808,7 @@ static void nvme_rdma_destroy_io_queues(struct nvme_rdma_ctrl *ctrl, { if (remove) { blk_cleanup_queue(ctrl->ctrl.connect_q); - nvme_rdma_free_tagset(&ctrl->ctrl, ctrl->ctrl.tagset); + blk_mq_free_tag_set(ctrl->ctrl.tagset); } nvme_rdma_free_io_queues(ctrl); } @@ -873,7 +849,7 @@ static int nvme_rdma_configure_io_queues(struct nvme_rdma_ctrl *ctrl, bool new) blk_cleanup_queue(ctrl->ctrl.connect_q); out_free_tag_set: if (new) - nvme_rdma_free_tagset(&ctrl->ctrl, ctrl->ctrl.tagset); + blk_mq_free_tag_set(ctrl->ctrl.tagset); out_free_io_queues: nvme_rdma_free_io_queues(ctrl); return ret;
From: Max Gurtovoy maxg@mellanox.com
mainline inclusion from mainline-v5.2-rc4 commit 62f99b62e5e3b88d23b6ced4380199e8386965af category: bugfix bugzilla: NA CVE: NA Link: https://gitee.com/openeuler/kernel/issues/I1WGZE
-------------------------------------------------
Commit 87fd125344d6 ("nvme-rdma: remove redundant reference between ib_device and tagset") caused a kernel panic when disconnecting from an inaccessible controller (disconnect during re-connection). Commit 87fd125344d6 ("nvme-rdma: remove redundant reference between ib_device and tagset") caused a kernel panic when disconnecting from an inaccessible controller (disconnect during re-connection).
-- nvme nvme0: Removing ctrl: NQN "testnqn1" nvme_rdma: nvme_rdma_exit_request: hctx 0 queue_idx 1 BUG: unable to handle kernel paging request at 0000000080000228 PGD 0 P4D 0 Oops: 0000 [#1] SMP PTI ... Call Trace: blk_mq_exit_hctx+0x5c/0xf0 blk_mq_exit_queue+0xd4/0x100 blk_cleanup_queue+0x9a/0xc0 nvme_rdma_destroy_io_queues+0x52/0x60 [nvme_rdma] nvme_rdma_shutdown_ctrl+0x3e/0x80 [nvme_rdma] nvme_do_delete_ctrl+0x53/0x80 [nvme_core] nvme_sysfs_delete+0x45/0x60 [nvme_core] kernfs_fop_write+0x105/0x180 vfs_write+0xad/0x1a0 ksys_write+0x5a/0xd0 do_syscall_64+0x55/0x110 entry_SYSCALL_64_after_hwframe+0x44/0xa9 RIP: 0033:0x7fa215417154 --
The reason for this crash is accessing an already freed ib_device for performing dma_unmap during exit_request commands. The root cause for that is that during re-connection all the queues are destroyed and re-created (and the ib_device is reference counted by the queues and freed as well) but the tagset stays alive and all the DMA mappings (that we perform in init_request) kept in the request context. The original commit fixed a different bug that was introduced during bonding (aka nic teaming) tests that for some scenarios change the underlying ib_device and caused memory leakage and possible segmentation fault. This commit is a complementary commit that also changes the wrong DMA mappings that were saved in the request context and making the request sqe dma mappings dynamic with the command lifetime (i.e. mapped in .queue_rq and unmapped in .complete). It also fixes the above crash of accessing freed ib_device during destruction of the tagset.
Fixes: 87fd125344d6 ("nvme-rdma: remove redundant reference between ib_device and tagset") Reported-by: Jim Harris james.r.harris@intel.com Suggested-by: Sagi Grimberg sagi@grimberg.me Tested-by: Jim Harris james.r.harris@intel.com Reviewed-by: Christoph Hellwig hch@lst.de Signed-off-by: Max Gurtovoy maxg@mellanox.com Signed-off-by: Sagi Grimberg sagi@grimberg.me Reviewed-by: Chao Leng lengchao@huawei.com Reviewed-by: Jike Cheng chengjike.cheng@huawei.com Conflicts: drivers/nvme/host/rdma.c [lrz: delete extra blank lines] Signed-off-by: Ruozhu Li liruozhu@huawei.com Signed-off-by: Lijie lijie34@huawei.com Reviewed-by: Tao Hou houtao1@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- drivers/nvme/host/rdma.c | 53 ++++++++++++++++++++++++++-------------- 1 file changed, 35 insertions(+), 18 deletions(-)
diff --git a/drivers/nvme/host/rdma.c b/drivers/nvme/host/rdma.c index 0bfc1875575d..e83df4a4c42c 100644 --- a/drivers/nvme/host/rdma.c +++ b/drivers/nvme/host/rdma.c @@ -214,6 +214,11 @@ static struct nvme_rdma_qe *nvme_rdma_alloc_ring(struct ib_device *ibdev, if (!ring) return NULL;
+ /* + * Bind the CQEs (post recv buffers) DMA mapping to the RDMA queue + * lifetime. It's safe, since any chage in the underlying RDMA device + * will issue error recovery and queue re-creation. + */ for (i = 0; i < ib_queue_size; i++) { if (nvme_rdma_alloc_qe(ibdev, &ring[i], capsule_size, dir)) goto out_free_ring; @@ -268,14 +273,8 @@ static int nvme_rdma_create_qp(struct nvme_rdma_queue *queue, const int factor) static void nvme_rdma_exit_request(struct blk_mq_tag_set *set, struct request *rq, unsigned int hctx_idx) { - struct nvme_rdma_ctrl *ctrl = set->driver_data; struct nvme_rdma_request *req = blk_mq_rq_to_pdu(rq); - int queue_idx = (set == &ctrl->tag_set) ? hctx_idx + 1 : 0; - struct nvme_rdma_queue *queue = &ctrl->queues[queue_idx]; - struct nvme_rdma_device *dev = queue->device; - - nvme_rdma_free_qe(dev->dev, &req->sqe, sizeof(struct nvme_command), - DMA_TO_DEVICE); + kfree(req->sqe.data); }
static int nvme_rdma_init_request(struct blk_mq_tag_set *set, @@ -286,15 +285,11 @@ static int nvme_rdma_init_request(struct blk_mq_tag_set *set, struct nvme_rdma_request *req = blk_mq_rq_to_pdu(rq); int queue_idx = (set == &ctrl->tag_set) ? hctx_idx + 1 : 0; struct nvme_rdma_queue *queue = &ctrl->queues[queue_idx]; - struct nvme_rdma_device *dev = queue->device; - struct ib_device *ibdev = dev->dev; - int ret;
nvme_req(rq)->ctrl = &ctrl->ctrl; - ret = nvme_rdma_alloc_qe(ibdev, &req->sqe, sizeof(struct nvme_command), - DMA_TO_DEVICE); - if (ret) - return ret; + req->sqe.data = kzalloc(sizeof(struct nvme_command), GFP_KERNEL); + if (!req->sqe.data) + return -ENOMEM;
req->queue = queue;
@@ -736,6 +731,11 @@ static int nvme_rdma_configure_admin_queue(struct nvme_rdma_ctrl *ctrl, ctrl->device = ctrl->queues[0].device;
ctrl->max_fr_pages = nvme_rdma_get_max_fr_pages(ctrl->device->dev); + /* + * Bind the async event SQE DMA mapping to the admin queue lifetime. + * It's safe, since any chage in the underlying RDMA device will issue + * error recovery and queue re-creation. + */
error = nvme_rdma_alloc_qe(ctrl->device->dev, &ctrl->async_event_sqe, sizeof(struct nvme_command), DMA_TO_DEVICE); @@ -1702,12 +1702,20 @@ static blk_status_t nvme_rdma_queue_rq(struct blk_mq_hw_ctx *hctx, return nvmf_fail_nonready_command(&queue->ctrl->ctrl, rq);
dev = queue->device->dev; + + req->sqe.dma = ib_dma_map_single(dev, req->sqe.data, + sizeof(struct nvme_command), + DMA_TO_DEVICE); + err = ib_dma_mapping_error(dev, req->sqe.dma); + if (unlikely(err)) + return BLK_STS_RESOURCE; + ib_dma_sync_single_for_cpu(dev, sqe->dma, sizeof(struct nvme_command), DMA_TO_DEVICE);
ret = nvme_setup_cmd(ns, rq, c); if (ret) - return ret; + goto unmap_qe;
blk_mq_start_request(rq);
@@ -1734,8 +1742,13 @@ static blk_status_t nvme_rdma_queue_rq(struct blk_mq_hw_ctx *hctx, return BLK_STS_OK; err: if (err == -ENOMEM || err == -EAGAIN) - return BLK_STS_RESOURCE; - return BLK_STS_IOERR; + ret = BLK_STS_RESOURCE; + else + ret = BLK_STS_IOERR; +unmap_qe: + ib_dma_unmap_single(dev, req->sqe.dma, sizeof(struct nvme_command), + DMA_TO_DEVICE); + return ret; }
static int nvme_rdma_poll(struct blk_mq_hw_ctx *hctx, unsigned int tag) @@ -1762,8 +1775,12 @@ static int nvme_rdma_poll(struct blk_mq_hw_ctx *hctx, unsigned int tag) static void nvme_rdma_complete_rq(struct request *rq) { struct nvme_rdma_request *req = blk_mq_rq_to_pdu(rq); + struct nvme_rdma_queue *queue = req->queue; + struct ib_device *ibdev = queue->device->dev;
- nvme_rdma_unmap_data(req->queue, rq); + nvme_rdma_unmap_data(queue, rq); + ib_dma_unmap_single(ibdev, req->sqe.dma, sizeof(struct nvme_command), + DMA_TO_DEVICE); nvme_complete_rq(rq); }
From: Sagi Grimberg sagi@grimberg.me
mainline inclusion from mainline-v5.3-rc5 commit d94211b8bad3787e0655a67284105f57db728cb1 category: bugfix bugzilla: NA CVE: NA Link: https://gitee.com/openeuler/kernel/issues/I1WGZE
-------------------------------------------------
When start_queue fails, we need to make sure to drain the queue cq before freeing the rdma resources because we might still race with the completion path. Have start_queue() error path safely stop the queue.
-- [30371.808111] nvme nvme1: Failed reconnect attempt 11 [30371.808113] nvme nvme1: Reconnecting in 10 seconds... [...] [30382.069315] nvme nvme1: creating 4 I/O queues. [30382.257058] nvme nvme1: Connect Invalid SQE Parameter, qid 4 [30382.257061] nvme nvme1: failed to connect queue: 4 ret=386 [30382.305001] BUG: unable to handle kernel NULL pointer dereference at 0000000000000018 [30382.305022] IP: qedr_poll_cq+0x8a3/0x1170 [qedr] [30382.305028] PGD 0 P4D 0 [30382.305037] Oops: 0000 [#1] SMP PTI [...] [30382.305153] Call Trace: [30382.305166] ? __switch_to_asm+0x34/0x70 [30382.305187] __ib_process_cq+0x56/0xd0 [ib_core] [30382.305201] ib_poll_handler+0x26/0x70 [ib_core] [30382.305213] irq_poll_softirq+0x88/0x110 [30382.305223] ? sort_range+0x20/0x20 [30382.305232] __do_softirq+0xde/0x2c6 [30382.305241] ? sort_range+0x20/0x20 [30382.305249] run_ksoftirqd+0x1c/0x60 [30382.305258] smpboot_thread_fn+0xef/0x160 [30382.305265] kthread+0x113/0x130 [30382.305273] ? kthread_create_worker_on_cpu+0x50/0x50 [30382.305281] ret_from_fork+0x35/0x40 --
Reported-by: Nicolas Morey-Chaisemartin NMoreyChaisemartin@suse.com Reviewed-by: Max Gurtovoy maxg@mellanox.com Reviewed-by: Hannes Reinecke hare@suse.com Signed-off-by: Sagi Grimberg sagi@grimberg.me Reviewed-by: Chao Leng lengchao@huawei.com Reviewed-by: Jike Cheng chengjike.cheng@huawei.com Conflicts: drivers/nvme/host/rdma.c [lrz: get queue pointer to apply patch] Signed-off-by: Ruozhu Li liruozhu@huawei.com Signed-off-by: Lijie lijie34@huawei.com Reviewed-by: Tao Hou houtao1@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- drivers/nvme/host/rdma.c | 18 +++++++++++++----- 1 file changed, 13 insertions(+), 5 deletions(-)
diff --git a/drivers/nvme/host/rdma.c b/drivers/nvme/host/rdma.c index e83df4a4c42c..ae7155005c35 100644 --- a/drivers/nvme/host/rdma.c +++ b/drivers/nvme/host/rdma.c @@ -548,13 +548,18 @@ static int nvme_rdma_alloc_queue(struct nvme_rdma_ctrl *ctrl, return ret; }
+static void __nvme_rdma_stop_queue(struct nvme_rdma_queue *queue) +{ + rdma_disconnect(queue->cm_id); + ib_drain_qp(queue->qp); +} + static void nvme_rdma_stop_queue(struct nvme_rdma_queue *queue) { if (!test_and_clear_bit(NVME_RDMA_Q_LIVE, &queue->flags)) return;
- rdma_disconnect(queue->cm_id); - ib_drain_qp(queue->qp); + __nvme_rdma_stop_queue(queue); }
static void nvme_rdma_free_queue(struct nvme_rdma_queue *queue) @@ -584,6 +589,7 @@ static void nvme_rdma_stop_io_queues(struct nvme_rdma_ctrl *ctrl)
static int nvme_rdma_start_queue(struct nvme_rdma_ctrl *ctrl, int idx) { + struct nvme_rdma_queue *queue = &ctrl->queues[idx]; int ret;
if (idx) @@ -591,11 +597,13 @@ static int nvme_rdma_start_queue(struct nvme_rdma_ctrl *ctrl, int idx) else ret = nvmf_connect_admin_queue(&ctrl->ctrl);
- if (!ret) - set_bit(NVME_RDMA_Q_LIVE, &ctrl->queues[idx].flags); - else + if (!ret) { + set_bit(NVME_RDMA_Q_LIVE, &queue->flags); + } else { + __nvme_rdma_stop_queue(queue); dev_info(ctrl->ctrl.device, "failed to connect queue: %d ret=%d\n", idx, ret); + } return ret; }
From: Sagi Grimberg sagi@grimberg.me
mainline inclusion from mainline-v5.4-rc2 commit 67b483dd03c4cd9e90e4c3943132dce514ea4e88 category: bugfix bugzilla: NA CVE: NA Link: https://gitee.com/openeuler/kernel/issues/I1WGZE
-------------------------------------------------
If the connect times out, we may have already destroyed the queue in the timeout handler, so test if the queue is still allocated in the connect error handler.
Reported-by: Yi Zhang yi.zhang@redhat.com Signed-off-by: Sagi Grimberg sagi@grimberg.me Reviewed-by: Chao Leng lengchao@huawei.com Reviewed-by: Jike Cheng chengjike.cheng@huawei.com Signed-off-by: Ruozhu Li liruozhu@huawei.com Signed-off-by: Lijie lijie34@huawei.com Reviewed-by: Tao Hou houtao1@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- drivers/nvme/host/rdma.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-)
diff --git a/drivers/nvme/host/rdma.c b/drivers/nvme/host/rdma.c index ae7155005c35..82ae3a2935d6 100644 --- a/drivers/nvme/host/rdma.c +++ b/drivers/nvme/host/rdma.c @@ -600,7 +600,8 @@ static int nvme_rdma_start_queue(struct nvme_rdma_ctrl *ctrl, int idx) if (!ret) { set_bit(NVME_RDMA_Q_LIVE, &queue->flags); } else { - __nvme_rdma_stop_queue(queue); + if (test_bit(NVME_RDMA_Q_ALLOCATED, &queue->flags)) + __nvme_rdma_stop_queue(queue); dev_info(ctrl->ctrl.device, "failed to connect queue: %d ret=%d\n", idx, ret); }
From: Sagi Grimberg sagi@grimberg.me
mainline inclusion from mainline-v5.0-rc1 commit 9afc97c29b032af9a4112c2f4a02d5313b4dc71f category: bugfix bugzilla: NA CVE: NA Link: https://gitee.com/openeuler/kernel/issues/I1WGZE
-------------------------------------------------
Devices that does not use managed affinity can not export a vector affinity as the consumer relies on having a static mapping it can map to upper layer affinity (e.g. sw queues). If the driver allows the user to set the device irq affinity, then the affinitization of a long term existing entites is not relevant.
For example, nvme-rdma controllers queue-irq affinitization is determined at init time so if the irq affinity changes over time, we are no longer aligned.
Signed-off-by: Sagi Grimberg sagi@grimberg.me Acked-by: Leon Romanovsky leonro@mellanox.com Signed-off-by: Doug Ledford dledford@redhat.com Signed-off-by: Jason Gunthorpe jgg@mellanox.com Reviewed-by: Chao Leng lengchao@huawei.com Reviewed-by: Jike Cheng chengjike.cheng@huawei.com Conflicts: drivers/infiniband/hw/mlx5/main.c include/linux/mlx5/driver.h [lrz: adjust context] Signed-off-by: Ruozhu Li liruozhu@huawei.com Signed-off-by: Lijie lijie34@huawei.com Reviewed-by: Tao Hou houtao1@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- drivers/infiniband/hw/mlx5/main.c | 9 --------- include/linux/mlx5/driver.h | 6 ------ 2 files changed, 15 deletions(-)
diff --git a/drivers/infiniband/hw/mlx5/main.c b/drivers/infiniband/hw/mlx5/main.c index f41f3ff689c5..1f1f57a2753c 100644 --- a/drivers/infiniband/hw/mlx5/main.c +++ b/drivers/infiniband/hw/mlx5/main.c @@ -5328,14 +5328,6 @@ static void init_delay_drop(struct mlx5_ib_dev *dev) mlx5_ib_warn(dev, "Failed to init delay drop debugfs\n"); }
-static const struct cpumask * -mlx5_ib_get_vector_affinity(struct ib_device *ibdev, int comp_vector) -{ - struct mlx5_ib_dev *dev = to_mdev(ibdev); - - return mlx5_get_vector_affinity_hint(dev->mdev, comp_vector); -} - /* The mlx5_ib_multiport_mutex should be held when calling this function */ static void mlx5_ib_unbind_slave_port(struct mlx5_ib_dev *ibdev, struct mlx5_ib_multiport_info *mpi) @@ -5848,7 +5840,6 @@ int mlx5_ib_stage_caps_init(struct mlx5_ib_dev *dev) dev->ib_dev.map_mr_sg = mlx5_ib_map_mr_sg; dev->ib_dev.check_mr_status = mlx5_ib_check_mr_status; dev->ib_dev.get_dev_fw_str = get_dev_fw_str; - dev->ib_dev.get_vector_affinity = mlx5_ib_get_vector_affinity; if (MLX5_CAP_GEN(mdev, ipoib_enhanced_offloads)) dev->ib_dev.alloc_rdma_netdev = mlx5_ib_alloc_rdma_netdev;
diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h index dc89a964c1f3..530d7177accd 100644 --- a/include/linux/mlx5/driver.h +++ b/include/linux/mlx5/driver.h @@ -1318,10 +1318,4 @@ enum { MLX5_TRIGGERED_CMD_COMP = (u64)1 << 32, };
-static inline const struct cpumask * -mlx5_get_vector_affinity_hint(struct mlx5_core_dev *dev, int vector) -{ - return dev->priv.irq_info[vector + MLX5_EQ_VEC_COMP_BASE].mask; -} - #endif /* MLX5_DRIVER_H */
From: Max Gurtovoy maxg@mellanox.com
mainline inclusion from mainline-v5.4-rc7 commit 9ad9e8d6ca29c1446d81c6518ae634a2141dfd22 category: bugfix bugzilla: NA CVE: NA Link: https://gitee.com/openeuler/kernel/issues/I1WGZE
-------------------------------------------------
In case there are controllers that are not associated with any RDMA device (e.g. during unsuccessful reconnection) and the user will unload the module, these controllers will not be freed and will access already freed memory. The same logic appears in other fabric drivers as well.
Fixes: 87fd125344d6 ("nvme-rdma: remove redundant reference between ib_device and tagset") Reviewed-by: Sagi Grimberg sagi@grimberg.me Signed-off-by: Max Gurtovoy maxg@mellanox.com Signed-off-by: Keith Busch kbusch@kernel.org Reviewed-by: Chao Leng lengchao@huawei.com Reviewed-by: Jike Cheng chengjike.cheng@huawei.com Signed-off-by: Ruozhu Li liruozhu@huawei.com Signed-off-by: Lijie lijie34@huawei.com Reviewed-by: Tao Hou houtao1@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- drivers/nvme/host/rdma.c | 8 ++++++++ 1 file changed, 8 insertions(+)
diff --git a/drivers/nvme/host/rdma.c b/drivers/nvme/host/rdma.c index 82ae3a2935d6..c999432794ee 100644 --- a/drivers/nvme/host/rdma.c +++ b/drivers/nvme/host/rdma.c @@ -2108,8 +2108,16 @@ static int __init nvme_rdma_init_module(void)
static void __exit nvme_rdma_cleanup_module(void) { + struct nvme_rdma_ctrl *ctrl; + nvmf_unregister_transport(&nvme_rdma_transport); ib_unregister_client(&nvme_rdma_ib_client); + + mutex_lock(&nvme_rdma_ctrl_mutex); + list_for_each_entry(ctrl, &nvme_rdma_ctrl_list, list) + nvme_delete_ctrl(&ctrl->ctrl); + mutex_unlock(&nvme_rdma_ctrl_mutex); + flush_workqueue(nvme_delete_wq); }
module_init(nvme_rdma_init_module);
From: Ming Lei ming.lei@redhat.com
mainline inclusion from mainline-v5.4-rc1 commit aa306ab703e9452b1e25cc8e8f04b8df523d0bb8 category: bugfix bugzilla: NA CVE: NA Link: https://gitee.com/openeuler/kernel/issues/I1WGZE
-------------------------------------------------
NVMe needs this function to decide if one request to be aborted has been completed in normal IO path already.
So introduce it.
Cc: Max Gurtovoy maxg@mellanox.com Cc: Sagi Grimberg sagi@grimberg.me Cc: Keith Busch keith.busch@intel.com Cc: Christoph Hellwig hch@lst.de Reviewed-by: Sagi Grimberg sagi@grimberg.me Signed-off-by: Ming Lei ming.lei@redhat.com Signed-off-by: Jens Axboe axboe@kernel.dk Reviewed-by: Chao Leng lengchao@huawei.com Reviewed-by: Jike Cheng chengjike.cheng@huawei.com Signed-off-by: Ruozhu Li liruozhu@huawei.com Signed-off-by: Lijie lijie34@huawei.com Reviewed-by: Tao Hou houtao1@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- block/blk-mq.c | 6 ++++++ include/linux/blk-mq.h | 1 + 2 files changed, 7 insertions(+)
diff --git a/block/blk-mq.c b/block/blk-mq.c index 7f5360949509..7096f23b18da 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -636,6 +636,12 @@ int blk_mq_request_started(struct request *rq) } EXPORT_SYMBOL_GPL(blk_mq_request_started);
+int blk_mq_request_completed(struct request *rq) +{ + return blk_mq_rq_state(rq) == MQ_RQ_COMPLETE; +} +EXPORT_SYMBOL_GPL(blk_mq_request_completed); + void blk_mq_start_request(struct request *rq) { struct request_queue *q = rq->q; diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h index 4878d8ed4914..b414cad68024 100644 --- a/include/linux/blk-mq.h +++ b/include/linux/blk-mq.h @@ -288,6 +288,7 @@ static inline u16 blk_mq_unique_tag_to_tag(u32 unique_tag)
int blk_mq_request_started(struct request *rq); +int blk_mq_request_completed(struct request *rq); void blk_mq_start_request(struct request *rq); void blk_mq_end_request(struct request *rq, blk_status_t error); void __blk_mq_end_request(struct request *rq, blk_status_t error);
From: Ming Lei ming.lei@redhat.com
mainline inclusion from mainline-v5.4-rc1 commit f9934a80f91dba8c7029ba7601459e41ea7770aa category: bugfix bugzilla: NA CVE: NA Link: https://gitee.com/openeuler/kernel/issues/I1WGZE
-------------------------------------------------
blk-mq may schedule to call queue's complete function on remote CPU via IPI, but doesn't provide any way to synchronize the request's complete fn. The current queue freeze interface can't provide the synchonization because aborted requests stay at blk-mq queues during EH.
In some driver's EH(such as NVMe), hardware queue's resource may be freed &re-allocated. If the completed request's complete fn is run finally after the hardware queue's resource is released, kernel crash will be triggered.
Prepare for fixing this kind of issue by introducing blk_mq_tagset_wait_completed_request().
Cc: Max Gurtovoy maxg@mellanox.com Cc: Sagi Grimberg sagi@grimberg.me Cc: Keith Busch keith.busch@intel.com Cc: Christoph Hellwig hch@lst.de Reviewed-by: Sagi Grimberg sagi@grimberg.me Signed-off-by: Ming Lei ming.lei@redhat.com Signed-off-by: Jens Axboe axboe@kernel.dk Reviewed-by: Chao Leng lengchao@huawei.com Reviewed-by: Jike Cheng chengjike.cheng@huawei.com Conflicts: block/blk-mq-tag.c include/linux/blk-mq.h [lrz: remain return type to void as 4.19] Signed-off-by: Ruozhu Li liruozhu@huawei.com Signed-off-by: Lijie lijie34@huawei.com Reviewed-by: Tao Hou houtao1@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- block/blk-mq-tag.c | 31 +++++++++++++++++++++++++++++++ include/linux/blk-mq.h | 1 + 2 files changed, 32 insertions(+)
diff --git a/block/blk-mq-tag.c b/block/blk-mq-tag.c index 323bbca53a17..ce7f7188625e 100644 --- a/block/blk-mq-tag.c +++ b/block/blk-mq-tag.c @@ -9,6 +9,7 @@ #include <linux/module.h>
#include <linux/blk-mq.h> +#include <linux/delay.h> #include "blk.h" #include "blk-mq.h" #include "blk-mq-tag.h" @@ -328,6 +329,36 @@ void blk_mq_tagset_busy_iter(struct blk_mq_tag_set *tagset, } EXPORT_SYMBOL(blk_mq_tagset_busy_iter);
+static void blk_mq_tagset_count_completed_rqs(struct request *rq, + void *data, bool reserved) +{ + unsigned *count = data; + + if (blk_mq_request_completed(rq)) + (*count)++; +} + +/** + * blk_mq_tagset_wait_completed_request - wait until all completed req's + * complete funtion is run + * @tagset: Tag set to drain completed request + * + * Note: This function has to be run after all IO queues are shutdown + */ +void blk_mq_tagset_wait_completed_request(struct blk_mq_tag_set *tagset) +{ + while (true) { + unsigned count = 0; + + blk_mq_tagset_busy_iter(tagset, + blk_mq_tagset_count_completed_rqs, &count); + if (!count) + break; + msleep(5); + } +} +EXPORT_SYMBOL(blk_mq_tagset_wait_completed_request); + static void __blk_mq_queue_tag_busy_iter(struct request_queue *q, busy_iter_fn *fn, void *priv, bool inflight) { diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h index b414cad68024..d26edab21d5c 100644 --- a/include/linux/blk-mq.h +++ b/include/linux/blk-mq.h @@ -316,6 +316,7 @@ void blk_mq_run_hw_queues(struct request_queue *q, bool async); void blk_mq_delay_run_hw_queues(struct request_queue *q, unsigned long msecs); void blk_mq_tagset_busy_iter(struct blk_mq_tag_set *tagset, busy_tag_iter_fn *fn, void *priv); +void blk_mq_tagset_wait_completed_request(struct blk_mq_tag_set *tagset); void blk_mq_freeze_queue(struct request_queue *q); void blk_mq_unfreeze_queue(struct request_queue *q); void blk_freeze_queue_start(struct request_queue *q);
From: Ming Lei ming.lei@redhat.com
mainline inclusion from mainline-v5.4-rc1 commit 622b8b6893ff3096e130250c1298adf57a0cab03 category: bugfix bugzilla: NA CVE: NA Link: https://gitee.com/openeuler/kernel/issues/I1WGZE
-------------------------------------------------
When aborting in-flight request for recovering controller, we have to make sure that queue's complete function is called on completed request before moving on. Otherwise, for example, the warning of WARN_ON_ONCE(qp->mrs_used > 0) in ib_destroy_qp_user() may be triggered on nvme-rdma.
Fix this issue by using blk_mq_tagset_wait_completed_request.
Cc: Max Gurtovoy maxg@mellanox.com Cc: Sagi Grimberg sagi@grimberg.me Cc: Keith Busch keith.busch@intel.com Cc: Christoph Hellwig hch@lst.de Reviewed-by: Sagi Grimberg sagi@grimberg.me Signed-off-by: Ming Lei ming.lei@redhat.com Signed-off-by: Jens Axboe axboe@kernel.dk Reviewed-by: Chao Leng lengchao@huawei.com Reviewed-by: Jike Cheng chengjike.cheng@huawei.com Conflicts: drivers/nvme/host/rdma.c [lrz: adjust context] Signed-off-by: Ruozhu Li liruozhu@huawei.com Signed-off-by: Lijie lijie34@huawei.com Reviewed-by: Tao Hou houtao1@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- drivers/nvme/host/rdma.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-)
diff --git a/drivers/nvme/host/rdma.c b/drivers/nvme/host/rdma.c index c999432794ee..f0cc293e579a 100644 --- a/drivers/nvme/host/rdma.c +++ b/drivers/nvme/host/rdma.c @@ -870,9 +870,11 @@ static void nvme_rdma_teardown_admin_queue(struct nvme_rdma_ctrl *ctrl, mutex_lock(&ctrl->teardown_lock); blk_mq_quiesce_queue(ctrl->ctrl.admin_q); nvme_rdma_stop_queue(&ctrl->queues[0]); - if (ctrl->ctrl.admin_tagset) + if (ctrl->ctrl.admin_tagset) { blk_mq_tagset_busy_iter(ctrl->ctrl.admin_tagset, nvme_cancel_request, &ctrl->ctrl); + blk_mq_tagset_wait_completed_request(ctrl->ctrl.admin_tagset); + } blk_mq_unquiesce_queue(ctrl->ctrl.admin_q); nvme_rdma_destroy_admin_queue(ctrl, remove); mutex_unlock(&ctrl->teardown_lock); @@ -885,9 +887,11 @@ static void nvme_rdma_teardown_io_queues(struct nvme_rdma_ctrl *ctrl, if (ctrl->ctrl.queue_count > 1) { nvme_stop_queues(&ctrl->ctrl); nvme_rdma_stop_io_queues(ctrl); - if (ctrl->ctrl.tagset) + if (ctrl->ctrl.tagset) { blk_mq_tagset_busy_iter(ctrl->ctrl.tagset, nvme_cancel_request, &ctrl->ctrl); + blk_mq_tagset_wait_completed_request(ctrl->ctrl.tagset); + } if (remove) nvme_start_queues(&ctrl->ctrl); nvme_rdma_destroy_io_queues(ctrl, remove);
From: Keith Busch keith.busch@intel.com
mainline inclusion from mainline-v5.0-rc1 commit 49cd84b6f8b677ef45731ed56ddb802cdbb94c9e category: bugfix bugzilla: NA CVE: NA Link: https://gitee.com/openeuler/kernel/issues/I1WGZE
--------------------------------
A controller may have an internal state that is not able to successfully process commands for a short duration. In such states, an immediate command requeue is expected to fail. The driver may exceed its max retry count, which permanently ends the command in failure when the same command would succeed after waiting for the controller to be ready.
NVMe ratified TP 4033 provides a delay hint in the completion status code for failed commands. Implement the retry delay based on the command completion status and the controller's requested delay.
Note that requeued commands are handled per request_queue, not per individual request. If multiple commands fail, the controller should consistently report the desired delay time for retryable commands in all CQEs, otherwise the requeue list may be kicked too soon.
Signed-off-by: Keith Busch keith.busch@intel.com Reviewed-by: Sagi Grimberg sagi@grimberg.me Signed-off-by: Christoph Hellwig hch@lst.de Signed-off-by: Jens Axboe axboe@kernel.dk Reviewed-by: Chao Leng lengchao@huawei.com Reviewed-by: Jike Cheng chengjike.cheng@huawei.com Signed-off-by: Lijie lijie34@huawei.com Reviewed-by: Tao Hou houtao1@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- drivers/nvme/host/core.c | 47 ++++++++++++++++++++++++++++++++++++++-- drivers/nvme/host/nvme.h | 1 + include/linux/nvme.h | 17 ++++++++++++++- 3 files changed, 62 insertions(+), 3 deletions(-)
diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index e77e1be508d5..862f7b252339 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -254,6 +254,22 @@ static inline bool nvme_req_needs_retry(struct request *req) return true; }
+static void nvme_retry_req(struct request *req) +{ + struct nvme_ns *ns = req->q->queuedata; + unsigned long delay = 0; + u16 crd; + + /* The mask and shift result must be <= 3 */ + crd = (nvme_req(req)->status & NVME_SC_CRD) >> 11; + if (ns && crd) + delay = ns->ctrl->crdt[crd - 1] * 100; + + nvme_req(req)->retries++; + blk_mq_requeue_request(req, false); + blk_mq_delay_kick_requeue_list(req->q, delay); +} + void nvme_complete_rq(struct request *req) { blk_status_t status = nvme_error_status(req); @@ -265,8 +281,7 @@ void nvme_complete_rq(struct request *req) return;
if (!blk_queue_dying(req->q)) { - nvme_req(req)->retries++; - blk_mq_requeue_request(req, true); + nvme_retry_req(req); return; } } @@ -1956,6 +1971,26 @@ static int nvme_configure_timestamp(struct nvme_ctrl *ctrl) return ret; }
+static int nvme_configure_acre(struct nvme_ctrl *ctrl) +{ + struct nvme_feat_host_behavior *host; + int ret; + + /* Don't bother enabling the feature if retry delay is not reported */ + if (!ctrl->crdt[0]) + return 0; + + host = kzalloc(sizeof(*host), GFP_KERNEL); + if (!host) + return 0; + + host->acre = NVME_ENABLE_ACRE; + ret = nvme_set_features(ctrl, NVME_FEAT_HOST_BEHAVIOR, 0, + host, sizeof(*host), NULL); + kfree(host); + return ret; +} + static int nvme_configure_apst(struct nvme_ctrl *ctrl) { /* @@ -2486,6 +2521,10 @@ int nvme_init_identify(struct nvme_ctrl *ctrl) ctrl->quirks &= ~NVME_QUIRK_NO_DEEPEST_PS; }
+ ctrl->crdt[0] = le16_to_cpu(id->crdt1); + ctrl->crdt[1] = le16_to_cpu(id->crdt2); + ctrl->crdt[2] = le16_to_cpu(id->crdt3); + ctrl->oacs = le16_to_cpu(id->oacs); ctrl->oncs = le16_to_cpup(&id->oncs); ctrl->oaes = le32_to_cpu(id->oaes); @@ -2585,6 +2624,10 @@ int nvme_init_identify(struct nvme_ctrl *ctrl) if (ret < 0) return ret;
+ ret = nvme_configure_acre(ctrl); + if (ret < 0) + return ret; + ctrl->identified = true;
return 0; diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h index 9c2e7a151e40..5146aa961ab2 100644 --- a/drivers/nvme/host/nvme.h +++ b/drivers/nvme/host/nvme.h @@ -180,6 +180,7 @@ struct nvme_ctrl { u32 page_size; u32 max_hw_sectors; u32 max_segments; + u16 crdt[3]; u16 oncs; u16 oacs; u16 nssa; diff --git a/include/linux/nvme.h b/include/linux/nvme.h index 818dbe9331be..60b492116e1b 100644 --- a/include/linux/nvme.h +++ b/include/linux/nvme.h @@ -214,7 +214,11 @@ struct nvme_id_ctrl { __le32 rtd3e; __le32 oaes; __le32 ctratt; - __u8 rsvd100[156]; + __u8 rsvd100[28]; + __le16 crdt1; + __le16 crdt2; + __le16 crdt3; + __u8 rsvd134[122]; __le16 oacs; __u8 acl; __u8 aerl; @@ -738,6 +742,15 @@ enum { NVME_HOST_MEM_RETURN = (1 << 1), };
+struct nvme_feat_host_behavior { + __u8 acre; + __u8 resv1[511]; +}; + +enum { + NVME_ENABLE_ACRE = 1, +}; + /* Admin commands */
enum nvme_admin_opcode { @@ -792,6 +805,7 @@ enum { NVME_FEAT_RRL = 0x12, NVME_FEAT_PLM_CONFIG = 0x13, NVME_FEAT_PLM_WINDOW = 0x14, + NVME_FEAT_HOST_BEHAVIOR = 0x16, NVME_FEAT_SW_PROGRESS = 0x80, NVME_FEAT_HOST_ID = 0x81, NVME_FEAT_RESV_MASK = 0x82, @@ -1243,6 +1257,7 @@ enum { NVME_SC_ANA_TRANSITION = 0x303, NVME_SC_HOST_PATH_ERROR = 0x370,
+ NVME_SC_CRD = 0x1800, NVME_SC_DNR = 0x4000, };
From: Sagi Grimberg sagi@grimberg.me
mainline inclusion from mainline-v5.1-rc1 commit a63b83700ba89c300f705167d06bf122f3666287 category: bugfix bugzilla: NA CVE: NA Link: https://gitee.com/openeuler/kernel/issues/I1WGZE
--------------------------------
In case nvme_alloc_ns fails after we initialize ns_head but before we add the ns to the controller namespaces list we need to explicitly put the ns_head reference because when we teardown the controller we won't find it, causing us to leak a dangling subsystem eventually.
Signed-off-by: Sagi Grimberg sagi@grimberg.me Signed-off-by: Christoph Hellwig hch@lst.de Signed-off-by: Jens Axboe axboe@kernel.dk Reviewed-by: Chao Leng lengchao@huawei.com Reviewed-by: Jike Cheng chengjike.cheng@huawei.com Signed-off-by: Lijie lijie34@huawei.com Reviewed-by: Tao Hou houtao1@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- drivers/nvme/host/core.c | 1 + 1 file changed, 1 insertion(+)
diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index 862f7b252339..c9b497ab31d4 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -3275,6 +3275,7 @@ static void nvme_alloc_ns(struct nvme_ctrl *ctrl, unsigned nsid) mutex_lock(&ctrl->subsys->lock); list_del_rcu(&ns->siblings); mutex_unlock(&ctrl->subsys->lock); + nvme_put_ns_head(ns->head); out_free_id: kfree(id); out_free_queue:
From: Ming Lei ming.lei@redhat.com
mainline inclusion from mainline-v5.4-rc1 commit 78ca40724713bd422873cb4ebee86f9f499650f7 category: bugfix bugzilla: NA CVE: NA Link: https://gitee.com/openeuler/kernel/issues/I1WGZE
--------------------------------
Before aborting in-flight requests, all IO queues and their interrupts have been shutdown. However, request's completion function may not be done yet because it can be scheduled to run via IPI.
So don't abort one request if it is marked as completed, otherwise we may abort one normal completed request.
Cc: Max Gurtovoy maxg@mellanox.com Cc: Sagi Grimberg sagi@grimberg.me Cc: Keith Busch keith.busch@intel.com Cc: Christoph Hellwig hch@lst.de Reviewed-by: Sagi Grimberg sagi@grimberg.me Signed-off-by: Ming Lei ming.lei@redhat.com Signed-off-by: Jens Axboe axboe@kernel.dk Conflicts: nvme_cancel_request function does not return a value. Reviewed-by: Chao Leng lengchao@huawei.com Reviewed-by: Jike Cheng chengjike.cheng@huawei.com Signed-off-by: Lijie lijie34@huawei.com Reviewed-by: Hou Tao houtao1@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- drivers/nvme/host/core.c | 4 ++++ 1 file changed, 4 insertions(+)
diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index c9b497ab31d4..449840fdda83 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -294,6 +294,10 @@ void nvme_cancel_request(struct request *req, void *data, bool reserved) dev_dbg_ratelimited(((struct nvme_ctrl *) data)->device, "Cancelling I/O %d", req->tag);
+ /* don't abort one completed request */ + if (blk_mq_request_completed(req)) + return; + nvme_req(req)->status = NVME_SC_ABORT_REQ; blk_mq_complete_request(req);
From: Sagi Grimberg sagi@grimberg.me
mainline inclusion from mainline-v5.4-rc1 commit 2f9c173647753b81eed7c198abf7622ab22dc49d category: bugfix bugzilla: NA CVE: NA Link: https://gitee.com/openeuler/kernel/issues/I1WGZE
--------------------------------
No need for the full blown request structure.
Reviewed-by: Christoph Hellwig hch@lst.de Signed-off-by: Sagi Grimberg sagi@grimberg.me Reviewed-by: Chao Leng lengchao@huawei.com Reviewed-by: Jike Cheng chengjike.cheng@huawei.com Signed-off-by: Lijie lijie34@huawei.com Reviewed-by: Tao Hou houtao1@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- drivers/nvme/host/core.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-)
diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index 449840fdda83..05219b8c64e1 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -209,9 +209,9 @@ static inline bool nvme_ns_has_pi(struct nvme_ns *ns) return ns->pi_type && ns->ms == sizeof(struct t10_pi_tuple); }
-static blk_status_t nvme_error_status(struct request *req) +static blk_status_t nvme_error_status(u16 status) { - switch (nvme_req(req)->status & 0x7ff) { + switch (status & 0x7ff) { case NVME_SC_SUCCESS: return BLK_STS_OK; case NVME_SC_CAP_EXCEEDED: @@ -272,7 +272,7 @@ static void nvme_retry_req(struct request *req)
void nvme_complete_rq(struct request *req) { - blk_status_t status = nvme_error_status(req); + blk_status_t status = nvme_error_status(nvme_req(req)->status);
trace_nvme_complete_rq(req);
From: Sagi Grimberg sagi@grimberg.me
mainline inclusion from mainline-v5.4-rc1 commit 331813f687ed41347b2b7dc784d81ccdbf6f9157 category: bugfix bugzilla: NA CVE: NA Link: https://gitee.com/openeuler/kernel/issues/I1WGZE
--------------------------------
right now callers of nvme_identify_ns only know that it failed, but don't know why. Make nvme_identify_ns propagate the error back. Because nvme_submit_sync_cmd may return a positive status code, we make nvme_identify_ns receive the id by reference and return that status up the call chain, but make sure not to leak positive nvme status codes to the upper layers.
Reviewed-by: Minwoo Im minwoo.im.dev@gmail.com Reviewed-by: Hannes Reinecke hare@suse.com Reviewed-by: James Smart james.smart@broadcom.com Reviewed-by: Chaitanya Kulkarni chaitanya.kulkarni@wdc.com Reviewed-by: Christoph Hellwig hch@lst.de Signed-off-by: Sagi Grimberg sagi@grimberg.me Reviewed-by: Chao Leng lengchao@huawei.com Reviewed-by: Jike Cheng chengjike.cheng@huawei.com Signed-off-by: Lijie lijie34@huawei.com Reviewed-by: Tao Hou houtao1@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- drivers/nvme/host/core.c | 37 +++++++++++++++++++------------------ 1 file changed, 19 insertions(+), 18 deletions(-)
diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index 05219b8c64e1..2f3765ad6cd7 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -1064,10 +1064,9 @@ static int nvme_identify_ns_list(struct nvme_ctrl *dev, unsigned nsid, __le32 *n NVME_IDENTIFY_DATA_SIZE); }
-static struct nvme_id_ns *nvme_identify_ns(struct nvme_ctrl *ctrl, - unsigned nsid) +static int nvme_identify_ns(struct nvme_ctrl *ctrl, + unsigned nsid, struct nvme_id_ns **id) { - struct nvme_id_ns *id; struct nvme_command c = { }; int error;
@@ -1076,18 +1075,17 @@ static struct nvme_id_ns *nvme_identify_ns(struct nvme_ctrl *ctrl, c.identify.nsid = cpu_to_le32(nsid); c.identify.cns = NVME_ID_CNS_NS;
- id = kmalloc(sizeof(*id), GFP_KERNEL); - if (!id) - return NULL; + *id = kmalloc(sizeof(**id), GFP_KERNEL); + if (!*id) + return -ENOMEM;
- error = nvme_submit_sync_cmd(ctrl->admin_q, &c, id, sizeof(*id)); + error = nvme_submit_sync_cmd(ctrl->admin_q, &c, *id, sizeof(**id)); if (error) { dev_warn(ctrl->device, "Identify namespace failed\n"); - kfree(id); - return NULL; + kfree(*id); }
- return id; + return error; }
static int nvme_set_features(struct nvme_ctrl *dev, unsigned fid, unsigned dword11, @@ -1641,13 +1639,13 @@ static int nvme_revalidate_disk(struct gendisk *disk) return -ENODEV; }
- id = nvme_identify_ns(ctrl, ns->head->ns_id); - if (!id) - return -ENODEV; + ret = nvme_identify_ns(ctrl, ns->head->ns_id, &id); + if (ret) + goto out;
if (id->ncap == 0) { ret = -ENODEV; - goto out; + goto free_id; }
__nvme_revalidate_disk(disk, id); @@ -1658,8 +1656,11 @@ static int nvme_revalidate_disk(struct gendisk *disk) ret = -ENODEV; }
-out: +free_id: kfree(id); +out: + if (ret > 0) + ret = blk_status_to_errno(nvme_error_status(ret)); return ret; }
@@ -3204,7 +3205,7 @@ static void nvme_alloc_ns(struct nvme_ctrl *ctrl, unsigned nsid) struct gendisk *disk; struct nvme_id_ns *id; char disk_name[DISK_NAME_LEN]; - int node = dev_to_node(ctrl->dev), flags = GENHD_FL_EXT_DEVT; + int node = dev_to_node(ctrl->dev), flags = GENHD_FL_EXT_DEVT, ret;
ns = kzalloc_node(sizeof(*ns), GFP_KERNEL, node); if (!ns) @@ -3223,8 +3224,8 @@ static void nvme_alloc_ns(struct nvme_ctrl *ctrl, unsigned nsid) blk_queue_logical_block_size(ns->queue, 1 << ns->lba_shift); nvme_set_queue_limits(ctrl, ns->queue);
- id = nvme_identify_ns(ctrl, nsid); - if (!id) + ret = nvme_identify_ns(ctrl, nsid, &id); + if (ret) goto out_free_queue;
if (id->ncap == 0)
From: Sagi Grimberg sagi@grimberg.me
mainline inclusion from mainline-5.4-rc1 commit 93da40239b1069ef96bbfe7c8d08edb347e8107a category: bugfix bugzilla: NA CVE: NA Link: https://gitee.com/openeuler/kernel/issues/I1WGZE
--------------------------------
AENs in general are not related to the presence of I/O queues, so enable them regardless. Note that the only exception is that discovery controller will not support any of the requested AENs and nvme_enable_aen will respect that and return, so it is still safe to enable regardless.
Note it is safe to enable AENs even before the initial namespace scanning as we have the scan operation in a workqueue context.
Reviewed-by: Minwoo Im minwoo.im.dev@gmail.com Reviewed-by: Christoph Hellwig hch@lst.de Reviewed-by: Chaitanya Kulkarni chaitanya.kulkarni@wdc.com Reviewed-by: Hannes Reinecke hare@suse.com Signed-off-by: Sagi Grimberg sagi@grimberg.me Reviewed-by: Chao Leng lengchao@huawei.com Reviewed-by: Jike Cheng chengjike.cheng@huawei.com Signed-off-by: Lijie lijie34@huawei.com Reviewed-by: Tao Hou houtao1@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- drivers/nvme/host/core.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-)
diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index 2f3765ad6cd7..ce935f300628 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -1151,6 +1151,8 @@ static void nvme_enable_aen(struct nvme_ctrl *ctrl) if (status) dev_warn(ctrl->device, "Failed to configure AEN (cfg %x)\n", supported_aens); + + queue_work(nvme_wq, &ctrl->async_event_work); }
static int nvme_submit_io(struct nvme_ns *ns, struct nvme_user_io __user *uio) @@ -3643,10 +3645,10 @@ void nvme_start_ctrl(struct nvme_ctrl *ctrl) if (ctrl->kato) nvme_start_keep_alive(ctrl);
+ nvme_enable_aen(ctrl); + if (ctrl->queue_count > 1) { nvme_queue_scan(ctrl); - nvme_enable_aen(ctrl); - queue_work(nvme_wq, &ctrl->async_event_work); nvme_start_queues(ctrl); } ctrl->created = true;
From: Max Gurtovoy maxg@mellanox.com
mainline inclusion from mainline-v5.5-rc1 commit 2dc3947b53f573e8a75ea9cbec5588df88ca502e category: bugfix bugzilla: NA CVE: NA Link: https://gitee.com/openeuler/kernel/issues/I1WGZE
--------------------------------
Fix the status code of canceled requests initiated by the host according to TP4028 (Status Code 0x371): "Command Aborted By host: The command was aborted as a result of host action (e.g., the host disconnected the Fabric connection)."
Also in a multipath environment, unless otherwise specified, errors of this type (path related) should be retried using a different path, if one is available.
Signed-off-by: Max Gurtovoy maxg@mellanox.com Reviewed-by: Christoph Hellwig hch@lst.de Signed-off-by: Keith Busch kbusch@kernel.org Signed-off-by: Jens Axboe axboe@kernel.dk Reviewed-by: Chao Leng lengchao@huawei.com Reviewed-by: Jike Cheng chengjike.cheng@huawei.com Signed-off-by: Lijie lijie34@huawei.com Reviewed-by: Tao Hou houtao1@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- drivers/nvme/host/core.c | 2 +- drivers/nvme/host/multipath.c | 1 + include/linux/nvme.h | 1 + 3 files changed, 3 insertions(+), 1 deletion(-)
diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index ce935f300628..78f0a02fc822 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -298,7 +298,7 @@ void nvme_cancel_request(struct request *req, void *data, bool reserved) if (blk_mq_request_completed(req)) return;
- nvme_req(req)->status = NVME_SC_ABORT_REQ; + nvme_req(req)->status = NVME_SC_HOST_ABORTED_CMD; blk_mq_complete_request(req);
} diff --git a/drivers/nvme/host/multipath.c b/drivers/nvme/host/multipath.c index e71075338ff5..5cb30dfc0047 100644 --- a/drivers/nvme/host/multipath.c +++ b/drivers/nvme/host/multipath.c @@ -99,6 +99,7 @@ bool nvme_failover_req(struct request *req) } break; case NVME_SC_HOST_PATH_ERROR: + case NVME_SC_HOST_ABORTED_CMD: /* * Temporary transport disruption in talking to the controller. * Try to send on a new path. diff --git a/include/linux/nvme.h b/include/linux/nvme.h index 60b492116e1b..412eb5928adc 100644 --- a/include/linux/nvme.h +++ b/include/linux/nvme.h @@ -1256,6 +1256,7 @@ enum { NVME_SC_ANA_INACCESSIBLE = 0x302, NVME_SC_ANA_TRANSITION = 0x303, NVME_SC_HOST_PATH_ERROR = 0x370, + NVME_SC_HOST_ABORTED_CMD = 0x371,
NVME_SC_CRD = 0x1800, NVME_SC_DNR = 0x4000,
From: "Masahiro Yamada (KIOXIA)" masahiro31.yamada@kioxia.com
mainline inclusion from mainline-v5.7-rc1 commit c225b610311bc5695d952cd3590136f26199a227 category: bugfix bugzilla: NA CVE: NA Link: https://gitee.com/openeuler/kernel/issues/I1WGZE
--------------------------------
Currently 32 bit application gets ENOTTY when it calls compat_ioctl with NVME_IOCTL_SUBMIT_IO in 64 bit kernel.
The cause is that the results of sizeof(struct nvme_user_io), which is used to define NVME_IOCTL_SUBMIT_IO, are not same between 32 bit compiler and 64 bit compiler.
* 32 bit: the result of sizeof nvme_user_io is 44. * 64 bit: the result of sizeof nvme_user_io is 48.
64 bit compiler seems to add 32 bit padding for multiple of 8 bytes.
This patch adds a compat_ioctl handler. The handler replaces NVME_IOCTL_SUBMIT_IO32 with NVME_IOCTL_SUBMIT_IO in case 32 bit application calls compat_ioctl for submit in 64 bit kernel. Then, it calls nvme_ioctl as usual.
Reviewed-by: Christoph Hellwig hch@lst.de Signed-off-by: Masahiro Yamada (KIOXIA) masahiro31.yamada@kioxia.com Signed-off-by: Keith Busch kbusch@kernel.org Reviewed-by: Chao Leng lengchao@huawei.com Reviewed-by: Jike Cheng chengjike.cheng@huawei.com Signed-off-by: Lijie lijie34@huawei.com Reviewed-by: Tao Hou houtao1@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- drivers/nvme/host/core.c | 45 ++++++++++++++++++++++++++++++++++++++-- 1 file changed, 43 insertions(+), 2 deletions(-)
diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index 78f0a02fc822..b77b6c9259d8 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -1418,6 +1418,47 @@ static int nvme_ioctl(struct block_device *bdev, fmode_t mode, return ret; }
+#ifdef CONFIG_COMPAT +struct nvme_user_io32 { + __u8 opcode; + __u8 flags; + __u16 control; + __u16 nblocks; + __u16 rsvd; + __u64 metadata; + __u64 addr; + __u64 slba; + __u32 dsmgmt; + __u32 reftag; + __u16 apptag; + __u16 appmask; +} __attribute__((__packed__)); + +#define NVME_IOCTL_SUBMIT_IO32 _IOW('N', 0x42, struct nvme_user_io32) + +static int nvme_compat_ioctl(struct block_device *bdev, fmode_t mode, + unsigned int cmd, unsigned long arg) +{ + /* + * Corresponds to the difference of NVME_IOCTL_SUBMIT_IO + * between 32 bit programs and 64 bit kernel. + * The cause is that the results of sizeof(struct nvme_user_io), + * which is used to define NVME_IOCTL_SUBMIT_IO, + * are not same between 32 bit compiler and 64 bit compiler. + * NVME_IOCTL_SUBMIT_IO32 is for 64 bit kernel handling + * NVME_IOCTL_SUBMIT_IO issued from 32 bit programs. + * Other IOCTL numbers are same between 32 bit and 64 bit. + * So there is nothing to do regarding to other IOCTL numbers. + */ + if (cmd == NVME_IOCTL_SUBMIT_IO32) + return nvme_ioctl(bdev, mode, NVME_IOCTL_SUBMIT_IO, arg); + + return nvme_ioctl(bdev, mode, cmd, arg); +} +#else +#define nvme_compat_ioctl NULL +#endif /* CONFIG_COMPAT */ + static int nvme_open(struct block_device *bdev, fmode_t mode) { struct nvme_ns *ns = bdev->bd_disk->private_data; @@ -1791,7 +1832,7 @@ EXPORT_SYMBOL_GPL(nvme_sec_submit); static const struct block_device_operations nvme_fops = { .owner = THIS_MODULE, .ioctl = nvme_ioctl, - .compat_ioctl = nvme_ioctl, + .compat_ioctl = nvme_compat_ioctl, .open = nvme_open, .release = nvme_release, .getgeo = nvme_getgeo, @@ -1819,7 +1860,7 @@ const struct block_device_operations nvme_ns_head_ops = { .open = nvme_ns_head_open, .release = nvme_ns_head_release, .ioctl = nvme_ioctl, - .compat_ioctl = nvme_ioctl, + .compat_ioctl = nvme_compat_ioctl, .getgeo = nvme_getgeo, .pr_ops = &nvme_pr_ops, };
From: Max Gurtovoy maxg@mellanox.com
mainline inclusion from mainline-v5.7-rc1 commit f41cfd5d0a04b12a5dae753cd01163661432ebbb category: bugfix bugzilla: NA CVE: NA Link: https://gitee.com/openeuler/kernel/issues/I1WGZE
--------------------------------
ida instances allocate some internal memory in addition to the base 'struct ida'. Use ida_destroy() to release that memory at module_exit().
Reviewed-by: Christoph Hellwig hch@lst.de Signed-off-by: Max Gurtovoy maxg@mellanox.com Signed-off-by: Keith Busch kbusch@kernel.org Reviewed-by: Chao Leng lengchao@huawei.com Reviewed-by: Jike Cheng chengjike.cheng@huawei.com Signed-off-by: Lijie lijie34@huawei.com Reviewed-by: Tao Hou houtao1@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- drivers/nvme/host/core.c | 1 + 1 file changed, 1 insertion(+)
diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index b77b6c9259d8..b8446637b3c2 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -3957,6 +3957,7 @@ void nvme_core_exit(void) destroy_workqueue(nvme_delete_wq); destroy_workqueue(nvme_reset_wq); destroy_workqueue(nvme_wq); + ida_destroy(&nvme_instance_ida); }
MODULE_LICENSE("GPL");
From: Prabhath Sajeepa psajeepa@purestorage.com
mainline inclusion from mainline-v5.5-rc1 commit 64fab7290dc3561729bbc1e35895a517eb2e549e category: bugfix bugzilla: NA CVE: NA Link: https://gitee.com/openeuler/kernel/issues/I1WGZE
--------------------------------
Check validity of offset into ANA log buffer before accessing nvme_ana_group_desc. This check ensures the size of ANA log buffer >= offset + sizeof(nvme_ana_group_desc)
Reviewed-by: Sagi Grimberg sagi@grimberg.me Signed-off-by: Prabhath Sajeepa psajeepa@purestorage.com Signed-off-by: Keith Busch kbusch@kernel.org Signed-off-by: Jens Axboe axboe@kernel.dk Reviewed-by: Chao Leng lengchao@huawei.com Reviewed-by: Jike Cheng chengjike.cheng@huawei.com Signed-off-by: Lijie lijie34@huawei.com Reviewed-by: Tao Hou houtao1@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- drivers/nvme/host/multipath.c | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-)
diff --git a/drivers/nvme/host/multipath.c b/drivers/nvme/host/multipath.c index 5cb30dfc0047..bf22bbb56bda 100644 --- a/drivers/nvme/host/multipath.c +++ b/drivers/nvme/host/multipath.c @@ -338,8 +338,14 @@ static int nvme_parse_ana_log(struct nvme_ctrl *ctrl, void *data,
for (i = 0; i < le16_to_cpu(ctrl->ana_log_buf->ngrps); i++) { struct nvme_ana_group_desc *desc = base + offset; - u32 nr_nsids = le32_to_cpu(desc->nnsids); - size_t nsid_buf_size = nr_nsids * sizeof(__le32); + u32 nr_nsids; + size_t nsid_buf_size; + + if (WARN_ON_ONCE(offset > ctrl->ana_log_size - sizeof(*desc))) + return -EINVAL; + + nr_nsids = le32_to_cpu(desc->nnsids); + nsid_buf_size = nr_nsids * sizeof(__le32);
if (WARN_ON_ONCE(desc->grpid == 0)) return -EINVAL; @@ -359,8 +365,6 @@ static int nvme_parse_ana_log(struct nvme_ctrl *ctrl, void *data, return error;
offset += nsid_buf_size; - if (WARN_ON_ONCE(offset > ctrl->ana_log_size - sizeof(*desc))) - return -EINVAL; }
return 0;
From: Sagi Grimberg sagi@grimberg.me
mainline inclusion from mainline-v5.9-rc4 commit 0475a8dcbcee92a5d22e40c9c6353829fc6294b8 category: bugfix bugzilla: NA CVE: NA Link: https://gitee.com/openeuler/kernel/issues/I1WGZE
-------------------------------------------------
When a request times out in a LIVE state, we simply trigger error recovery and let the error recovery handle the request cancellation, however when a request times out in a non LIVE state, we make sure to complete it immediately as it might block controller setup or teardown and prevent forward progress.
However tearing down the entire set of I/O and admin queues causes freeze/unfreeze imbalance (q->mq_freeze_depth) because and is really an overkill to what we actually need, which is to just fence controller teardown that may be running, stop the queue, and cancel the request if it is not already completed.
Now that we have the controller teardown_lock, we can safely serialize request cancellation. This addresses a hang caused by calling extra queue freeze on controller namespaces, causing unfreeze to not complete correctly.
Reviewed-by: Christoph Hellwig hch@lst.de Reviewed-by: James Smart james.smart@broadcom.com Signed-off-by: Sagi Grimberg sagi@grimberg.me Reviewed-by: Chao Leng lengchao@huawei.com Reviewed-by: Jike Cheng chengjike.cheng@huawei.com Conflicts: drivers/nvme/host/rdma.c [lrz: adjust context] Signed-off-by: Ruozhu Li liruozhu@huawei.com Signed-off-by: Lijie lijie34@huawei.com Reviewed-by: Tao Hou houtao1@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- drivers/nvme/host/rdma.c | 41 ++++++++++++++++++++++++++++++++-------- 1 file changed, 33 insertions(+), 8 deletions(-)
diff --git a/drivers/nvme/host/rdma.c b/drivers/nvme/host/rdma.c index f0cc293e579a..3007d3aa08c1 100644 --- a/drivers/nvme/host/rdma.c +++ b/drivers/nvme/host/rdma.c @@ -1052,6 +1052,7 @@ static void nvme_rdma_error_recovery(struct nvme_rdma_ctrl *ctrl) if (!nvme_change_ctrl_state(&ctrl->ctrl, NVME_CTRL_RESETTING)) return;
+ dev_warn(ctrl->ctrl.device, "starting error recovery\n"); queue_work(nvme_wq, &ctrl->err_work); }
@@ -1667,6 +1668,22 @@ static int nvme_rdma_cm_handler(struct rdma_cm_id *cm_id, return 0; }
+static void nvme_rdma_complete_timed_out(struct request *rq) +{ + struct nvme_rdma_request *req = blk_mq_rq_to_pdu(rq); + struct nvme_rdma_queue *queue = req->queue; + struct nvme_rdma_ctrl *ctrl = queue->ctrl; + + /* fence other contexts that may complete the command */ + mutex_lock(&ctrl->teardown_lock); + nvme_rdma_stop_queue(queue); + if (!blk_mq_request_completed(rq)) { + nvme_req(rq)->status = NVME_SC_HOST_ABORTED_CMD; + blk_mq_complete_request(rq); + } + mutex_unlock(&ctrl->teardown_lock); +} + static enum blk_eh_timer_return nvme_rdma_timeout(struct request *rq, bool reserved) { @@ -1679,19 +1696,27 @@ nvme_rdma_timeout(struct request *rq, bool reserved)
if (ctrl->ctrl.state != NVME_CTRL_LIVE) { /* - * Teardown immediately if controller times out while starting - * or we are already started error recovery. all outstanding - * requests are completed on shutdown, so we return BLK_EH_DONE. + * If we are resetting, connecting or deleting we should + * complete immediately because we may block controller + * teardown or setup sequence + * - ctrl disable/shutdown fabrics requests + * - connect requests + * - initialization admin requests + * - I/O requests that entered after unquiescing and + * the controller stopped responding + * + * All other requests should be cancelled by the error + * recovery work, so it's fine that we fail it here. */ - flush_work(&ctrl->err_work); - nvme_rdma_teardown_io_queues(ctrl, false); - nvme_rdma_teardown_admin_queue(ctrl, false); + nvme_rdma_complete_timed_out(rq); return BLK_EH_DONE; }
- dev_warn(ctrl->ctrl.device, "starting error recovery\n"); + /* + * LIVE state should trigger the normal error recovery which will + * handle completing this request. + */ nvme_rdma_error_recovery(ctrl); - return BLK_EH_RESET_TIMER; }