From: Sagi Grimberg sagi@grimberg.me
mainline inclusion from mainline-v5.3-rc5 commit d94211b8bad3787e0655a67284105f57db728cb1 category: bugfix bugzilla: NA CVE: NA Link: https://gitee.com/openeuler/kernel/issues/I1WGZE
-------------------------------------------------
When start_queue fails, we need to make sure to drain the queue cq before freeing the rdma resources because we might still race with the completion path. Have start_queue() error path safely stop the queue.
-- [30371.808111] nvme nvme1: Failed reconnect attempt 11 [30371.808113] nvme nvme1: Reconnecting in 10 seconds... [...] [30382.069315] nvme nvme1: creating 4 I/O queues. [30382.257058] nvme nvme1: Connect Invalid SQE Parameter, qid 4 [30382.257061] nvme nvme1: failed to connect queue: 4 ret=386 [30382.305001] BUG: unable to handle kernel NULL pointer dereference at 0000000000000018 [30382.305022] IP: qedr_poll_cq+0x8a3/0x1170 [qedr] [30382.305028] PGD 0 P4D 0 [30382.305037] Oops: 0000 [#1] SMP PTI [...] [30382.305153] Call Trace: [30382.305166] ? __switch_to_asm+0x34/0x70 [30382.305187] __ib_process_cq+0x56/0xd0 [ib_core] [30382.305201] ib_poll_handler+0x26/0x70 [ib_core] [30382.305213] irq_poll_softirq+0x88/0x110 [30382.305223] ? sort_range+0x20/0x20 [30382.305232] __do_softirq+0xde/0x2c6 [30382.305241] ? sort_range+0x20/0x20 [30382.305249] run_ksoftirqd+0x1c/0x60 [30382.305258] smpboot_thread_fn+0xef/0x160 [30382.305265] kthread+0x113/0x130 [30382.305273] ? kthread_create_worker_on_cpu+0x50/0x50 [30382.305281] ret_from_fork+0x35/0x40 --
Reported-by: Nicolas Morey-Chaisemartin NMoreyChaisemartin@suse.com Reviewed-by: Max Gurtovoy maxg@mellanox.com Reviewed-by: Hannes Reinecke hare@suse.com Signed-off-by: Sagi Grimberg sagi@grimberg.me Reviewed-by: Chao Leng lengchao@huawei.com Reviewed-by: Jike Cheng chengjike.cheng@huawei.com Conflicts: drivers/nvme/host/rdma.c [lrz: get queue pointer to apply patch] Signed-off-by: Ruozhu Li liruozhu@huawei.com Signed-off-by: Lijie lijie34@huawei.com Reviewed-by: Tao Hou houtao1@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- drivers/nvme/host/rdma.c | 18 +++++++++++++----- 1 file changed, 13 insertions(+), 5 deletions(-)
diff --git a/drivers/nvme/host/rdma.c b/drivers/nvme/host/rdma.c index e83df4a4c42c..ae7155005c35 100644 --- a/drivers/nvme/host/rdma.c +++ b/drivers/nvme/host/rdma.c @@ -548,13 +548,18 @@ static int nvme_rdma_alloc_queue(struct nvme_rdma_ctrl *ctrl, return ret; }
+static void __nvme_rdma_stop_queue(struct nvme_rdma_queue *queue) +{ + rdma_disconnect(queue->cm_id); + ib_drain_qp(queue->qp); +} + static void nvme_rdma_stop_queue(struct nvme_rdma_queue *queue) { if (!test_and_clear_bit(NVME_RDMA_Q_LIVE, &queue->flags)) return;
- rdma_disconnect(queue->cm_id); - ib_drain_qp(queue->qp); + __nvme_rdma_stop_queue(queue); }
static void nvme_rdma_free_queue(struct nvme_rdma_queue *queue) @@ -584,6 +589,7 @@ static void nvme_rdma_stop_io_queues(struct nvme_rdma_ctrl *ctrl)
static int nvme_rdma_start_queue(struct nvme_rdma_ctrl *ctrl, int idx) { + struct nvme_rdma_queue *queue = &ctrl->queues[idx]; int ret;
if (idx) @@ -591,11 +597,13 @@ static int nvme_rdma_start_queue(struct nvme_rdma_ctrl *ctrl, int idx) else ret = nvmf_connect_admin_queue(&ctrl->ctrl);
- if (!ret) - set_bit(NVME_RDMA_Q_LIVE, &ctrl->queues[idx].flags); - else + if (!ret) { + set_bit(NVME_RDMA_Q_LIVE, &queue->flags); + } else { + __nvme_rdma_stop_queue(queue); dev_info(ctrl->ctrl.device, "failed to connect queue: %d ret=%d\n", idx, ret); + } return ret; }