From: Ruozhu Li liruozhu@huawei.com
mainline inclusion from mainline-v5.15-rc2 commit 9817d763dbe15327b9b3ff4404fa6f27f927e744 category: bugfix bugzilla: NA CVE: NA Link: https://gitee.com/openeuler/kernel/issues/I1WGZE
We got a panic when host received a rej cm event soon after a connect error cm event. When host get connect error cm event, it will destroy qp immediately. But cm_id is still valid then.Another cm event rise here, try to access the qp which was destroyed.Then we got a kernel panic blow:
[87816.777089] [20473] ib_cm:cm_rep_handler:2343: cm_rep_handler: Stale connection. local_comm_id -154357094, remote_comm_id -1133609861 [87816.777223] [20473] ib_cm:cm_init_qp_rtr_attr:4162: cm_init_qp_rtr_attr: local_id -1150387077, cm_id_priv->id.state: 13 [87816.777225] [20473] rdma_cm:cma_rep_recv:1871: RDMA CM: CONNECT_ERROR: failed to handle reply. status -22 [87816.777395] [20473] ib_cm:ib_send_cm_rej:2781: ib_send_cm_rej: local_id -1150387077, cm_id->state: 13 [87816.777398] [20473] nvme_rdma:nvme_rdma_cm_handler:1718: nvme nvme278: connect error (6): status -22 id 00000000c3809aff [87816.801155] [20473] nvme_rdma:nvme_rdma_cm_handler:1742: nvme nvme278: CM error event 6 [87816.801160] [20473] rdma_cm:cma_ib_handler:1947: RDMA CM: REJECTED: consumer defined [87816.801163] nvme nvme278: rdma connection establishment failed (-104) [87816.801168] BUG: unable to handle kernel NULL pointer dereference at 0000000000000370 [87816.801201] RIP: 0010:_ib_modify_qp+0x6e/0x3a0 [ib_core] [87816.801215] Call Trace: [87816.801223] cma_modify_qp_err+0x52/0x80 [rdma_cm] [87816.801228] ? __dynamic_pr_debug+0x8a/0xb0 [87816.801232] cma_ib_handler+0x25a/0x2f0 [rdma_cm] [87816.801235] cm_process_work+0x60/0xe0 [ib_cm] [87816.801238] cm_work_handler+0x13b/0x1b97 [ib_cm] [87816.801243] ? __switch_to_asm+0x35/0x70 [87816.801244] ? __switch_to_asm+0x41/0x70 [87816.801246] ? __switch_to_asm+0x35/0x70 [87816.801248] ? __switch_to_asm+0x41/0x70 [87816.801252] ? __switch_to+0x8c/0x480 [87816.801254] ? __switch_to_asm+0x41/0x70 [87816.801256] ? __switch_to_asm+0x35/0x70 [87816.801259] process_one_work+0x1a7/0x3b0 [87816.801263] worker_thread+0x30/0x390 [87816.801266] ? create_worker+0x1a0/0x1a0 [87816.801268] kthread+0x112/0x130 [87816.801270] ? kthread_flush_work_fn+0x10/0x10 [87816.801272] ret_from_fork+0x35/0x40
-------------------------------------------------
We should always destroy cm_id before destroy qp to avoid to get cma event after qp was destroyed, which may lead to use after free. In RDMA connection establishment error flow, don't destroy qp in cm event handler.Just report cm_error to upper level, qp will be destroy in nvme_rdma_alloc_queue() after destroy cm id.
Signed-off-by: Ruozhu Li liruozhu@huawei.com Reviewed-by: Max Gurtovoy mgurtovoy@nvidia.com Signed-off-by: Christoph Hellwig hch@lst.de Conflicts: drivers/nvme/host/rdma.c [lrz: adjust context] Reviewed-by: Jason Yan yanaijie@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- drivers/nvme/host/rdma.c | 16 +++------------- 1 file changed, 3 insertions(+), 13 deletions(-)
diff --git a/drivers/nvme/host/rdma.c b/drivers/nvme/host/rdma.c index b8e0d637ddcfc..049edb9ed1858 100644 --- a/drivers/nvme/host/rdma.c +++ b/drivers/nvme/host/rdma.c @@ -575,8 +575,8 @@ static void nvme_rdma_free_queue(struct nvme_rdma_queue *queue) if (!test_and_clear_bit(NVME_RDMA_Q_ALLOCATED, &queue->flags)) return;
- nvme_rdma_destroy_queue_ib(queue); rdma_destroy_id(queue->cm_id); + nvme_rdma_destroy_queue_ib(queue); mutex_destroy(&queue->queue_lock); }
@@ -1509,14 +1509,10 @@ static int nvme_rdma_conn_established(struct nvme_rdma_queue *queue) for (i = 0; i < queue->queue_size; i++) { ret = nvme_rdma_post_recv(queue, &queue->rsp_ring[i]); if (ret) - goto out_destroy_queue_ib; + return ret; }
return 0; - -out_destroy_queue_ib: - nvme_rdma_destroy_queue_ib(queue); - return ret; }
static int nvme_rdma_conn_rejected(struct nvme_rdma_queue *queue, @@ -1608,14 +1604,10 @@ static int nvme_rdma_route_resolved(struct nvme_rdma_queue *queue) if (ret) { dev_err(ctrl->ctrl.device, "rdma_connect failed (%d).\n", ret); - goto out_destroy_queue_ib; + return ret; }
return 0; - -out_destroy_queue_ib: - nvme_rdma_destroy_queue_ib(queue); - return ret; }
static int nvme_rdma_cm_handler(struct rdma_cm_id *cm_id, @@ -1646,8 +1638,6 @@ static int nvme_rdma_cm_handler(struct rdma_cm_id *cm_id, case RDMA_CM_EVENT_ROUTE_ERROR: case RDMA_CM_EVENT_CONNECT_ERROR: case RDMA_CM_EVENT_UNREACHABLE: - nvme_rdma_destroy_queue_ib(queue); - /* fall through */ case RDMA_CM_EVENT_ADDR_ERROR: dev_dbg(queue->ctrl->ctrl.device, "CM error event %d\n", ev->event);