[PATCH for-rc 0/3] RDMA/hns: Misc fixes
This patchset contains servral fixes for hns. Junxian Huang (1): RDMA/hns: Fix memory leak of bonding resource Lianfa Weng (2): RDMA/hns: Fix warning in poll cq direct mode RDMA/hns: Fix log flood after cmd_mbox failure drivers/infiniband/hw/hns/hns_roce_cq.c | 6 +++--- drivers/infiniband/hw/hns/hns_roce_hw_v2.c | 18 +++++++++--------- drivers/infiniband/hw/hns/hns_roce_main.c | 6 ++++-- drivers/infiniband/hw/hns/hns_roce_mr.c | 6 +++--- drivers/infiniband/hw/hns/hns_roce_srq.c | 2 +- 5 files changed, 20 insertions(+), 18 deletions(-) -- 2.33.0
In a corner case of concurrent driver removal and driver reset, bonding resource is first released in hns_roce_hw_v2_exit() during driver removal, and then is allocated again in hns_roce_register_device() during driver reset. This leads to memory leak because the release timing has already passed. This may also lead to a kernel panic as below because of the leaked notifier callback: Call trace: 0xffffa20fccc04978 (P) raw_notifier_call_chain+0x20/0x38 call_netdevice_notifiers_info+0x60/0xb8 netdev_lower_state_changed+0x4c/0xb8 Bonding resource allocation and release should occur only during driver init and removal, so don't do the allocation during reset. Fixes: b37ad2e290fc ("RDMA/hns: Initialize bonding resources") Signed-off-by: Junxian Huang <huangjunxian6@hisilicon.com> --- drivers/infiniband/hw/hns/hns_roce_main.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/drivers/infiniband/hw/hns/hns_roce_main.c b/drivers/infiniband/hw/hns/hns_roce_main.c index c17ff5347a01..a7308a3c586e 100644 --- a/drivers/infiniband/hw/hns/hns_roce_main.c +++ b/drivers/infiniband/hw/hns/hns_roce_main.c @@ -795,6 +795,7 @@ static const struct ib_device_ops hns_roce_dev_restrack_ops = { static int hns_roce_register_device(struct hns_roce_dev *hr_dev) { + struct hns_roce_v2_priv *priv = hr_dev->priv; struct hns_roce_ib_iboe *iboe = NULL; struct device *dev = hr_dev->dev; struct ib_device *ib_dev = NULL; @@ -838,7 +839,8 @@ static int hns_roce_register_device(struct hns_roce_dev *hr_dev) dma_set_max_seg_size(dev, SZ_2G); - if (hr_dev->caps.flags & HNS_ROCE_CAP_FLAG_BOND) { + if (hr_dev->caps.flags & HNS_ROCE_CAP_FLAG_BOND && + priv->handle->rinfo.reset_state != HNS_ROCE_STATE_RST_INIT) { ret = hns_roce_alloc_bond_grp(hr_dev); if (ret) { dev_err(dev, "failed to alloc bond_grp for bus %u, ret = %d\n", -- 2.33.0
From: Lianfa Weng <wenglianfa@huawei.com> CQs allocated by ib_alloc_cq() always have a comp_handler. Though in direct mode this handler is never expected to be called, it is still called when the driver is reset, triggering the following WARN_ONCE(): Call trace: ib_cq_completion_direct+0x38/0x60 hns_roce_cq_completion+0x54/0x90 (hns_roce_hw_v2] hns_roce_handle_device_err+Ox1c8/0x340 [hns_roce_hw_v2] hns_roce_hw_v2_uninit_instance.constprop.0+0x34/0x70 [hns_roce_hw_v2] hns_roce_hw_v2_reset_notify+0xc4/0xe0 [hns_roce_hw_v2] hclge_notify_roce_client+0x60/0xbc [hclge] hclge_reset_rebuild+0x48/0x34c [hclge] hclge_reset_subtask+0xcc/0xec [hclge] hclge_reset_service_task+0x80/0x160 [hclge] hclge_service_task+0x50/0x80 (hclge] process_one_work+0x1cc/0x4d0 worker_thread+0x154/0x414 kthread+0x104/0x144 ret_from_fork+0x10/0x18 Fixes: f295e4cece5c ("RDMA/hns: Delete unnecessary callback functions for cq") Signed-off-by: Lianfa Weng <wenglianfa@huawei.com> Signed-off-by: Junxian Huang <huangjunxian6@hisilicon.com> --- drivers/infiniband/hw/hns/hns_roce_main.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/infiniband/hw/hns/hns_roce_main.c b/drivers/infiniband/hw/hns/hns_roce_main.c index a7308a3c586e..2b71c2b30bc8 100644 --- a/drivers/infiniband/hw/hns/hns_roce_main.c +++ b/drivers/infiniband/hw/hns/hns_roce_main.c @@ -1114,7 +1114,7 @@ static void check_and_get_armed_cq(struct list_head *cq_list, struct ib_cq *cq) unsigned long flags; spin_lock_irqsave(&hr_cq->lock, flags); - if (cq->comp_handler) { + if (cq->comp_handler && hr_cq->ib_cq.poll_ctx != IB_POLL_DIRECT) { if (!hr_cq->is_armed) { hr_cq->is_armed = 1; list_add_tail(&hr_cq->node, cq_list); -- 2.33.0
From: Lianfa Weng <wenglianfa@huawei.com> hns_roce_cmd_mbox() is the command interface between driver and hardware. When hardware is abnormal, the unlimited error printings after hns_roce_cmd_mbox() failure will cause log flood and even system crash. Replace ibdev_err() and ibdev_warn() with their ratelimited versions in the error handling path after hns_roce_cmd_mbox() (and its wrappers hns_roce_create_hw_ctx/hns_roce_destroy_hw_ctx) fails. Fixes: 9a4435375cd1 ("IB/hns: Add driver files for hns RoCE driver") Signed-off-by: Lianfa Weng <wenglianfa@huawei.com> Signed-off-by: Junxian Huang <huangjunxian6@hisilicon.com> --- drivers/infiniband/hw/hns/hns_roce_cq.c | 6 +++--- drivers/infiniband/hw/hns/hns_roce_hw_v2.c | 18 +++++++++--------- drivers/infiniband/hw/hns/hns_roce_mr.c | 6 +++--- drivers/infiniband/hw/hns/hns_roce_srq.c | 2 +- 4 files changed, 16 insertions(+), 16 deletions(-) diff --git a/drivers/infiniband/hw/hns/hns_roce_cq.c b/drivers/infiniband/hw/hns/hns_roce_cq.c index 24de651f735e..1dd0efb5620d 100644 --- a/drivers/infiniband/hw/hns/hns_roce_cq.c +++ b/drivers/infiniband/hw/hns/hns_roce_cq.c @@ -174,9 +174,9 @@ static int hns_roce_create_cqc(struct hns_roce_dev *hr_dev, ret = hns_roce_create_hw_ctx(hr_dev, mailbox, HNS_ROCE_CMD_CREATE_CQC, hr_cq->cqn); if (ret) - ibdev_err(ibdev, - "failed to send create cmd for CQ(0x%lx), ret = %d.\n", - hr_cq->cqn, ret); + ibdev_err_ratelimited(ibdev, + "failed to send create cmd for CQ(0x%lx), ret = %d.\n", + hr_cq->cqn, ret); hns_roce_free_cmd_mailbox(hr_dev, mailbox); diff --git a/drivers/infiniband/hw/hns/hns_roce_hw_v2.c b/drivers/infiniband/hw/hns/hns_roce_hw_v2.c index 4afd7d6ae3ca..332a4816f2ca 100644 --- a/drivers/infiniband/hw/hns/hns_roce_hw_v2.c +++ b/drivers/infiniband/hw/hns/hns_roce_hw_v2.c @@ -6193,9 +6193,9 @@ static int hns_roce_v2_modify_srq(struct ib_srq *ibsrq, HNS_ROCE_CMD_MODIFY_SRQC, srq->srqn); hns_roce_free_cmd_mailbox(hr_dev, mailbox); if (ret) - ibdev_err(&hr_dev->ib_dev, - "failed to handle cmd of modifying SRQ, ret = %d.\n", - ret); + ibdev_err_ratelimited(&hr_dev->ib_dev, + "failed to handle cmd of modifying SRQ, ret = %d.\n", + ret); } out: @@ -6221,9 +6221,9 @@ static int hns_roce_v2_query_srq(struct ib_srq *ibsrq, struct ib_srq_attr *attr) ret = hns_roce_cmd_mbox(hr_dev, 0, mailbox->dma, HNS_ROCE_CMD_QUERY_SRQC, srq->srqn); if (ret) { - ibdev_err(&hr_dev->ib_dev, - "failed to process cmd of querying SRQ, ret = %d.\n", - ret); + ibdev_err_ratelimited(&hr_dev->ib_dev, + "failed to process cmd of querying SRQ, ret = %d.\n", + ret); goto out; } @@ -6329,9 +6329,9 @@ static int hns_roce_v2_query_mpt(struct hns_roce_dev *hr_dev, u32 key, ret = hns_roce_cmd_mbox(hr_dev, 0, mailbox->dma, HNS_ROCE_CMD_QUERY_MPT, key_to_hw_index(key)); if (ret) { - ibdev_err(&hr_dev->ib_dev, - "failed to process cmd when querying MPT, ret = %d.\n", - ret); + ibdev_err_ratelimited(&hr_dev->ib_dev, + "failed to process cmd when querying MPT, ret = %d.\n", + ret); goto err_mailbox; } diff --git a/drivers/infiniband/hw/hns/hns_roce_mr.c b/drivers/infiniband/hw/hns/hns_roce_mr.c index 896af1828a38..e8a9e7d8f267 100644 --- a/drivers/infiniband/hw/hns/hns_roce_mr.c +++ b/drivers/infiniband/hw/hns/hns_roce_mr.c @@ -173,7 +173,7 @@ static int hns_roce_mr_enable(struct hns_roce_dev *hr_dev, ret = hns_roce_create_hw_ctx(hr_dev, mailbox, HNS_ROCE_CMD_CREATE_MPT, mtpt_idx & (hr_dev->caps.num_mtpts - 1)); if (ret) { - dev_err(dev, "failed to create mpt, ret = %d.\n", ret); + dev_err_ratelimited(dev, "failed to create mpt, ret = %d.\n", ret); goto err_page; } @@ -315,7 +315,7 @@ struct ib_mr *hns_roce_rereg_user_mr(struct ib_mr *ibmr, int flags, u64 start, ret = hns_roce_destroy_hw_ctx(hr_dev, HNS_ROCE_CMD_DESTROY_MPT, mtpt_idx); if (ret) - ibdev_warn(ib_dev, "failed to destroy MPT, ret = %d.\n", ret); + ibdev_warn_ratelimited(ib_dev, "failed to destroy MPT, ret = %d.\n", ret); mr->enabled = 0; mr->iova = virt_addr; @@ -346,7 +346,7 @@ struct ib_mr *hns_roce_rereg_user_mr(struct ib_mr *ibmr, int flags, u64 start, ret = hns_roce_create_hw_ctx(hr_dev, mailbox, HNS_ROCE_CMD_CREATE_MPT, mtpt_idx); if (ret) { - ibdev_err(ib_dev, "failed to create MPT, ret = %d.\n", ret); + ibdev_err_ratelimited(ib_dev, "failed to create MPT, ret = %d.\n", ret); goto free_cmd_mbox; } diff --git a/drivers/infiniband/hw/hns/hns_roce_srq.c b/drivers/infiniband/hw/hns/hns_roce_srq.c index 8644c3916367..00552a08f21a 100644 --- a/drivers/infiniband/hw/hns/hns_roce_srq.c +++ b/drivers/infiniband/hw/hns/hns_roce_srq.c @@ -103,7 +103,7 @@ static int hns_roce_create_srqc(struct hns_roce_dev *hr_dev, ret = hns_roce_create_hw_ctx(hr_dev, mailbox, HNS_ROCE_CMD_CREATE_SRQ, srq->srqn); if (ret) - ibdev_err(ibdev, "failed to config SRQC, ret = %d.\n", ret); + ibdev_err_ratelimited(ibdev, "failed to config SRQC, ret = %d.\n", ret); err_mbox: hns_roce_free_cmd_mailbox(hr_dev, mailbox); -- 2.33.0
participants (1)
-
Junxian Huang