From: Juan Zhou zhoujuan51@h-partners.com
Some bugfix for hns RoCE.
Chengchang Tang (6): RDMA/hns: fix iommu_map_sg() failed when MR bigger than 4G RDMA/hns: Use complete parentheses in macros RDMA/hns: Remove extra blank line in get_sge_num_from_max_inl_data() RDMA/hns: Fix missing resetting notify RDMA/hns: Fix missing capacities in query_device() RDMA/hns: Fix cpu stuck by printings during reset
wenglianfa (3): RDMA/hns: Fix simultaneous reset and resource deregistration RDMA/hns: Fix the overflow risk of hem_list_calc_ba_range() RDMA/hns: Fix long waiting cmd event when reset
drivers/infiniband/core/ib_core_uverbs.c | 85 ++++++++++++++++++ drivers/infiniband/core/rdma_core.h | 1 - drivers/infiniband/core/uverbs_main.c | 64 -------------- drivers/infiniband/hw/hns/hns_roce_cq.c | 33 +++++-- drivers/infiniband/hw/hns/hns_roce_db.c | 31 +++++-- drivers/infiniband/hw/hns/hns_roce_device.h | 38 +++++++- drivers/infiniband/hw/hns/hns_roce_hem.c | 17 ++-- drivers/infiniband/hw/hns/hns_roce_hem.h | 12 +-- drivers/infiniband/hw/hns/hns_roce_hw_v2.c | 95 ++++++++++++-------- drivers/infiniband/hw/hns/hns_roce_hw_v2.h | 2 +- drivers/infiniband/hw/hns/hns_roce_main.c | 16 +++- drivers/infiniband/hw/hns/hns_roce_mr.c | 96 ++++++++++++++++++++- drivers/infiniband/hw/hns/hns_roce_qp.c | 33 +++++-- drivers/infiniband/hw/hns/hns_roce_srq.c | 47 ++++++++-- include/rdma/ib_verbs.h | 2 + 15 files changed, 423 insertions(+), 149 deletions(-)
-- 2.30.0
driver inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I9FIHP
----------------------------------------------------------------------
Since the maximum length of the sgl entry is 4G, if MR is greater than 4G, there is a probability that the sg entry length will overflow which would lead to a iommu map failure.
Since the largest RoCE packet is 2G, there will be no DMA operation exceeding 2G at one time. We adjust the DMA max seg size from 4G to 2G to ensure that the length of the sgl entry will not overflow and flip.
Signed-off-by: Chengchang Tang tangchengchang@huawei.com (cherry picked from commit 582d9481542e6f9faada8b374432f59b5978ce69) Signed-off-by: Juan Zhou zhoujuan51@h-partners.com --- drivers/infiniband/hw/hns/hns_roce_main.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/drivers/infiniband/hw/hns/hns_roce_main.c b/drivers/infiniband/hw/hns/hns_roce_main.c index 9dc50ec62..340ebb2e4 100644 --- a/drivers/infiniband/hw/hns/hns_roce_main.c +++ b/drivers/infiniband/hw/hns/hns_roce_main.c @@ -1016,7 +1016,7 @@ static int hns_roce_register_device(struct hns_roce_dev *hr_dev) if (ret) return ret; } - dma_set_max_seg_size(dev, UINT_MAX); + dma_set_max_seg_size(dev, SZ_2G);
if ((hr_dev->caps.flags & HNS_ROCE_CAP_FLAG_BOND) && (hr_dev->hw->bond_is_active(hr_dev)))
driver inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I9FIHP
----------------------------------------------------------------------
Use complete parentheses to ensure that macro expansion does not produce unexpected results.
Signed-off-by: Chengchang Tang tangchengchang@huawei.com Signed-off-by: Juan Zhou zhoujuan51@h-partners.com --- drivers/infiniband/hw/hns/hns_roce_hem.h | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-)
diff --git a/drivers/infiniband/hw/hns/hns_roce_hem.h b/drivers/infiniband/hw/hns/hns_roce_hem.h index 6fb51db96..9c415b254 100644 --- a/drivers/infiniband/hw/hns/hns_roce_hem.h +++ b/drivers/infiniband/hw/hns/hns_roce_hem.h @@ -57,16 +57,16 @@ enum { };
#define check_whether_bt_num_3(type, hop_num) \ - (type < HEM_TYPE_MTT && hop_num == 2) + ((type) < HEM_TYPE_MTT && (hop_num) == 2)
#define check_whether_bt_num_2(type, hop_num) \ - ((type < HEM_TYPE_MTT && hop_num == 1) || \ - (type >= HEM_TYPE_MTT && hop_num == 2)) + (((type) < HEM_TYPE_MTT && (hop_num) == 1) || \ + ((type) >= HEM_TYPE_MTT && (hop_num) == 2))
#define check_whether_bt_num_1(type, hop_num) \ - ((type < HEM_TYPE_MTT && hop_num == HNS_ROCE_HOP_NUM_0) || \ - (type >= HEM_TYPE_MTT && hop_num == 1) || \ - (type >= HEM_TYPE_MTT && hop_num == HNS_ROCE_HOP_NUM_0)) + (((type) < HEM_TYPE_MTT && (hop_num) == HNS_ROCE_HOP_NUM_0) || \ + ((type) >= HEM_TYPE_MTT && (hop_num) == 1) || \ + ((type) >= HEM_TYPE_MTT && (hop_num) == HNS_ROCE_HOP_NUM_0))
struct hns_roce_hem { void *buf;
driver inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I9FIHP
----------------------------------------------------------------------
Remove redundant blank lines between value judgments and function calls.
Signed-off-by: Chengchang Tang tangchengchang@huawei.com Signed-off-by: Juan Zhou zhoujuan51@h-partners.com --- drivers/infiniband/hw/hns/hns_roce_qp.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-)
diff --git a/drivers/infiniband/hw/hns/hns_roce_qp.c b/drivers/infiniband/hw/hns/hns_roce_qp.c index cc92a54d0..55d3c17ac 100644 --- a/drivers/infiniband/hw/hns/hns_roce_qp.c +++ b/drivers/infiniband/hw/hns/hns_roce_qp.c @@ -531,13 +531,12 @@ static unsigned int get_sge_num_from_max_inl_data(bool is_ud_or_gsi, { unsigned int inline_sge;
- inline_sge = roundup_pow_of_two(max_inline_data) / HNS_ROCE_SGE_SIZE; - /* * if max_inline_data less than * HNS_ROCE_SGE_IN_WQE * HNS_ROCE_SGE_SIZE, * In addition to ud's mode, no need to extend sge. */ + inline_sge = roundup_pow_of_two(max_inline_data) / HNS_ROCE_SGE_SIZE; if (!is_ud_or_gsi && inline_sge <= HNS_ROCE_SGE_IN_WQE) inline_sge = 0;
driver inclusion category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I9FIHP
----------------------------------------------------------------------
A hardware reset is required to stop traffic injection, currently this is done by obtaining a notification from the kernel space driver and stop ringing the doorbell in user space. This notification is implemented through a shared memory. If concurrency scenarios are involved, the shared memory mechanism needs barriers to ensure reliability, but barriers will severely affect performance.
This patch uses a new scheme to solve this problem. Before resetting, the kernel-mode driver will zap all the shared memory between user-mode driver and kernel-mode driver, and point these VMAs to a zero page, so that user-mode can no longer access any hardware address during reset, thus achieving flow stop.
Fixes: 9f651379c548 ("RDMA/hns: Kernel notify usr space to stop ring db") Signed-off-by: Chengchang Tang tangchengchang@huawei.com Signed-off-by: Juan Zhou zhoujuan51@h-partners.com --- drivers/infiniband/core/ib_core_uverbs.c | 85 ++++++++++++++++++++++ drivers/infiniband/core/rdma_core.h | 1 - drivers/infiniband/core/uverbs_main.c | 64 ---------------- drivers/infiniband/hw/hns/hns_roce_hw_v2.c | 14 ++-- include/rdma/ib_verbs.h | 2 + 5 files changed, 94 insertions(+), 72 deletions(-)
diff --git a/drivers/infiniband/core/ib_core_uverbs.c b/drivers/infiniband/core/ib_core_uverbs.c index b51bd7087..4e27389a7 100644 --- a/drivers/infiniband/core/ib_core_uverbs.c +++ b/drivers/infiniband/core/ib_core_uverbs.c @@ -5,6 +5,7 @@ * Copyright 2019 Marvell. All rights reserved. */ #include <linux/xarray.h> +#include <linux/sched/mm.h> #include "uverbs.h" #include "core_priv.h"
@@ -365,3 +366,87 @@ int rdma_user_mmap_entry_insert(struct ib_ucontext *ucontext, U32_MAX); } EXPORT_SYMBOL(rdma_user_mmap_entry_insert); + +void uverbs_user_mmap_disassociate(struct ib_uverbs_file *ufile) +{ + struct rdma_umap_priv *priv, *next_priv; + + lockdep_assert_held(&ufile->hw_destroy_rwsem); + + while (1) { + struct mm_struct *mm = NULL; + + /* Get an arbitrary mm pointer that hasn't been cleaned yet */ + mutex_lock(&ufile->umap_lock); + while (!list_empty(&ufile->umaps)) { + int ret; + + priv = list_first_entry(&ufile->umaps, + struct rdma_umap_priv, list); + mm = priv->vma->vm_mm; + ret = mmget_not_zero(mm); + if (!ret) { + list_del_init(&priv->list); + if (priv->entry) { + rdma_user_mmap_entry_put(priv->entry); + priv->entry = NULL; + } + mm = NULL; + continue; + } + break; + } + mutex_unlock(&ufile->umap_lock); + if (!mm) + return; + + /* + * The umap_lock is nested under mmap_lock since it used within + * the vma_ops callbacks, so we have to clean the list one mm + * at a time to get the lock ordering right. Typically there + * will only be one mm, so no big deal. + */ + mmap_read_lock(mm); + mutex_lock(&ufile->umap_lock); + list_for_each_entry_safe(priv, next_priv, &ufile->umaps, list) { + struct vm_area_struct *vma = priv->vma; + + if (vma->vm_mm != mm) + continue; + list_del_init(&priv->list); + + zap_vma_ptes(vma, vma->vm_start, + vma->vm_end - vma->vm_start); + + if (priv->entry) { + rdma_user_mmap_entry_put(priv->entry); + priv->entry = NULL; + } + } + mutex_unlock(&ufile->umap_lock); + mmap_read_unlock(mm); + mmput(mm); + } +} +EXPORT_SYMBOL(uverbs_user_mmap_disassociate); + +/** + * rdma_user_mmap_disassociate() - disassociate the mmap from the ucontext. + * + * @ucontext: associated user context. + * + * This function should be called by drivers that need to disable mmap for + * some ucontexts. + */ +void rdma_user_mmap_disassociate(struct ib_ucontext *ucontext) +{ + struct ib_uverbs_file *ufile = ucontext->ufile; + + /* Racing with uverbs_destroy_ufile_hw */ + if (!down_read_trylock(&ufile->hw_destroy_rwsem)) + return; + + uverbs_user_mmap_disassociate(ufile); + up_read(&ufile->hw_destroy_rwsem); +} +EXPORT_SYMBOL(rdma_user_mmap_disassociate); diff --git a/drivers/infiniband/core/rdma_core.h b/drivers/infiniband/core/rdma_core.h index 33706dad6..ad01fbd52 100644 --- a/drivers/infiniband/core/rdma_core.h +++ b/drivers/infiniband/core/rdma_core.h @@ -149,7 +149,6 @@ void uverbs_disassociate_api(struct uverbs_api *uapi); void uverbs_destroy_api(struct uverbs_api *uapi); void uapi_compute_bundle_size(struct uverbs_api_ioctl_method *method_elm, unsigned int num_attrs); -void uverbs_user_mmap_disassociate(struct ib_uverbs_file *ufile);
extern const struct uapi_definition uverbs_def_obj_async_fd[]; extern const struct uapi_definition uverbs_def_obj_counters[]; diff --git a/drivers/infiniband/core/uverbs_main.c b/drivers/infiniband/core/uverbs_main.c index 495d5a5d0..f1db48c2c 100644 --- a/drivers/infiniband/core/uverbs_main.c +++ b/drivers/infiniband/core/uverbs_main.c @@ -45,7 +45,6 @@ #include <linux/cdev.h> #include <linux/anon_inodes.h> #include <linux/slab.h> -#include <linux/sched/mm.h>
#include <linux/uaccess.h>
@@ -817,69 +816,6 @@ static const struct vm_operations_struct rdma_umap_ops = { .fault = rdma_umap_fault, };
-void uverbs_user_mmap_disassociate(struct ib_uverbs_file *ufile) -{ - struct rdma_umap_priv *priv, *next_priv; - - lockdep_assert_held(&ufile->hw_destroy_rwsem); - - while (1) { - struct mm_struct *mm = NULL; - - /* Get an arbitrary mm pointer that hasn't been cleaned yet */ - mutex_lock(&ufile->umap_lock); - while (!list_empty(&ufile->umaps)) { - int ret; - - priv = list_first_entry(&ufile->umaps, - struct rdma_umap_priv, list); - mm = priv->vma->vm_mm; - ret = mmget_not_zero(mm); - if (!ret) { - list_del_init(&priv->list); - if (priv->entry) { - rdma_user_mmap_entry_put(priv->entry); - priv->entry = NULL; - } - mm = NULL; - continue; - } - break; - } - mutex_unlock(&ufile->umap_lock); - if (!mm) - return; - - /* - * The umap_lock is nested under mmap_lock since it used within - * the vma_ops callbacks, so we have to clean the list one mm - * at a time to get the lock ordering right. Typically there - * will only be one mm, so no big deal. - */ - mmap_read_lock(mm); - mutex_lock(&ufile->umap_lock); - list_for_each_entry_safe (priv, next_priv, &ufile->umaps, - list) { - struct vm_area_struct *vma = priv->vma; - - if (vma->vm_mm != mm) - continue; - list_del_init(&priv->list); - - zap_vma_ptes(vma, vma->vm_start, - vma->vm_end - vma->vm_start); - - if (priv->entry) { - rdma_user_mmap_entry_put(priv->entry); - priv->entry = NULL; - } - } - mutex_unlock(&ufile->umap_lock); - mmap_read_unlock(mm); - mmput(mm); - } -} - /* * ib_uverbs_open() does not need the BKL: * diff --git a/drivers/infiniband/hw/hns/hns_roce_hw_v2.c b/drivers/infiniband/hw/hns/hns_roce_hw_v2.c index 9815853f5..9774ddd4e 100644 --- a/drivers/infiniband/hw/hns/hns_roce_hw_v2.c +++ b/drivers/infiniband/hw/hns/hns_roce_hw_v2.c @@ -41,6 +41,7 @@ #include <rdma/ib_cache.h> #include <rdma/ib_umem.h> #include <rdma/uverbs_ioctl.h> +#include <rdma/ib_verbs.h>
#include "hnae3.h" #include "hclge_main.h" @@ -7438,14 +7439,13 @@ int hns_roce_bond_uninit_client(struct hns_roce_bond_group *bond_grp,
static void hns_roce_v2_reset_notify_user(struct hns_roce_dev *hr_dev) { - struct hns_roce_v2_reset_state *state; - - state = (struct hns_roce_v2_reset_state *)hr_dev->reset_kaddr; + struct hns_roce_ucontext *uctx, *tmp;
- state->reset_state = HNS_ROCE_IS_RESETTING; - state->hw_ready = 0; - /* Ensure reset state was flushed in memory */ - wmb(); + mutex_lock(&hr_dev->uctx_list_mutex); + list_for_each_entry_safe(uctx, tmp, &hr_dev->uctx_list, list) { + rdma_user_mmap_disassociate(&uctx->ibucontext); + } + mutex_unlock(&hr_dev->uctx_list_mutex); }
static int hns_roce_hw_v2_reset_notify_down(struct hnae3_handle *handle) diff --git a/include/rdma/ib_verbs.h b/include/rdma/ib_verbs.h index 1930dfbf1..7e602a7e6 100644 --- a/include/rdma/ib_verbs.h +++ b/include/rdma/ib_verbs.h @@ -2949,6 +2949,7 @@ int rdma_user_mmap_entry_insert_range(struct ib_ucontext *ucontext, struct rdma_user_mmap_entry *entry, size_t length, u32 min_pgoff, u32 max_pgoff); +void rdma_user_mmap_disassociate(struct ib_ucontext *ucontext);
static inline int rdma_user_mmap_entry_insert_exact(struct ib_ucontext *ucontext, @@ -4728,6 +4729,7 @@ void rdma_roce_rescan_device(struct ib_device *ibdev); struct ib_ucontext *ib_uverbs_get_ucontext_file(struct ib_uverbs_file *ufile);
int uverbs_destroy_def_handler(struct uverbs_attr_bundle *attrs); +void uverbs_user_mmap_disassociate(struct ib_uverbs_file *ufile);
struct net_device *rdma_alloc_netdev(struct ib_device *device, u32 port_num, enum rdma_netdev_t type, const char *name,
driver inclusion category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I9FIHP
----------------------------------------------------------------------
This patch add max_ah and cq moderation capacities to hns_roce_query_device().
Fixes: 9a4435375cd1 ("IB/hns: Add driver files for hns RoCE driver") Signed-off-by: Chengchang Tang tangchengchang@huawei.com Signed-off-by: Juan Zhou zhoujuan51@h-partners.com --- drivers/infiniband/hw/hns/hns_roce_device.h | 3 +++ drivers/infiniband/hw/hns/hns_roce_hw_v2.c | 2 +- drivers/infiniband/hw/hns/hns_roce_hw_v2.h | 2 +- drivers/infiniband/hw/hns/hns_roce_main.c | 6 ++++++ 4 files changed, 11 insertions(+), 2 deletions(-)
diff --git a/drivers/infiniband/hw/hns/hns_roce_device.h b/drivers/infiniband/hw/hns/hns_roce_device.h index 851e397dc..8c840849a 100644 --- a/drivers/infiniband/hw/hns/hns_roce_device.h +++ b/drivers/infiniband/hw/hns/hns_roce_device.h @@ -103,6 +103,9 @@ #define CQ_BANKID_SHIFT 2 #define CQ_BANKID_MASK GENMASK(1, 0)
+#define HNS_ROCE_MAX_CQ_COUNT 0xFFFF +#define HNS_ROCE_MAX_CQ_PERIOD 0xFFFF + enum { SERV_TYPE_RC, SERV_TYPE_UC, diff --git a/drivers/infiniband/hw/hns/hns_roce_hw_v2.c b/drivers/infiniband/hw/hns/hns_roce_hw_v2.c index 9774ddd4e..a3494257b 100644 --- a/drivers/infiniband/hw/hns/hns_roce_hw_v2.c +++ b/drivers/infiniband/hw/hns/hns_roce_hw_v2.c @@ -6167,7 +6167,7 @@ static int hns_roce_v2_modify_cq(struct ib_cq *cq, u16 cq_count, u16 cq_period) dev_info(hr_dev->dev, "cq_period(%u) reached the upper limit, adjusted to 65.\n", cq_period); - cq_period = HNS_ROCE_MAX_CQ_PERIOD; + cq_period = HNS_ROCE_MAX_CQ_PERIOD_HIP08; } cq_period *= HNS_ROCE_CLOCK_ADJUST; } diff --git a/drivers/infiniband/hw/hns/hns_roce_hw_v2.h b/drivers/infiniband/hw/hns/hns_roce_hw_v2.h index 5adb3c1cf..0637361d8 100644 --- a/drivers/infiniband/hw/hns/hns_roce_hw_v2.h +++ b/drivers/infiniband/hw/hns/hns_roce_hw_v2.h @@ -1351,7 +1351,7 @@ struct fmea_ram_ecc {
/* only for RNR timeout issue of HIP08 */ #define HNS_ROCE_CLOCK_ADJUST 1000 -#define HNS_ROCE_MAX_CQ_PERIOD 65 +#define HNS_ROCE_MAX_CQ_PERIOD_HIP08 65 #define HNS_ROCE_MAX_EQ_PERIOD 65 #define HNS_ROCE_RNR_TIMER_10NS 1 #define HNS_ROCE_1US_CFG 999 diff --git a/drivers/infiniband/hw/hns/hns_roce_main.c b/drivers/infiniband/hw/hns/hns_roce_main.c index 340ebb2e4..d9f43076e 100644 --- a/drivers/infiniband/hw/hns/hns_roce_main.c +++ b/drivers/infiniband/hw/hns/hns_roce_main.c @@ -250,6 +250,12 @@ static int hns_roce_query_device(struct ib_device *ib_dev, IB_ATOMIC_HCA : IB_ATOMIC_NONE; props->max_pkeys = 1; props->local_ca_ack_delay = hr_dev->caps.local_ca_ack_delay; + props->max_ah = INT_MAX; + props->cq_caps.max_cq_moderation_period = HNS_ROCE_MAX_CQ_PERIOD; + props->cq_caps.max_cq_moderation_count = HNS_ROCE_MAX_CQ_COUNT; + if (hr_dev->pci_dev->revision == PCI_REVISION_ID_HIP08) + props->cq_caps.max_cq_moderation_period = HNS_ROCE_MAX_CQ_PERIOD_HIP08; + if (hr_dev->caps.flags & HNS_ROCE_CAP_FLAG_SRQ) { props->max_srq = hr_dev->caps.num_srqs; props->max_srq_wr = hr_dev->caps.max_srq_wrs;
driver inclusion category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I9FIHP
----------------------------------------------------------------------
During the reset, issuing the cmdq command to destroy resources such as qp, cq, and mr may fail, and the destruction failure information is printed. When a large number of resources fail to be destroyed, printk() causes multiple threads to compete for a global log buffer spinlock. As a result, the CPU performance deteriorates and the CPU is stuck. To fix it, use a function like printk_limit_rate() to limit the print frequency of printk().
Fixes: 9a4435375cd1 ("IB/hns: Add driver files for hns RoCE driver") Fixes: c7bcb13442e1 ("RDMA/hns: Add SRQ support for hip08 kernel mode") Fixes: 626903e9355b ("RDMA/hns: Add support for reporting wc as software mode") Fixes: 6eef524201de ("RDMA/hns: Replace not intuitive function/macro names") Fixes: 6f5f556d3795 ("RDMA/hns: Use the reserved loopback QPs to free MR before destroying MPT") Fixes: 357f34294686 ("RDMA/hns: Simplify the state judgment code of qp") Fixes: 926a01dc000d ("RDMA/hns: Add QP operations support for hip08 SoC") Fixes: 70d469ff011b ("RDMA/hns: Add method to detach WQE buffer") Fixes: 9c9edf689a60 ("RDMA/hns: Remove redundant DFX file and DFX ops structure") Signed-off-by: Chengchang Tang tangchengchang@huawei.com Signed-off-by: Juan Zhou zhoujuan51@h-partners.com --- drivers/infiniband/hw/hns/hns_roce_cq.c | 5 ++- drivers/infiniband/hw/hns/hns_roce_hem.c | 5 ++- drivers/infiniband/hw/hns/hns_roce_hw_v2.c | 50 +++++++++++++--------- drivers/infiniband/hw/hns/hns_roce_mr.c | 2 +- drivers/infiniband/hw/hns/hns_roce_srq.c | 5 ++- 5 files changed, 39 insertions(+), 28 deletions(-)
diff --git a/drivers/infiniband/hw/hns/hns_roce_cq.c b/drivers/infiniband/hw/hns/hns_roce_cq.c index 7250d0643..d1d93fc66 100644 --- a/drivers/infiniband/hw/hns/hns_roce_cq.c +++ b/drivers/infiniband/hw/hns/hns_roce_cq.c @@ -179,8 +179,9 @@ static void free_cqc(struct hns_roce_dev *hr_dev, struct hns_roce_cq *hr_cq) ret = hns_roce_destroy_hw_ctx(hr_dev, HNS_ROCE_CMD_DESTROY_CQC, hr_cq->cqn); if (ret) - dev_err(dev, "DESTROY_CQ failed (%d) for CQN %06lx\n", ret, - hr_cq->cqn); + dev_err_ratelimited(dev, + "DESTROY_CQ failed (%d) for CQN %06lx\n", + ret, hr_cq->cqn);
xa_erase(&cq_table->array, hr_cq->cqn);
diff --git a/drivers/infiniband/hw/hns/hns_roce_hem.c b/drivers/infiniband/hw/hns/hns_roce_hem.c index a4b3f1916..9ac68efc3 100644 --- a/drivers/infiniband/hw/hns/hns_roce_hem.c +++ b/drivers/infiniband/hw/hns/hns_roce_hem.c @@ -672,8 +672,9 @@ void hns_roce_table_put(struct hns_roce_dev *hr_dev,
ret = hr_dev->hw->clear_hem(hr_dev, table, obj, HEM_HOP_STEP_DIRECT); if (ret) - dev_warn(dev, "failed to clear HEM base address, ret = %d.\n", - ret); + dev_warn_ratelimited(dev, + "failed to clear HEM base address, ret = %d.\n", + ret);
hns_roce_free_hem(hr_dev, table->hem[i]); table->hem[i] = NULL; diff --git a/drivers/infiniband/hw/hns/hns_roce_hw_v2.c b/drivers/infiniband/hw/hns/hns_roce_hw_v2.c index a3494257b..e854e559f 100644 --- a/drivers/infiniband/hw/hns/hns_roce_hw_v2.c +++ b/drivers/infiniband/hw/hns/hns_roce_hw_v2.c @@ -443,19 +443,21 @@ static int check_send_valid(struct hns_roce_dev *hr_dev, if (unlikely(hr_qp->state == IB_QPS_RESET || hr_qp->state == IB_QPS_INIT || hr_qp->state == IB_QPS_RTR)) { - ibdev_err(ibdev, "failed to post WQE, QP state %u!\n", - hr_qp->state); + ibdev_err_ratelimited(ibdev, + "failed to post WQE, QP state %u!\n", + hr_qp->state); return -EINVAL; } else if (unlikely(hr_dev->state >= HNS_ROCE_DEVICE_STATE_RST_DOWN)) { - ibdev_err(ibdev, "failed to post WQE, dev state %d!\n", - hr_dev->state); + ibdev_err_ratelimited(ibdev, + "failed to post WQE, dev state %d!\n", + hr_dev->state); return -EIO; }
if (check_dca_attach_enable(hr_qp)) { ret = dca_attach_qp_buf(hr_dev, hr_qp); if (unlikely(ret)) { - ibdev_err(ibdev, + ibdev_err_ratelimited(ibdev, "failed to attach DCA for QP-%lu send!\n", hr_qp->qpn); return ret; @@ -3613,8 +3615,9 @@ static int free_mr_post_send_lp_wqe(struct hns_roce_qp *hr_qp)
ret = hns_roce_v2_post_send(&hr_qp->ibqp, send_wr, &bad_wr); if (ret) { - ibdev_err(ibdev, "failed to post wqe for free mr, ret = %d.\n", - ret); + ibdev_err_ratelimited(ibdev, + "failed to post wqe for free mr, ret = %d.\n", + ret); return ret; }
@@ -3653,7 +3656,7 @@ static void free_mr_send_cmd_to_hw(struct hns_roce_dev *hr_dev)
ret = free_mr_post_send_lp_wqe(hr_qp); if (ret) { - ibdev_err(ibdev, + ibdev_err_ratelimited(ibdev, "failed to send wqe (qp:0x%lx) for free mr, ret = %d.\n", hr_qp->qpn, ret); break; @@ -3666,14 +3669,14 @@ static void free_mr_send_cmd_to_hw(struct hns_roce_dev *hr_dev) while (cqe_cnt) { npolled = hns_roce_v2_poll_cq(&free_mr->rsv_cq->ib_cq, cqe_cnt, wc); if (npolled < 0) { - ibdev_err(ibdev, + ibdev_err_ratelimited(ibdev, "failed to poll cqe for free mr, remain %d cqe.\n", cqe_cnt); goto out; }
if (time_after(jiffies, end)) { - ibdev_err(ibdev, + ibdev_err_ratelimited(ibdev, "failed to poll cqe for free mr and timeout, remain %d cqe.\n", cqe_cnt); goto out; @@ -5261,7 +5264,8 @@ static int hns_roce_v2_set_abs_fields(struct ib_qp *ibqp, int ret = 0;
if (!check_qp_state(cur_state, new_state)) { - ibdev_err(&hr_dev->ib_dev, "Illegal state for QP!\n"); + ibdev_err_ratelimited(&hr_dev->ib_dev, + "Illegal state for QP!\n"); return -EINVAL; }
@@ -5525,7 +5529,7 @@ static int hns_roce_v2_modify_qp(struct ib_qp *ibqp, /* SW pass context to HW */ ret = hns_roce_v2_qp_modify(hr_dev, context, qpc_mask, hr_qp); if (ret) { - ibdev_err(ibdev, "failed to modify QP, ret = %d.\n", ret); + ibdev_err_ratelimited(ibdev, "failed to modify QP, ret = %d.\n", ret); goto out; }
@@ -5710,7 +5714,9 @@ static int hns_roce_v2_query_qp(struct ib_qp *ibqp, struct ib_qp_attr *qp_attr,
ret = hns_roce_v2_query_qpc(hr_dev, hr_qp->qpn, &context); if (ret) { - ibdev_err(ibdev, "failed to query QPC, ret = %d.\n", ret); + ibdev_err_ratelimited(ibdev, + "failed to query QPC, ret = %d.\n", + ret); ret = -EINVAL; goto out; } @@ -5718,7 +5724,7 @@ static int hns_roce_v2_query_qp(struct ib_qp *ibqp, struct ib_qp_attr *qp_attr, state = hr_reg_read(&context, QPC_QP_ST); tmp_qp_state = to_ib_qp_st((enum hns_roce_v2_qp_state)state); if (tmp_qp_state == -1) { - ibdev_err(ibdev, "Illegal ib_qp_state\n"); + ibdev_err_ratelimited(ibdev, "Illegal ib_qp_state\n"); ret = -EINVAL; goto out; } @@ -5800,7 +5806,9 @@ static bool hns_roce_v2_chk_dca_buf_inactive(struct hns_roce_dev *hr_dev,
ret = hns_roce_v2_query_qpc(hr_dev, hr_qp->qpn, &context); if (ret) { - ibdev_err(ibdev, "failed to query DCA QPC, ret = %d.\n", ret); + ibdev_err_ratelimited(ibdev, + "failed to query DCA QPC, ret = %d.\n", + ret); return false; }
@@ -5844,7 +5852,7 @@ static int hns_roce_v2_destroy_qp_common(struct hns_roce_dev *hr_dev, ret = hns_roce_v2_modify_qp(&hr_qp->ibqp, NULL, 0, hr_qp->state, IB_QPS_RESET, udata); if (ret) - ibdev_err(ibdev, + ibdev_err_ratelimited(ibdev, "failed to modify QP to RST, ret = %d.\n", ret); } @@ -5882,7 +5890,7 @@ int hns_roce_v2_destroy_qp(struct ib_qp *ibqp, struct ib_udata *udata)
ret = hns_roce_v2_destroy_qp_common(hr_dev, hr_qp, udata); if (ret) - ibdev_err(&hr_dev->ib_dev, + ibdev_err_ratelimited(&hr_dev->ib_dev, "failed to destroy QP, QPN = 0x%06lx, ret = %d.\n", hr_qp->qpn, ret);
@@ -6178,7 +6186,7 @@ static int hns_roce_v2_modify_cq(struct ib_cq *cq, u16 cq_count, u16 cq_period) HNS_ROCE_CMD_MODIFY_CQC, hr_cq->cqn); hns_roce_free_cmd_mailbox(hr_dev, mailbox); if (ret) - ibdev_err(&hr_dev->ib_dev, + ibdev_err_ratelimited(&hr_dev->ib_dev, "failed to process cmd when modifying CQ, ret = %d.\n", ret);
@@ -6204,9 +6212,9 @@ static int hns_roce_v2_query_cqc(struct hns_roce_dev *hr_dev, u32 cqn, ret = hns_roce_cmd_mbox(hr_dev, 0, mailbox->dma, HNS_ROCE_CMD_QUERY_CQC, cqn); if (ret) { - ibdev_err(&hr_dev->ib_dev, - "failed to process cmd when querying CQ, ret = %d.\n", - ret); + ibdev_err_ratelimited(&hr_dev->ib_dev, + "failed to process cmd when querying CQ, ret = %d.\n", + ret); goto err_mailbox; }
diff --git a/drivers/infiniband/hw/hns/hns_roce_mr.c b/drivers/infiniband/hw/hns/hns_roce_mr.c index 15382fb89..00b81e09c 100644 --- a/drivers/infiniband/hw/hns/hns_roce_mr.c +++ b/drivers/infiniband/hw/hns/hns_roce_mr.c @@ -138,7 +138,7 @@ static void hns_roce_mr_free(struct hns_roce_dev *hr_dev, struct hns_roce_mr *mr key_to_hw_index(mr->key) & (hr_dev->caps.num_mtpts - 1)); if (ret) - ibdev_warn(ibdev, "failed to destroy mpt, ret = %d.\n", + ibdev_warn_ratelimited(ibdev, "failed to destroy mpt, ret = %d.\n", ret); }
diff --git a/drivers/infiniband/hw/hns/hns_roce_srq.c b/drivers/infiniband/hw/hns/hns_roce_srq.c index 31f100211..293c76632 100644 --- a/drivers/infiniband/hw/hns/hns_roce_srq.c +++ b/drivers/infiniband/hw/hns/hns_roce_srq.c @@ -150,8 +150,9 @@ static void free_srqc(struct hns_roce_dev *hr_dev, struct hns_roce_srq *srq) ret = hns_roce_destroy_hw_ctx(hr_dev, HNS_ROCE_CMD_DESTROY_SRQ, srq->srqn); if (ret) - dev_err(hr_dev->dev, "DESTROY_SRQ failed (%d) for SRQN %06lx\n", - ret, srq->srqn); + dev_err_ratelimited(hr_dev->dev, + "DESTROY_SRQ failed (%d) for SRQN %06lx\n", + ret, srq->srqn);
xa_erase(&srq_table->xa, srq->srqn);
From: wenglianfa wenglianfa@huawei.com
driver inclusion category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I9FIHP
----------------------------------------------------------------------
In the current solution, the pseudo WC enables the user-mode to detect the device error in advance and releases context resources. As a result, there is a high probability that hardware reset and context resource release occur at the same time. During the hardware reset, the MBOX cannot instruct the hardware to stop accessing the memory, but the corresponding resources are released during the reset. The hardware is unaware that the driver has freed resources. Therefore, the remaining tasks of the hardware access invalid memory, and the RAS alarm is reported.
If the driver detects above scenario, the driver will not release the resources.Instead, record it in a linked list. Wait for the roce driver to uninstall before releasing it. In this way, the hardware does not access the invalid memory, and the driver does not cause memory leakage.
Fixes: b0969f83890b ("RDMA/hns: Do not destroy QP resources in the hw resetting phase") Signed-off-by: wenglianfa wenglianfa@huawei.com Signed-off-by: Juan Zhou zhoujuan51@h-partners.com --- drivers/infiniband/hw/hns/hns_roce_cq.c | 28 +++++- drivers/infiniband/hw/hns/hns_roce_db.c | 31 +++++-- drivers/infiniband/hw/hns/hns_roce_device.h | 35 +++++++- drivers/infiniband/hw/hns/hns_roce_hw_v2.c | 14 +-- drivers/infiniband/hw/hns/hns_roce_main.c | 8 ++ drivers/infiniband/hw/hns/hns_roce_mr.c | 94 ++++++++++++++++++++- drivers/infiniband/hw/hns/hns_roce_qp.c | 30 ++++++- drivers/infiniband/hw/hns/hns_roce_srq.c | 42 +++++++-- 8 files changed, 249 insertions(+), 33 deletions(-)
diff --git a/drivers/infiniband/hw/hns/hns_roce_cq.c b/drivers/infiniband/hw/hns/hns_roce_cq.c index d1d93fc66..47f42c3c7 100644 --- a/drivers/infiniband/hw/hns/hns_roce_cq.c +++ b/drivers/infiniband/hw/hns/hns_roce_cq.c @@ -182,6 +182,8 @@ static void free_cqc(struct hns_roce_dev *hr_dev, struct hns_roce_cq *hr_cq) dev_err_ratelimited(dev, "DESTROY_CQ failed (%d) for CQN %06lx\n", ret, hr_cq->cqn); + if (ret == -EBUSY) + hr_cq->delayed_destroy_flag = true;
xa_erase(&cq_table->array, hr_cq->cqn);
@@ -193,7 +195,11 @@ static void free_cqc(struct hns_roce_dev *hr_dev, struct hns_roce_cq *hr_cq) complete(&hr_cq->free); wait_for_completion(&hr_cq->free);
- hns_roce_table_put(hr_dev, &cq_table->table, hr_cq->cqn); + /* this resource will be freed when the driver is uninstalled, so + * no memory leak will occur. + */ + if (!hr_cq->delayed_destroy_flag) + hns_roce_table_put(hr_dev, &cq_table->table, hr_cq->cqn); }
static int alloc_cq_buf(struct hns_roce_dev *hr_dev, struct hns_roce_cq *hr_cq, @@ -203,6 +209,10 @@ static int alloc_cq_buf(struct hns_roce_dev *hr_dev, struct hns_roce_cq *hr_cq, struct hns_roce_buf_attr buf_attr = {}; int ret;
+ hr_cq->mtr_node = kvmalloc(sizeof(*hr_cq->mtr_node), GFP_KERNEL); + if (!hr_cq->mtr_node) + return -ENOMEM; + buf_attr.page_shift = hr_dev->caps.cqe_buf_pg_sz + PAGE_SHIFT; buf_attr.region[0].size = hr_cq->cq_depth * hr_cq->cqe_size; buf_attr.region[0].hopnum = hr_dev->caps.cqe_hop_num; @@ -211,15 +221,24 @@ static int alloc_cq_buf(struct hns_roce_dev *hr_dev, struct hns_roce_cq *hr_cq, ret = hns_roce_mtr_create(hr_dev, &hr_cq->mtr, &buf_attr, hr_dev->caps.cqe_ba_pg_sz + PAGE_SHIFT, udata, addr); - if (ret) + if (ret) { ibdev_err(ibdev, "failed to alloc CQ mtr, ret = %d.\n", ret); + kvfree(hr_cq->mtr_node); + hr_cq->mtr_node = NULL; + }
return ret; }
static void free_cq_buf(struct hns_roce_dev *hr_dev, struct hns_roce_cq *hr_cq) { - hns_roce_mtr_destroy(hr_dev, &hr_cq->mtr); + if (hr_cq->delayed_destroy_flag) { + hns_roce_add_unfree_mtr(hr_cq->mtr_node, hr_dev, &hr_cq->mtr); + } else { + hns_roce_mtr_destroy(hr_dev, &hr_cq->mtr); + kvfree(hr_cq->mtr_node); + hr_cq->mtr_node = NULL; + } }
static int alloc_cq_db(struct hns_roce_dev *hr_dev, struct hns_roce_cq *hr_cq, @@ -270,7 +289,8 @@ static void free_cq_db(struct hns_roce_dev *hr_dev, struct hns_roce_cq *hr_cq, uctx = rdma_udata_to_drv_context(udata, struct hns_roce_ucontext, ibucontext); - hns_roce_db_unmap_user(uctx, &hr_cq->db); + hns_roce_db_unmap_user(uctx, &hr_cq->db, + hr_cq->delayed_destroy_flag); } else { hns_roce_free_db(hr_dev, &hr_cq->db); } diff --git a/drivers/infiniband/hw/hns/hns_roce_db.c b/drivers/infiniband/hw/hns/hns_roce_db.c index 5c4c04808..5adc2f1fa 100644 --- a/drivers/infiniband/hw/hns/hns_roce_db.c +++ b/drivers/infiniband/hw/hns/hns_roce_db.c @@ -24,7 +24,7 @@ int hns_roce_db_map_user(struct hns_roce_ucontext *context, unsigned long virt, page = kmalloc(sizeof(*page), GFP_KERNEL); if (!page) { ret = -ENOMEM; - goto out; + goto err_out; }
refcount_set(&page->refcount, 1); @@ -33,8 +33,12 @@ int hns_roce_db_map_user(struct hns_roce_ucontext *context, unsigned long virt, PAGE_SIZE, 0); if (IS_ERR(page->umem)) { ret = PTR_ERR(page->umem); - kfree(page); - goto out; + goto err_page; + } + page->umem_node = kvmalloc(sizeof(*page->umem_node), GFP_KERNEL); + if (!page->umem_node) { + ret = -ENOMEM; + goto err_umem; }
list_add(&page->list, &context->page_list); @@ -46,21 +50,36 @@ found: db->u.user_page = page; refcount_inc(&page->refcount);
-out: + mutex_unlock(&context->page_mutex); + return 0; + +err_umem: + ib_umem_release(page->umem); +err_page: + kfree(page); +err_out: mutex_unlock(&context->page_mutex);
return ret; }
void hns_roce_db_unmap_user(struct hns_roce_ucontext *context, - struct hns_roce_db *db) + struct hns_roce_db *db, + bool delayed_unmap_flag) { + struct hns_roce_dev *hr_dev = to_hr_dev(context->ibucontext.device); + mutex_lock(&context->page_mutex);
refcount_dec(&db->u.user_page->refcount); if (refcount_dec_if_one(&db->u.user_page->refcount)) { list_del(&db->u.user_page->list); - ib_umem_release(db->u.user_page->umem); + if (delayed_unmap_flag) { + hns_roce_add_unfree_umem(db->u.user_page, hr_dev); + } else { + ib_umem_release(db->u.user_page->umem); + kvfree(db->u.user_page->umem_node); + } kfree(db->u.user_page); }
diff --git a/drivers/infiniband/hw/hns/hns_roce_device.h b/drivers/infiniband/hw/hns/hns_roce_device.h index 8c840849a..f62cc5cce 100644 --- a/drivers/infiniband/hw/hns/hns_roce_device.h +++ b/drivers/infiniband/hw/hns/hns_roce_device.h @@ -381,6 +381,11 @@ struct hns_roce_mw { u32 pbl_buf_pg_sz; };
+struct hns_roce_mtr_node { + struct hns_roce_mtr mtr; + struct list_head list; +}; + struct hns_roce_mr { struct ib_mr ibmr; u64 iova; /* MR's virtual original addr */ @@ -394,6 +399,8 @@ struct hns_roce_mr { struct hns_roce_mtr pbl_mtr; u32 npages; dma_addr_t *page_list; + bool delayed_destroy_flag; + struct hns_roce_mtr_node *mtr_node; };
struct hns_roce_mr_table { @@ -460,11 +467,17 @@ struct hns_roce_db_pgdir { dma_addr_t db_dma; };
+struct hns_roce_umem_node { + struct ib_umem *umem; + struct list_head list; +}; + struct hns_roce_user_db_page { struct list_head list; struct ib_umem *umem; unsigned long user_virt; refcount_t refcount; + struct hns_roce_umem_node *umem_node; };
struct hns_roce_db { @@ -499,6 +512,8 @@ struct hns_roce_cq { struct list_head rq_list; /* all qps on this recv cq */ int is_armed; /* cq is armed */ struct list_head node; /* all armed cqs are on a list */ + bool delayed_destroy_flag; + struct hns_roce_mtr_node *mtr_node; };
struct hns_roce_idx_que { @@ -507,6 +522,7 @@ struct hns_roce_idx_que { unsigned long *bitmap; u32 head; u32 tail; + struct hns_roce_mtr_node *mtr_node; };
struct hns_roce_srq { @@ -532,6 +548,8 @@ struct hns_roce_srq { void (*event)(struct hns_roce_srq *srq, enum hns_roce_event event); struct hns_roce_db rdb; u32 cap_flags; + bool delayed_destroy_flag; + struct hns_roce_mtr_node *mtr_node; };
struct hns_roce_uar_table { @@ -714,6 +732,8 @@ struct hns_roce_qp { u8 tc_mode; u8 priority; enum hns_roce_cong_type cong_type; + bool delayed_destroy_flag; + struct hns_roce_mtr_node *mtr_node; };
struct hns_roce_ib_iboe { @@ -1128,6 +1148,11 @@ struct hns_roce_dev { atomic64_t *dfx_cnt; struct hns_roce_scc_param *scc_param; struct notifier_block bond_nb; + + struct list_head mtr_unfree_list; /* list of unfree mtr on this dev */ + spinlock_t mtr_unfree_list_lock; /* protect mtr_unfree_list */ + struct list_head umem_unfree_list; /* list of unfree umem on this dev */ + spinlock_t umem_unfree_list_lock; /* protect umem_unfree_list */ };
static inline struct hns_roce_dev *to_hr_dev(struct ib_device *ib_dev) @@ -1402,7 +1427,8 @@ int hns_roce_destroy_cq(struct ib_cq *ib_cq, struct ib_udata *udata); int hns_roce_db_map_user(struct hns_roce_ucontext *context, unsigned long virt, struct hns_roce_db *db); void hns_roce_db_unmap_user(struct hns_roce_ucontext *context, - struct hns_roce_db *db); + struct hns_roce_db *db, + bool delayed_unmap_flag); int hns_roce_alloc_db(struct hns_roce_dev *hr_dev, struct hns_roce_db *db, int order); void hns_roce_free_db(struct hns_roce_dev *hr_dev, struct hns_roce_db *db); @@ -1429,4 +1455,11 @@ hns_roce_user_mmap_entry_insert(struct ib_ucontext *ucontext, u64 address, enum hns_roce_mmap_type mmap_type); void hns_roce_register_sysfs(struct hns_roce_dev *hr_dev); void hns_roce_unregister_sysfs(struct hns_roce_dev *hr_dev); +void hns_roce_add_unfree_umem(struct hns_roce_user_db_page *user_page, + struct hns_roce_dev *hr_dev); +void hns_roce_free_unfree_umem(struct hns_roce_dev *hr_dev); +void hns_roce_add_unfree_mtr(struct hns_roce_mtr_node *pos, + struct hns_roce_dev *hr_dev, + struct hns_roce_mtr *mtr); +void hns_roce_free_unfree_mtr(struct hns_roce_dev *hr_dev); #endif /* _HNS_ROCE_DEVICE_H */ diff --git a/drivers/infiniband/hw/hns/hns_roce_hw_v2.c b/drivers/infiniband/hw/hns/hns_roce_hw_v2.c index e854e559f..23b3c94c8 100644 --- a/drivers/infiniband/hw/hns/hns_roce_hw_v2.c +++ b/drivers/infiniband/hw/hns/hns_roce_hw_v2.c @@ -33,7 +33,6 @@ #include <linux/acpi.h> #include <linux/etherdevice.h> #include <linux/interrupt.h> -#include <linux/iopoll.h> #include <linux/kernel.h> #include <linux/types.h> #include <net/addrconf.h> @@ -1132,14 +1131,9 @@ static u32 hns_roce_v2_cmd_hw_resetting(struct hns_roce_dev *hr_dev, unsigned long instance_stage, unsigned long reset_stage) { -#define HW_RESET_TIMEOUT_US 1000000 -#define HW_RESET_DELAY_US 1 - struct hns_roce_v2_priv *priv = hr_dev->priv; struct hnae3_handle *handle = priv->handle; const struct hnae3_ae_ops *ops = handle->ae_algo->ops; - unsigned long val; - int ret;
/* When hardware reset is detected, we should stop sending mailbox&cmq& * doorbell to hardware. If now in .init_instance() function, we should @@ -1152,10 +1146,7 @@ static u32 hns_roce_v2_cmd_hw_resetting(struct hns_roce_dev *hr_dev, */ hr_dev->dis_db = true;
- ret = read_poll_timeout_atomic(ops->ae_dev_reset_cnt, val, - val > hr_dev->reset_cnt, HW_RESET_DELAY_US, - HW_RESET_TIMEOUT_US, false, handle); - if (!ret) + if (!ops->get_hw_reset_stat(handle)) hr_dev->is_reset = true;
if (!hr_dev->is_reset || reset_stage == HNS_ROCE_STATE_RST_INIT || @@ -5894,6 +5885,9 @@ int hns_roce_v2_destroy_qp(struct ib_qp *ibqp, struct ib_udata *udata) "failed to destroy QP, QPN = 0x%06lx, ret = %d.\n", hr_qp->qpn, ret);
+ if (ret == -EBUSY) + hr_qp->delayed_destroy_flag = true; + hns_roce_qp_destroy(hr_dev, hr_qp, udata);
return 0; diff --git a/drivers/infiniband/hw/hns/hns_roce_main.c b/drivers/infiniband/hw/hns/hns_roce_main.c index d9f43076e..93e7f56b8 100644 --- a/drivers/infiniband/hw/hns/hns_roce_main.c +++ b/drivers/infiniband/hw/hns/hns_roce_main.c @@ -1255,6 +1255,12 @@ static int hns_roce_setup_hca(struct hns_roce_dev *hr_dev) INIT_LIST_HEAD(&hr_dev->uctx_list); mutex_init(&hr_dev->uctx_list_mutex);
+ INIT_LIST_HEAD(&hr_dev->mtr_unfree_list); + spin_lock_init(&hr_dev->mtr_unfree_list_lock); + + INIT_LIST_HEAD(&hr_dev->umem_unfree_list); + spin_lock_init(&hr_dev->umem_unfree_list_lock); + if (hr_dev->caps.flags & HNS_ROCE_CAP_FLAG_CQ_RECORD_DB || hr_dev->caps.flags & HNS_ROCE_CAP_FLAG_QP_RECORD_DB) { INIT_LIST_HEAD(&hr_dev->pgdir_list); @@ -1467,6 +1473,8 @@ void hns_roce_exit(struct hns_roce_dev *hr_dev, bool bond_cleanup) if (hr_dev->hw->hw_exit) hr_dev->hw->hw_exit(hr_dev); hns_roce_teardown_hca(hr_dev); + hns_roce_free_unfree_umem(hr_dev); + hns_roce_free_unfree_mtr(hr_dev); hns_roce_cleanup_hem(hr_dev);
if (hr_dev->cmd_mod) diff --git a/drivers/infiniband/hw/hns/hns_roce_mr.c b/drivers/infiniband/hw/hns/hns_roce_mr.c index 00b81e09c..53ae83548 100644 --- a/drivers/infiniband/hw/hns/hns_roce_mr.c +++ b/drivers/infiniband/hw/hns/hns_roce_mr.c @@ -83,7 +83,11 @@ static void free_mr_key(struct hns_roce_dev *hr_dev, struct hns_roce_mr *mr) { unsigned long obj = key_to_hw_index(mr->key);
- hns_roce_table_put(hr_dev, &hr_dev->mr_table.mtpt_table, obj); + /* this resource will be freed when the driver is uninstalled, so + * no memory leak will occur. + */ + if (!mr->delayed_destroy_flag) + hns_roce_table_put(hr_dev, &hr_dev->mr_table.mtpt_table, obj); ida_free(&hr_dev->mr_table.mtpt_ida.ida, (int)obj); }
@@ -95,6 +99,10 @@ static int alloc_mr_pbl(struct hns_roce_dev *hr_dev, struct hns_roce_mr *mr, struct hns_roce_buf_attr buf_attr = {}; int err;
+ mr->mtr_node = kvmalloc(sizeof(*mr->mtr_node), GFP_KERNEL); + if (!mr->mtr_node) + return -ENOMEM; + mr->pbl_hop_num = is_fast ? 1 : hr_dev->caps.pbl_hop_num; buf_attr.page_shift = is_fast ? PAGE_SHIFT : hr_dev->caps.pbl_buf_pg_sz + PAGE_SHIFT; @@ -114,6 +122,8 @@ static int alloc_mr_pbl(struct hns_roce_dev *hr_dev, struct hns_roce_mr *mr, udata, start); if (err) { ibdev_err(ibdev, "failed to alloc pbl mtr, ret = %d.\n", err); + kvfree(mr->mtr_node); + mr->mtr_node = NULL; return err; }
@@ -125,7 +135,13 @@ static int alloc_mr_pbl(struct hns_roce_dev *hr_dev, struct hns_roce_mr *mr,
static void free_mr_pbl(struct hns_roce_dev *hr_dev, struct hns_roce_mr *mr) { - hns_roce_mtr_destroy(hr_dev, &mr->pbl_mtr); + if (mr->delayed_destroy_flag && mr->type != MR_TYPE_DMA) { + hns_roce_add_unfree_mtr(mr->mtr_node, hr_dev, &mr->pbl_mtr); + } else { + hns_roce_mtr_destroy(hr_dev, &mr->pbl_mtr); + kvfree(mr->mtr_node); + mr->mtr_node = NULL; + } }
static void hns_roce_mr_free(struct hns_roce_dev *hr_dev, struct hns_roce_mr *mr) @@ -140,6 +156,8 @@ static void hns_roce_mr_free(struct hns_roce_dev *hr_dev, struct hns_roce_mr *mr if (ret) ibdev_warn_ratelimited(ibdev, "failed to destroy mpt, ret = %d.\n", ret); + if (ret == -EBUSY) + mr->delayed_destroy_flag = true; }
free_mr_pbl(hr_dev, mr); @@ -1203,3 +1221,75 @@ void hns_roce_mtr_destroy(struct hns_roce_dev *hr_dev, struct hns_roce_mtr *mtr) /* free buffers */ mtr_free_bufs(hr_dev, mtr); } + +static void hns_roce_copy_mtr(struct hns_roce_mtr *new_mtr, struct hns_roce_mtr *old_mtr) +{ + struct list_head *new_head, *old_head; + int i, j; + + memcpy(new_mtr, old_mtr, sizeof(*old_mtr)); + + for (i = 0; i < HNS_ROCE_MAX_BT_REGION; i++) + for (j = 0; j < HNS_ROCE_MAX_BT_LEVEL; j++) { + new_head = &new_mtr->hem_list.mid_bt[i][j]; + old_head = &old_mtr->hem_list.mid_bt[i][j]; + list_replace(old_head, new_head); + } + + new_head = &new_mtr->hem_list.root_bt; + old_head = &old_mtr->hem_list.root_bt; + list_replace(old_head, new_head); + + new_head = &new_mtr->hem_list.btm_bt; + old_head = &old_mtr->hem_list.btm_bt; + list_replace(old_head, new_head); +} + +void hns_roce_add_unfree_mtr(struct hns_roce_mtr_node *pos, + struct hns_roce_dev *hr_dev, + struct hns_roce_mtr *mtr) +{ + hns_roce_copy_mtr(&pos->mtr, mtr); + + spin_lock(&hr_dev->mtr_unfree_list_lock); + list_add_tail(&pos->list, &hr_dev->mtr_unfree_list); + spin_unlock(&hr_dev->mtr_unfree_list_lock); +} + +void hns_roce_free_unfree_mtr(struct hns_roce_dev *hr_dev) +{ + struct hns_roce_mtr_node *pos, *next; + + spin_lock(&hr_dev->mtr_unfree_list_lock); + list_for_each_entry_safe(pos, next, &hr_dev->mtr_unfree_list, list) { + list_del(&pos->list); + hns_roce_mtr_destroy(hr_dev, &pos->mtr); + kvfree(pos); + } + spin_unlock(&hr_dev->mtr_unfree_list_lock); +} + +void hns_roce_add_unfree_umem(struct hns_roce_user_db_page *user_page, + struct hns_roce_dev *hr_dev) +{ + struct hns_roce_umem_node *pos = user_page->umem_node; + + pos->umem = user_page->umem; + + spin_lock(&hr_dev->umem_unfree_list_lock); + list_add_tail(&pos->list, &hr_dev->umem_unfree_list); + spin_unlock(&hr_dev->umem_unfree_list_lock); +} + +void hns_roce_free_unfree_umem(struct hns_roce_dev *hr_dev) +{ + struct hns_roce_umem_node *pos, *next; + + spin_lock(&hr_dev->umem_unfree_list_lock); + list_for_each_entry_safe(pos, next, &hr_dev->umem_unfree_list, list) { + list_del(&pos->list); + ib_umem_release(pos->umem); + kvfree(pos); + } + spin_unlock(&hr_dev->umem_unfree_list_lock); +} diff --git a/drivers/infiniband/hw/hns/hns_roce_qp.c b/drivers/infiniband/hw/hns/hns_roce_qp.c index 55d3c17ac..b44a8c503 100644 --- a/drivers/infiniband/hw/hns/hns_roce_qp.c +++ b/drivers/infiniband/hw/hns/hns_roce_qp.c @@ -387,6 +387,12 @@ static void free_qpc(struct hns_roce_dev *hr_dev, struct hns_roce_qp *hr_qp) { struct hns_roce_qp_table *qp_table = &hr_dev->qp_table;
+ if (hr_qp->delayed_destroy_flag) + return; + + /* this resource will be freed when the driver is uninstalled, so + * no memory leak will occur. + */ if (hr_dev->caps.trrl_entry_sz) hns_roce_table_put(hr_dev, &qp_table->trrl_table, hr_qp->qpn); hns_roce_table_put(hr_dev, &qp_table->irrl_table, hr_qp->qpn); @@ -777,12 +783,18 @@ static int alloc_wqe_buf(struct hns_roce_dev *hr_dev, struct hns_roce_qp *hr_qp, struct ib_device *ibdev = &hr_dev->ib_dev; int ret;
+ hr_qp->mtr_node = kvmalloc(sizeof(*hr_qp->mtr_node), GFP_KERNEL); + if (!hr_qp->mtr_node) + return -ENOMEM; + if (dca_en) { /* DCA must be enabled after the buffer attr is configured. */ ret = hns_roce_enable_dca(hr_dev, hr_qp, udata); if (ret) { ibdev_err(ibdev, "failed to enable DCA, ret = %d.\n", ret); + kvfree(hr_qp->mtr_node); + hr_qp->mtr_node = NULL; return ret; }
@@ -803,6 +815,8 @@ static int alloc_wqe_buf(struct hns_roce_dev *hr_dev, struct hns_roce_qp *hr_qp, ibdev_err(ibdev, "failed to create WQE mtr, ret = %d.\n", ret); if (dca_en) hns_roce_disable_dca(hr_dev, hr_qp, udata); + kvfree(hr_qp->mtr_node); + hr_qp->mtr_node = NULL; }
return ret; @@ -811,7 +825,13 @@ static int alloc_wqe_buf(struct hns_roce_dev *hr_dev, struct hns_roce_qp *hr_qp, static void free_wqe_buf(struct hns_roce_dev *hr_dev, struct hns_roce_qp *hr_qp, struct ib_udata *udata) { - hns_roce_mtr_destroy(hr_dev, &hr_qp->mtr); + if (hr_qp->delayed_destroy_flag) { + hns_roce_add_unfree_mtr(hr_qp->mtr_node, hr_dev, &hr_qp->mtr); + } else { + hns_roce_mtr_destroy(hr_dev, &hr_qp->mtr); + kvfree(hr_qp->mtr_node); + hr_qp->mtr_node = NULL; + }
if (hr_qp->en_flags & HNS_ROCE_QP_CAP_DYNAMIC_CTX_ATTACH) hns_roce_disable_dca(hr_dev, hr_qp, udata); @@ -951,7 +971,7 @@ static int alloc_user_qp_db(struct hns_roce_dev *hr_dev,
err_sdb: if (hr_qp->en_flags & HNS_ROCE_QP_CAP_SQ_RECORD_DB) - hns_roce_db_unmap_user(uctx, &hr_qp->sdb); + hns_roce_db_unmap_user(uctx, &hr_qp->sdb, false); err_out: return ret; } @@ -1033,9 +1053,11 @@ static void free_qp_db(struct hns_roce_dev *hr_dev, struct hns_roce_qp *hr_qp,
if (udata) { if (hr_qp->en_flags & HNS_ROCE_QP_CAP_RQ_RECORD_DB) - hns_roce_db_unmap_user(uctx, &hr_qp->rdb); + hns_roce_db_unmap_user(uctx, &hr_qp->rdb, + hr_qp->delayed_destroy_flag); if (hr_qp->en_flags & HNS_ROCE_QP_CAP_SQ_RECORD_DB) - hns_roce_db_unmap_user(uctx, &hr_qp->sdb); + hns_roce_db_unmap_user(uctx, &hr_qp->sdb, + hr_qp->delayed_destroy_flag); if (hr_qp->en_flags & HNS_ROCE_QP_CAP_DIRECT_WQE) qp_user_mmap_entry_remove(hr_qp); } else { diff --git a/drivers/infiniband/hw/hns/hns_roce_srq.c b/drivers/infiniband/hw/hns/hns_roce_srq.c index 293c76632..9ac397609 100644 --- a/drivers/infiniband/hw/hns/hns_roce_srq.c +++ b/drivers/infiniband/hw/hns/hns_roce_srq.c @@ -153,6 +153,8 @@ static void free_srqc(struct hns_roce_dev *hr_dev, struct hns_roce_srq *srq) dev_err_ratelimited(hr_dev->dev, "DESTROY_SRQ failed (%d) for SRQN %06lx\n", ret, srq->srqn); + if (ret == -EBUSY) + srq->delayed_destroy_flag = true;
xa_erase(&srq_table->xa, srq->srqn);
@@ -160,7 +162,8 @@ static void free_srqc(struct hns_roce_dev *hr_dev, struct hns_roce_srq *srq) complete(&srq->free); wait_for_completion(&srq->free);
- hns_roce_table_put(hr_dev, &srq_table->table, srq->srqn); + if (!srq->delayed_destroy_flag) + hns_roce_table_put(hr_dev, &srq_table->table, srq->srqn); }
static int alloc_srq_idx(struct hns_roce_dev *hr_dev, struct hns_roce_srq *srq, @@ -171,6 +174,10 @@ static int alloc_srq_idx(struct hns_roce_dev *hr_dev, struct hns_roce_srq *srq, struct hns_roce_buf_attr buf_attr = {}; int ret;
+ idx_que->mtr_node = kvmalloc(sizeof(*idx_que->mtr_node), GFP_KERNEL); + if (!idx_que->mtr_node) + return -ENOMEM; + srq->idx_que.entry_shift = ilog2(HNS_ROCE_IDX_QUE_ENTRY_SZ);
buf_attr.page_shift = hr_dev->caps.idx_buf_pg_sz + PAGE_SHIFT; @@ -185,7 +192,7 @@ static int alloc_srq_idx(struct hns_roce_dev *hr_dev, struct hns_roce_srq *srq, if (ret) { ibdev_err(ibdev, "failed to alloc SRQ idx mtr, ret = %d.\n", ret); - return ret; + goto err_kvmalloc; }
if (!udata) { @@ -203,6 +210,9 @@ static int alloc_srq_idx(struct hns_roce_dev *hr_dev, struct hns_roce_srq *srq, return 0; err_idx_mtr: hns_roce_mtr_destroy(hr_dev, &idx_que->mtr); +err_kvmalloc: + kvfree(idx_que->mtr_node); + idx_que->mtr_node = NULL;
return ret; } @@ -213,7 +223,13 @@ static void free_srq_idx(struct hns_roce_dev *hr_dev, struct hns_roce_srq *srq)
bitmap_free(idx_que->bitmap); idx_que->bitmap = NULL; - hns_roce_mtr_destroy(hr_dev, &idx_que->mtr); + if (srq->delayed_destroy_flag) { + hns_roce_add_unfree_mtr(idx_que->mtr_node, hr_dev, &idx_que->mtr); + } else { + hns_roce_mtr_destroy(hr_dev, &idx_que->mtr); + kvfree(idx_que->mtr_node); + idx_que->mtr_node = NULL; + } }
static int alloc_srq_wqe_buf(struct hns_roce_dev *hr_dev, @@ -224,6 +240,10 @@ static int alloc_srq_wqe_buf(struct hns_roce_dev *hr_dev, struct hns_roce_buf_attr buf_attr = {}; int ret;
+ srq->mtr_node = kvmalloc(sizeof(*srq->mtr_node), GFP_KERNEL); + if (!srq->mtr_node) + return -ENOMEM; + srq->wqe_shift = ilog2(roundup_pow_of_two(max(HNS_ROCE_SGE_SIZE, HNS_ROCE_SGE_SIZE * srq->max_gs))); @@ -237,9 +257,12 @@ static int alloc_srq_wqe_buf(struct hns_roce_dev *hr_dev, ret = hns_roce_mtr_create(hr_dev, &srq->buf_mtr, &buf_attr, hr_dev->caps.srqwqe_ba_pg_sz + PAGE_SHIFT, udata, addr); - if (ret) + if (ret) { ibdev_err(ibdev, "failed to alloc SRQ buf mtr, ret = %d.\n", ret); + kvfree(srq->mtr_node); + srq->mtr_node = NULL; + }
return ret; } @@ -247,7 +270,13 @@ static int alloc_srq_wqe_buf(struct hns_roce_dev *hr_dev, static void free_srq_wqe_buf(struct hns_roce_dev *hr_dev, struct hns_roce_srq *srq) { - hns_roce_mtr_destroy(hr_dev, &srq->buf_mtr); + if (srq->delayed_destroy_flag) { + hns_roce_add_unfree_mtr(srq->mtr_node, hr_dev, &srq->buf_mtr); + } else { + hns_roce_mtr_destroy(hr_dev, &srq->buf_mtr); + kvfree(srq->mtr_node); + srq->mtr_node = NULL; + } }
static int alloc_srq_wrid(struct hns_roce_dev *hr_dev, struct hns_roce_srq *srq) @@ -416,7 +445,8 @@ static void free_srq_db(struct hns_roce_dev *hr_dev, struct hns_roce_srq *srq, uctx = rdma_udata_to_drv_context(udata, struct hns_roce_ucontext, ibucontext); - hns_roce_db_unmap_user(uctx, &srq->rdb); + hns_roce_db_unmap_user(uctx, &srq->rdb, + srq->delayed_destroy_flag); } else { hns_roce_free_db(hr_dev, &srq->rdb); }
From: wenglianfa wenglianfa@huawei.com
driver inclusion category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I9FIHP
----------------------------------------------------------------------
The 'unit' max value is 2^24 and the cur 'hopnum' value max value is 2, so the 'step' value may be more than the value range of u32. To fix it, The 'step' and context variables are changed to u64.
Fixes: 38389eaa4db1 ("RDMA/hns: Add mtr support for mixed multihop addressing") Signed-off-by: wenglianfa wenglianfa@huawei.com Signed-off-by: Juan Zhou zhoujuan51@h-partners.com --- drivers/infiniband/hw/hns/hns_roce_hem.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-)
diff --git a/drivers/infiniband/hw/hns/hns_roce_hem.c b/drivers/infiniband/hw/hns/hns_roce_hem.c index 9ac68efc3..052d93097 100644 --- a/drivers/infiniband/hw/hns/hns_roce_hem.c +++ b/drivers/infiniband/hw/hns/hns_roce_hem.c @@ -1042,9 +1042,9 @@ static bool hem_list_is_bottom_bt(int hopnum, int bt_level) * @bt_level: base address table level * @unit: ba entries per bt page */ -static u32 hem_list_calc_ba_range(int hopnum, int bt_level, int unit) +static u64 hem_list_calc_ba_range(int hopnum, int bt_level, int unit) { - u32 step; + u64 step; int max; int i;
@@ -1080,7 +1080,7 @@ int hns_roce_hem_list_calc_root_ba(const struct hns_roce_buf_region *regions, { struct hns_roce_buf_region *r; int total = 0; - int step; + u64 step; int i;
for (i = 0; i < region_cnt; i++) { @@ -1111,7 +1111,7 @@ static int hem_list_alloc_mid_bt(struct hns_roce_dev *hr_dev, int ret = 0; int max_ofs; int level; - u32 step; + u64 step; int end;
if (hopnum <= 1) @@ -1148,7 +1148,7 @@ static int hem_list_alloc_mid_bt(struct hns_roce_dev *hr_dev, }
start_aligned = (distance / step) * step + r->offset; - end = min_t(int, start_aligned + step - 1, max_ofs); + end = min_t(u64, start_aligned + step - 1, max_ofs); cur = hem_list_alloc_item(hr_dev, start_aligned, end, unit, true); if (!cur) { @@ -1237,7 +1237,7 @@ static int setup_middle_bt(struct hns_roce_dev *hr_dev, void *cpu_base, struct hns_roce_hem_item *hem, *temp_hem; int total = 0; int offset; - int step; + u64 step;
step = hem_list_calc_ba_range(r->hopnum, 1, unit); if (step < 1)
From: wenglianfa wenglianfa@huawei.com
driver inclusion category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I9FIHP
---------------------------------------------------------------------- DTS2024012409358
During the reset, the cmd event cannot be reported. As a result, Thread waiting for cmd event for a long time. To fix it, notify cmd not to wait for cmd event when reset starts.
Fixes: 9a4435375cd1 ("IB/hns: Add driver files for hns RoCE driver") Signed-off-by: wenglianfa wenglianfa@huawei.com Signed-off-by: Juan Zhou zhoujuan51@h-partners.com --- drivers/infiniband/hw/hns/hns_roce_hw_v2.c | 17 +++++++++++++++++ 1 file changed, 17 insertions(+)
diff --git a/drivers/infiniband/hw/hns/hns_roce_hw_v2.c b/drivers/infiniband/hw/hns/hns_roce_hw_v2.c index 23b3c94c8..421f5b058 100644 --- a/drivers/infiniband/hw/hns/hns_roce_hw_v2.c +++ b/drivers/infiniband/hw/hns/hns_roce_hw_v2.c @@ -7450,6 +7450,20 @@ static void hns_roce_v2_reset_notify_user(struct hns_roce_dev *hr_dev) mutex_unlock(&hr_dev->uctx_list_mutex); }
+static void hns_roce_v2_reset_notify_cmd(struct hns_roce_dev *hr_dev) +{ + struct hns_roce_cmdq *hr_cmd = &hr_dev->cmd; + int i; + + if (!hr_dev->cmd_mod) + return; + + for (i = 0; i < hr_cmd->max_cmds; i++) { + hr_cmd->context[i].result = -EBUSY; + complete(&hr_cmd->context[i].done); + } +} + static int hns_roce_hw_v2_reset_notify_down(struct hnae3_handle *handle) { struct hns_roce_dev *hr_dev; @@ -7473,6 +7487,9 @@ static int hns_roce_hw_v2_reset_notify_down(struct hnae3_handle *handle)
hr_dev->state = HNS_ROCE_DEVICE_STATE_RST_DOWN;
+ /* Complete the CMDQ event in advance during the reset. */ + hns_roce_v2_reset_notify_cmd(hr_dev); + return 0; }