From: Junxian Huang huangjunxian6@hisilicon.com
driver inclusion category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I9MANQ
---------------------------------------------------------------
The only supported length of atomic wr in kernel space is 8. The driver needs to check the length before post send.
Fixes: 00a59d30f3f9 ("RDMA/hns: Optimize wqe buffer filling process for post send") Signed-off-by: Junxian Huang huangjunxian6@hisilicon.com --- drivers/infiniband/hw/hns/hns_roce_device.h | 2 ++ drivers/infiniband/hw/hns/hns_roce_hw_v2.c | 18 ++++++++++++++---- 2 files changed, 16 insertions(+), 4 deletions(-)
diff --git a/drivers/infiniband/hw/hns/hns_roce_device.h b/drivers/infiniband/hw/hns/hns_roce_device.h index 80870c278..d778fefbe 100644 --- a/drivers/infiniband/hw/hns/hns_roce_device.h +++ b/drivers/infiniband/hw/hns/hns_roce_device.h @@ -94,6 +94,8 @@ /* Configure to HW for PAGE_SIZE larger than 4KB */ #define PG_SHIFT_OFFSET (PAGE_SHIFT - 12)
+#define ATOMIC_WR_LEN 8 + #define HNS_ROCE_IDX_QUE_ENTRY_SZ 4 #define SRQ_DB_REG 0x230
diff --git a/drivers/infiniband/hw/hns/hns_roce_hw_v2.c b/drivers/infiniband/hw/hns/hns_roce_hw_v2.c index b44bd4c3c..bc8818dc0 100644 --- a/drivers/infiniband/hw/hns/hns_roce_hw_v2.c +++ b/drivers/infiniband/hw/hns/hns_roce_hw_v2.c @@ -166,15 +166,22 @@ static void set_frmr_seg(struct hns_roce_v2_rc_send_wqe *rc_sq_wqe, hr_reg_clear(fseg, FRMR_BLK_MODE); }
-static void set_atomic_seg(const struct ib_send_wr *wr, - struct hns_roce_v2_rc_send_wqe *rc_sq_wqe, - unsigned int valid_num_sge) +static int set_atomic_seg(struct hns_roce_dev *hr_dev, + const struct ib_send_wr *wr, + struct hns_roce_v2_rc_send_wqe *rc_sq_wqe, + unsigned int valid_num_sge, u32 msg_len) { struct hns_roce_v2_wqe_data_seg *dseg = (void *)rc_sq_wqe + sizeof(struct hns_roce_v2_rc_send_wqe); struct hns_roce_wqe_atomic_seg *aseg = (void *)dseg + sizeof(struct hns_roce_v2_wqe_data_seg);
+ if (msg_len != ATOMIC_WR_LEN) { + ibdev_err(&hr_dev->ib_dev, "invalid atomic wr len, len = %u.\n", + msg_len); + return -EINVAL; + } + set_data_seg_v2(dseg, wr->sg_list);
if (wr->opcode == IB_WR_ATOMIC_CMP_AND_SWP) { @@ -187,6 +194,8 @@ static void set_atomic_seg(const struct ib_send_wr *wr, }
hr_reg_write(rc_sq_wqe, RC_SEND_WQE_SGE_NUM, valid_num_sge); + + return 0; }
static int fill_ext_sge_inl_data(struct hns_roce_qp *qp, @@ -681,7 +690,8 @@ static inline int set_rc_wqe(struct hns_roce_qp *qp,
if (wr->opcode == IB_WR_ATOMIC_CMP_AND_SWP || wr->opcode == IB_WR_ATOMIC_FETCH_AND_ADD) - set_atomic_seg(wr, rc_sq_wqe, valid_num_sge); + ret = set_atomic_seg(hr_dev, wr, rc_sq_wqe, valid_num_sge, + msg_len); else if (wr->opcode != IB_WR_REG_MR) ret = set_rwqe_data_seg(&qp->ibqp, wr, rc_sq_wqe, &curr_idx, valid_num_sge);
From: wenglianfa wenglianfa@huawei.com
driver inclusion category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I9MANQ
----------------------------------------------------------------------
For the HIP08, the reserved loopback QP is used to release MRs before the MPT is destroyed. After free_mr_exit() and before hns_roce_unregister_device(), rsv_qp is released and set to NULL, and ib_device is not unregister. During this period, the user mode can use ib_device to execute dereg_mr(). As a result, rsv_qp is accessed again and a NULL pointer is reported.
To fix Use-After-Free of rsv_qp, execute free_mr_exit() after hns_roce_unregister_device().
Fixes: 6f5f556d3795 ("RDMA/hns: Use the reserved loopback QPs to free MR before destroying MPT") Signed-off-by: wenglianfa wenglianfa@huawei.com Signed-off-by: Juan Zhou zhoujuan51@h-partners.com --- drivers/infiniband/hw/hns/hns_roce_hw_v2.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-)
diff --git a/drivers/infiniband/hw/hns/hns_roce_hw_v2.c b/drivers/infiniband/hw/hns/hns_roce_hw_v2.c index bc8818dc0..e6ecfe013 100644 --- a/drivers/infiniband/hw/hns/hns_roce_hw_v2.c +++ b/drivers/infiniband/hw/hns/hns_roce_hw_v2.c @@ -3147,6 +3147,9 @@ err_clear_extdb_failed:
static void hns_roce_v2_exit(struct hns_roce_dev *hr_dev) { + if (hr_dev->pci_dev->revision == PCI_REVISION_ID_HIP08) + free_mr_exit(hr_dev); + hns_roce_function_clear(hr_dev);
if (!hr_dev->is_vf) @@ -7334,9 +7337,6 @@ static void __hns_roce_hw_v2_uninit_instance(struct hnae3_handle *handle, hr_dev->state = HNS_ROCE_DEVICE_STATE_UNINIT; hns_roce_handle_device_err(hr_dev);
- if (hr_dev->pci_dev->revision == PCI_REVISION_ID_HIP08) - free_mr_exit(hr_dev); - hns_roce_exit(hr_dev, bond_cleanup); kfree(hr_dev->priv); ib_dealloc_device(&hr_dev->ib_dev);
From: wenglianfa wenglianfa@huawei.com
driver inclusion category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I9MANQ
----------------------------------------------------------------------
Sleep is not allowed in the spinlock critical section, but ib_umem_release() may sleep in the spinlock critical sectio. To fix it, use mutex_lock() instead of spin_lock().
Fixes: 04c5d76e4f15 ("RDMA/hns: Fix simultaneous reset and resource deregistration") Signed-off-by: wenglianfa wenglianfa@huawei.com --- drivers/infiniband/hw/hns/hns_roce_device.h | 4 ++-- drivers/infiniband/hw/hns/hns_roce_main.c | 8 ++++++-- drivers/infiniband/hw/hns/hns_roce_mr.c | 16 ++++++++-------- 3 files changed, 16 insertions(+), 12 deletions(-)
diff --git a/drivers/infiniband/hw/hns/hns_roce_device.h b/drivers/infiniband/hw/hns/hns_roce_device.h index d778fefbe..958ed5bc5 100644 --- a/drivers/infiniband/hw/hns/hns_roce_device.h +++ b/drivers/infiniband/hw/hns/hns_roce_device.h @@ -1151,9 +1151,9 @@ struct hns_roce_dev { struct notifier_block bond_nb;
struct list_head mtr_unfree_list; /* list of unfree mtr on this dev */ - spinlock_t mtr_unfree_list_lock; /* protect mtr_unfree_list */ + struct mutex mtr_unfree_list_mutex; /* protect mtr_unfree_list */ struct list_head umem_unfree_list; /* list of unfree umem on this dev */ - spinlock_t umem_unfree_list_lock; /* protect umem_unfree_list */ + struct mutex umem_unfree_list_mutex; /* protect umem_unfree_list */ };
static inline struct hns_roce_dev *to_hr_dev(struct ib_device *ib_dev) diff --git a/drivers/infiniband/hw/hns/hns_roce_main.c b/drivers/infiniband/hw/hns/hns_roce_main.c index ab43b6688..6257085cf 100644 --- a/drivers/infiniband/hw/hns/hns_roce_main.c +++ b/drivers/infiniband/hw/hns/hns_roce_main.c @@ -1241,6 +1241,8 @@ static void hns_roce_teardown_hca(struct hns_roce_dev *hr_dev) hns_roce_cleanup_dca(hr_dev);
hns_roce_cleanup_bitmap(hr_dev); + mutex_destroy(&hr_dev->umem_unfree_list_mutex); + mutex_destroy(&hr_dev->mtr_unfree_list_mutex); mutex_destroy(&hr_dev->uctx_list_mutex); if (hr_dev->caps.flags & HNS_ROCE_CAP_FLAG_CQ_RECORD_DB || hr_dev->caps.flags & HNS_ROCE_CAP_FLAG_QP_RECORD_DB) @@ -1268,10 +1270,10 @@ static int hns_roce_setup_hca(struct hns_roce_dev *hr_dev) mutex_init(&hr_dev->uctx_list_mutex);
INIT_LIST_HEAD(&hr_dev->mtr_unfree_list); - spin_lock_init(&hr_dev->mtr_unfree_list_lock); + mutex_init(&hr_dev->mtr_unfree_list_mutex);
INIT_LIST_HEAD(&hr_dev->umem_unfree_list); - spin_lock_init(&hr_dev->umem_unfree_list_lock); + mutex_init(&hr_dev->umem_unfree_list_mutex);
if (hr_dev->caps.flags & HNS_ROCE_CAP_FLAG_CQ_RECORD_DB || hr_dev->caps.flags & HNS_ROCE_CAP_FLAG_QP_RECORD_DB) { @@ -1315,6 +1317,8 @@ err_uar_table_free: if (hr_dev->caps.flags & HNS_ROCE_CAP_FLAG_CQ_RECORD_DB || hr_dev->caps.flags & HNS_ROCE_CAP_FLAG_QP_RECORD_DB) mutex_destroy(&hr_dev->pgdir_mutex); + mutex_destroy(&hr_dev->umem_unfree_list_mutex); + mutex_destroy(&hr_dev->mtr_unfree_list_mutex); mutex_destroy(&hr_dev->uctx_list_mutex);
return ret; diff --git a/drivers/infiniband/hw/hns/hns_roce_mr.c b/drivers/infiniband/hw/hns/hns_roce_mr.c index 59e7ed66d..0a6fde734 100644 --- a/drivers/infiniband/hw/hns/hns_roce_mr.c +++ b/drivers/infiniband/hw/hns/hns_roce_mr.c @@ -1250,22 +1250,22 @@ void hns_roce_add_unfree_mtr(struct hns_roce_mtr_node *pos, { hns_roce_copy_mtr(&pos->mtr, mtr);
- spin_lock(&hr_dev->mtr_unfree_list_lock); + mutex_lock(&hr_dev->mtr_unfree_list_mutex); list_add_tail(&pos->list, &hr_dev->mtr_unfree_list); - spin_unlock(&hr_dev->mtr_unfree_list_lock); + mutex_unlock(&hr_dev->mtr_unfree_list_mutex); }
void hns_roce_free_unfree_mtr(struct hns_roce_dev *hr_dev) { struct hns_roce_mtr_node *pos, *next;
- spin_lock(&hr_dev->mtr_unfree_list_lock); + mutex_lock(&hr_dev->mtr_unfree_list_mutex); list_for_each_entry_safe(pos, next, &hr_dev->mtr_unfree_list, list) { list_del(&pos->list); hns_roce_mtr_destroy(hr_dev, &pos->mtr); kvfree(pos); } - spin_unlock(&hr_dev->mtr_unfree_list_lock); + mutex_unlock(&hr_dev->mtr_unfree_list_mutex); }
void hns_roce_add_unfree_umem(struct hns_roce_user_db_page *user_page, @@ -1275,20 +1275,20 @@ void hns_roce_add_unfree_umem(struct hns_roce_user_db_page *user_page,
pos->umem = user_page->umem;
- spin_lock(&hr_dev->umem_unfree_list_lock); + mutex_lock(&hr_dev->umem_unfree_list_mutex); list_add_tail(&pos->list, &hr_dev->umem_unfree_list); - spin_unlock(&hr_dev->umem_unfree_list_lock); + mutex_unlock(&hr_dev->umem_unfree_list_mutex); }
void hns_roce_free_unfree_umem(struct hns_roce_dev *hr_dev) { struct hns_roce_umem_node *pos, *next;
- spin_lock(&hr_dev->umem_unfree_list_lock); + mutex_lock(&hr_dev->umem_unfree_list_mutex); list_for_each_entry_safe(pos, next, &hr_dev->umem_unfree_list, list) { list_del(&pos->list); ib_umem_release(pos->umem); kvfree(pos); } - spin_unlock(&hr_dev->umem_unfree_list_lock); + mutex_unlock(&hr_dev->umem_unfree_list_mutex); }
From: Junxian Huang huangjunxian6@hisilicon.com
driver inclusion category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I9MANQ
----------------------------------------------------------------------
CEQEs are handled in interrupt handler currently. This may cause the CPU core staying in interrupt context too long and lead to soft lockup under heavy load.
Handle CEQEs in tasklet and set an upper limit for the number of CEQE handled by a single call of tasklet.
Fixes: a5073d6054f7 ("RDMA/hns: Add eq support of hip08") Signed-off-by: Junxian Huang huangjunxian6@hisilicon.com --- drivers/infiniband/hw/hns/hns_roce_device.h | 1 + drivers/infiniband/hw/hns/hns_roce_hw_v2.c | 80 ++++++++++++--------- 2 files changed, 48 insertions(+), 33 deletions(-)
diff --git a/drivers/infiniband/hw/hns/hns_roce_device.h b/drivers/infiniband/hw/hns/hns_roce_device.h index 958ed5bc5..bdc62dcf4 100644 --- a/drivers/infiniband/hw/hns/hns_roce_device.h +++ b/drivers/infiniband/hw/hns/hns_roce_device.h @@ -802,6 +802,7 @@ struct hns_roce_eq { int shift; int event_type; int sub_type; + struct tasklet_struct tasklet; };
struct hns_roce_eq_table { diff --git a/drivers/infiniband/hw/hns/hns_roce_hw_v2.c b/drivers/infiniband/hw/hns/hns_roce_hw_v2.c index e6ecfe013..bd49ffa4e 100644 --- a/drivers/infiniband/hw/hns/hns_roce_hw_v2.c +++ b/drivers/infiniband/hw/hns/hns_roce_hw_v2.c @@ -6467,33 +6467,11 @@ static struct hns_roce_ceqe *next_ceqe_sw_v2(struct hns_roce_eq *eq) !!(eq->cons_index & eq->entries)) ? ceqe : NULL; }
-static irqreturn_t hns_roce_v2_ceq_int(struct hns_roce_dev *hr_dev, - struct hns_roce_eq *eq) +static irqreturn_t hns_roce_v2_ceq_int(struct hns_roce_eq *eq) { - struct hns_roce_ceqe *ceqe = next_ceqe_sw_v2(eq); - irqreturn_t ceqe_found = IRQ_NONE; - u32 cqn; - - while (ceqe) { - /* Make sure we read CEQ entry after we have checked the - * ownership bit - */ - dma_rmb(); - - cqn = hr_reg_read(ceqe, CEQE_CQN); + tasklet_schedule(&eq->tasklet);
- hns_roce_cq_completion(hr_dev, cqn); - - ++eq->cons_index; - ceqe_found = IRQ_HANDLED; - atomic64_inc(&hr_dev->dfx_cnt[HNS_ROCE_DFX_CEQE_CNT]); - - ceqe = next_ceqe_sw_v2(eq); - } - - update_eq_db(eq); - - return IRQ_RETVAL(ceqe_found); + return IRQ_HANDLED; }
static irqreturn_t hns_roce_v2_msix_interrupt_eq(int irq, void *eq_ptr) @@ -6504,7 +6482,7 @@ static irqreturn_t hns_roce_v2_msix_interrupt_eq(int irq, void *eq_ptr)
if (eq->type_flag == HNS_ROCE_CEQ) /* Completion event interrupt */ - int_work = hns_roce_v2_ceq_int(hr_dev, eq); + int_work = hns_roce_v2_ceq_int(eq); else /* Asynchronous event interrupt */ int_work = hns_roce_v2_aeq_int(hr_dev, eq); @@ -6872,6 +6850,34 @@ free_cmd_mbox: return ret; }
+static void hns_roce_ceq_task(struct tasklet_struct *task) +{ + struct hns_roce_eq *eq = from_tasklet(eq, task, tasklet); + struct hns_roce_ceqe *ceqe = next_ceqe_sw_v2(eq); + struct hns_roce_dev *hr_dev = eq->hr_dev; + int ceqe_num = 0; + u32 cqn; + + while (ceqe && ceqe_num < hr_dev->caps.ceqe_depth) { + /* Make sure we read CEQ entry after we have checked the + * ownership bit + */ + dma_rmb(); + + cqn = hr_reg_read(ceqe, CEQE_CQN); + + hns_roce_cq_completion(hr_dev, cqn); + + ++eq->cons_index; + ++ceqe_num; + atomic64_inc(&hr_dev->dfx_cnt[HNS_ROCE_DFX_CEQE_CNT]); + + ceqe = next_ceqe_sw_v2(eq); + } + + update_eq_db(eq); +} + static int __hns_roce_request_irq(struct hns_roce_dev *hr_dev, int irq_num, int comp_num, int aeq_num, int other_num) { @@ -6903,21 +6909,24 @@ static int __hns_roce_request_irq(struct hns_roce_dev *hr_dev, int irq_num, j - other_num - aeq_num);
for (j = 0; j < irq_num; j++) { - if (j < other_num) + if (j < other_num) { ret = request_irq(hr_dev->irq[j], hns_roce_v2_msix_interrupt_abn, 0, hr_dev->irq_names[j], hr_dev); - - else if (j < (other_num + comp_num)) + } else if (j < (other_num + comp_num)) { + tasklet_setup(&eq_table->eq[j - other_num].tasklet, + hns_roce_ceq_task); ret = request_irq(eq_table->eq[j - other_num].irq, hns_roce_v2_msix_interrupt_eq, 0, hr_dev->irq_names[j + aeq_num], &eq_table->eq[j - other_num]); - else + } else { ret = request_irq(eq_table->eq[j - other_num].irq, hns_roce_v2_msix_interrupt_eq, 0, hr_dev->irq_names[j - comp_num], &eq_table->eq[j - other_num]); + } + if (ret) { dev_err(hr_dev->dev, "request irq error!\n"); goto err_request_failed; @@ -6928,11 +6937,13 @@ static int __hns_roce_request_irq(struct hns_roce_dev *hr_dev, int irq_num,
err_request_failed: for (j -= 1; j >= 0; j--) - if (j < other_num) + if (j < other_num) { free_irq(hr_dev->irq[j], hr_dev); - else + } else { free_irq(eq_table->eq[j - other_num].irq, &eq_table->eq[j - other_num]); + tasklet_kill(&eq_table->eq[j - other_num].tasklet); + }
err_kzalloc_failed: for (i -= 1; i >= 0; i--) @@ -6953,8 +6964,11 @@ static void __hns_roce_free_irq(struct hns_roce_dev *hr_dev) for (i = 0; i < hr_dev->caps.num_other_vectors; i++) free_irq(hr_dev->irq[i], hr_dev);
- for (i = 0; i < eq_num; i++) + for (i = 0; i < eq_num; i++) { free_irq(hr_dev->eq_table.eq[i].irq, &hr_dev->eq_table.eq[i]); + if (i < hr_dev->caps.num_comp_vectors) + tasklet_kill(&hr_dev->eq_table.eq[i].tasklet); + }
for (i = 0; i < irq_num; i++) kfree(hr_dev->irq_names[i]);