From: wanglin wanglin137@huawei.com
driver inclusion category: bugfix bugzilla: NA CVE: NA
The RQ/SRQ of HIP08 needs one special sge to stop receive reliably. So the driver needs to allocate at least one SGE when creating RQ/SRQ and ensure that at least one SGE is filled with the special value during post_recv.
Besides, the kernel driver should only do this for kernel ULP. For userspace ULP, the userspace driver will allocate the reserved SGE in buffer, and the kernel driver just needs to pin the corresponding size of memory based on the userspace driver's requirements.
Reviewed-by: Hu Chunzhi huchunzhi@huawei.com Reviewed-by: Zhao Weibo zhaoweibo3@huawei.com Signed-off-by: wanglin wanglin137@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- drivers/infiniband/hw/hns/hns_roce_device.h | 2 + drivers/infiniband/hw/hns/hns_roce_hw_v2.c | 24 +++++++----- drivers/infiniband/hw/hns/hns_roce_hw_v2.h | 6 +-- drivers/infiniband/hw/hns/hns_roce_qp.c | 33 +++++++++++++++-- drivers/infiniband/hw/hns/hns_roce_srq.c | 41 +++++++++++++++++---- 5 files changed, 83 insertions(+), 23 deletions(-)
diff --git a/drivers/infiniband/hw/hns/hns_roce_device.h b/drivers/infiniband/hw/hns/hns_roce_device.h index 533df3c68d2ff..c5edeaa6ee8e8 100644 --- a/drivers/infiniband/hw/hns/hns_roce_device.h +++ b/drivers/infiniband/hw/hns/hns_roce_device.h @@ -493,6 +493,7 @@ struct hns_roce_wq { int wqe_cnt; /* WQE num */ u32 max_post; int max_gs; + u32 rsv_sge; int offset; int wqe_shift; /* WQE size */ u32 head; @@ -601,6 +602,7 @@ struct hns_roce_srq { unsigned long srqn; int max; int max_gs; + u32 rsv_sge; int wqe_shift; void __iomem *db_reg_l;
diff --git a/drivers/infiniband/hw/hns/hns_roce_hw_v2.c b/drivers/infiniband/hw/hns/hns_roce_hw_v2.c index aa0fe9fd7d352..d3a70dd82d337 100644 --- a/drivers/infiniband/hw/hns/hns_roce_hw_v2.c +++ b/drivers/infiniband/hw/hns/hns_roce_hw_v2.c @@ -742,6 +742,7 @@ static int hns_roce_v2_post_recv(struct ib_qp *ibqp, struct ib_recv_wr *wr, struct device *dev = hr_dev->dev; unsigned long flags = 0; void *wqe = NULL; + u32 max_sge; int ret = 0; int nreq; int ind; @@ -766,6 +767,7 @@ static int hns_roce_v2_post_recv(struct ib_qp *ibqp, struct ib_recv_wr *wr, return -EINVAL; }
+ max_sge = hr_qp->rq.max_gs - hr_qp->rq.rsv_sge; for (nreq = 0; wr; ++nreq, wr = wr->next) { if (hns_roce_wq_overflow(&hr_qp->rq, nreq, hr_qp->ibqp.recv_cq)) { @@ -774,9 +776,9 @@ static int hns_roce_v2_post_recv(struct ib_qp *ibqp, struct ib_recv_wr *wr, goto out; }
- if (unlikely(wr->num_sge >= hr_qp->rq.max_gs)) { - dev_err(dev, "RQ: sge num(%d) is larger or equal than max sge num(%d)\n", - wr->num_sge, hr_qp->rq.max_gs); + if (unlikely(wr->num_sge > max_sge)) { + dev_err(dev, "RQ: sge num(%d) is larger than max sge num(%d)\n", + wr->num_sge, max_sge); ret = -EINVAL; *bad_wr = wr; goto out; @@ -791,7 +793,7 @@ static int hns_roce_v2_post_recv(struct ib_qp *ibqp, struct ib_recv_wr *wr, dseg++; }
- if (wr->num_sge < hr_qp->rq.max_gs) { + if (hr_qp->rq.rsv_sge) { dseg->lkey = cpu_to_le32(HNS_ROCE_INVALID_LKEY); dseg->addr = 0; dseg->len = cpu_to_le32(HNS_ROCE_INVALID_SGE_LENGTH); @@ -1985,10 +1987,12 @@ static int hns_roce_query_pf_caps(struct hns_roce_dev *hr_dev) caps->max_sq_sg = le16_to_cpu(resp_a->max_sq_sg); caps->max_sq_inline = le16_to_cpu(resp_a->max_sq_inline); caps->max_rq_sg = le16_to_cpu(resp_a->max_rq_sg); + caps->max_rq_sg = roundup_pow_of_two(caps->max_rq_sg); caps->max_extend_sg = le32_to_cpu(resp_a->max_extend_sg); caps->num_qpc_timer = le16_to_cpu(resp_a->num_qpc_timer); caps->num_cqc_timer = le16_to_cpu(resp_a->num_cqc_timer); caps->max_srq_sges = le16_to_cpu(resp_a->max_srq_sges); + caps->max_srq_sges = roundup_pow_of_two(caps->max_srq_sges); caps->num_aeq_vectors = resp_a->num_aeq_vectors; caps->num_other_vectors = resp_a->num_other_vectors; caps->max_sq_desc_sz = resp_a->max_sq_desc_sz; @@ -5429,7 +5433,7 @@ static int hns_roce_v2_query_qp(struct ib_qp *ibqp, struct ib_qp_attr *qp_attr, done: qp_attr->cur_qp_state = qp_attr->qp_state; qp_attr->cap.max_recv_wr = hr_qp->rq.wqe_cnt; - qp_attr->cap.max_recv_sge = hr_qp->rq.max_gs; + qp_attr->cap.max_recv_sge = hr_qp->rq.max_gs - hr_qp->rq.rsv_sge;
if (!ibqp->uobject) { qp_attr->cap.max_send_wr = hr_qp->sq.wqe_cnt; @@ -7051,7 +7055,7 @@ int hns_roce_v2_query_srq(struct ib_srq *ibsrq, struct ib_srq_attr *attr)
attr->srq_limit = limit_wl; attr->max_wr = srq->max - 1; - attr->max_sge = srq->max_gs - HNS_ROCE_RESERVED_SGE; + attr->max_sge = srq->max_gs - srq->rsv_sge;
memcpy(srq_context, mailbox->buf, sizeof(*srq_context));
@@ -7101,6 +7105,7 @@ static int hns_roce_v2_post_srq_recv(struct ib_srq *ibsrq, unsigned long flags; int ret = 0; int wqe_idx; + u32 max_sge; void *wqe; int nreq; int ind; @@ -7109,11 +7114,12 @@ static int hns_roce_v2_post_srq_recv(struct ib_srq *ibsrq, spin_lock_irqsave(&srq->lock, flags);
ind = srq->head & (srq->max - 1); + max_sge = srq->max_gs - srq->rsv_sge; for (nreq = 0; wr; ++nreq, wr = wr->next) { - if (unlikely(wr->num_sge >= srq->max_gs)) { + if (unlikely(wr->num_sge > max_sge)) { dev_err(hr_dev->dev, "srq(0x%lx) wr sge num(%d) exceed the max num %d.\n", - srq->srqn, wr->num_sge, srq->max_gs); + srq->srqn, wr->num_sge, max_sge); ret = -EINVAL; *bad_wr = wr; break; @@ -7138,7 +7144,7 @@ static int hns_roce_v2_post_srq_recv(struct ib_srq *ibsrq, dseg[i].addr = cpu_to_le64(wr->sg_list[i].addr); }
- if (wr->num_sge < srq->max_gs) { + if (srq->rsv_sge) { dseg[i].len = cpu_to_le32(HNS_ROCE_INVALID_SGE_LENGTH); dseg[i].lkey = cpu_to_le32(HNS_ROCE_INVALID_LKEY); dseg[i].addr = 0; diff --git a/drivers/infiniband/hw/hns/hns_roce_hw_v2.h b/drivers/infiniband/hw/hns/hns_roce_hw_v2.h index dad3ac0b4731c..11318105826a5 100644 --- a/drivers/infiniband/hw/hns/hns_roce_hw_v2.h +++ b/drivers/infiniband/hw/hns/hns_roce_hw_v2.h @@ -58,16 +58,16 @@ #define HNS_ROCE_V2_MAX_WQE_NUM 0x8000 #define HNS_ROCE_V2_MAX_SRQ 0x100000 #define HNS_ROCE_V2_MAX_SRQ_WR 0x8000 -#define HNS_ROCE_V2_MAX_SRQ_SGE 0xff +#define HNS_ROCE_V2_MAX_SRQ_SGE 0x100 #define HNS_ROCE_V2_MAX_CQ_NUM 0x100000 #define HNS_ROCE_V2_MAX_CQC_TIMER_NUM 0x100 #define HNS_ROCE_V2_MAX_SRQ_NUM 0x100000 #define HNS_ROCE_V2_MAX_CQE_NUM 0x400000 #define HNS_ROCE_V2_MAX_SRQWQE_NUM 0x8000 /* reserve one sge to circumvent a hardware issue */ -#define HNS_ROCE_V2_MAX_RQ_SGE_NUM 0xff +#define HNS_ROCE_V2_MAX_RQ_SGE_NUM 0x100 #define HNS_ROCE_V2_MAX_SQ_SGE_NUM 0xff -#define HNS_ROCE_V2_MAX_SRQ_SGE_NUM 0xff +#define HNS_ROCE_V2_MAX_SRQ_SGE_NUM 0x100 #define HNS_ROCE_V2_MAX_EXTEND_SGE_NUM 0x200000 #define HNS_ROCE_V2_MAX_SQ_INLINE 0x20 #define HNS_ROCE_V2_UAR_NUM 256 diff --git a/drivers/infiniband/hw/hns/hns_roce_qp.c b/drivers/infiniband/hw/hns/hns_roce_qp.c index 520213d6461ed..e0f2413c2a0a3 100644 --- a/drivers/infiniband/hw/hns/hns_roce_qp.c +++ b/drivers/infiniband/hw/hns/hns_roce_qp.c @@ -344,17 +344,42 @@ void hns_roce_release_range_qp(struct hns_roce_dev *hr_dev, int base_qpn, } EXPORT_SYMBOL_GPL(hns_roce_release_range_qp);
+static u32 proc_rq_sge(struct hns_roce_dev *dev, struct hns_roce_qp *hr_qp, + int user) +{ + u32 max_sge = dev->caps.max_rq_sg; + + if (dev->pci_dev->revision > PCI_REVISION_ID_HIP08_B) + return max_sge; + + /* Reserve SGEs only for HIP08 in kernel; The userspace driver will + * calculate number of max_sge with reserved SGEs when allocating wqe + * buf, so there is no need to do this again in kernel. But the number + * may exceed the capacity of SGEs recorded in the firmware, so the + * kernel driver should just adapt the value accordingly. + */ + if (user) + max_sge = roundup_pow_of_two(max_sge + 1); + else + hr_qp->rq.rsv_sge = 1; + + return max_sge; +} + + + static int hns_roce_set_rq_size(struct hns_roce_dev *hr_dev, struct ib_qp_cap *cap, int is_user, int has_rq, struct hns_roce_qp *hr_qp) { + u32 max_sge = proc_rq_sge(hr_dev, hr_qp, is_user); struct device *dev = hr_dev->dev; u32 max_cnt;
/* Check the validity of QP support capacity */ if (cap->max_recv_wr > hr_dev->caps.max_wqes || - cap->max_recv_sge > hr_dev->caps.max_rq_sg) { - dev_err(dev, "RQ(0x%lx) WR or sge error!max_recv_wr=%d max_recv_sge=%d\n", + cap->max_recv_sge > max_sge) { + dev_err(dev, "RQ(0x%lx) WR or sge error, depth = %u, sge = %u\n", hr_qp->qpn, cap->max_recv_wr, cap->max_recv_sge); return -EINVAL; } @@ -386,7 +411,7 @@ static int hns_roce_set_rq_size(struct hns_roce_dev *hr_dev,
max_cnt = max(1U, cap->max_recv_sge); hr_qp->rq.max_gs = roundup_pow_of_two(max_cnt + - HNS_ROCE_RESERVED_SGE); + hr_qp->rq.rsv_sge); if (hr_dev->caps.max_rq_sg <= HNS_ROCE_MAX_SGE_NUM) hr_qp->rq.wqe_shift = ilog2(hr_dev->caps.max_rq_desc_sz); @@ -397,7 +422,7 @@ static int hns_roce_set_rq_size(struct hns_roce_dev *hr_dev, }
cap->max_recv_wr = hr_qp->rq.max_post = hr_qp->rq.wqe_cnt; - cap->max_recv_sge = hr_qp->rq.max_gs - HNS_ROCE_RESERVED_SGE; + cap->max_recv_sge = hr_qp->rq.max_gs - hr_qp->rq.rsv_sge;
return 0; } diff --git a/drivers/infiniband/hw/hns/hns_roce_srq.c b/drivers/infiniband/hw/hns/hns_roce_srq.c index 1e17484d3676a..9d4aec18be0fc 100644 --- a/drivers/infiniband/hw/hns/hns_roce_srq.c +++ b/drivers/infiniband/hw/hns/hns_roce_srq.c @@ -30,7 +30,7 @@ * SOFTWARE. */ #include "roce_k_compat.h" - +#include <linux/pci.h> #include <rdma/ib_umem.h> #include <rdma/hns-abi.h> #include "hns_roce_device.h" @@ -431,6 +431,28 @@ static void destroy_kernel_srq(struct hns_roce_dev *hr_dev, hns_roce_buf_free(hr_dev, srq_buf_size, &srq->buf); }
+static u32 proc_srq_sge(struct hns_roce_dev *dev, struct hns_roce_srq *hr_srq, + bool user) +{ + u32 max_sge = dev->caps.max_srq_sges; + + if (dev->pci_dev->revision > PCI_REVISION_ID_HIP08_B) + return max_sge; + /* Reserve SGEs only for HIP08 in kernel; The userspace driver will + * calculate number of max_sge with reserved SGEs when allocating wqe + * buf, so there is no need to do this again in kernel. But the number + * may exceed the capacity of SGEs recorded in the firmware, so the + * kernel driver should just adapt the value accordingly. + */ + if (user) + max_sge = roundup_pow_of_two(max_sge + 1); + else + hr_srq->rsv_sge = 1; + + return max_sge; +} + + struct ib_srq *hns_roce_create_srq(struct ib_pd *pd, struct ib_srq_init_attr *srq_init_attr, struct ib_udata *udata) @@ -439,23 +461,26 @@ struct ib_srq *hns_roce_create_srq(struct ib_pd *pd, struct hns_roce_srq *srq; int srq_desc_size; int srq_buf_size; + u32 max_sge; int ret; u32 cqn;
- /* Check the actual SRQ wqe and SRQ sge num */ - if (srq_init_attr->attr.max_wr >= hr_dev->caps.max_srq_wrs || - srq_init_attr->attr.max_sge > hr_dev->caps.max_srq_sges) - return ERR_PTR(-EINVAL); - srq = kzalloc(sizeof(*srq), GFP_KERNEL); if (!srq) return ERR_PTR(-ENOMEM);
+ max_sge = proc_srq_sge(hr_dev, srq, !!udata); + + if (srq_init_attr->attr.max_wr >= hr_dev->caps.max_srq_wrs || + srq_init_attr->attr.max_sge > max_sge) + return ERR_PTR(-EINVAL); + mutex_init(&srq->mutex); spin_lock_init(&srq->lock);
srq->max = roundup_pow_of_two(srq_init_attr->attr.max_wr + 1); - srq->max_gs = srq_init_attr->attr.max_sge + HNS_ROCE_RESERVED_SGE; + srq->max_gs = + roundup_pow_of_two(srq_init_attr->attr.max_sge + srq->rsv_sge);
srq_desc_size = max(HNS_ROCE_SGE_SIZE, HNS_ROCE_SGE_SIZE * srq->max_gs); srq_desc_size = roundup_pow_of_two(srq_desc_size); @@ -499,6 +524,8 @@ struct ib_srq *hns_roce_create_srq(struct ib_pd *pd,
srq->event = hns_roce_ib_srq_event; srq->ibsrq.ext.xrc.srq_num = srq->srqn; + srq_init_attr->attr.max_wr = srq->max; + srq_init_attr->attr.max_sge = srq->max_gs - srq->rsv_sge;
if (pd->uobject) { if (ib_copy_to_udata(udata, &srq->srqn, sizeof(__u32))) {