driver inclusion category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I98HIN
--------------------------------------------------------------------------
Currently, driver fixedly allocates 4K pages for user space WQE buffer even in a 64K system. This results in HW reading WQE with a granularity of 4K even in a 64K system. Considering that we support 1024-byte inline, in the scenario of using SQ inline, HW will switch pages every 4 WQEs. This will introduce a delay of about 400ns, which is an average delay of 100ns per packet.
In order to improve performance, we allow user-mode drivers to use a larger page size to allocate WQE buffers, thereby reducing the latency introduced by HW page switching. User-mode drivers will be allowed to allocate WQE buffers between 4K to system page size. During ibv_create_qp(), the driver will dynamically select the appropriate page size based on ibv_qp_cap, thus reducing memory consumption while improving performance.
This feature needs to be used in conjunction with the kernel-mode driver. In order to ensure forward compatibility, if the kernel-mode driver does not support this feature, the user-mode driver will continue to use a fixed 4K pagesize to allocate WQE buffer.
Signed-off-by: Chengchang Tang tangchengchang@huawei.com --- kernel-headers/rdma/hns-abi.h | 5 +++- providers/hns/hns_roce_u.h | 1 + providers/hns/hns_roce_u_verbs.c | 51 ++++++++++++++++++++++++++++---- 3 files changed, 50 insertions(+), 7 deletions(-)
diff --git a/kernel-headers/rdma/hns-abi.h b/kernel-headers/rdma/hns-abi.h index 39ed8a4..f33d876 100644 --- a/kernel-headers/rdma/hns-abi.h +++ b/kernel-headers/rdma/hns-abi.h @@ -90,7 +90,8 @@ struct hns_roce_ib_create_qp { __u8 log_sq_bb_count; __u8 log_sq_stride; __u8 sq_no_prefetch; - __u8 reserved[5]; + __u8 pageshift; + __u8 reserved[4]; __aligned_u64 sdb_addr; __aligned_u64 comp_mask; /* Use enum hns_roce_create_qp_comp_mask */ __aligned_u64 create_flags; @@ -119,12 +120,14 @@ enum { HNS_ROCE_EXSGE_FLAGS = 1 << 0, HNS_ROCE_RQ_INLINE_FLAGS = 1 << 1, HNS_ROCE_CQE_INLINE_FLAGS = 1 << 2, + HNS_ROCE_UCTX_DYN_QP_PGSZ = 1 << 4, };
enum { HNS_ROCE_RSP_EXSGE_FLAGS = 1 << 0, HNS_ROCE_RSP_RQ_INLINE_FLAGS = 1 << 1, HNS_ROCE_RSP_CQE_INLINE_FLAGS = 1 << 2, + HNS_ROCE_UCTX_RSP_DYN_QP_PGSZ = HNS_ROCE_UCTX_DYN_QP_PGSZ, };
struct hns_roce_ib_alloc_ucontext_resp { diff --git a/providers/hns/hns_roce_u.h b/providers/hns/hns_roce_u.h index b02fb22..1440464 100644 --- a/providers/hns/hns_roce_u.h +++ b/providers/hns/hns_roce_u.h @@ -350,6 +350,7 @@ struct hns_roce_qp { uint8_t sl; uint8_t tc_mode; uint8_t priority; + uint8_t pageshift; unsigned int qkey; enum ibv_mtu path_mtu;
diff --git a/providers/hns/hns_roce_u_verbs.c b/providers/hns/hns_roce_u_verbs.c index 04201e7..4aa5a3c 100644 --- a/providers/hns/hns_roce_u_verbs.c +++ b/providers/hns/hns_roce_u_verbs.c @@ -1174,31 +1174,69 @@ static void free_recv_rinl_buf(struct hns_roce_rinl_buf *rinl_buf) } }
+static void get_best_multi_region_pg_shift(struct hns_roce_device *hr_dev, + struct hns_roce_context *ctx, + struct hns_roce_qp *qp) +{ + uint32_t ext_sge_size; + uint32_t sq_size; + uint32_t rq_size; + uint8_t pg_shift; + + if (!(ctx->config & HNS_ROCE_UCTX_RSP_DYN_QP_PGSZ)) { + qp->pageshift = HNS_HW_PAGE_SHIFT; + return; + } + + /* + * The larger the pagesize used, the better the performance, but it + * may waste more memory. Therefore, we use the least common multiple + * (aligned to power of 2) of sq wqe buffer size, rq wqe buffer size, + * and ext_sge buffer size as the pagesize. Additionally, since the + * kernel cannot guarantee the allocation of contiguous memory larger + * than the system page, the pagesize must be smaller than the system + * page. + */ + sq_size = qp->sq.wqe_cnt << qp->sq.wqe_shift; + ext_sge_size = qp->ex_sge.sge_cnt << qp->ex_sge.sge_shift; + rq_size = qp->rq.wqe_cnt << qp->rq.wqe_shift; + + pg_shift = max_t(uint8_t, sq_size ? hr_ilog32(sq_size) : 0, + ext_sge_size ? hr_ilog32(ext_sge_size) : 0); + pg_shift = max_t(uint8_t, pg_shift, rq_size ? hr_ilog32(rq_size) : 0); + pg_shift = max_t(uint8_t, pg_shift, HNS_HW_PAGE_SHIFT); + qp->pageshift = min_t(uint8_t, pg_shift, hr_ilog32(hr_dev->page_size)); +} + static int calc_qp_buff_size(struct hns_roce_device *hr_dev, + struct hns_roce_context *ctx, struct hns_roce_qp *qp) { struct hns_roce_wq *sq = &qp->sq; struct hns_roce_wq *rq = &qp->rq; + unsigned int page_size; unsigned int size;
qp->buf_size = 0; + get_best_multi_region_pg_shift(hr_dev, ctx, qp); + page_size = 1 << qp->pageshift;
/* SQ WQE */ sq->offset = 0; - size = to_hr_hem_entries_size(sq->wqe_cnt, sq->wqe_shift); + size = align(sq->wqe_cnt << sq->wqe_shift, page_size); qp->buf_size += size;
/* extend SGE WQE in SQ */ qp->ex_sge.offset = qp->buf_size; if (qp->ex_sge.sge_cnt > 0) { - size = to_hr_hem_entries_size(qp->ex_sge.sge_cnt, - qp->ex_sge.sge_shift); + size = align(qp->ex_sge.sge_cnt << qp->ex_sge.sge_shift, + page_size); qp->buf_size += size; }
/* RQ WQE */ rq->offset = qp->buf_size; - size = to_hr_hem_entries_size(rq->wqe_cnt, rq->wqe_shift); + size = align(rq->wqe_cnt << rq->wqe_shift, page_size); qp->buf_size += size;
if (qp->buf_size < 1) @@ -1223,7 +1261,7 @@ static int qp_alloc_wqe(struct ibv_qp_cap *cap, struct hns_roce_qp *qp, { struct hns_roce_device *hr_dev = to_hr_dev(ctx->ibv_ctx.context.device);
- if (calc_qp_buff_size(hr_dev, qp)) + if (calc_qp_buff_size(hr_dev, ctx, qp)) return -EINVAL;
qp->sq.wrid = malloc(qp->sq.wqe_cnt * sizeof(uint64_t)); @@ -1241,7 +1279,7 @@ static int qp_alloc_wqe(struct ibv_qp_cap *cap, struct hns_roce_qp *qp, goto err_alloc; }
- if (hns_roce_alloc_buf(&qp->buf, qp->buf_size, HNS_HW_PAGE_SIZE)) + if (hns_roce_alloc_buf(&qp->buf, qp->buf_size, 1 << qp->pageshift)) goto err_alloc;
return 0; @@ -1478,6 +1516,7 @@ static int qp_exec_create_cmd(struct ibv_qp_init_attr_ex *attr, cmd_ex.buf_addr = (uintptr_t)qp->buf.buf; cmd_ex.log_sq_stride = qp->sq.wqe_shift; cmd_ex.log_sq_bb_count = hr_ilog32(qp->sq.wqe_cnt); + cmd_ex.pageshift = qp->pageshift;
if (hns_attr && hns_attr->comp_mask & HNSDV_QP_INIT_ATTR_MASK_QP_CONGEST_TYPE) {