driver inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I87LTM
--------------------------------------------------------------------------
Currently, driver fixedly allocates 4K pages for user space WQE buffer even in a 64K system. This results in HW reading WQE with a granularity of 4K even in a 64K system. Considering that we support 1024-byte inline, in the scenario of using SQ inline, HW will switch pages every 4 WQEs. This will introduce a delay of about 400ns, which is an average delay of 100ns per packet.
In order to improve performance, we allow user-mode drivers to use a larger page size to allocate WQE buffers, thereby reducing the latency introduced by HW page switching. User-mode drivers will be allowed to allocate WQE buffers between 4K to system page size. During ibv_create_qp(), the driver will dynamically select the appropriate page size based on ibv_qp_cap, thus reducing memory consumption while improving performance.
This feature needs to be used in conjunction with the kernel-mode driver. In order to ensure forward compatibility, if the kernel-mode driver does not support this feature, the user-mode driver will continue to use a fixed 4K pagesize to allocate WQE buffer.
Signed-off-by: Chengchang Tang tangchengchang@huawei.com --- kernel-headers/rdma/hns-abi.h | 5 ++- providers/hns/hns_roce_u.c | 2 +- providers/hns/hns_roce_u.h | 1 + providers/hns/hns_roce_u_verbs.c | 65 ++++++++++++++++++++++++++------ 4 files changed, 59 insertions(+), 14 deletions(-)
diff --git a/kernel-headers/rdma/hns-abi.h b/kernel-headers/rdma/hns-abi.h index cab941f..157dc9d 100644 --- a/kernel-headers/rdma/hns-abi.h +++ b/kernel-headers/rdma/hns-abi.h @@ -81,7 +81,8 @@ struct hns_roce_ib_create_qp { __u8 log_sq_bb_count; __u8 log_sq_stride; __u8 sq_no_prefetch; - __u8 reserved[5]; + __u8 reserved[4]; + __u8 pageshift; __aligned_u64 sdb_addr; __aligned_u64 comp_mask; __aligned_u64 create_flags; @@ -122,6 +123,7 @@ enum { HNS_ROCE_RQ_INLINE_FLAGS = 1 << 1, HNS_ROCE_CQE_INLINE_FLAGS = 1 << 2, HNS_ROCE_UCTX_CONFIG_DCA = 1 << 3, + HNS_ROCE_UCTX_DYN_QP_PGSZ = 1 << 4, };
enum { @@ -129,6 +131,7 @@ enum { HNS_ROCE_RSP_RQ_INLINE_FLAGS = 1 << 1, HNS_ROCE_RSP_CQE_INLINE_FLAGS = 1 << 2, HNS_ROCE_UCTX_RSP_DCA_FLAGS = HNS_ROCE_UCTX_CONFIG_DCA, + HNS_ROCE_UCTX_RSP_DYN_QP_PGSZ = HNS_ROCE_UCTX_DYN_QP_PGSZ, };
struct hns_roce_ib_alloc_ucontext_resp { diff --git a/providers/hns/hns_roce_u.c b/providers/hns/hns_roce_u.c index 0660081..02ad880 100644 --- a/providers/hns/hns_roce_u.c +++ b/providers/hns/hns_roce_u.c @@ -267,7 +267,7 @@ static void ucontext_set_cmd(struct hns_roce_alloc_ucontext *cmd, struct hnsdv_context_attr *attr) { cmd->config |= HNS_ROCE_EXSGE_FLAGS | HNS_ROCE_RQ_INLINE_FLAGS | - HNS_ROCE_CQE_INLINE_FLAGS; + HNS_ROCE_CQE_INLINE_FLAGS | HNS_ROCE_UCTX_DYN_QP_PGSZ;
if (!attr || !(attr->flags & HNSDV_CONTEXT_FLAGS_DCA)) return; diff --git a/providers/hns/hns_roce_u.h b/providers/hns/hns_roce_u.h index 5501d8e..ae9ae51 100644 --- a/providers/hns/hns_roce_u.h +++ b/providers/hns/hns_roce_u.h @@ -409,6 +409,7 @@ struct hns_roce_qp { uint8_t sl; uint8_t tc_mode; uint8_t priority; + uint8_t pageshift; unsigned int qkey; enum ibv_mtu path_mtu;
diff --git a/providers/hns/hns_roce_u_verbs.c b/providers/hns/hns_roce_u_verbs.c index 7b58dd0..f76341c 100644 --- a/providers/hns/hns_roce_u_verbs.c +++ b/providers/hns/hns_roce_u_verbs.c @@ -1327,31 +1327,69 @@ static void free_recv_rinl_buf(struct hns_roce_rinl_buf *rinl_buf) } }
+static void get_best_multi_region_pg_shift(struct hns_roce_device *hr_dev, + struct hns_roce_context *ctx, + struct hns_roce_qp *qp, bool dca_en) +{ + uint32_t ext_sge_size; + uint32_t sq_size; + uint32_t rq_size; + uint8_t pg_shift; + + if (!(ctx->config & HNS_ROCE_UCTX_RSP_DYN_QP_PGSZ) || dca_en) { + qp->pageshift = HNS_HW_PAGE_SHIFT; + return; + } + + /* + * The larger the pagesize used, the better the performance, but it + * may waste more memory. Therefore, we use the least common multiple + * (aligned to power of 2) of sq wqe buffer size, rq wqe buffer size, + * and ext_sge buffer size as the pagesize. Additionally, since the + * kernel cannot guarantee the allocation of contiguous memory larger + * than the system page, the pagesize must be smaller than the system + * page. + */ + sq_size = qp->sq.wqe_cnt << qp->sq.wqe_shift; + ext_sge_size = qp->ex_sge.sge_cnt << qp->ex_sge.sge_shift; + rq_size = qp->rq.wqe_cnt << qp->rq.wqe_shift; + + pg_shift = max_t(uint8_t, sq_size ? hr_ilog32(sq_size) : 0, + ext_sge_size ? hr_ilog32(ext_sge_size) : 0); + pg_shift = max_t(uint8_t, pg_shift, rq_size ? hr_ilog32(rq_size) : 0); + pg_shift = max_t(uint8_t, pg_shift, HNS_HW_PAGE_SHIFT); + qp->pageshift = min_t(uint8_t, pg_shift, hr_ilog32(hr_dev->page_size)); +} + static int calc_qp_buff_size(struct hns_roce_device *hr_dev, - struct hns_roce_qp *qp) + struct hns_roce_context *ctx, + struct hns_roce_qp *qp, bool dca_en) { struct hns_roce_wq *sq = &qp->sq; struct hns_roce_wq *rq = &qp->rq; + unsigned int page_size; unsigned int size;
qp->buf_size = 0; + get_best_multi_region_pg_shift(hr_dev, ctx, qp, dca_en); + page_size = 1 << qp->pageshift;
/* SQ WQE */ sq->offset = 0; - size = to_hr_hem_entries_size(sq->wqe_cnt, sq->wqe_shift); + size = align(sq->wqe_cnt << sq->wqe_shift, page_size); qp->buf_size += size;
/* extend SGE WQE in SQ */ qp->ex_sge.offset = qp->buf_size; if (qp->ex_sge.sge_cnt > 0) { - size = to_hr_hem_entries_size(qp->ex_sge.sge_cnt, - qp->ex_sge.sge_shift); + size = align(qp->ex_sge.sge_cnt << qp->ex_sge.sge_shift, + page_size); qp->buf_size += size; }
/* RQ WQE */ rq->offset = qp->buf_size; - size = to_hr_hem_entries_size(rq->wqe_cnt, rq->wqe_shift); + size = align(rq->wqe_cnt << rq->wqe_shift, page_size); qp->buf_size += size;
if (qp->buf_size < 1) @@ -1375,7 +1413,7 @@ static inline bool check_qp_support_dca(struct hns_roce_dca_ctx *dca_ctx, if (hns_attr && (hns_attr->comp_mask & HNSDV_QP_INIT_ATTR_MASK_QP_CREATE_FLAGS) && (hns_attr->create_flags & HNSDV_QP_CREATE_ENABLE_DCA_MODE)) - return true; + return dca_ctx->max_size > 0;
return false; } @@ -1396,9 +1434,12 @@ static int qp_alloc_wqe(struct ibv_qp_init_attr_ex *attr, struct hns_roce_qp *qp, struct hns_roce_context *ctx) { struct hns_roce_device *hr_dev = to_hr_dev(ctx->ibv_ctx.context.device); + bool dca_en = check_qp_support_dca(&ctx->dca_ctx, attr, hns_attr); + int ret;
- if (calc_qp_buff_size(hr_dev, qp)) - return -EINVAL; + ret = calc_qp_buff_size(hr_dev, ctx, qp, dca_en); + if (ret) + return ret;
qp->sq.wrid = malloc(qp->sq.wqe_cnt * sizeof(uint64_t)); if (!qp->sq.wrid) @@ -1416,19 +1457,18 @@ static int qp_alloc_wqe(struct ibv_qp_init_attr_ex *attr, goto err_alloc; }
- if (check_qp_support_dca(&ctx->dca_ctx, attr, hns_attr) && - ctx->dca_ctx.max_size > 0) { + if (dca_en) { /* when DCA is enabled, use a buffer list to store page addr */ qp->buf.buf = NULL; qp->dca_wqe.max_cnt = hr_hw_page_count(qp->buf_size); - qp->dca_wqe.shift = HNS_HW_PAGE_SHIFT; + qp->dca_wqe.shift = qp->pageshift; qp->dca_wqe.bufs = calloc(qp->dca_wqe.max_cnt, sizeof(void *)); if (!qp->dca_wqe.bufs) goto err_alloc; verbs_debug(&ctx->ibv_ctx, "alloc DCA buf.\n"); } else { if (hns_roce_alloc_buf(&qp->buf, qp->buf_size, - HNS_HW_PAGE_SIZE)) + 1 << qp->pageshift)) goto err_alloc; }
@@ -1642,6 +1682,7 @@ static int qp_exec_create_cmd(struct ibv_qp_init_attr_ex *attr, cmd_ex.buf_addr = (uintptr_t)qp->buf.buf; cmd_ex.log_sq_stride = qp->sq.wqe_shift; cmd_ex.log_sq_bb_count = hr_ilog32(qp->sq.wqe_cnt); + cmd_ex.pageshift = qp->pageshift;
if (cmd_flag->congest_type_flags) { cmd_ex.comp_mask |= HNS_ROCE_CREATE_QP_MASK_CONGEST_TYPE;
From: Juan Zhou zhoujuan51@h-partners.com
driver inclusion category: cleanup bugzilla: https://gitee.com/openeuler/kernel/issues/I87LTM
--------------------------------------------------------------------------
In the hns_roce_alloc_ucontext() function, the configuration code is too much. Extract this part of the code into the function hns_roce_get_uctx_config() to improve readability.
Signed-off-by: Juan Zhou zhoujuan51@h-partners.com --- drivers/infiniband/hw/hns/hns_roce_main.c | 58 +++++++++++++---------- 1 file changed, 33 insertions(+), 25 deletions(-)
diff --git a/drivers/infiniband/hw/hns/hns_roce_main.c b/drivers/infiniband/hw/hns/hns_roce_main.c index 7a4662a8718f..50bdea51a550 100644 --- a/drivers/infiniband/hw/hns/hns_roce_main.c +++ b/drivers/infiniband/hw/hns/hns_roce_main.c @@ -499,6 +499,38 @@ static u32 get_udca_max_qps(struct hns_roce_dev *hr_dev, return qp_num; }
+static void hns_roce_get_uctx_config(struct hns_roce_dev *hr_dev, + struct hns_roce_ucontext *context, + struct hns_roce_ib_alloc_ucontext *ucmd, + struct hns_roce_ib_alloc_ucontext_resp *resp) +{ + if (hr_dev->pci_dev->revision >= PCI_REVISION_ID_HIP09) + context->config = ucmd->config & HNS_ROCE_EXSGE_FLAGS; + + if (context->config & HNS_ROCE_EXSGE_FLAGS) { + resp->config |= HNS_ROCE_RSP_EXSGE_FLAGS; + resp->max_inline_data = hr_dev->caps.max_sq_inline; + } + + if (hr_dev->caps.flags & HNS_ROCE_CAP_FLAG_RQ_INLINE) { + context->config |= ucmd->config & HNS_ROCE_RQ_INLINE_FLAGS; + if (context->config & HNS_ROCE_RQ_INLINE_FLAGS) + resp->config |= HNS_ROCE_RSP_RQ_INLINE_FLAGS; + } + + if (hr_dev->caps.flags & HNS_ROCE_CAP_FLAG_CQE_INLINE) { + context->config |= ucmd->config & HNS_ROCE_CQE_INLINE_FLAGS; + if (context->config & HNS_ROCE_CQE_INLINE_FLAGS) + resp->config |= HNS_ROCE_RSP_CQE_INLINE_FLAGS; + } + + if (hr_dev->caps.flags & HNS_ROCE_CAP_FLAG_DCA_MODE) { + context->config |= ucmd->config & HNS_ROCE_UCTX_CONFIG_DCA; + if (context->config & HNS_ROCE_UCTX_CONFIG_DCA) + resp->config |= HNS_ROCE_UCTX_RSP_DCA_FLAGS; + } +} + static int hns_roce_alloc_ucontext(struct ib_ucontext *uctx, struct ib_udata *udata) { @@ -519,31 +551,7 @@ static int hns_roce_alloc_ucontext(struct ib_ucontext *uctx, if (ret) goto error_fail_uar_alloc;
- if (hr_dev->pci_dev->revision >= PCI_REVISION_ID_HIP09) - context->config = ucmd.config & HNS_ROCE_EXSGE_FLAGS; - - if (context->config & HNS_ROCE_EXSGE_FLAGS) { - resp.config |= HNS_ROCE_RSP_EXSGE_FLAGS; - resp.max_inline_data = hr_dev->caps.max_sq_inline; - } - - if (hr_dev->caps.flags & HNS_ROCE_CAP_FLAG_RQ_INLINE) { - context->config |= ucmd.config & HNS_ROCE_RQ_INLINE_FLAGS; - if (context->config & HNS_ROCE_RQ_INLINE_FLAGS) - resp.config |= HNS_ROCE_RSP_RQ_INLINE_FLAGS; - } - - if (hr_dev->caps.flags & HNS_ROCE_CAP_FLAG_CQE_INLINE) { - context->config |= ucmd.config & HNS_ROCE_CQE_INLINE_FLAGS; - if (context->config & HNS_ROCE_CQE_INLINE_FLAGS) - resp.config |= HNS_ROCE_RSP_CQE_INLINE_FLAGS; - } - - if (hr_dev->caps.flags & HNS_ROCE_CAP_FLAG_DCA_MODE) { - context->config |= ucmd.config & HNS_ROCE_UCTX_CONFIG_DCA; - if (context->config & HNS_ROCE_UCTX_CONFIG_DCA) - resp.config |= HNS_ROCE_UCTX_RSP_DCA_FLAGS; - } + hns_roce_get_uctx_config(hr_dev, context, &ucmd, &resp);
ret = hns_roce_uar_alloc(hr_dev, &context->uar); if (ret)
driver inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I87LTM
--------------------------------------------------------------------------
Currently, driver fixedly allocates 4K pages for user space WQE buffer even in a 64K system. This results in HW reading WQE with a granularity of 4K even in a 64K system. Considering that we support 1024-byte inline, in the scenario of using SQ inline, HW will switch pages every 4 WQEs. This will introduce a delay of about 400ns, which is an average delay of 100ns per packet.
In order to improve performance, we allow user-mode driver to use more flexible WQE buffer page size allocation strategies, which allowing user-mode driver to configure WQE buffer using pages between 4K to system PAGESIZE.
This feature needs to be used in conjunction with the user-mode driver. In order to ensure forward compatibility, if the user-mode driver does not support this feature, the kernel mode will continue to use a fixed 4K pagesize.
Signed-off-by: Chengchang Tang tangchengchang@huawei.com --- drivers/infiniband/hw/hns/hns_roce_main.c | 5 +++ drivers/infiniband/hw/hns/hns_roce_qp.c | 55 ++++++++++++----------- include/uapi/rdma/hns-abi.h | 5 ++- 3 files changed, 38 insertions(+), 27 deletions(-)
diff --git a/drivers/infiniband/hw/hns/hns_roce_main.c b/drivers/infiniband/hw/hns/hns_roce_main.c index 50bdea51a550..8f7838463248 100644 --- a/drivers/infiniband/hw/hns/hns_roce_main.c +++ b/drivers/infiniband/hw/hns/hns_roce_main.c @@ -529,6 +529,11 @@ static void hns_roce_get_uctx_config(struct hns_roce_dev *hr_dev, if (context->config & HNS_ROCE_UCTX_CONFIG_DCA) resp->config |= HNS_ROCE_UCTX_RSP_DCA_FLAGS; } + + if (ucmd->config & HNS_ROCE_UCTX_DYN_QP_PGSZ) { + context->config |= HNS_ROCE_UCTX_DYN_QP_PGSZ; + resp->config |= HNS_ROCE_UCTX_RSP_DYN_QP_PGSZ; + } }
static int hns_roce_alloc_ucontext(struct ib_ucontext *uctx, diff --git a/drivers/infiniband/hw/hns/hns_roce_qp.c b/drivers/infiniband/hw/hns/hns_roce_qp.c index 00f82f4b19f6..77de664f9b7c 100644 --- a/drivers/infiniband/hw/hns/hns_roce_qp.c +++ b/drivers/infiniband/hw/hns/hns_roce_qp.c @@ -672,17 +672,24 @@ static bool check_dca_is_enable(struct hns_roce_dev *hr_dev,
static int set_wqe_buf_attr(struct hns_roce_dev *hr_dev, struct hns_roce_qp *hr_qp, bool dca_en, - struct hns_roce_buf_attr *buf_attr) + u8 page_shift, struct hns_roce_buf_attr *buf_attr) { + unsigned int page_size = BIT(page_shift); int buf_size; int idx = 0;
hr_qp->buff_size = 0; - + if (page_shift > PAGE_SHIFT || page_shift < HNS_HW_PAGE_SHIFT) + return -EOPNOTSUPP; + /* + * When enable DCA, there's no need to alloc buffer now, and + * the page shift should be fixed to 4K. + */ + if (dca_en && page_shift != HNS_HW_PAGE_SHIFT) + return -EOPNOTSUPP; /* SQ WQE */ hr_qp->sq.offset = 0; - buf_size = to_hr_hem_entries_size(hr_qp->sq.wqe_cnt, - hr_qp->sq.wqe_shift); + buf_size = ALIGN(hr_qp->sq.wqe_cnt << hr_qp->sq.wqe_shift, page_size); if (buf_size > 0 && idx < ARRAY_SIZE(buf_attr->region)) { buf_attr->region[idx].size = buf_size; buf_attr->region[idx].hopnum = hr_dev->caps.wqe_sq_hop_num; @@ -692,8 +699,7 @@ static int set_wqe_buf_attr(struct hns_roce_dev *hr_dev,
/* extend SGE WQE in SQ */ hr_qp->sge.offset = hr_qp->buff_size; - buf_size = to_hr_hem_entries_size(hr_qp->sge.sge_cnt, - hr_qp->sge.sge_shift); + buf_size = ALIGN(hr_qp->sge.sge_cnt << hr_qp->sge.sge_shift, page_size); if (buf_size > 0 && idx < ARRAY_SIZE(buf_attr->region)) { buf_attr->region[idx].size = buf_size; buf_attr->region[idx].hopnum = hr_dev->caps.wqe_sge_hop_num; @@ -703,8 +709,7 @@ static int set_wqe_buf_attr(struct hns_roce_dev *hr_dev,
/* RQ WQE */ hr_qp->rq.offset = hr_qp->buff_size; - buf_size = to_hr_hem_entries_size(hr_qp->rq.wqe_cnt, - hr_qp->rq.wqe_shift); + buf_size = ALIGN(hr_qp->rq.wqe_cnt << hr_qp->rq.wqe_shift, page_size); if (buf_size > 0 && idx < ARRAY_SIZE(buf_attr->region)) { buf_attr->region[idx].size = buf_size; buf_attr->region[idx].hopnum = hr_dev->caps.wqe_rq_hop_num; @@ -716,19 +721,8 @@ static int set_wqe_buf_attr(struct hns_roce_dev *hr_dev, return -EINVAL;
buf_attr->region_count = idx; - - if (dca_en) { - /* - * When enable DCA, there's no need to alloc buffer now, and - * the page shift should be fixed to 4K. - */ - buf_attr->mtt_only = true; - buf_attr->page_shift = HNS_HW_PAGE_SHIFT; - } else { - buf_attr->mtt_only = false; - buf_attr->page_shift = HNS_HW_PAGE_SHIFT + - hr_dev->caps.mtt_buf_pg_sz; - } + buf_attr->mtt_only = dca_en; + buf_attr->page_shift = page_shift;
return 0; } @@ -834,21 +828,30 @@ static void free_wqe_buf(struct hns_roce_dev *hr_dev, struct hns_roce_qp *hr_qp,
static int alloc_qp_wqe(struct hns_roce_dev *hr_dev, struct hns_roce_qp *hr_qp, struct ib_qp_init_attr *init_attr, - struct ib_udata *udata, unsigned long addr) + struct ib_udata *udata, + struct hns_roce_ib_create_qp *ucmd) { + struct hns_roce_ucontext *uctx = rdma_udata_to_drv_context(udata, + struct hns_roce_ucontext, ibucontext); struct ib_device *ibdev = &hr_dev->ib_dev; struct hns_roce_buf_attr buf_attr = {}; + u8 page_shift = HNS_HW_PAGE_SHIFT; bool dca_en; int ret;
- dca_en = check_dca_is_enable(hr_dev, hr_qp, init_attr, !!udata, addr); - ret = set_wqe_buf_attr(hr_dev, hr_qp, dca_en, &buf_attr); + if (uctx && (uctx->config & HNS_ROCE_UCTX_DYN_QP_PGSZ)) + page_shift = ucmd->pageshift; + + dca_en = check_dca_is_enable(hr_dev, hr_qp, init_attr, + !!udata, ucmd->buf_addr); + ret = set_wqe_buf_attr(hr_dev, hr_qp, dca_en, page_shift, &buf_attr); if (ret) { ibdev_err(ibdev, "failed to split WQE buf, ret = %d.\n", ret); return ret; }
- ret = alloc_wqe_buf(hr_dev, hr_qp, dca_en, &buf_attr, udata, addr); + ret = alloc_wqe_buf(hr_dev, hr_qp, dca_en, + &buf_attr, udata, ucmd->buf_addr); if (ret) ibdev_err(ibdev, "failed to alloc WQE buf, ret = %d.\n", ret);
@@ -1237,7 +1240,7 @@ static int hns_roce_create_qp_common(struct hns_roce_dev *hr_dev, goto err_qpn; }
- ret = alloc_qp_wqe(hr_dev, hr_qp, init_attr, udata, ucmd.buf_addr); + ret = alloc_qp_wqe(hr_dev, hr_qp, init_attr, udata, &ucmd); if (ret) { ibdev_err(ibdev, "failed to alloc QP buffer, ret = %d.\n", ret); goto err_buf; diff --git a/include/uapi/rdma/hns-abi.h b/include/uapi/rdma/hns-abi.h index cab941fea327..157dc9d4cd88 100644 --- a/include/uapi/rdma/hns-abi.h +++ b/include/uapi/rdma/hns-abi.h @@ -81,7 +81,8 @@ struct hns_roce_ib_create_qp { __u8 log_sq_bb_count; __u8 log_sq_stride; __u8 sq_no_prefetch; - __u8 reserved[5]; + __u8 reserved[4]; + __u8 pageshift; __aligned_u64 sdb_addr; __aligned_u64 comp_mask; __aligned_u64 create_flags; @@ -122,6 +123,7 @@ enum { HNS_ROCE_RQ_INLINE_FLAGS = 1 << 1, HNS_ROCE_CQE_INLINE_FLAGS = 1 << 2, HNS_ROCE_UCTX_CONFIG_DCA = 1 << 3, + HNS_ROCE_UCTX_DYN_QP_PGSZ = 1 << 4, };
enum { @@ -129,6 +131,7 @@ enum { HNS_ROCE_RSP_RQ_INLINE_FLAGS = 1 << 1, HNS_ROCE_RSP_CQE_INLINE_FLAGS = 1 << 2, HNS_ROCE_UCTX_RSP_DCA_FLAGS = HNS_ROCE_UCTX_CONFIG_DCA, + HNS_ROCE_UCTX_RSP_DYN_QP_PGSZ = HNS_ROCE_UCTX_DYN_QP_PGSZ, };
struct hns_roce_ib_alloc_ucontext_resp {