From: Chengchang Tang tangchengchang@huawei.com
driver inclusion category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I5Y79T
-------------------------------------------------------------------
Now, The address of the first two pages in the MR will be searched. And an exception will occur when there is only one page in this MR.
This patch fix the number of page to search.
Fixes: 9b2cf76c9f05 ("RDMA/hns: Optimize PBL buffer allocation process") Signed-off-by: Chengchang Tang tangchengchang@huawei.com Reviewed-by: Yangyang Li liyangyang20@huawei.com Reviewed-by: YueHaibing yuehaibing@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- drivers/infiniband/hw/hns/hns_roce_hw_v2.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-)
diff --git a/drivers/infiniband/hw/hns/hns_roce_hw_v2.c b/drivers/infiniband/hw/hns/hns_roce_hw_v2.c index 87872c6e1977..c6348e520fb1 100644 --- a/drivers/infiniband/hw/hns/hns_roce_hw_v2.c +++ b/drivers/infiniband/hw/hns/hns_roce_hw_v2.c @@ -3266,7 +3266,8 @@ static int set_mtpt_pbl(struct hns_roce_dev *hr_dev, int i, count;
count = hns_roce_mtr_find(hr_dev, &mr->pbl_mtr, 0, pages, - ARRAY_SIZE(pages), &pbl_ba); + min_t(int, ARRAY_SIZE(pages), mr->npages), + &pbl_ba); if (count < 1) { ibdev_err(ibdev, "failed to find PBL mtr, count = %d.\n", count);
From: Chengchang Tang tangchengchang@huawei.com
mainline inclusion from mailine v6.0-rc5 commit 55af9d498556 category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I5Y79T CVE: NA
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?i...
------------------------------------------------------------------------
The supported page size for hns is (4K, 128M), not (4K, 2G).
Fixes: cfc85f3e4b7f ("RDMA/hns: Add profile support for hip08 driver") Link: https://lore.kernel.org/r/20220829105021.1427804-2-liangwenpeng@huawei.com Signed-off-by: Chengchang Tang tangchengchang@huawei.com Signed-off-by: Wenpeng Liang liangwenpeng@huawei.com Signed-off-by: Leon Romanovsky leon@kernel.org Reviewed-by: Yangyang Li liyangyang20@huawei.com Reviewed-by: YueHaibing yuehaibing@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- drivers/infiniband/hw/hns/hns_roce_hw_v2.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/drivers/infiniband/hw/hns/hns_roce_hw_v2.h b/drivers/infiniband/hw/hns/hns_roce_hw_v2.h index 77e8cd067642..39641b449a42 100644 --- a/drivers/infiniband/hw/hns/hns_roce_hw_v2.h +++ b/drivers/infiniband/hw/hns/hns_roce_hw_v2.h @@ -83,7 +83,7 @@
#define HNS_ROCE_V2_QPC_TIMER_ENTRY_SZ PAGE_SIZE #define HNS_ROCE_V2_CQC_TIMER_ENTRY_SZ PAGE_SIZE -#define HNS_ROCE_V2_PAGE_SIZE_SUPPORTED 0xFFFFF000 +#define HNS_ROCE_V2_PAGE_SIZE_SUPPORTED 0xFFFF000 #define HNS_ROCE_V2_MAX_INNER_MTPT_NUM 2 #define HNS_ROCE_INVALID_LKEY 0x0 #define HNS_ROCE_INVALID_SGE_LENGTH 0x80000000
From: Chengchang Tang tangchengchang@huawei.com
driver inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I5Y79T
-------------------------------------------------------------------
page_shift and page_cnt is only used in mtr_map_bufs(). And these parameter could be calculated indepedently.
Strip the computation of page_shift and page_cnt from mtr_init_buf_cfg(), reducing the number of parameters of it. This help reducing coupling between mtr_inif_buf_cfg() and mtr_map_bufs().
And the parameter validation in mtr_init_buf_cfg has also been abstracted into a separate function and placed at the beginning of the function.
Signed-off-by: Chengchang Tang tangchengchang@huawei.com Reviewed-by: Yangyang Li liyangyang20@huawei.com Reviewed-by: YueHaibing yuehaibing@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- drivers/infiniband/hw/hns/hns_roce_mr.c | 113 ++++++++++++++---------- 1 file changed, 68 insertions(+), 45 deletions(-)
diff --git a/drivers/infiniband/hw/hns/hns_roce_mr.c b/drivers/infiniband/hw/hns/hns_roce_mr.c index ea9af06b3530..08f11922326e 100644 --- a/drivers/infiniband/hw/hns/hns_roce_mr.c +++ b/drivers/infiniband/hw/hns/hns_roce_mr.c @@ -691,14 +691,37 @@ static int mtr_alloc_bufs(struct hns_roce_dev *hr_dev, struct hns_roce_mtr *mtr, return 0; }
-static int mtr_map_bufs(struct hns_roce_dev *hr_dev, struct hns_roce_mtr *mtr, - int page_count, unsigned int page_shift) +static int cal_mtr_pg_cnt(struct hns_roce_mtr *mtr) +{ + struct hns_roce_buf_region *region; + int page_cnt = 0; + int i; + + for (i = 0; i < mtr->hem_cfg.region_count; i++) { + region = &mtr->hem_cfg.region[i]; + page_cnt += region->count; + } + + return page_cnt; +} + +static int mtr_map_bufs(struct hns_roce_dev *hr_dev, struct hns_roce_mtr *mtr) { struct ib_device *ibdev = &hr_dev->ib_dev; + int page_count = cal_mtr_pg_cnt(mtr); + unsigned int page_shift; dma_addr_t *pages; int npage; int ret;
+ /* When HEM buffer uses 0-level addressing, the page size is + * equal to the whole buffer size, and we split the buffer into + * small pages which is used to check whether the adjacent + * units are in the continuous space and its size is fixed to + * 4K based on hns ROCEE's requirement. + */ + page_shift = mtr->hem_cfg.is_direct ? HNS_HW_PAGE_SHIFT : + mtr->hem_cfg.buf_pg_shift; /* alloc a tmp array to store buffer's dma address */ pages = kvcalloc(page_count, sizeof(dma_addr_t), GFP_KERNEL); if (!pages) @@ -848,62 +871,68 @@ int hns_roce_mtr_find(struct hns_roce_dev *hr_dev, struct hns_roce_mtr *mtr, return total; }
+static bool is_buf_attr_valid(struct hns_roce_dev *hr_dev, + struct hns_roce_buf_attr *attr) +{ + struct ib_device *ibdev = &hr_dev->ib_dev; + + if (attr->region_count > ARRAY_SIZE(attr->region) || + attr->region_count < 1 || attr->page_shift < HNS_HW_PAGE_SHIFT) { + ibdev_err(ibdev, + "invalid buf attr, region count %d page shift %u.\n", + attr->region_count, attr->page_shift); + return false; + } + + return true; +} + static int mtr_init_buf_cfg(struct hns_roce_dev *hr_dev, struct hns_roce_buf_attr *attr, - struct hns_roce_hem_cfg *cfg, - unsigned int *buf_page_shift, u64 unalinged_size) + struct hns_roce_hem_cfg *cfg, u64 unalinged_size) { struct hns_roce_buf_region *r; - u64 first_region_padding; - int page_cnt, region_cnt; - unsigned int page_shift; + size_t buf_pg_sz; size_t buf_size; + int page_cnt, i; + u64 pgoff = 0; + + if (!is_buf_attr_valid(hr_dev, attr)) + return -EINVAL;
/* If mtt is disabled, all pages must be within a continuous range */ cfg->is_direct = !mtr_has_mtt(attr); + cfg->region_count = attr->region_count; buf_size = mtr_bufs_size(attr); if (cfg->is_direct) { - /* When HEM buffer uses 0-level addressing, the page size is - * equal to the whole buffer size, and we split the buffer into - * small pages which is used to check whether the adjacent - * units are in the continuous space and its size is fixed to - * 4K based on hns ROCEE's requirement. - */ - page_shift = HNS_HW_PAGE_SHIFT; - - /* The ROCEE requires the page size to be 4K * 2 ^ N. */ + buf_pg_sz = HNS_HW_PAGE_SIZE; cfg->buf_pg_count = 1; + /* The ROCEE requires the page size to be 4K * 2 ^ N. */ cfg->buf_pg_shift = HNS_HW_PAGE_SHIFT + order_base_2(DIV_ROUND_UP(buf_size, HNS_HW_PAGE_SIZE)); - first_region_padding = 0; } else { - page_shift = attr->page_shift; cfg->buf_pg_count = DIV_ROUND_UP(buf_size + unalinged_size, - 1 << page_shift); - cfg->buf_pg_shift = page_shift; - first_region_padding = unalinged_size; + 1 << attr->page_shift); + cfg->buf_pg_shift = attr->page_shift; + buf_pg_sz = 1 << cfg->buf_pg_shift; + pgoff = unalinged_size; }
/* Convert buffer size to page index and page count for each region and * the buffer's offset needs to be appended to the first region. */ - for (page_cnt = 0, region_cnt = 0; region_cnt < attr->region_count && - region_cnt < ARRAY_SIZE(cfg->region); region_cnt++) { - r = &cfg->region[region_cnt]; + for (page_cnt = 0, i = 0; i < attr->region_count; i++) { + r = &cfg->region[i]; r->offset = page_cnt; - buf_size = hr_hw_page_align(attr->region[region_cnt].size + - first_region_padding); - r->count = DIV_ROUND_UP(buf_size, 1 << page_shift); - first_region_padding = 0; + buf_size = hr_hw_page_align(attr->region[i].size + + pgoff); + r->count = DIV_ROUND_UP(buf_size, buf_pg_sz); + pgoff = 0; page_cnt += r->count; - r->hopnum = to_hr_hem_hopnum(attr->region[region_cnt].hopnum, - r->count); + r->hopnum = to_hr_hem_hopnum(attr->region[i].hopnum, r->count); }
- cfg->region_count = region_cnt; - *buf_page_shift = page_shift; - - return page_cnt; + return 0; }
static int mtr_alloc_mtt(struct hns_roce_dev *hr_dev, struct hns_roce_mtr *mtr, @@ -949,18 +978,12 @@ int hns_roce_mtr_create(struct hns_roce_dev *hr_dev, struct hns_roce_mtr *mtr, unsigned long user_addr) { struct ib_device *ibdev = &hr_dev->ib_dev; - unsigned int buf_page_shift = 0; - int buf_page_cnt; int ret;
- buf_page_cnt = mtr_init_buf_cfg(hr_dev, buf_attr, &mtr->hem_cfg, - &buf_page_shift, - udata ? user_addr & ~PAGE_MASK : 0); - if (buf_page_cnt < 1 || buf_page_shift < HNS_HW_PAGE_SHIFT) { - ibdev_err(ibdev, "failed to init mtr cfg, count %d shift %u.\n", - buf_page_cnt, buf_page_shift); - return -EINVAL; - } + ret = mtr_init_buf_cfg(hr_dev, buf_attr, &mtr->hem_cfg, + udata ? user_addr & ~PAGE_MASK : 0); + if (ret) + return ret;
ret = mtr_alloc_mtt(hr_dev, mtr, ba_page_shift); if (ret) { @@ -984,7 +1007,7 @@ int hns_roce_mtr_create(struct hns_roce_dev *hr_dev, struct hns_roce_mtr *mtr, }
/* Write buffer's dma address to MTT */ - ret = mtr_map_bufs(hr_dev, mtr, buf_page_cnt, buf_page_shift); + ret = mtr_map_bufs(hr_dev, mtr); if (ret) ibdev_err(ibdev, "failed to map mtr bufs, ret = %d.\n", ret); else
From: Chengchang Tang tangchengchang@huawei.com
driver inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I5Y79T
-------------------------------------------------------------------
MTR memory allocation do not depend on allocation of MTT. And MTT allocation can be adjusted according to MTR, so make MTR allocation before MTT allocation.
Signed-off-by: Chengchang Tang tangchengchang@huawei.com Reviewed-by: Yangyang Li liyangyang20@huawei.com Reviewed-by: YueHaibing yuehaibing@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- drivers/infiniband/hw/hns/hns_roce_mr.c | 35 ++++++++++++++----------- 1 file changed, 20 insertions(+), 15 deletions(-)
diff --git a/drivers/infiniband/hw/hns/hns_roce_mr.c b/drivers/infiniband/hw/hns/hns_roce_mr.c index 08f11922326e..177ab22d6f86 100644 --- a/drivers/infiniband/hw/hns/hns_roce_mr.c +++ b/drivers/infiniband/hw/hns/hns_roce_mr.c @@ -679,7 +679,7 @@ static int mtr_alloc_bufs(struct hns_roce_dev *hr_dev, struct hns_roce_mtr *mtr, mtr->umem = NULL; mtr->kmem = hns_roce_buf_alloc(hr_dev, total_size, buf_attr->page_shift, - mtr->hem_cfg.is_direct ? + !mtr_has_mtt(buf_attr) ? HNS_ROCE_BUF_DIRECT : 0); if (IS_ERR(mtr->kmem)) { ibdev_err(ibdev, "failed to alloc kmem, ret = %ld.\n", @@ -977,18 +977,27 @@ int hns_roce_mtr_create(struct hns_roce_dev *hr_dev, struct hns_roce_mtr *mtr, unsigned int ba_page_shift, struct ib_udata *udata, unsigned long user_addr) { + u64 pgoff = udata ? user_addr & ~PAGE_MASK : 0; struct ib_device *ibdev = &hr_dev->ib_dev; int ret;
- ret = mtr_init_buf_cfg(hr_dev, buf_attr, &mtr->hem_cfg, - udata ? user_addr & ~PAGE_MASK : 0); + if (!buf_attr->mtt_only) { + ret = mtr_alloc_bufs(hr_dev, mtr, buf_attr, udata, user_addr); + if (ret) { + ibdev_err(ibdev, + "failed to alloc mtr bufs, ret = %d.\n", ret); + return ret; + } + } + + ret = mtr_init_buf_cfg(hr_dev, buf_attr, &mtr->hem_cfg, pgoff); if (ret) - return ret; + goto err_init_buf;
ret = mtr_alloc_mtt(hr_dev, mtr, ba_page_shift); if (ret) { ibdev_err(ibdev, "failed to alloc mtr mtt, ret = %d.\n", ret); - return ret; + goto err_init_buf; }
/* The caller has its own buffer list and invokes the hns_roce_mtr_map() @@ -1000,22 +1009,18 @@ int hns_roce_mtr_create(struct hns_roce_dev *hr_dev, struct hns_roce_mtr *mtr, return 0; }
- ret = mtr_alloc_bufs(hr_dev, mtr, buf_attr, udata, user_addr); - if (ret) { - ibdev_err(ibdev, "failed to alloc mtr bufs, ret = %d.\n", ret); - goto err_alloc_mtt; - } - /* Write buffer's dma address to MTT */ ret = mtr_map_bufs(hr_dev, mtr); - if (ret) + if (ret) { ibdev_err(ibdev, "failed to map mtr bufs, ret = %d.\n", ret); - else - return 0; + goto err_alloc_mtt; + } + return 0;
- mtr_free_bufs(hr_dev, mtr); err_alloc_mtt: mtr_free_mtt(hr_dev, mtr); +err_init_buf: + mtr_free_bufs(hr_dev, mtr); return ret; }
From: Chengchang Tang tangchengchang@huawei.com
driver inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I5Y79T
-------------------------------------------------------------------
In the current implementation, we use a fixed page size to configure the MTR, which is not flexible enough and is not conducive to the performance of the HW.
Now the best page size will be calculated and used to configure the MTR. It only works for PBL now.
Signed-off-by: Chengchang Tang tangchengchang@huawei.com Reviewed-by: Yangyang Li liyangyang20@huawei.com Reviewed-by: YueHaibing yuehaibing@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- drivers/infiniband/hw/hns/hns_roce_alloc.c | 6 -- drivers/infiniband/hw/hns/hns_roce_device.h | 2 + drivers/infiniband/hw/hns/hns_roce_mr.c | 102 +++++++++++++++++--- 3 files changed, 93 insertions(+), 17 deletions(-)
diff --git a/drivers/infiniband/hw/hns/hns_roce_alloc.c b/drivers/infiniband/hw/hns/hns_roce_alloc.c index 11a78ceae568..60269322ba98 100644 --- a/drivers/infiniband/hw/hns/hns_roce_alloc.c +++ b/drivers/infiniband/hw/hns/hns_roce_alloc.c @@ -137,12 +137,6 @@ int hns_roce_get_kmem_bufs(struct hns_roce_dev *hr_dev, dma_addr_t *bufs, int total = 0; int i;
- if (page_shift > buf->trunk_shift) { - dev_err(hr_dev->dev, "failed to check kmem buf shift %u > %u\n", - page_shift, buf->trunk_shift); - return -EINVAL; - } - offset = 0; max_size = buf->ntrunks << buf->trunk_shift; for (i = 0; i < buf_cnt && offset < max_size; i++) { diff --git a/drivers/infiniband/hw/hns/hns_roce_device.h b/drivers/infiniband/hw/hns/hns_roce_device.h index 6804c12ce146..1244b002094f 100644 --- a/drivers/infiniband/hw/hns/hns_roce_device.h +++ b/drivers/infiniband/hw/hns/hns_roce_device.h @@ -277,7 +277,9 @@ struct hns_roce_buf_attr { unsigned int region_count; /* valid region count */ unsigned int page_shift; /* buffer page shift */ unsigned int user_access; /* umem access flag */ + u64 iova; bool mtt_only; /* only alloc buffer-required MTT memory */ + bool adaptive; /* adaptive for page_shift and hopnum */ };
struct hns_roce_hem_cfg { diff --git a/drivers/infiniband/hw/hns/hns_roce_mr.c b/drivers/infiniband/hw/hns/hns_roce_mr.c index 177ab22d6f86..c4f7ed5f5477 100644 --- a/drivers/infiniband/hw/hns/hns_roce_mr.c +++ b/drivers/infiniband/hw/hns/hns_roce_mr.c @@ -32,6 +32,7 @@ */
#include <linux/vmalloc.h> +#include <linux/count_zeros.h> #include <rdma/ib_umem.h> #include "hns_roce_device.h" #include "hns_roce_cmd.h" @@ -102,6 +103,9 @@ static int alloc_mr_pbl(struct hns_roce_dev *hr_dev, struct hns_roce_mr *mr, buf_attr.user_access = mr->access; /* fast MR's buffer is alloced before mapping, not at creation */ buf_attr.mtt_only = is_fast; + buf_attr.iova = mr->iova; + /* pagesize and hopnum is fixed for fast MR */ + buf_attr.adaptive = !is_fast;
err = hns_roce_mtr_create(hr_dev, &mr->pbl_mtr, &buf_attr, hr_dev->caps.pbl_ba_pg_sz + PAGE_SHIFT, @@ -871,6 +875,74 @@ int hns_roce_mtr_find(struct hns_roce_dev *hr_dev, struct hns_roce_mtr *mtr, return total; }
+/** + * hns_roce_find_buf_best_pgsz - Find best page size of the kmem. + * + * @hr_dev: hns_roce_dev struct + * @buf: kmem + * + * This function helps DMA regions using multi-level addressing to + * find the best page size in kmem. + * + * Returns 0 if the best pagesize is not found. + */ +static unsigned long hns_roce_find_buf_best_pgsz(struct hns_roce_dev *hr_dev, + struct hns_roce_buf *buf) +{ + unsigned long pgsz_bitmap = hr_dev->caps.page_size_cap; + u64 trunk_size = 1 << buf->trunk_shift; + u64 buf_size = trunk_size * buf->ntrunks; + dma_addr_t dma_addr = 0; + dma_addr_t mask; + int i; + + /* trunk_shift determines the size of each buf not PAGE_SIZE. */ + pgsz_bitmap &= GENMASK(BITS_PER_LONG - 1, buf->trunk_shift); + /* Best page size should smaller than the actual size of the block. */ + mask = pgsz_bitmap & + GENMASK(BITS_PER_LONG - 1, + bits_per((buf_size + dma_addr) ^ dma_addr)); + + for (i = 0; i < buf->ntrunks; i++) { + /* Walk kmem bufs to make sure that the start address of the + * current DMA block and the end address of the previous DMA + * block have the same offset, otherwise the page will be + * reduced. + */ + mask |= dma_addr ^ buf->trunk_list[i].map; + dma_addr = buf->trunk_list[i].map + trunk_size; + } + + if (mask) + pgsz_bitmap &= GENMASK(count_trailing_zeros(mask), 0); + + return pgsz_bitmap ? rounddown_pow_of_two(pgsz_bitmap) : 0; +} + +static int get_best_page_shift(struct hns_roce_dev *hr_dev, + struct hns_roce_mtr *mtr, + struct hns_roce_buf_attr *buf_attr) +{ + unsigned long page_sz; + + if (!buf_attr->adaptive) + return 0; + + if (mtr->umem) + page_sz = ib_umem_find_best_pgsz(mtr->umem, + hr_dev->caps.page_size_cap, + buf_attr->iova); + else + page_sz = hns_roce_find_buf_best_pgsz(hr_dev, mtr->kmem); + + if (!page_sz) + return -EINVAL; + + buf_attr->page_shift = order_base_2(page_sz); + + return 0; +} + static bool is_buf_attr_valid(struct hns_roce_dev *hr_dev, struct hns_roce_buf_attr *attr) { @@ -888,9 +960,10 @@ static bool is_buf_attr_valid(struct hns_roce_dev *hr_dev, }
static int mtr_init_buf_cfg(struct hns_roce_dev *hr_dev, - struct hns_roce_buf_attr *attr, - struct hns_roce_hem_cfg *cfg, u64 unalinged_size) + struct hns_roce_mtr *mtr, + struct hns_roce_buf_attr *attr) { + struct hns_roce_hem_cfg *cfg = &mtr->hem_cfg; struct hns_roce_buf_region *r; size_t buf_pg_sz; size_t buf_size; @@ -911,11 +984,12 @@ static int mtr_init_buf_cfg(struct hns_roce_dev *hr_dev, cfg->buf_pg_shift = HNS_HW_PAGE_SHIFT + order_base_2(DIV_ROUND_UP(buf_size, HNS_HW_PAGE_SIZE)); } else { - cfg->buf_pg_count = DIV_ROUND_UP(buf_size + unalinged_size, - 1 << attr->page_shift); + buf_pg_sz = 1 << attr->page_shift; + cfg->buf_pg_count = mtr->umem ? + ib_umem_num_dma_blocks(mtr->umem, buf_pg_sz) : + DIV_ROUND_UP(buf_size, buf_pg_sz); cfg->buf_pg_shift = attr->page_shift; - buf_pg_sz = 1 << cfg->buf_pg_shift; - pgoff = unalinged_size; + pgoff = mtr->umem ? mtr->umem->address & ~PAGE_MASK : 0; }
/* Convert buffer size to page index and page count for each region and @@ -924,9 +998,12 @@ static int mtr_init_buf_cfg(struct hns_roce_dev *hr_dev, for (page_cnt = 0, i = 0; i < attr->region_count; i++) { r = &cfg->region[i]; r->offset = page_cnt; - buf_size = hr_hw_page_align(attr->region[i].size + - pgoff); - r->count = DIV_ROUND_UP(buf_size, buf_pg_sz); + buf_size = hr_hw_page_align(attr->region[i].size + pgoff); + if (attr->adaptive && mtr->umem) + r->count = ib_umem_num_dma_blocks(mtr->umem, buf_pg_sz); + else + r->count = DIV_ROUND_UP(buf_size, buf_pg_sz); + pgoff = 0; page_cnt += r->count; r->hopnum = to_hr_hem_hopnum(attr->region[i].hopnum, r->count); @@ -977,7 +1054,6 @@ int hns_roce_mtr_create(struct hns_roce_dev *hr_dev, struct hns_roce_mtr *mtr, unsigned int ba_page_shift, struct ib_udata *udata, unsigned long user_addr) { - u64 pgoff = udata ? user_addr & ~PAGE_MASK : 0; struct ib_device *ibdev = &hr_dev->ib_dev; int ret;
@@ -988,9 +1064,13 @@ int hns_roce_mtr_create(struct hns_roce_dev *hr_dev, struct hns_roce_mtr *mtr, "failed to alloc mtr bufs, ret = %d.\n", ret); return ret; } + + ret = get_best_page_shift(hr_dev, mtr, buf_attr); + if (ret) + goto err_init_buf; }
- ret = mtr_init_buf_cfg(hr_dev, buf_attr, &mtr->hem_cfg, pgoff); + ret = mtr_init_buf_cfg(hr_dev, mtr, buf_attr); if (ret) goto err_init_buf;
From: Chengchang Tang tangchengchang@huawei.com
driver inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I5Y79T
-------------------------------------------------------------------
In the current implementation, we configure a fixed addressing level for the MTR. But in fact, the necessary addressing level is related to the page size and the size of the memory.
This patch calculates the addressing level according to the page size and the size of the memory, and uses the addressing level to configure the MTR.
It only works for PBL now.
Signed-off-by: Chengchang Tang tangchengchang@huawei.com Reviewed-by: Yangyang Li liyangyang20@huawei.com Reviewed-by: YueHaibing yuehaibing@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- drivers/infiniband/hw/hns/hns_roce_device.h | 2 + drivers/infiniband/hw/hns/hns_roce_mr.c | 59 +++++++++++++++++++-- 2 files changed, 58 insertions(+), 3 deletions(-)
diff --git a/drivers/infiniband/hw/hns/hns_roce_device.h b/drivers/infiniband/hw/hns/hns_roce_device.h index 1244b002094f..54c17dcab4a8 100644 --- a/drivers/infiniband/hw/hns/hns_roce_device.h +++ b/drivers/infiniband/hw/hns/hns_roce_device.h @@ -178,6 +178,8 @@ enum {
#define HNS_ROCE_CMD_SUCCESS 1
+#define HNS_ROCE_MAX_HOP_NUM 3 + /* The minimum page size is 4K for hardware */ #define HNS_HW_PAGE_SHIFT 12 #define HNS_HW_PAGE_SIZE (1 << HNS_HW_PAGE_SHIFT) diff --git a/drivers/infiniband/hw/hns/hns_roce_mr.c b/drivers/infiniband/hw/hns/hns_roce_mr.c index c4f7ed5f5477..63e8f29e5807 100644 --- a/drivers/infiniband/hw/hns/hns_roce_mr.c +++ b/drivers/infiniband/hw/hns/hns_roce_mr.c @@ -110,10 +110,13 @@ static int alloc_mr_pbl(struct hns_roce_dev *hr_dev, struct hns_roce_mr *mr, err = hns_roce_mtr_create(hr_dev, &mr->pbl_mtr, &buf_attr, hr_dev->caps.pbl_ba_pg_sz + PAGE_SHIFT, udata, start); - if (err) + if (err) { ibdev_err(ibdev, "failed to alloc pbl mtr, ret = %d.\n", err); - else - mr->npages = mr->pbl_mtr.hem_cfg.buf_pg_count; + return err; + } + + mr->npages = mr->pbl_mtr.hem_cfg.buf_pg_count; + mr->pbl_hop_num = buf_attr.region[0].hopnum;
return err; } @@ -943,6 +946,52 @@ static int get_best_page_shift(struct hns_roce_dev *hr_dev, return 0; }
+static int get_best_hop_num(struct hns_roce_dev *hr_dev, + struct hns_roce_mtr *mtr, + struct hns_roce_buf_attr *buf_attr, + unsigned int ba_pg_shift) +{ +#define INVALID_HOPNUM -1 +#define MIN_BA_CNT 1 + size_t buf_pg_sz = 1 << buf_attr->page_shift; + struct ib_device *ibdev = &hr_dev->ib_dev; + size_t ba_pg_sz = 1 << ba_pg_shift; + int hop_num = INVALID_HOPNUM; + size_t unit = MIN_BA_CNT; + size_t ba_cnt; + int i; + + if (!buf_attr->adaptive) + return 0; + + hop_num = INVALID_HOPNUM; + unit = MIN_BA_CNT; + /* Caculating the number of buf pages, each buf page needs a BA */ + if (mtr->umem) + ba_cnt = ib_umem_num_dma_blocks(mtr->umem, buf_pg_sz); + else + ba_cnt = DIV_ROUND_UP(buf_attr->region[0].size, buf_pg_sz); + + for (i = 0; i <= HNS_ROCE_MAX_HOP_NUM; i++) { + if (ba_cnt <= unit) { + hop_num = i; + break; + } + /* Number of BAs can be represented at per hop */ + unit *= ba_pg_sz / BA_BYTE_LEN; + } + + if (hop_num < 0) { + ibdev_err(ibdev, + "failed to calculate a valid hopnum.\n"); + return -EINVAL; + } + + buf_attr->region[0].hopnum = hop_num; + + return 0; +} + static bool is_buf_attr_valid(struct hns_roce_dev *hr_dev, struct hns_roce_buf_attr *attr) { @@ -1068,6 +1117,10 @@ int hns_roce_mtr_create(struct hns_roce_dev *hr_dev, struct hns_roce_mtr *mtr, ret = get_best_page_shift(hr_dev, mtr, buf_attr); if (ret) goto err_init_buf; + + ret = get_best_hop_num(hr_dev, mtr, buf_attr, ba_page_shift); + if (ret) + goto err_init_buf; }
ret = mtr_init_buf_cfg(hr_dev, mtr, buf_attr);
From: Chengchang Tang tangchengchang@huawei.com
driver inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I5Y79T
-------------------------------------------------------------------
Currently, when creating a MR, there is a certain probability of causing a hardware exception when addressing at level 0 with a huge page. This is because the PA address is now aligned to 4K instead of the actual page size set by the driver.
When the MTR of multiple regions is addressed at level 0 with a huge page, multiple regions will share the same huge page. This huge page will be divided into multiple 4K pages, and these pages will be allocated to different regions. At this point, all the regions of this MTR only require 4k alignment instead of the actual pagesize. This can help reduce memory consumption.
But when there is only one region, the MTR with level-0 addressing can directly use this huge page. At this point the hardware needs this page to be aligned to the actual page size, not 4K.
This patch set page size to 4k only when the MTR with level 0 addressing has mutliple regions.
Fixes: 0e0ab04b5bbe ("RDMA/hns: Refactor the MTR creation flow") Signed-off-by: Chengchang Tang tangchengchang@huawei.com Reviewed-by: Yangyang Li liyangyang20@huawei.com Reviewed-by: YueHaibing yuehaibing@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- drivers/infiniband/hw/hns/hns_roce_mr.c | 24 ++++++++++++++---------- 1 file changed, 14 insertions(+), 10 deletions(-)
diff --git a/drivers/infiniband/hw/hns/hns_roce_mr.c b/drivers/infiniband/hw/hns/hns_roce_mr.c index 63e8f29e5807..30c2f5e8e84a 100644 --- a/drivers/infiniband/hw/hns/hns_roce_mr.c +++ b/drivers/infiniband/hw/hns/hns_roce_mr.c @@ -712,6 +712,16 @@ static int cal_mtr_pg_cnt(struct hns_roce_mtr *mtr) return page_cnt; }
+static bool need_split_huge_page(struct hns_roce_mtr *mtr) +{ + /* When HEM buffer uses 0-level addressing, the page size is + * equal to the whole buffer size. If the current MTR has multiple + * regions, we split the buffer into small pages(4k, required by hns + * ROCEE). These pages will be used in multiple regions. + */ + return mtr->hem_cfg.is_direct && mtr->hem_cfg.region_count > 1; +} + static int mtr_map_bufs(struct hns_roce_dev *hr_dev, struct hns_roce_mtr *mtr) { struct ib_device *ibdev = &hr_dev->ib_dev; @@ -721,14 +731,8 @@ static int mtr_map_bufs(struct hns_roce_dev *hr_dev, struct hns_roce_mtr *mtr) int npage; int ret;
- /* When HEM buffer uses 0-level addressing, the page size is - * equal to the whole buffer size, and we split the buffer into - * small pages which is used to check whether the adjacent - * units are in the continuous space and its size is fixed to - * 4K based on hns ROCEE's requirement. - */ - page_shift = mtr->hem_cfg.is_direct ? HNS_HW_PAGE_SHIFT : - mtr->hem_cfg.buf_pg_shift; + page_shift = need_split_huge_page(mtr) ? HNS_HW_PAGE_SHIFT : + mtr->hem_cfg.buf_pg_shift; /* alloc a tmp array to store buffer's dma address */ pages = kvcalloc(page_count, sizeof(dma_addr_t), GFP_KERNEL); if (!pages) @@ -748,7 +752,7 @@ static int mtr_map_bufs(struct hns_roce_dev *hr_dev, struct hns_roce_mtr *mtr) goto err_alloc_list; }
- if (mtr->hem_cfg.is_direct && npage > 1) { + if (need_split_huge_page(mtr) && npage > 1) { ret = mtr_check_direct_pages(pages, npage, page_shift); if (ret) { ibdev_err(ibdev, "failed to check %s page: %d / %d.\n", @@ -1026,7 +1030,7 @@ static int mtr_init_buf_cfg(struct hns_roce_dev *hr_dev, cfg->is_direct = !mtr_has_mtt(attr); cfg->region_count = attr->region_count; buf_size = mtr_bufs_size(attr); - if (cfg->is_direct) { + if (need_split_huge_page(mtr)) { buf_pg_sz = HNS_HW_PAGE_SIZE; cfg->buf_pg_count = 1; /* The ROCEE requires the page size to be 4K * 2 ^ N. */
From: Chengchang Tang tangchengchang@huawei.com
driver inclusion category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I5Y79T
------------------------------------------------------------------------
An invalid roce page size cap must include the system page size.
Signed-off-by: Chengchang Tang tangchengchang@huawei.com Reviewed-by: Yangyang Li liyangyang20@huawei.com Reviewed-by: YueHaibing yuehaibing@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- drivers/infiniband/hw/hns/hns_roce_hw_v2.c | 2 ++ 1 file changed, 2 insertions(+)
diff --git a/drivers/infiniband/hw/hns/hns_roce_hw_v2.c b/drivers/infiniband/hw/hns/hns_roce_hw_v2.c index c6348e520fb1..f7962c96745b 100644 --- a/drivers/infiniband/hw/hns/hns_roce_hw_v2.c +++ b/drivers/infiniband/hw/hns/hns_roce_hw_v2.c @@ -2337,6 +2337,8 @@ static int hns_roce_query_pf_caps(struct hns_roce_dev *hr_dev) caps->wqe_sge_hop_num = hr_reg_read(resp_d, PF_CAPS_D_EX_SGE_HOP_NUM); caps->wqe_rq_hop_num = hr_reg_read(resp_d, PF_CAPS_D_RQWQE_HOP_NUM);
+ if (!(caps->page_size_cap & PAGE_SIZE)) + caps->page_size_cap = HNS_ROCE_V2_PAGE_SIZE_SUPPORTED; return 0; }
From: Yixing Liu liuyixing1@huawei.com
driver inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I5XYHV
----------------------------------------------------------
This patch add support to config dscp map to tc by nic interface get_dscp_prio. Driver will convert mapping relationship from dscp-prio to dscp-tc.
The dependency patch is as follows: 7a1313b356de ("net: hns3: add support config dscp map to tc")
Signed-off-by: Yixing Liu liuyixing1@huawei.com Reviewed-by: Yangyang Li liyangyang20@huawei.com Reviewed-by: Yue Haibing yuehaibing@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- drivers/infiniband/hw/hns/hns_roce_ah.c | 26 +++++++- drivers/infiniband/hw/hns/hns_roce_device.h | 4 ++ drivers/infiniband/hw/hns/hns_roce_hw_v2.c | 69 ++++++++++++++++++--- drivers/infiniband/hw/hns/hns_roce_qp.c | 13 ++++ include/uapi/rdma/hns-abi.h | 12 ++++ 5 files changed, 112 insertions(+), 12 deletions(-)
diff --git a/drivers/infiniband/hw/hns/hns_roce_ah.c b/drivers/infiniband/hw/hns/hns_roce_ah.c index 492b122d0521..cea402b28c44 100644 --- a/drivers/infiniband/hw/hns/hns_roce_ah.c +++ b/drivers/infiniband/hw/hns/hns_roce_ah.c @@ -33,7 +33,9 @@ #include <linux/pci.h> #include <rdma/ib_addr.h> #include <rdma/ib_cache.h> +#include "hnae3.h" #include "hns_roce_device.h" +#include "hns_roce_hw_v2.h"
static inline u16 get_ah_udp_sport(const struct rdma_ah_attr *ah_attr) { @@ -57,8 +59,11 @@ int hns_roce_create_ah(struct ib_ah *ibah, struct rdma_ah_init_attr *init_attr, struct rdma_ah_attr *ah_attr = init_attr->ah_attr; const struct ib_global_route *grh = rdma_ah_read_grh(ah_attr); struct hns_roce_dev *hr_dev = to_hr_dev(ibah->device); + struct hns_roce_ib_create_ah_resp resp = {}; struct hns_roce_ah *ah = to_hr_ah(ibah); - int ret = 0; + u8 priority; + u8 tc_mode; + int ret;
if (hr_dev->pci_dev->revision == PCI_REVISION_ID_HIP08 && udata) return -EOPNOTSUPP; @@ -72,9 +77,19 @@ int hns_roce_create_ah(struct ib_ah *ibah, struct rdma_ah_init_attr *init_attr, ah->av.hop_limit = grh->hop_limit; ah->av.flowlabel = grh->flow_label; ah->av.udp_sport = get_ah_udp_sport(ah_attr); - ah->av.sl = rdma_ah_get_sl(ah_attr); ah->av.tclass = get_tclass(grh);
+ ret = hr_dev->hw->get_dscp(hr_dev, get_tclass(grh), &tc_mode, + &priority); + if (ret && ret != -EOPNOTSUPP) + return ret; + + if (tc_mode == HNAE3_TC_MAP_MODE_DSCP && + grh->sgid_attr->gid_type == IB_GID_TYPE_ROCE_UDP_ENCAP) + ah->av.sl = priority; + else + ah->av.sl = rdma_ah_get_sl(ah_attr); + memcpy(ah->av.dgid, grh->dgid.raw, HNS_ROCE_GID_SIZE); memcpy(ah->av.mac, ah_attr->roce.dmac, ETH_ALEN);
@@ -88,6 +103,13 @@ int hns_roce_create_ah(struct ib_ah *ibah, struct rdma_ah_init_attr *init_attr, ah->av.vlan_en = ah->av.vlan_id < VLAN_N_VID; }
+ if (udata) { + resp.priority = ah->av.sl; + resp.tc_mode = tc_mode; + ret = ib_copy_to_udata(udata, &resp, + min(udata->outlen, sizeof(resp))); + } + return ret; }
diff --git a/drivers/infiniband/hw/hns/hns_roce_device.h b/drivers/infiniband/hw/hns/hns_roce_device.h index 54c17dcab4a8..7a945cefd5a0 100644 --- a/drivers/infiniband/hw/hns/hns_roce_device.h +++ b/drivers/infiniband/hw/hns/hns_roce_device.h @@ -631,6 +631,8 @@ struct hns_roce_qp { struct list_head sq_node; /* all send qps are on a list */ struct hns_user_mmap_entry *dwqe_mmap_entry; u32 config; + u8 tc_mode; + u8 priority; };
struct hns_roce_ib_iboe { @@ -891,6 +893,8 @@ struct hns_roce_hw { int (*query_cqc)(struct hns_roce_dev *hr_dev, u32 cqn, void *buffer); int (*query_qpc)(struct hns_roce_dev *hr_dev, u32 qpn, void *buffer); int (*query_mpt)(struct hns_roce_dev *hr_dev, u32 key, void *buffer); + int (*get_dscp)(struct hns_roce_dev *hr_dev, u8 dscp, + u8 *tc_mode, u8 *priority); const struct ib_device_ops *hns_roce_dev_ops; const struct ib_device_ops *hns_roce_dev_srq_ops; }; diff --git a/drivers/infiniband/hw/hns/hns_roce_hw_v2.c b/drivers/infiniband/hw/hns/hns_roce_hw_v2.c index f7962c96745b..450b7ac4333c 100644 --- a/drivers/infiniband/hw/hns/hns_roce_hw_v2.c +++ b/drivers/infiniband/hw/hns/hns_roce_hw_v2.c @@ -4922,6 +4922,61 @@ static int fill_cong_field(struct ib_qp *ibqp, const struct ib_qp_attr *attr, return 0; }
+int hns_roce_hw_v2_get_dscp(struct hns_roce_dev *hr_dev, u8 dscp, + u8 *tc_mode, u8 *priority) +{ + struct hns_roce_v2_priv *priv = hr_dev->priv; + struct hnae3_handle *handle = priv->handle; + const struct hnae3_ae_ops *ops = handle->ae_algo->ops; + int ret; + + if (!ops->get_dscp_prio) + return -EOPNOTSUPP; + + ret = ops->get_dscp_prio(handle, dscp, tc_mode, priority); + if (ret) + return ret; + + return 0; +} + +static int hns_roce_set_sl(struct ib_qp *ibqp, + const struct ib_qp_attr *attr, + struct hns_roce_v2_qp_context *context, + struct hns_roce_v2_qp_context *qpc_mask) +{ + const struct ib_global_route *grh = rdma_ah_read_grh(&attr->ah_attr); + struct hns_roce_dev *hr_dev = to_hr_dev(ibqp->device); + struct hns_roce_qp *hr_qp = to_hr_qp(ibqp); + struct ib_device *ibdev = &hr_dev->ib_dev; + int ret; + + ret = hns_roce_hw_v2_get_dscp(hr_dev, get_tclass(&attr->ah_attr.grh), + &hr_qp->tc_mode, &hr_qp->priority); + if (ret && ret != -EOPNOTSUPP) { + ibdev_err(ibdev, "failed to get dscp, ret = %d.\n", ret); + return ret; + } + + if (hr_qp->tc_mode == HNAE3_TC_MAP_MODE_DSCP && + grh->sgid_attr->gid_type == IB_GID_TYPE_ROCE_UDP_ENCAP) + hr_qp->sl = hr_qp->priority; + else + hr_qp->sl = rdma_ah_get_sl(&attr->ah_attr); + + if (unlikely(hr_qp->sl > MAX_SERVICE_LEVEL)) { + ibdev_err(ibdev, + "failed to fill QPC, sl (%u) shouldn't be larger than %d.\n", + hr_qp->sl, MAX_SERVICE_LEVEL); + return -EINVAL; + } + + hr_reg_write(context, QPC_SL, hr_qp->sl); + hr_reg_clear(qpc_mask, QPC_SL); + + return 0; +} + static int hns_roce_v2_set_path(struct ib_qp *ibqp, const struct ib_qp_attr *attr, int attr_mask, @@ -5015,16 +5070,9 @@ static int hns_roce_v2_set_path(struct ib_qp *ibqp, memcpy(context->dgid, grh->dgid.raw, sizeof(grh->dgid.raw)); memset(qpc_mask->dgid, 0, sizeof(grh->dgid.raw));
- hr_qp->sl = rdma_ah_get_sl(&attr->ah_attr); - if (unlikely(hr_qp->sl > MAX_SERVICE_LEVEL)) { - ibdev_err(ibdev, - "failed to fill QPC, sl (%u) shouldn't be larger than %d.\n", - hr_qp->sl, MAX_SERVICE_LEVEL); - return -EINVAL; - } - - hr_reg_write(context, QPC_SL, hr_qp->sl); - hr_reg_clear(qpc_mask, QPC_SL); + ret = hns_roce_set_sl(ibqp, attr, context, qpc_mask); + if (ret) + return ret;
return 0; } @@ -6730,6 +6778,7 @@ static const struct hns_roce_hw hns_roce_hw_v2 = { .query_cqc = hns_roce_v2_query_cqc, .query_qpc = hns_roce_v2_query_qpc, .query_mpt = hns_roce_v2_query_mpt, + .get_dscp = hns_roce_hw_v2_get_dscp, .hns_roce_dev_ops = &hns_roce_v2_dev_ops, .hns_roce_dev_srq_ops = &hns_roce_v2_dev_srq_ops, }; diff --git a/drivers/infiniband/hw/hns/hns_roce_qp.c b/drivers/infiniband/hw/hns/hns_roce_qp.c index 50177c47b247..cbe7d37430c7 100644 --- a/drivers/infiniband/hw/hns/hns_roce_qp.c +++ b/drivers/infiniband/hw/hns/hns_roce_qp.c @@ -1338,6 +1338,7 @@ int hns_roce_modify_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr, { struct hns_roce_dev *hr_dev = to_hr_dev(ibqp->device); struct hns_roce_qp *hr_qp = to_hr_qp(ibqp); + struct hns_roce_ib_modify_qp_resp resp = {}; enum ib_qp_state cur_state, new_state; int ret = -EINVAL;
@@ -1378,6 +1379,18 @@ int hns_roce_modify_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr,
ret = hr_dev->hw->modify_qp(ibqp, attr, attr_mask, cur_state, new_state, udata); + if (ret) + goto out; + + if (udata && udata->outlen) { + resp.tc_mode = hr_qp->tc_mode; + resp.priority = hr_qp->priority; + ret = ib_copy_to_udata(udata, &resp, + min(udata->outlen, sizeof(resp))); + if (ret) + ibdev_err(&hr_dev->ib_dev, + "failed to copy modify qp resp.\n"); + }
out: mutex_unlock(&hr_qp->mutex); diff --git a/include/uapi/rdma/hns-abi.h b/include/uapi/rdma/hns-abi.h index cd334ee5804c..b8a5d65333aa 100644 --- a/include/uapi/rdma/hns-abi.h +++ b/include/uapi/rdma/hns-abi.h @@ -85,6 +85,18 @@ struct hns_roce_ib_create_qp_resp { __aligned_u64 dwqe_mmap_key; };
+struct hns_roce_ib_create_ah_resp { + __u8 priority; + __u8 tc_mode; + __u8 reserved[6]; +}; + +struct hns_roce_ib_modify_qp_resp { + __u8 tc_mode; + __u8 priority; + __u8 reserved[6]; +}; + enum { HNS_ROCE_EXSGE_FLAGS = 1 << 0, HNS_ROCE_RQ_INLINE_FLAGS = 1 << 1,