From: Kai Ye yekai13@huawei.com
mainline inclusion from v6.1-rc4 commit f5b657e5dbf830cfcb19b588b784b8190a5164a0 category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I5ZHPY CVE: NA
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?i...
----------------------------------------------------------------------
The default qos value is not initialized when sriov is repeatedly enabled and disabled. So add the vf qos value initialized in the sriov enable process.
Signed-off-by: Kai Ye yekai13@huawei.com Signed-off-by: Herbert Xu herbert@gondor.apana.org.au Signed-off-by: Jiangshui Yang yangjiangshui@h-partners.com Reviewed-by: Xiu Jianfeng xiujianfeng@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- drivers/crypto/hisilicon/qm.c | 22 +++++++++++++++------- 1 file changed, 15 insertions(+), 7 deletions(-)
diff --git a/drivers/crypto/hisilicon/qm.c b/drivers/crypto/hisilicon/qm.c index af86579bee35..9e6f5004bdaf 100644 --- a/drivers/crypto/hisilicon/qm.c +++ b/drivers/crypto/hisilicon/qm.c @@ -4805,6 +4805,14 @@ void hisi_qm_debug_regs_clear(struct hisi_qm *qm) } EXPORT_SYMBOL_GPL(hisi_qm_debug_regs_clear);
+static void hisi_qm_init_vf_qos(struct hisi_qm *qm, int total_func) +{ + int i; + + for (i = 1; i <= total_func; i++) + qm->factor[i].func_qos = QM_QOS_MAX_VAL; +} + /** * hisi_qm_sriov_enable() - enable virtual functions * @pdev: the PCIe device @@ -4838,6 +4846,10 @@ int hisi_qm_sriov_enable(struct pci_dev *pdev, int max_vfs) }
num_vfs = max_vfs; + + if (test_bit(QM_SUPPORT_FUNC_QOS, &qm->caps)) + hisi_qm_init_vf_qos(qm, num_vfs); + ret = qm_vf_q_assign(qm, num_vfs); if (ret) { pci_err(pdev, "Can't assign queues for VF!\n"); @@ -4873,7 +4885,6 @@ EXPORT_SYMBOL_GPL(hisi_qm_sriov_enable); int hisi_qm_sriov_disable(struct pci_dev *pdev, bool is_frozen) { struct hisi_qm *qm = pci_get_drvdata(pdev); - int total_vfs = pci_sriov_get_totalvfs(qm->pdev); int ret;
if (pci_vfs_assigned(pdev)) { @@ -4888,9 +4899,6 @@ int hisi_qm_sriov_disable(struct pci_dev *pdev, bool is_frozen) }
pci_disable_sriov(pdev); - /* clear vf function shaper configure array */ - if (test_bit(QM_SUPPORT_FUNC_QOS, &qm->caps)) - memset(qm->factor + 1, 0, sizeof(struct qm_shaper_factor) * total_vfs);
ret = qm_clear_vft_config(qm); if (ret) @@ -6303,7 +6311,7 @@ static int hisi_qp_alloc_memory(struct hisi_qm *qm) static int hisi_qm_memory_init(struct hisi_qm *qm) { struct device *dev = &qm->pdev->dev; - int ret, total_func, i; + int ret, total_func; size_t off = 0;
if (test_bit(QM_SUPPORT_FUNC_QOS, &qm->caps)) { @@ -6312,8 +6320,8 @@ static int hisi_qm_memory_init(struct hisi_qm *qm) if (!qm->factor) return -ENOMEM;
- for (i = 0; i < total_func; i++) - qm->factor[i].func_qos = QM_QOS_MAX_VAL; + /* Only the PF value needs to be initialized */ + qm->factor[0].func_qos = QM_QOS_MAX_VAL; }
#define QM_INIT_BUF(qm, type, num) do { \
From: Zhiqi Song songzhiqi1@huawei.com
mainline inclusion from v6.1-rc4 commit 45e6319bd5f2154d8b8c9f1eaa4ac030ba0d330c category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I5ZHPY CVE: NA
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?i...
----------------------------------------------------------------------
In hpre_remove(), when the disable operation of qm sriov failed, the following logic should continue to be executed to release the remaining resources that have been allocated, instead of returning directly, otherwise there will be resource leakage.
Signed-off-by: Zhiqi Song songzhiqi1@huawei.com Signed-off-by: Herbert Xu herbert@gondor.apana.org.au Signed-off-by: Jiangshui Yang yangjiangshui@h-partners.com Reviewed-by: Xiu Jianfeng xiujianfeng@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- drivers/crypto/hisilicon/hpre/hpre_main.c | 10 ++-------- 1 file changed, 2 insertions(+), 8 deletions(-)
diff --git a/drivers/crypto/hisilicon/hpre/hpre_main.c b/drivers/crypto/hisilicon/hpre/hpre_main.c index 029117704133..e7ad00c178aa 100644 --- a/drivers/crypto/hisilicon/hpre/hpre_main.c +++ b/drivers/crypto/hisilicon/hpre/hpre_main.c @@ -1437,18 +1437,12 @@ static int hpre_probe(struct pci_dev *pdev, const struct pci_device_id *id) static void hpre_remove(struct pci_dev *pdev) { struct hisi_qm *qm = pci_get_drvdata(pdev); - int ret;
hisi_qm_pm_uninit(qm); hisi_qm_wait_task_finish(qm, &hpre_devices); hisi_qm_alg_unregister(qm, &hpre_devices); - if (qm->fun_type == QM_HW_PF && qm->vfs_num) { - ret = hisi_qm_sriov_disable(pdev, true); - if (ret) { - pci_err(pdev, "Disable SRIOV fail!\n"); - return; - } - } + if (qm->fun_type == QM_HW_PF && qm->vfs_num) + hisi_qm_sriov_disable(pdev, true);
hpre_debugfs_exit(qm); hisi_qm_stop(qm, QM_NORMAL);
From: Yicong Yang yangyicong@hisilicon.com
mainline inclusion from v6.1-rc4 commit 7001141d34e550854425afa76e960513cf150a62 category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I5ZHPY CVE: NA
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?i...
----------------------------------------------------------------------
dev_to_node() can handle the case when CONFIG_NUMA is not set, so the check of CONFIG_NUMA is redundant and can be removed.
Signed-off-by: Yicong Yang yangyicong@hisilicon.com Signed-off-by: Weili Qian qianweili@huawei.com Signed-off-by: Herbert Xu herbert@gondor.apana.org.au Signed-off-by: Jiangshui Yang yangjiangshui@h-partners.com Reviewed-by: Xiu Jianfeng xiujianfeng@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- drivers/crypto/hisilicon/qm.c | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-)
diff --git a/drivers/crypto/hisilicon/qm.c b/drivers/crypto/hisilicon/qm.c index 9e6f5004bdaf..e96308ea45b0 100644 --- a/drivers/crypto/hisilicon/qm.c +++ b/drivers/crypto/hisilicon/qm.c @@ -4281,16 +4281,14 @@ static int hisi_qm_sort_devices(int node, struct list_head *head, struct hisi_qm *qm; struct list_head *n; struct device *dev; - int dev_node = 0; + int dev_node;
list_for_each_entry(qm, &qm_list->list, list) { dev = &qm->pdev->dev;
- if (IS_ENABLED(CONFIG_NUMA)) { - dev_node = dev_to_node(dev); - if (dev_node < 0) - dev_node = 0; - } + dev_node = dev_to_node(dev); + if (dev_node < 0) + dev_node = 0;
res = kzalloc(sizeof(*res), GFP_KERNEL); if (!res)
From: Weili Qian qianweili@huawei.com
mainline inclusion from v6.1-rc4 commit f57e292897cac13b6ddee078aea21173b234ecb7 category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I5ZHPY CVE: NA
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?i...
----------------------------------------------------------------------
In qm_get_xqc_depth(), parameters low_bits and high_bits save the values of the corresponding bits. However, the values saved by the two parameters are opposite. As a result, the values returned to the callers are incorrect.
Fixes: 129a9f340172 ("crypto: hisilicon/qm - get qp num and depth from hardware registers") Signed-off-by: Weili Qian qianweili@huawei.com Signed-off-by: Herbert Xu herbert@gondor.apana.org.au Signed-off-by: Jiangshui Yang yangjiangshui@h-partners.com Reviewed-by: Xiu Jianfeng xiujianfeng@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- drivers/crypto/hisilicon/qm.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-)
diff --git a/drivers/crypto/hisilicon/qm.c b/drivers/crypto/hisilicon/qm.c index e96308ea45b0..08c5bacc6905 100644 --- a/drivers/crypto/hisilicon/qm.c +++ b/drivers/crypto/hisilicon/qm.c @@ -909,8 +909,8 @@ static void qm_get_xqc_depth(struct hisi_qm *qm, u16 *low_bits, u32 depth;
depth = hisi_qm_get_hw_info(qm, qm_basic_info, type, qm->cap_ver); - *high_bits = depth & QM_XQ_DEPTH_MASK; - *low_bits = (depth >> QM_XQ_DEPTH_SHIFT) & QM_XQ_DEPTH_MASK; + *low_bits = depth & QM_XQ_DEPTH_MASK; + *high_bits = (depth >> QM_XQ_DEPTH_SHIFT) & QM_XQ_DEPTH_MASK; }
static u32 qm_get_irq_num(struct hisi_qm *qm)
From: Weili Qian qianweili@huawei.com
mainline inclusion from v6.1-rc4 commit 94adb03fd58bbe355e3d7a9d0f701889313e4a51 category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I5ZHPY CVE: NA
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?i...
----------------------------------------------------------------------
Change the value of clock gating register to 0x7fff to enable clock gating of the address prefetch module. When the device is idle, the clock is turned off to save power.
Signed-off-by: Weili Qian qianweili@huawei.com Signed-off-by: Herbert Xu herbert@gondor.apana.org.au Signed-off-by: Jiangshui Yang yangjiangshui@h-partners.com Reviewed-by: Xiu Jianfeng xiujianfeng@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- drivers/crypto/hisilicon/sec2/sec_main.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/drivers/crypto/hisilicon/sec2/sec_main.c b/drivers/crypto/hisilicon/sec2/sec_main.c index 92aa00825bf0..c83db5d4c825 100644 --- a/drivers/crypto/hisilicon/sec2/sec_main.c +++ b/drivers/crypto/hisilicon/sec2/sec_main.c @@ -55,7 +55,7 @@ #define SEC_CONTROL_REG 0x301200 #define SEC_DYNAMIC_GATE_REG 0x30121c #define SEC_CORE_AUTO_GATE 0x30212c -#define SEC_DYNAMIC_GATE_EN 0x7bff +#define SEC_DYNAMIC_GATE_EN 0x7fff #define SEC_CORE_AUTO_GATE_EN GENMASK(3, 0) #define SEC_CLK_GATE_ENABLE BIT(3) #define SEC_CLK_GATE_DISABLE (~BIT(3))
From: Weili Qian qianweili@huawei.com
mainline inclusion from v6.1-rc4 commit ee1537fe3dd89860d0336563891f6cac707d0cb5 category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I5ZHPY CVE: NA
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?i...
----------------------------------------------------------------------
After the device is reset, the VF needs to re-enable communication interrupt before the VF sends restart complete message to the PF. If the interrupt is re-enabled after the VF notifies the PF, the PF may fail to send messages to the VF after receiving VF's restart complete message.
Fixes: 760fe22cf5e9 ("crypto: hisilicon/qm - update reset flow") Signed-off-by: Weili Qian qianweili@huawei.com Signed-off-by: Herbert Xu herbert@gondor.apana.org.au Signed-off-by: Jiangshui Yang yangjiangshui@h-partners.com Reviewed-by: Xiu Jianfeng xiujianfeng@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- drivers/crypto/hisilicon/qm.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/drivers/crypto/hisilicon/qm.c b/drivers/crypto/hisilicon/qm.c index 08c5bacc6905..249e63200e4e 100644 --- a/drivers/crypto/hisilicon/qm.c +++ b/drivers/crypto/hisilicon/qm.c @@ -5727,6 +5727,7 @@ static void qm_pf_reset_vf_done(struct hisi_qm *qm) cmd = QM_VF_START_FAIL; }
+ qm_cmd_init(qm); ret = qm_ping_pf(qm, cmd); if (ret) dev_warn(&pdev->dev, "PF responds timeout in reset done!\n"); @@ -5788,7 +5789,6 @@ static void qm_pf_reset_vf_process(struct hisi_qm *qm, goto err_get_status;
qm_pf_reset_vf_done(qm); - qm_cmd_init(qm);
dev_info(dev, "device reset done.\n");
From: Kai Ye yekai13@huawei.com
mainline inclusion from v6.1-rc4 commit 3efe90af4c0c46c58dba1b306de142827153d9c0 category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I5ZHPY CVE: NA
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?i...
----------------------------------------------------------------------
Increase the buffer to prevent stack overflow by fuzz test. The maximum length of the qos configuration buffer is 256 bytes. Currently, the value of the 'val buffer' is only 32 bytes. The sscanf does not check the dest memory length. So the 'val buffer' may stack overflow.
Signed-off-by: Kai Ye yekai13@huawei.com Signed-off-by: Herbert Xu herbert@gondor.apana.org.au Signed-off-by: Jiangshui Yang yangjiangshui@h-partners.com Reviewed-by: Xiu Jianfeng xiujianfeng@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- drivers/crypto/hisilicon/qm.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-)
diff --git a/drivers/crypto/hisilicon/qm.c b/drivers/crypto/hisilicon/qm.c index 249e63200e4e..2331ccf025f8 100644 --- a/drivers/crypto/hisilicon/qm.c +++ b/drivers/crypto/hisilicon/qm.c @@ -250,7 +250,6 @@ #define QM_QOS_MIN_CIR_B 100 #define QM_QOS_MAX_CIR_U 6 #define QM_QOS_MAX_CIR_S 11 -#define QM_QOS_VAL_MAX_LEN 32 #define QM_DFX_BASE 0x0100000 #define QM_DFX_STATE1 0x0104000 #define QM_DFX_STATE2 0x01040C8 @@ -4616,7 +4615,7 @@ static ssize_t qm_get_qos_value(struct hisi_qm *qm, const char *buf, unsigned int *fun_index) { char tbuf_bdf[QM_DBG_READ_LEN] = {0}; - char val_buf[QM_QOS_VAL_MAX_LEN] = {0}; + char val_buf[QM_DBG_READ_LEN] = {0}; u32 tmp1, device, function; int ret, bus;
From: Kai Ye yekai13@huawei.com
mainline inclusion from v6.1-rc4 commit 22d7a6c39cabab811f42cb2daed2343c87b0aca5 category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I5ZHPY CVE: NA
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?i...
----------------------------------------------------------------------
The pci bdf number check is added for qos written by using the pci api. Directly get the devfn by pci_dev, so delete some redundant code. And use the kstrtoul instead of sscanf to simplify code.
Signed-off-by: Kai Ye yekai13@huawei.com Signed-off-by: Herbert Xu herbert@gondor.apana.org.au Signed-off-by: Jiangshui Yang yangjiangshui@h-partners.com Reviewed-by: Xiu Jianfeng xiujianfeng@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- drivers/crypto/hisilicon/qm.c | 37 ++++++++++++----------------------- 1 file changed, 12 insertions(+), 25 deletions(-)
diff --git a/drivers/crypto/hisilicon/qm.c b/drivers/crypto/hisilicon/qm.c index 2331ccf025f8..d7a1b0043aa1 100644 --- a/drivers/crypto/hisilicon/qm.c +++ b/drivers/crypto/hisilicon/qm.c @@ -4593,49 +4593,36 @@ static ssize_t qm_algqos_read(struct file *filp, char __user *buf, return ret; }
-static ssize_t qm_qos_value_init(const char *buf, unsigned long *val) -{ - int buflen = strlen(buf); - int ret, i; - - for (i = 0; i < buflen; i++) { - if (!isdigit(buf[i])) - return -EINVAL; - } - - ret = sscanf(buf, "%lu", val); - if (ret != QM_QOS_VAL_NUM) - return -EINVAL; - - return 0; -} - static ssize_t qm_get_qos_value(struct hisi_qm *qm, const char *buf, unsigned long *val, unsigned int *fun_index) { + struct bus_type *bus_type = qm->pdev->dev.bus; char tbuf_bdf[QM_DBG_READ_LEN] = {0}; char val_buf[QM_DBG_READ_LEN] = {0}; - u32 tmp1, device, function; - int ret, bus; + struct pci_dev *pdev; + struct device *dev; + int ret;
ret = sscanf(buf, "%s %s", tbuf_bdf, val_buf); if (ret != QM_QOS_PARAM_NUM) return -EINVAL;
- ret = qm_qos_value_init(val_buf, val); + ret = kstrtoul(val_buf, 10, val); if (ret || *val == 0 || *val > QM_QOS_MAX_VAL) { pci_err(qm->pdev, "input qos value is error, please set 1~1000!\n"); return -EINVAL; }
- ret = sscanf(tbuf_bdf, "%u:%x:%u.%u", &tmp1, &bus, &device, &function); - if (ret != QM_QOS_BDF_PARAM_NUM) { - pci_err(qm->pdev, "input pci bdf value is error!\n"); - return -EINVAL; + dev = bus_find_device_by_name(bus_type, NULL, tbuf_bdf); + if (!dev) { + pci_err(qm->pdev, "input pci bdf number is error!\n"); + return -ENODEV; }
- *fun_index = PCI_DEVFN(device, function); + pdev = container_of(dev, struct pci_dev, dev); + + *fun_index = pdev->devfn;
return 0; }
From: Kai Ye yekai13@huawei.com
mainline inclusion from mainline-crypto commit 8f82f4ae8946d665f1e38da8e2b39b929d2435b1 category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I5ZHPY CVE: NA
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?i...
----------------------------------------------------------------------
Because the permission on the VF debugfs file is "0444". So the VF function checking is redundant in qos writing api.
Signed-off-by: Kai Ye yekai13@huawei.com Signed-off-by: Herbert Xu herbert@gondor.apana.org.au Signed-off-by: Jiangshui Yang yangjiangshui@h-partners.com Reviewed-by: Xiu Jianfeng xiujianfeng@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- drivers/crypto/hisilicon/qm.c | 3 --- 1 file changed, 3 deletions(-)
diff --git a/drivers/crypto/hisilicon/qm.c b/drivers/crypto/hisilicon/qm.c index d7a1b0043aa1..382e54406510 100644 --- a/drivers/crypto/hisilicon/qm.c +++ b/drivers/crypto/hisilicon/qm.c @@ -4636,9 +4636,6 @@ static ssize_t qm_algqos_write(struct file *filp, const char __user *buf, unsigned long val; int len, ret;
- if (qm->fun_type == QM_HW_VF) - return -EINVAL; - if (*pos != 0) return 0;
From: Yu Kuai yukuai3@huawei.com
mainline inclusion from mainline-v5.15-rc1 commit 89f871af1b26d98d983cba7ed0e86effa45ba5f8 category: bugfix bugzilla: https://gitee.com/src-openeuler/kernel/issues/I60HCD CVE: NA
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?i...
--------------------------------
If blk_mq_request_issue_directly() failed from blk_insert_cloned_request(), the request will be accounted start. Currently, blk_insert_cloned_request() is only called by dm, and such request won't be accounted done by dm.
In normal path, io will be accounted start from blk_mq_bio_to_request(), when the request is allocated, and such io will be accounted done from __blk_mq_end_request_acct() whether it succeeded or failed. Thus add blk_account_io_done() to fix the problem.
Signed-off-by: Yu Kuai yukuai3@huawei.com Reviewed-by: Christoph Hellwig hch@lst.de Link: https://lore.kernel.org/r/20220126012132.3111551-1-yukuai3@huawei.com Signed-off-by: Jens Axboe axboe@kernel.dk
Conflict: block/blk-core.c Signed-off-by: Yu Kuai yukuai3@huawei.com Reviewed-by: Jason Yan yanaijie@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- block/blk-core.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-)
diff --git a/block/blk-core.c b/block/blk-core.c index a4ec5e168312..a18cfc467d41 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -1223,7 +1223,10 @@ blk_status_t blk_insert_cloned_request(struct request_queue *q, struct request * * bypass a potential scheduler on the bottom device for * insert. */ - return blk_mq_request_issue_directly(rq, true); + ret = blk_mq_request_issue_directly(rq, true); + if (ret) + blk_account_io_done(rq, ktime_get_ns()); + return ret; } EXPORT_SYMBOL_GPL(blk_insert_cloned_request);
From: Lei Chen lennychen@tencent.com
stable inclusion from stable-v5.10.152 commit 392536023da18086d57565e716ed50193869b8e7 category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I60HVY CVE: NA
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=...
-------------------------------
commit 5a20d073ec54a72d9a732fa44bfe14954eb6332f upstream.
It's unnecessary to call wbt_update_limits explicitly within wbt_init, because it will be called in the following function wbt_queue_depth_changed.
Signed-off-by: Lei Chen lennychen@tencent.com Signed-off-by: Jens Axboe axboe@kernel.dk Signed-off-by: Yu Kuai yukuai3@huawei.com Signed-off-by: Greg Kroah-Hartman gregkh@linuxfoundation.org Reviewed-by: Jason Yan yanaijie@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- block/blk-wbt.c | 1 - 1 file changed, 1 deletion(-)
diff --git a/block/blk-wbt.c b/block/blk-wbt.c index 35d81b5deae1..4ec0a018a2ad 100644 --- a/block/blk-wbt.c +++ b/block/blk-wbt.c @@ -840,7 +840,6 @@ int wbt_init(struct request_queue *q) rwb->enable_state = WBT_STATE_ON_DEFAULT; rwb->wc = 1; rwb->rq_depth.default_depth = RWB_DEF_DEPTH; - wbt_update_limits(rwb);
/* * Assign rwb and add the stats callback.
From: Yu Kuai yukuai3@huawei.com
stable inclusion from stable-v5.10.152 commit 910ba49b33450a878128adc7d9c419dd97efd923 category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I60HVY CVE: NA
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=...
-------------------------------
commit 8c5035dfbb9475b67c82b3fdb7351236525bf52b upstream.
Our test found a problem that wbt inflight counter is negative, which will cause io hang(noted that this problem doesn't exist in mainline):
t1: device create t2: issue io add_disk blk_register_queue wbt_enable_default wbt_init rq_qos_add // wb_normal is still 0 /* * in mainline, disk can't be opened before * bdev_add(), however, in old kernels, disk * can be opened before blk_register_queue(). */ blkdev_issue_flush // disk size is 0, however, it's not checked submit_bio_wait submit_bio blk_mq_submit_bio rq_qos_throttle wbt_wait bio_to_wbt_flags rwb_enabled // wb_normal is 0, inflight is not increased
wbt_queue_depth_changed(&rwb->rqos); wbt_update_limits // wb_normal is initialized rq_qos_track wbt_track rq->wbt_flags |= bio_to_wbt_flags(rwb, bio); // wb_normal is not 0,wbt_flags will be set t3: io completion blk_mq_free_request rq_qos_done wbt_done wbt_is_tracked // return true __wbt_done wbt_rqw_done atomic_dec_return(&rqw->inflight); // inflight is decreased
commit 8235b5c1e8c1 ("block: call bdev_add later in device_add_disk") can avoid this problem, however it's better to fix this problem in wbt:
1) Lower kernel can't backport this patch due to lots of refactor. 2) Root cause is that wbt call rq_qos_add() before wb_normal is initialized.
Fixes: e34cbd307477 ("blk-wbt: add general throttling mechanism") Cc: stable@vger.kernel.org Signed-off-by: Yu Kuai yukuai3@huawei.com Link: https://lore.kernel.org/r/20220913105749.3086243-1-yukuai1@huaweicloud.com Signed-off-by: Jens Axboe axboe@kernel.dk Signed-off-by: Greg Kroah-Hartman gregkh@linuxfoundation.org Reviewed-by: Jason Yan yanaijie@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- block/blk-wbt.c | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-)
diff --git a/block/blk-wbt.c b/block/blk-wbt.c index 4ec0a018a2ad..bafdb8098893 100644 --- a/block/blk-wbt.c +++ b/block/blk-wbt.c @@ -840,6 +840,10 @@ int wbt_init(struct request_queue *q) rwb->enable_state = WBT_STATE_ON_DEFAULT; rwb->wc = 1; rwb->rq_depth.default_depth = RWB_DEF_DEPTH; + rwb->min_lat_nsec = wbt_default_latency_nsec(q); + + wbt_queue_depth_changed(&rwb->rqos); + wbt_set_write_cache(q, test_bit(QUEUE_FLAG_WC, &q->queue_flags));
/* * Assign rwb and add the stats callback. @@ -847,10 +851,5 @@ int wbt_init(struct request_queue *q) rq_qos_add(q, &rwb->rqos); blk_stat_add_callback(q, rwb->cb);
- rwb->min_lat_nsec = wbt_default_latency_nsec(q); - - wbt_queue_depth_changed(&rwb->rqos); - wbt_set_write_cache(q, test_bit(QUEUE_FLAG_WC, &q->queue_flags)); - return 0; }
From: Yu Kuai yukuai3@huawei.com
stable inclusion from stable-v5.10.152 commit 31b1570677e8bf85f48be8eb95e21804399b8295 category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I60HVY CVE: NA
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=...
-------------------------------
commit 285febabac4a16655372d23ff43e89ff6f216691 upstream.
commit 8c5035dfbb94 ("blk-wbt: call rq_qos_add() after wb_normal is initialized") moves wbt_set_write_cache() before rq_qos_add(), which is wrong because wbt_rq_qos() is still NULL.
Fix the problem by removing wbt_set_write_cache() and setting 'rwb->wc' directly. Noted that this patch also remove the redundant setting of 'rab->wc'.
Fixes: 8c5035dfbb94 ("blk-wbt: call rq_qos_add() after wb_normal is initialized") Reported-by: kernel test robot yujie.liu@intel.com Link: https://lore.kernel.org/r/202210081045.77ddf59b-yujie.liu@intel.com Signed-off-by: Yu Kuai yukuai3@huawei.com Reviewed-by: Ming Lei ming.lei@redhat.com Link: https://lore.kernel.org/r/20221009101038.1692875-1-yukuai1@huaweicloud.com Signed-off-by: Jens Axboe axboe@kernel.dk Signed-off-by: Greg Kroah-Hartman gregkh@linuxfoundation.org Reviewed-by: Jason Yan yanaijie@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- block/blk-wbt.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-)
diff --git a/block/blk-wbt.c b/block/blk-wbt.c index bafdb8098893..6f63920f073c 100644 --- a/block/blk-wbt.c +++ b/block/blk-wbt.c @@ -838,12 +838,11 @@ int wbt_init(struct request_queue *q) rwb->last_comp = rwb->last_issue = jiffies; rwb->win_nsec = RWB_WINDOW_NSEC; rwb->enable_state = WBT_STATE_ON_DEFAULT; - rwb->wc = 1; + rwb->wc = test_bit(QUEUE_FLAG_WC, &q->queue_flags); rwb->rq_depth.default_depth = RWB_DEF_DEPTH; rwb->min_lat_nsec = wbt_default_latency_nsec(q);
wbt_queue_depth_changed(&rwb->rqos); - wbt_set_write_cache(q, test_bit(QUEUE_FLAG_WC, &q->queue_flags));
/* * Assign rwb and add the stats callback.
From: Yu Kuai yukuai3@huawei.com
mainline inclusion from mainline-v5.16-rc2 commit 76dd298094f484c6250ebd076fa53287477b2328 category: bugfix bugzilla: https://gitee.com/src-openeuler/kernel/issues/I5VGU9 CVE: NA
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?i...
--------------------------------
Our syzkaller report a null pointer dereference, root cause is following:
__blk_mq_alloc_map_and_rqs set->tags[hctx_idx] = blk_mq_alloc_map_and_rqs blk_mq_alloc_map_and_rqs blk_mq_alloc_rqs // failed due to oom alloc_pages_node // set->tags[hctx_idx] is still NULL blk_mq_free_rqs drv_tags = set->tags[hctx_idx]; // null pointer dereference is triggered blk_mq_clear_rq_mapping(drv_tags, ...)
This is because commit 63064be150e4 ("blk-mq: Add blk_mq_alloc_map_and_rqs()") merged the two steps:
1) set->tags[hctx_idx] = blk_mq_alloc_rq_map() 2) blk_mq_alloc_rqs(..., set->tags[hctx_idx])
into one step:
set->tags[hctx_idx] = blk_mq_alloc_map_and_rqs()
Since tags is not initialized yet in this case, fix the problem by checking if tags is NULL pointer in blk_mq_clear_rq_mapping().
Fixes: 63064be150e4 ("blk-mq: Add blk_mq_alloc_map_and_rqs()") Signed-off-by: Yu Kuai yukuai3@huawei.com Reviewed-by: John Garry john.garry@huawei.com Link: https://lore.kernel.org/r/20221011142253.4015966-1-yukuai1@huaweicloud.com Signed-off-by: Jens Axboe axboe@kernel.dk Reviewed-by: Jason Yan yanaijie@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- block/blk-mq.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-)
diff --git a/block/blk-mq.c b/block/blk-mq.c index 7fbbad7b08b3..b9b2d9412b02 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -2450,8 +2450,11 @@ static void blk_mq_clear_rq_mapping(struct blk_mq_tags *drv_tags, struct page *page; unsigned long flags;
- /* There is no need to clear a driver tags own mapping */ - if (drv_tags == tags) + /* + * There is no need to clear mapping if driver tags is not initialized + * or the mapping belongs to the driver tags. + */ + if (!drv_tags || drv_tags == tags) return;
list_for_each_entry(page, &tags->page_list, lru) {
From: Michal Simek michal.simek@xilinx.com
mainline inclusion from mainline-v5.13-rc1 commit b991f8c3622c8c9d01a1ada382682a731932e651 category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I60OLE CVE: NA
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=...
--------------------------------
Right now the handling order depends on how entries are coming which is corresponding with order in DT. We have reached the case with DT overlays where conf and mux descriptions are exchanged which ends up in sequence that firmware has been asked to perform configuration before requesting the pin.
The patch is enforcing the order that pin is requested all the time first followed by pin configuration. This change will ensure that firmware gets requests in the right order.
Signed-off-by: Michal Simek michal.simek@xilinx.com Link: https://lore.kernel.org/r/cfbe01f791c2dd42a596cbda57e15599969b57aa.161536421... Signed-off-by: Linus Walleij linus.walleij@linaro.org Signed-off-by: Yuyao Lin linyuyao1@huawei.com Reviewed-by: Wei Li liwei391@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- drivers/pinctrl/core.c | 23 ++++++++++++++++++++++- 1 file changed, 22 insertions(+), 1 deletion(-)
diff --git a/drivers/pinctrl/core.c b/drivers/pinctrl/core.c index 840000870d5a..48ee9d1622b5 100644 --- a/drivers/pinctrl/core.c +++ b/drivers/pinctrl/core.c @@ -1258,13 +1258,34 @@ static int pinctrl_commit_state(struct pinctrl *p, struct pinctrl_state *state)
p->state = NULL;
- /* Apply all the settings for the new state */ + /* Apply all the settings for the new state - pinmux first */ list_for_each_entry(setting, &state->settings, node) { switch (setting->type) { case PIN_MAP_TYPE_MUX_GROUP: ret = pinmux_enable_setting(setting); break; case PIN_MAP_TYPE_CONFIGS_PIN: + case PIN_MAP_TYPE_CONFIGS_GROUP: + break; + default: + ret = -EINVAL; + break; + } + + if (ret < 0) + goto unapply_new_state; + + /* Do not link hogs (circular dependency) */ + if (p != setting->pctldev->p) + pinctrl_link_add(setting->pctldev, p->dev); + } + + /* Apply all the settings for the new state - pinconf after */ + list_for_each_entry(setting, &state->settings, node) { + switch (setting->type) { + case PIN_MAP_TYPE_MUX_GROUP: + break; + case PIN_MAP_TYPE_CONFIGS_PIN: case PIN_MAP_TYPE_CONFIGS_GROUP: ret = pinconf_apply_setting(setting); break;
From: Michal Simek michal.simek@xilinx.com
mainline inclusion from mainline-v5.13-rc1 commit 6a37d750037827d385672acdebf5788fc2ffa633 category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I60OLE CVE: NA
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=...
--------------------------------
Static analyzer tool found that the ret variable is not initialized but code expects ret value >=0 when pinconf is skipped in the first pinmux loop. The same expectation is for pinmux in a pinconf loop. That's why initialize ret to 0 to avoid uninitialized ret value in first loop or reusing ret value from first loop in second.
Addresses-Coverity: ("Uninitialized variables") Signed-off-by: Michal Simek michal.simek@xilinx.com Cc: Dan Carpenter dan.carpenter@oracle.com Reviewed-by: Colin Ian King colin.king@canonical.com Link: https://lore.kernel.org/r/e5203bae68eb94b4b8b4e67e5e7b4d86bb989724.161553429... Signed-off-by: Linus Walleij linus.walleij@linaro.org Signed-off-by: Yuyao Lin linyuyao1@huawei.com Reviewed-by: Wei Li liwei391@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- drivers/pinctrl/core.c | 2 ++ 1 file changed, 2 insertions(+)
diff --git a/drivers/pinctrl/core.c b/drivers/pinctrl/core.c index 48ee9d1622b5..a3097f0075aa 100644 --- a/drivers/pinctrl/core.c +++ b/drivers/pinctrl/core.c @@ -1266,6 +1266,7 @@ static int pinctrl_commit_state(struct pinctrl *p, struct pinctrl_state *state) break; case PIN_MAP_TYPE_CONFIGS_PIN: case PIN_MAP_TYPE_CONFIGS_GROUP: + ret = 0; break; default: ret = -EINVAL; @@ -1284,6 +1285,7 @@ static int pinctrl_commit_state(struct pinctrl *p, struct pinctrl_state *state) list_for_each_entry(setting, &state->settings, node) { switch (setting->type) { case PIN_MAP_TYPE_MUX_GROUP: + ret = 0; break; case PIN_MAP_TYPE_CONFIGS_PIN: case PIN_MAP_TYPE_CONFIGS_GROUP:
From: Li Lingfeng lilingfeng3@huawei.com
hulk inclusion category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I60QE9 CVE: NA
--------------------------------
As explained in 32c39e8a7613 ("block: fix use after free for bd_holder_dir"), we should make sure the "disk" is still live and then grab a reference to 'bd_holder_dir'. However, the "disk" should be "the claimed slave bdev" rather than "the holding disk".
Fixes: 32c39e8a7613 ("block: fix use after free for bd_holder_dir") Signed-off-by: Li Lingfeng lilingfeng3@huawei.com Reviewed-by: Yu Kuai yukuai3@huawei.com Reviewed-by: Jason Yan yanaijie@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- fs/block_dev.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/fs/block_dev.c b/fs/block_dev.c index c8aa41edc9bd..22d3a0f5152d 100644 --- a/fs/block_dev.c +++ b/fs/block_dev.c @@ -1269,7 +1269,7 @@ int bd_link_disk_holder(struct block_device *bdev, struct gendisk *disk) * the holder directory. Hold on to it. */ down_read(&bdev->bd_disk->lookup_sem); - if (!(disk->flags & GENHD_FL_UP)) { + if (!(bdev->bd_disk->flags & GENHD_FL_UP)) { up_read(&bdev->bd_disk->lookup_sem); return -ENODEV; }
From: Chen Yu yu.c.chen@intel.com
mainline inclusion from mainline-v6.0-rc1 commit 70fb5ccf2ebb09a0c8ebba775041567812d45f86 category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I61E4M
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?i...
--------------------------------
[Problem Statement] select_idle_cpu() might spend too much time searching for an idle CPU, when the system is overloaded.
The following histogram is the time spent in select_idle_cpu(), when running 224 instances of netperf on a system with 112 CPUs per LLC domain:
@usecs: [0] 533 | | [1] 5495 | | [2, 4) 12008 | | [4, 8) 239252 | | [8, 16) 4041924 |@@@@@@@@@@@@@@ | [16, 32) 12357398 |@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ | [32, 64) 14820255 |@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@| [64, 128) 13047682 |@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ | [128, 256) 8235013 |@@@@@@@@@@@@@@@@@@@@@@@@@@@@ | [256, 512) 4507667 |@@@@@@@@@@@@@@@ | [512, 1K) 2600472 |@@@@@@@@@ | [1K, 2K) 927912 |@@@ | [2K, 4K) 218720 | | [4K, 8K) 98161 | | [8K, 16K) 37722 | | [16K, 32K) 6715 | | [32K, 64K) 477 | | [64K, 128K) 7 | |
netperf latency usecs:
======= case load Lat_99th std% TCP_RR thread-224 257.39 ( 0.21)
The time spent in select_idle_cpu() is visible to netperf and might have a negative impact.
[Symptom analysis] The patch [1] from Mel Gorman has been applied to track the efficiency of select_idle_sibling. Copy the indicators here:
SIS Search Efficiency(se_eff%): A ratio expressed as a percentage of runqueues scanned versus idle CPUs found. A 100% efficiency indicates that the target, prev or recent CPU of a task was idle at wakeup. The lower the efficiency, the more runqueues were scanned before an idle CPU was found.
SIS Domain Search Efficiency(dom_eff%): Similar, except only for the slower SIS patch.
SIS Fast Success Rate(fast_rate%): Percentage of SIS that used target, prev or recent CPUs.
SIS Success rate(success_rate%): Percentage of scans that found an idle CPU.
The test is based on Aubrey's schedtests tool, including netperf, hackbench, schbench and tbench.
Test on vanilla kernel: schedstat_parse.py -f netperf_vanilla.log case load se_eff% dom_eff% fast_rate% success_rate% TCP_RR 28 threads 99.978 18.535 99.995 100.000 TCP_RR 56 threads 99.397 5.671 99.964 100.000 TCP_RR 84 threads 21.721 6.818 73.632 100.000 TCP_RR 112 threads 12.500 5.533 59.000 100.000 TCP_RR 140 threads 8.524 4.535 49.020 100.000 TCP_RR 168 threads 6.438 3.945 40.309 99.999 TCP_RR 196 threads 5.397 3.718 32.320 99.982 TCP_RR 224 threads 4.874 3.661 25.775 99.767 UDP_RR 28 threads 99.988 17.704 99.997 100.000 UDP_RR 56 threads 99.528 5.977 99.970 100.000 UDP_RR 84 threads 24.219 6.992 76.479 100.000 UDP_RR 112 threads 13.907 5.706 62.538 100.000 UDP_RR 140 threads 9.408 4.699 52.519 100.000 UDP_RR 168 threads 7.095 4.077 44.352 100.000 UDP_RR 196 threads 5.757 3.775 35.764 99.991 UDP_RR 224 threads 5.124 3.704 28.748 99.860
schedstat_parse.py -f schbench_vanilla.log (each group has 28 tasks) case load se_eff% dom_eff% fast_rate% success_rate% normal 1 mthread 99.152 6.400 99.941 100.000 normal 2 mthreads 97.844 4.003 99.908 100.000 normal 3 mthreads 96.395 2.118 99.917 99.998 normal 4 mthreads 55.288 1.451 98.615 99.804 normal 5 mthreads 7.004 1.870 45.597 61.036 normal 6 mthreads 3.354 1.346 20.777 34.230 normal 7 mthreads 2.183 1.028 11.257 21.055 normal 8 mthreads 1.653 0.825 7.849 15.549
schedstat_parse.py -f hackbench_vanilla.log (each group has 28 tasks) case load se_eff% dom_eff% fast_rate% success_rate% process-pipe 1 group 99.991 7.692 99.999 100.000 process-pipe 2 groups 99.934 4.615 99.997 100.000 process-pipe 3 groups 99.597 3.198 99.987 100.000 process-pipe 4 groups 98.378 2.464 99.958 100.000 process-pipe 5 groups 27.474 3.653 89.811 99.800 process-pipe 6 groups 20.201 4.098 82.763 99.570 process-pipe 7 groups 16.423 4.156 77.398 99.316 process-pipe 8 groups 13.165 3.920 72.232 98.828 process-sockets 1 group 99.977 5.882 99.999 100.000 process-sockets 2 groups 99.927 5.505 99.996 100.000 process-sockets 3 groups 99.397 3.250 99.980 100.000 process-sockets 4 groups 79.680 4.258 98.864 99.998 process-sockets 5 groups 7.673 2.503 63.659 92.115 process-sockets 6 groups 4.642 1.584 58.946 88.048 process-sockets 7 groups 3.493 1.379 49.816 81.164 process-sockets 8 groups 3.015 1.407 40.845 75.500 threads-pipe 1 group 99.997 0.000 100.000 100.000 threads-pipe 2 groups 99.894 2.932 99.997 100.000 threads-pipe 3 groups 99.611 4.117 99.983 100.000 threads-pipe 4 groups 97.703 2.624 99.937 100.000 threads-pipe 5 groups 22.919 3.623 87.150 99.764 threads-pipe 6 groups 18.016 4.038 80.491 99.557 threads-pipe 7 groups 14.663 3.991 75.239 99.247 threads-pipe 8 groups 12.242 3.808 70.651 98.644 threads-sockets 1 group 99.990 6.667 99.999 100.000 threads-sockets 2 groups 99.940 5.114 99.997 100.000 threads-sockets 3 groups 99.469 4.115 99.977 100.000 threads-sockets 4 groups 87.528 4.038 99.400 100.000 threads-sockets 5 groups 6.942 2.398 59.244 88.337 threads-sockets 6 groups 4.359 1.954 49.448 87.860 threads-sockets 7 groups 2.845 1.345 41.198 77.102 threads-sockets 8 groups 2.871 1.404 38.512 74.312
schedstat_parse.py -f tbench_vanilla.log case load se_eff% dom_eff% fast_rate% success_rate% loopback 28 threads 99.976 18.369 99.995 100.000 loopback 56 threads 99.222 7.799 99.934 100.000 loopback 84 threads 19.723 6.819 70.215 100.000 loopback 112 threads 11.283 5.371 55.371 99.999 loopback 140 threads 0.000 0.000 0.000 0.000 loopback 168 threads 0.000 0.000 0.000 0.000 loopback 196 threads 0.000 0.000 0.000 0.000 loopback 224 threads 0.000 0.000 0.000 0.000
According to the test above, if the system becomes busy, the SIS Search Efficiency(se_eff%) drops significantly. Although some benchmarks would finally find an idle CPU(success_rate% = 100%), it is doubtful whether it is worth it to search the whole LLC domain.
[Proposal] It would be ideal to have a crystal ball to answer this question: How many CPUs must a wakeup path walk down, before it can find an idle CPU? Many potential metrics could be used to predict the number. One candidate is the sum of util_avg in this LLC domain. The benefit of choosing util_avg is that it is a metric of accumulated historic activity, which seems to be smoother than instantaneous metrics (such as rq->nr_running). Besides, choosing the sum of util_avg would help predict the load of the LLC domain more precisely, because SIS_PROP uses one CPU's idle time to estimate the total LLC domain idle time.
In summary, the lower the util_avg is, the more select_idle_cpu() should scan for idle CPU, and vice versa. When the sum of util_avg in this LLC domain hits 85% or above, the scan stops. The reason to choose 85% as the threshold is that this is the imbalance_pct(117) when a LLC sched group is overloaded.
Introduce the quadratic function:
y = SCHED_CAPACITY_SCALE - p * x^2 and y'= y / SCHED_CAPACITY_SCALE
x is the ratio of sum_util compared to the CPU capacity: x = sum_util / (llc_weight * SCHED_CAPACITY_SCALE) y' is the ratio of CPUs to be scanned in the LLC domain, and the number of CPUs to scan is calculated by:
nr_scan = llc_weight * y'
Choosing quadratic function is because: [1] Compared to the linear function, it scans more aggressively when the sum_util is low. [2] Compared to the exponential function, it is easier to calculate. [3] It seems that there is no accurate mapping between the sum of util_avg and the number of CPUs to be scanned. Use heuristic scan for now.
For a platform with 112 CPUs per LLC, the number of CPUs to scan is: sum_util% 0 5 15 25 35 45 55 65 75 85 86 ... scan_nr 112 111 108 102 93 81 65 47 25 1 0 ...
For a platform with 16 CPUs per LLC, the number of CPUs to scan is: sum_util% 0 5 15 25 35 45 55 65 75 85 86 ... scan_nr 16 15 15 14 13 11 9 6 3 0 0 ...
Furthermore, to minimize the overhead of calculating the metrics in select_idle_cpu(), borrow the statistics from periodic load balance. As mentioned by Abel, on a platform with 112 CPUs per LLC, the sum_util calculated by periodic load balance after 112 ms would decay to about 0.5 * 0.5 * 0.5 * 0.7 = 8.75%, thus bringing a delay in reflecting the latest utilization. But it is a trade-off. Checking the util_avg in newidle load balance would be more frequent, but it brings overhead - multiple CPUs write/read the per-LLC shared variable and introduces cache contention. Tim also mentioned that, it is allowed to be non-optimal in terms of scheduling for the short-term variations, but if there is a long-term trend in the load behavior, the scheduler can adjust for that.
When SIS_UTIL is enabled, the select_idle_cpu() uses the nr_scan calculated by SIS_UTIL instead of the one from SIS_PROP. As Peter and Mel suggested, SIS_UTIL should be enabled by default.
This patch is based on the util_avg, which is very sensitive to the CPU frequency invariance. There is an issue that, when the max frequency has been clamp, the util_avg would decay insanely fast when the CPU is idle. Commit addca285120b ("cpufreq: intel_pstate: Handle no_turbo in frequency invariance") could be used to mitigate this symptom, by adjusting the arch_max_freq_ratio when turbo is disabled. But this issue is still not thoroughly fixed, because the current code is unaware of the user-specified max CPU frequency.
[Test result]
netperf and tbench were launched with 25% 50% 75% 100% 125% 150% 175% 200% of CPU number respectively. Hackbench and schbench were launched by 1, 2 ,4, 8 groups. Each test lasts for 100 seconds and repeats 3 times.
The following is the benchmark result comparison between baseline:vanilla v5.19-rc1 and compare:patched kernel. Positive compare% indicates better performance.
Each netperf test is a: netperf -4 -H 127.0.1 -t TCP/UDP_RR -c -C -l 100 netperf.throughput ======= case load baseline(std%) compare%( std%) TCP_RR 28 threads 1.00 ( 0.34) -0.16 ( 0.40) TCP_RR 56 threads 1.00 ( 0.19) -0.02 ( 0.20) TCP_RR 84 threads 1.00 ( 0.39) -0.47 ( 0.40) TCP_RR 112 threads 1.00 ( 0.21) -0.66 ( 0.22) TCP_RR 140 threads 1.00 ( 0.19) -0.69 ( 0.19) TCP_RR 168 threads 1.00 ( 0.18) -0.48 ( 0.18) TCP_RR 196 threads 1.00 ( 0.16) +194.70 ( 16.43) TCP_RR 224 threads 1.00 ( 0.16) +197.30 ( 7.85) UDP_RR 28 threads 1.00 ( 0.37) +0.35 ( 0.33) UDP_RR 56 threads 1.00 ( 11.18) -0.32 ( 0.21) UDP_RR 84 threads 1.00 ( 1.46) -0.98 ( 0.32) UDP_RR 112 threads 1.00 ( 28.85) -2.48 ( 19.61) UDP_RR 140 threads 1.00 ( 0.70) -0.71 ( 14.04) UDP_RR 168 threads 1.00 ( 14.33) -0.26 ( 11.16) UDP_RR 196 threads 1.00 ( 12.92) +186.92 ( 20.93) UDP_RR 224 threads 1.00 ( 11.74) +196.79 ( 18.62)
Take the 224 threads as an example, the SIS search metrics changes are illustrated below:
vanilla patched 4544492 +237.5% 15338634 sched_debug.cpu.sis_domain_search.avg 38539 +39686.8% 15333634 sched_debug.cpu.sis_failed.avg 128300000 -87.9% 15551326 sched_debug.cpu.sis_scanned.avg 5842896 +162.7% 15347978 sched_debug.cpu.sis_search.avg
There is -87.9% less CPU scans after patched, which indicates lower overhead. Besides, with this patch applied, there is -13% less rq lock contention in perf-profile.calltrace.cycles-pp._raw_spin_lock.raw_spin_rq_lock_nested .try_to_wake_up.default_wake_function.woken_wake_function. This might help explain the performance improvement - Because this patch allows the waking task to remain on the previous CPU, rather than grabbing other CPUs' lock.
Each hackbench test is a: hackbench -g $job --process/threads --pipe/sockets -l 1000000 -s 100 hackbench.throughput ========= case load baseline(std%) compare%( std%) process-pipe 1 group 1.00 ( 1.29) +0.57 ( 0.47) process-pipe 2 groups 1.00 ( 0.27) +0.77 ( 0.81) process-pipe 4 groups 1.00 ( 0.26) +1.17 ( 0.02) process-pipe 8 groups 1.00 ( 0.15) -4.79 ( 0.02) process-sockets 1 group 1.00 ( 0.63) -0.92 ( 0.13) process-sockets 2 groups 1.00 ( 0.03) -0.83 ( 0.14) process-sockets 4 groups 1.00 ( 0.40) +5.20 ( 0.26) process-sockets 8 groups 1.00 ( 0.04) +3.52 ( 0.03) threads-pipe 1 group 1.00 ( 1.28) +0.07 ( 0.14) threads-pipe 2 groups 1.00 ( 0.22) -0.49 ( 0.74) threads-pipe 4 groups 1.00 ( 0.05) +1.88 ( 0.13) threads-pipe 8 groups 1.00 ( 0.09) -4.90 ( 0.06) threads-sockets 1 group 1.00 ( 0.25) -0.70 ( 0.53) threads-sockets 2 groups 1.00 ( 0.10) -0.63 ( 0.26) threads-sockets 4 groups 1.00 ( 0.19) +11.92 ( 0.24) threads-sockets 8 groups 1.00 ( 0.08) +4.31 ( 0.11)
Each tbench test is a: tbench -t 100 $job 127.0.0.1 tbench.throughput ====== case load baseline(std%) compare%( std%) loopback 28 threads 1.00 ( 0.06) -0.14 ( 0.09) loopback 56 threads 1.00 ( 0.03) -0.04 ( 0.17) loopback 84 threads 1.00 ( 0.05) +0.36 ( 0.13) loopback 112 threads 1.00 ( 0.03) +0.51 ( 0.03) loopback 140 threads 1.00 ( 0.02) -1.67 ( 0.19) loopback 168 threads 1.00 ( 0.38) +1.27 ( 0.27) loopback 196 threads 1.00 ( 0.11) +1.34 ( 0.17) loopback 224 threads 1.00 ( 0.11) +1.67 ( 0.22)
Each schbench test is a: schbench -m $job -t 28 -r 100 -s 30000 -c 30000 schbench.latency_90%_us ======== case load baseline(std%) compare%( std%) normal 1 mthread 1.00 ( 31.22) -7.36 ( 20.25)* normal 2 mthreads 1.00 ( 2.45) -0.48 ( 1.79) normal 4 mthreads 1.00 ( 1.69) +0.45 ( 0.64) normal 8 mthreads 1.00 ( 5.47) +9.81 ( 14.28)
*Consider the Standard Deviation, this -7.36% regression might not be valid.
Also, a OLTP workload with a commercial RDBMS has been tested, and there is no significant change.
There were concerns that unbalanced tasks among CPUs would cause problems. For example, suppose the LLC domain is composed of 8 CPUs, and 7 tasks are bound to CPU0~CPU6, while CPU7 is idle:
CPU0 CPU1 CPU2 CPU3 CPU4 CPU5 CPU6 CPU7 util_avg 1024 1024 1024 1024 1024 1024 1024 0
Since the util_avg ratio is 87.5%( = 7/8 ), which is higher than 85%, select_idle_cpu() will not scan, thus CPU7 is undetected during scan. But according to Mel, it is unlikely the CPU7 will be idle all the time because CPU7 could pull some tasks via CPU_NEWLY_IDLE.
lkp(kernel test robot) has reported a regression on stress-ng.sock on a very busy system. According to the sched_debug statistics, it might be caused by SIS_UTIL terminates the scan and chooses a previous CPU earlier, and this might introduce more context switch, especially involuntary preemption, which impacts a busy stress-ng. This regression has shown that, not all benchmarks in every scenario benefit from idle CPU scan limit, and it needs further investigation.
Besides, there is slight regression in hackbench's 16 groups case when the LLC domain has 16 CPUs. Prateek mentioned that we should scan aggressively in an LLC domain with 16 CPUs. Because the cost to search for an idle one among 16 CPUs is negligible. The current patch aims to propose a generic solution and only considers the util_avg. Something like the below could be applied on top of the current patch to fulfill the requirement:
if (llc_weight <= 16) nr_scan = nr_scan * 32 / llc_weight;
For LLC domain with 16 CPUs, the nr_scan will be expanded to 2 times large. The smaller the CPU number this LLC domain has, the larger nr_scan will be expanded. This needs further investigation.
There is also ongoing work[2] from Abel to filter out the busy CPUs during wakeup, to further speed up the idle CPU scan. And it could be a following-up optimization on top of this change.
Suggested-by: Tim Chen tim.c.chen@intel.com Suggested-by: Peter Zijlstra peterz@infradead.org Signed-off-by: Chen Yu yu.c.chen@intel.com Signed-off-by: Peter Zijlstra (Intel) peterz@infradead.org Tested-by: Yicong Yang yangyicong@hisilicon.com Tested-by: Mohini Narkhede mohini.narkhede@intel.com Tested-by: K Prateek Nayak kprateek.nayak@amd.com Link: https://lore.kernel.org/r/20220612163428.849378-1-yu.c.chen@intel.com Signed-off-by: Jialin Zhang zhangjialin11@huawei.com Signed-off-by: Guan Jing guanjing6@huawei.com Reviewed-by: Zhang Qiao zhangqiao22@huawei.com Reviewed-by: Chen Hui judy.chenhui@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- include/linux/sched/topology.h | 1 + kernel/sched/fair.c | 87 ++++++++++++++++++++++++++++++++++ kernel/sched/features.h | 3 +- 3 files changed, 90 insertions(+), 1 deletion(-)
diff --git a/include/linux/sched/topology.h b/include/linux/sched/topology.h index bff39305271d..fe4e222d5d2d 100644 --- a/include/linux/sched/topology.h +++ b/include/linux/sched/topology.h @@ -82,6 +82,7 @@ struct sched_domain_shared { atomic_t ref; atomic_t nr_busy_cpus; int has_idle_cores; + int nr_idle_scan; #ifdef CONFIG_SCHED_STEAL struct sparsemask *cfs_overload_cpus; #endif diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index fe2f527c71ed..56a25d73beea 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -6223,6 +6223,7 @@ static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, int t { struct cpumask *cpus = this_cpu_cpumask_var_ptr(select_idle_mask); int i, cpu, idle_cpu = -1, nr = INT_MAX; + struct sched_domain_shared *sd_share; bool smt = test_idle_cores(target, false); int this = smp_processor_id(); struct sched_domain *this_sd; @@ -6253,6 +6254,17 @@ static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, int t time = cpu_clock(this); }
+ if (sched_feat(SIS_UTIL)) { + sd_share = rcu_dereference(per_cpu(sd_llc_shared, target)); + if (sd_share) { + /* because !--nr is the condition to stop scan */ + nr = READ_ONCE(sd_share->nr_idle_scan) + 1; + /* overloaded LLC is unlikely to have idle cpu/core */ + if (nr == 1) + return -1; + } + } + for_each_cpu_wrap(cpu, cpus, target) { if (smt) { i = select_idle_core(p, cpu, cpus, &idle_cpu); @@ -9609,6 +9621,77 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p, int this_cpu) return idlest; }
+static void update_idle_cpu_scan(struct lb_env *env, + unsigned long sum_util) +{ + struct sched_domain_shared *sd_share; + int llc_weight, pct; + u64 x, y, tmp; + /* + * Update the number of CPUs to scan in LLC domain, which could + * be used as a hint in select_idle_cpu(). The update of sd_share + * could be expensive because it is within a shared cache line. + * So the write of this hint only occurs during periodic load + * balancing, rather than CPU_NEWLY_IDLE, because the latter + * can fire way more frequently than the former. + */ + if (!sched_feat(SIS_UTIL) || env->idle == CPU_NEWLY_IDLE) + return; + + llc_weight = per_cpu(sd_llc_size, env->dst_cpu); + if (env->sd->span_weight != llc_weight) + return; + + sd_share = rcu_dereference(per_cpu(sd_llc_shared, env->dst_cpu)); + if (!sd_share) + return; + + /* + * The number of CPUs to search drops as sum_util increases, when + * sum_util hits 85% or above, the scan stops. + * The reason to choose 85% as the threshold is because this is the + * imbalance_pct(117) when a LLC sched group is overloaded. + * + * let y = SCHED_CAPACITY_SCALE - p * x^2 [1] + * and y'= y / SCHED_CAPACITY_SCALE + * + * x is the ratio of sum_util compared to the CPU capacity: + * x = sum_util / (llc_weight * SCHED_CAPACITY_SCALE) + * y' is the ratio of CPUs to be scanned in the LLC domain, + * and the number of CPUs to scan is calculated by: + * + * nr_scan = llc_weight * y' [2] + * + * When x hits the threshold of overloaded, AKA, when + * x = 100 / pct, y drops to 0. According to [1], + * p should be SCHED_CAPACITY_SCALE * pct^2 / 10000 + * + * Scale x by SCHED_CAPACITY_SCALE: + * x' = sum_util / llc_weight; [3] + * + * and finally [1] becomes: + * y = SCHED_CAPACITY_SCALE - + * x'^2 * pct^2 / (10000 * SCHED_CAPACITY_SCALE) [4] + * + */ + /* equation [3] */ + x = sum_util; + do_div(x, llc_weight); + + /* equation [4] */ + pct = env->sd->imbalance_pct; + tmp = x * x * pct * pct; + do_div(tmp, 10000 * SCHED_CAPACITY_SCALE); + tmp = min_t(long, tmp, SCHED_CAPACITY_SCALE); + y = SCHED_CAPACITY_SCALE - tmp; + + /* equation [2] */ + y *= llc_weight; + do_div(y, SCHED_CAPACITY_SCALE); + if ((int)y != sd_share->nr_idle_scan) + WRITE_ONCE(sd_share->nr_idle_scan, (int)y); +} + /** * update_sd_lb_stats - Update sched_domain's statistics for load balancing. * @env: The load balancing environment. @@ -9621,6 +9704,7 @@ static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sd struct sched_group *sg = env->sd->groups; struct sg_lb_stats *local = &sds->local_stat; struct sg_lb_stats tmp_sgs; + unsigned long sum_util = 0; int sg_status = 0;
#ifdef CONFIG_NO_HZ_COMMON @@ -9658,6 +9742,7 @@ static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sd sds->total_load += sgs->group_load; sds->total_capacity += sgs->group_capacity;
+ sum_util += sgs->group_util; sg = sg->next; } while (sg != env->sd->groups);
@@ -9691,6 +9776,8 @@ static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sd WRITE_ONCE(rd->overutilized, SG_OVERUTILIZED); trace_sched_overutilized_tp(rd, SG_OVERUTILIZED); } + + update_idle_cpu_scan(env, sum_util); }
#define NUMA_IMBALANCE_MIN 2 diff --git a/kernel/sched/features.h b/kernel/sched/features.h index 97ed11bd25e7..7783da2a021d 100644 --- a/kernel/sched/features.h +++ b/kernel/sched/features.h @@ -54,7 +54,8 @@ SCHED_FEAT(TTWU_QUEUE, true) /* * When doing wakeups, attempt to limit superfluous scans of the LLC domain. */ -SCHED_FEAT(SIS_PROP, true) +SCHED_FEAT(SIS_PROP, false) +SCHED_FEAT(SIS_UTIL, true)
#ifdef CONFIG_SCHED_STEAL /*
From: Guan Jing guanjing6@huawei.com
hulk inclusion category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I61E4M CVE: NA
--------------------------------
The sched_domain_shared structure is only used as pointer, and other drivers don't use it directly.
Signed-off-by: Guan Jing guanjing6@huawei.com Reviewed-by: zhangjialin zhangjialin11@huawei.com Reviewed-by: Zhang Qiao zhangqiao22@huawei.com Reviewed-by: Chen Hui judy.chenhui@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- include/linux/sched/topology.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/include/linux/sched/topology.h b/include/linux/sched/topology.h index fe4e222d5d2d..249c98aef083 100644 --- a/include/linux/sched/topology.h +++ b/include/linux/sched/topology.h @@ -82,10 +82,10 @@ struct sched_domain_shared { atomic_t ref; atomic_t nr_busy_cpus; int has_idle_cores; - int nr_idle_scan; #ifdef CONFIG_SCHED_STEAL struct sparsemask *cfs_overload_cpus; #endif + KABI_EXTEND(int nr_idle_scan) };
struct sched_domain {
From: Guan Jing guanjing6@huawei.com
hulk inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I61E4M CVE: NA
--------------------------------
When doing wakeups, attempt to limit superfluous scans of the LLC domain. ARM64 enables SIS_UTIL and disables SIS_PROP to search idle CPU based on sum of util_avg.
Signed-off-by: Guan Jing guanjing6@huawei.com Reviewed-by: Zhang Qiao zhangqiao22@huawei.com Reviewed-by: Chen Hui judy.chenhui@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- kernel/sched/features.h | 5 +++++ 1 file changed, 5 insertions(+)
diff --git a/kernel/sched/features.h b/kernel/sched/features.h index 7783da2a021d..fef48f5be2fa 100644 --- a/kernel/sched/features.h +++ b/kernel/sched/features.h @@ -54,8 +54,13 @@ SCHED_FEAT(TTWU_QUEUE, true) /* * When doing wakeups, attempt to limit superfluous scans of the LLC domain. */ +#ifdef CONFIG_ARM64 SCHED_FEAT(SIS_PROP, false) SCHED_FEAT(SIS_UTIL, true) +#else +SCHED_FEAT(SIS_PROP, true) +SCHED_FEAT(SIS_UTIL, false) +#endif
#ifdef CONFIG_SCHED_STEAL /*
From: Ziyang Xuan william.xuanziyang@huawei.com
hulk inclusion category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I61PL4 CVE: NA
--------------------------------
Under sockmap redirect scenario, destroy sock when psock->ingress_msg is not empty. Get a warning as following:
================================================= WARNING: CPU: 0 PID: 0 at net/ipv4/af_inet.c:154 inet_sock_destruct+0x408/0x430 ... Call Trace: <IRQ> __sk_destruct+0x3d/0x590 net/core/sock.c:1784 sk_destruct net/core/sock.c:1829 [inline] __sk_free+0x106/0x2a0 net/core/sock.c:1840 sk_free+0x7d/0xb0 net/core/sock.c:1851 sock_put include/net/sock.h:1813 [inline] tcp_v4_rcv+0x23af/0x26e0 net/ipv4/tcp_ipv4.c:2085 ip_protocol_deliver_rcu+0xe5/0x440 net/ipv4/ip_input.c:204 ip_local_deliver_finish+0xd2/0x110 net/ipv4/ip_input.c:231 NF_HOOK include/linux/netfilter.h:304 [inline] ip_local_deliver+0x10a/0x260 net/ipv4/ip_input.c:252 dst_input include/net/dst.h:459 [inline] ip_rcv_finish+0x126/0x160 net/ipv4/ip_input.c:428 NF_HOOK include/linux/netfilter.h:304 [inline] ip_rcv+0xbf/0x1d0 net/ipv4/ip_input.c:539 __netif_receive_skb_one_core+0x15f/0x190 net/core/dev.c:5366 __netif_receive_skb+0x2e/0xe0 net/core/dev.c:5480 process_backlog+0x132/0x2c0 net/core/dev.c:6386 napi_poll+0x17e/0x4f0 net/core/dev.c:6837 net_rx_action+0x183/0x3c0 net/core/dev.c:6907
That is because commit 7e41dfae18b1 ("[Huawei] bpf, sockmap: Add sk_rmem_alloc check for sockmap") does not consider redirect scenario, reduce sk_rmem_alloc without increasing sk_rmem_alloc. That would result in sk_rmem_alloc underflow.
Fixes: 8818e269f18d ("bpf, sockmap: Add sk_rmem_alloc check for sockmap") Signed-off-by: Ziyang Xuan william.xuanziyang@huawei.com Reviewed-by: Yue Haibing yuehaibing@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- net/core/skmsg.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-)
diff --git a/net/core/skmsg.c b/net/core/skmsg.c index 9dec3d35af79..448a0d24a734 100644 --- a/net/core/skmsg.c +++ b/net/core/skmsg.c @@ -659,7 +659,8 @@ void __sk_psock_purge_ingress_msg(struct sk_psock *psock)
list_for_each_entry_safe(msg, tmp, &psock->ingress_msg, list) { list_del(&msg->list); - atomic_sub(msg->sg.size, &psock->sk->sk_rmem_alloc); + if (!msg->skb) + atomic_sub(msg->sg.size, &psock->sk->sk_rmem_alloc); sk_msg_free(psock->sk, msg); kfree(msg); }
From: Luís Henriques lhenriques@suse.de
stable inclusion from stable-v5.10.146 commit 958b0ee23f5ac106e7cc11472b71aa2ea9a033bc category: bugfix bugzilla: 187444, https://gitee.com/openeuler/kernel/issues/I6261Z CVE: NA
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=...
--------------------------------
commit 29a5b8a137ac8eb410cc823653a29ac0e7b7e1b0 upstream.
When walking through an inode extents, the ext4_ext_binsearch_idx() function assumes that the extent header has been previously validated. However, there are no checks that verify that the number of entries (eh->eh_entries) is non-zero when depth is > 0. And this will lead to problems because the EXT_FIRST_INDEX() and EXT_LAST_INDEX() will return garbage and result in this:
[ 135.245946] ------------[ cut here ]------------ [ 135.247579] kernel BUG at fs/ext4/extents.c:2258! [ 135.249045] invalid opcode: 0000 [#1] PREEMPT SMP [ 135.250320] CPU: 2 PID: 238 Comm: tmp118 Not tainted 5.19.0-rc8+ #4 [ 135.252067] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.15.0-0-g2dd4b9b-rebuilt.opensuse.org 04/01/2014 [ 135.255065] RIP: 0010:ext4_ext_map_blocks+0xc20/0xcb0 [ 135.256475] Code: [ 135.261433] RSP: 0018:ffffc900005939f8 EFLAGS: 00010246 [ 135.262847] RAX: 0000000000000024 RBX: ffffc90000593b70 RCX: 0000000000000023 [ 135.264765] RDX: ffff8880038e5f10 RSI: 0000000000000003 RDI: ffff8880046e922c [ 135.266670] RBP: ffff8880046e9348 R08: 0000000000000001 R09: ffff888002ca580c [ 135.268576] R10: 0000000000002602 R11: 0000000000000000 R12: 0000000000000024 [ 135.270477] R13: 0000000000000000 R14: 0000000000000024 R15: 0000000000000000 [ 135.272394] FS: 00007fdabdc56740(0000) GS:ffff88807dd00000(0000) knlGS:0000000000000000 [ 135.274510] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [ 135.276075] CR2: 00007ffc26bd4f00 CR3: 0000000006261004 CR4: 0000000000170ea0 [ 135.277952] Call Trace: [ 135.278635] <TASK> [ 135.279247] ? preempt_count_add+0x6d/0xa0 [ 135.280358] ? percpu_counter_add_batch+0x55/0xb0 [ 135.281612] ? _raw_read_unlock+0x18/0x30 [ 135.282704] ext4_map_blocks+0x294/0x5a0 [ 135.283745] ? xa_load+0x6f/0xa0 [ 135.284562] ext4_mpage_readpages+0x3d6/0x770 [ 135.285646] read_pages+0x67/0x1d0 [ 135.286492] ? folio_add_lru+0x51/0x80 [ 135.287441] page_cache_ra_unbounded+0x124/0x170 [ 135.288510] filemap_get_pages+0x23d/0x5a0 [ 135.289457] ? path_openat+0xa72/0xdd0 [ 135.290332] filemap_read+0xbf/0x300 [ 135.291158] ? _raw_spin_lock_irqsave+0x17/0x40 [ 135.292192] new_sync_read+0x103/0x170 [ 135.293014] vfs_read+0x15d/0x180 [ 135.293745] ksys_read+0xa1/0xe0 [ 135.294461] do_syscall_64+0x3c/0x80 [ 135.295284] entry_SYSCALL_64_after_hwframe+0x46/0xb0
This patch simply adds an extra check in __ext4_ext_check(), verifying that eh_entries is not 0 when eh_depth is > 0.
Link: https://bugzilla.kernel.org/show_bug.cgi?id=215941 Link: https://bugzilla.kernel.org/show_bug.cgi?id=216283 Cc: Baokun Li libaokun1@huawei.com Cc: stable@kernel.org Signed-off-by: Luís Henriques lhenriques@suse.de Reviewed-by: Jan Kara jack@suse.cz Reviewed-by: Baokun Li libaokun1@huawei.com Link: https://lore.kernel.org/r/20220822094235.2690-1-lhenriques@suse.de Signed-off-by: Theodore Ts'o tytso@mit.edu Signed-off-by: Greg Kroah-Hartman gregkh@linuxfoundation.org Signed-off-by: Baokun Li libaokun1@huawei.com Reviewed-by: Zhang Yi yi.zhang@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- fs/ext4/extents.c | 4 ++++ 1 file changed, 4 insertions(+)
diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index 6202bd153934..e42a78170109 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -459,6 +459,10 @@ static int __ext4_ext_check(const char *function, unsigned int line, error_msg = "invalid eh_entries"; goto corrupted; } + if (unlikely((eh->eh_entries == 0) && (depth > 0))) { + error_msg = "eh_entries is 0 but eh_depth is > 0"; + goto corrupted; + } if (!ext4_valid_extent_entries(inode, eh, lblk, &pblk, depth)) { error_msg = "invalid extent entries"; goto corrupted;
From: Yuyao Lin linyuyao1@huawei.com
hulk inclusion category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I61XP8
--------------------------------
This reverts commit 098b0e01a91c42aaaf0425605cd126b03fcb0bcf.
Function timespec64_to_ns() Add the upper and lower limits check in commit cb47755725da ("time: Prevent undefined behaviour in timespec64_to_ns()"), timespec64_to_ktime() only check the upper limits,so revert this patch can fix overflow.
Signed-off-by: Yuyao Lin linyuyao1@huawei.com Reviewed-by: Wei Li liwei391@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- kernel/time/posix-cpu-timers.c | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-)
diff --git a/kernel/time/posix-cpu-timers.c b/kernel/time/posix-cpu-timers.c index d6b46346be1f..c97bc3e3b210 100644 --- a/kernel/time/posix-cpu-timers.c +++ b/kernel/time/posix-cpu-timers.c @@ -591,11 +591,7 @@ static int posix_cpu_timer_set(struct k_itimer *timer, int timer_flags, return -ESRCH; }
- /* - * Use the to_ktime conversion because that clamps the maximum - * value to KTIME_MAX and avoid multiplication overflows. - */ - new_expires = ktime_to_ns(timespec64_to_ktime(new->it_value)); + new_expires = timespec64_to_ns(&new->it_value);
/* * Protect against sighand release/switch in exit/exec and p->cpu_timers
From: GUO Zihua guozihua@huawei.com
hulk inclusion category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I62DVN CVE: NA
--------------------------------
Syzkaller reported a UAF in mpi_key_length().
BUG: KASAN: use-after-free in mpi_key_length+0x34/0xb0 Read of size 2 at addr ffff888005737e14 by task syz-executor.15/6236
CPU: 1 PID: 6236 Comm: syz-executor.15 Kdump: loaded Tainted: GF OE 5.10.0.kasan.x86_64 #1 Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.12.1-0-ga5cab58-20220525_182517-szxrtosci10000 04/01/2014 Call Trace: dump_stack+0x9c/0xd3 print_address_description.constprop.0+0x19/0x170 __kasan_report.cold+0x6c/0x84 kasan_report+0x3a/0x50 check_memory_region+0xfd/0x1f0 mpi_key_length+0x34/0xb0 pgp_calc_pkey_keyid.isra.0+0x100/0x5a0 pgp_generate_fingerprint+0x159/0x330 pgp_process_public_key+0x1c5/0x330 pgp_parse_packets+0xf4/0x200 pgp_key_parse+0xb6/0x340 asymmetric_key_preparse+0x8a/0x120 key_create_or_update+0x31f/0x8c0 __se_sys_add_key+0x23e/0x400 do_syscall_64+0x30/0x40 entry_SYSCALL_64_after_hwframe+0x61/0xc6
The root cause of the issue is that pgp_calc_pkey_keyid() would call mpi_key_length() and get the length of the public key. The length was then ducted from keylen, which is an unsigned value. However, the returnd byte count is not checked for legitimacy in mpi_key_length(), resulting in an inverted keylen, hence the read overflow.
It turns out that the byte count check was mistakenly placed in mpi_read_from_buffer() while commit 94479061ec5b ("mpi: introduce mpi_key_length()") tries to extract mpi_key_length() out of mpi_read_from_buffer(). This patch moves the check into mpi_key_length().
Fixes: commit 94479061ec5b ("mpi: introduce mpi_key_length()") Signed-off-by: GUO Zihua guozihua@huawei.com Reviewed-by: Wang Weiyang wangweiyang2@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- lib/mpi/mpicoder.c | 17 +++++++++-------- 1 file changed, 9 insertions(+), 8 deletions(-)
diff --git a/lib/mpi/mpicoder.c b/lib/mpi/mpicoder.c index 51a8fc758021..19b8ce9aa5e3 100644 --- a/lib/mpi/mpicoder.c +++ b/lib/mpi/mpicoder.c @@ -83,7 +83,7 @@ int mpi_key_length(const void *xbuffer, unsigned int ret_nread, unsigned int *nbits_arg, unsigned int *nbytes_arg) { const uint8_t *buffer = xbuffer; - unsigned int nbits; + unsigned int nbits, nbytes;
if (ret_nread < 2) return -EINVAL; @@ -94,10 +94,17 @@ int mpi_key_length(const void *xbuffer, unsigned int ret_nread, return -EINVAL; }
+ nbytes = DIV_ROUND_UP(nbits, 8); + if (nbytes + 2 > ret_nread) { + pr_info("MPI: mpi larger than buffer nbytes=%u ret_nread=%u\n", + nbytes, ret_nread); + return -EINVAL; + } + if (nbits_arg) *nbits_arg = nbits; if (nbytes_arg) - *nbytes_arg = DIV_ROUND_UP(nbits, 8); + *nbytes_arg = nbytes;
return 0; } @@ -114,12 +121,6 @@ MPI mpi_read_from_buffer(const void *xbuffer, unsigned *ret_nread) if (ret < 0) return ERR_PTR(ret);
- if (nbytes + 2 > *ret_nread) { - pr_info("MPI: mpi larger than buffer nbytes=%u ret_nread=%u\n", - nbytes, *ret_nread); - return ERR_PTR(-EINVAL); - } - val = mpi_read_raw_data(buffer + 2, nbytes); if (!val) return ERR_PTR(-ENOMEM);
From: Jian Shen shenjian15@huawei.com
driver inclusion category:feature bugzilla: https://gitee.com/openeuler/kernel/issues/I62HX2
----------------------------------------------------------------------
Currently, the PF check the VF alive by the KEEP_ALVE mailbox from VF. VF keep sending the mailbox per 2 seconds. Once PF lost the mailbox for more than 8 seconds, it will regards the VF is abnormal, and stop notifying the state change to VF, include link state, vf mac, reset, even though it receives the KEEP_ALIVE mailbox again. It's inreasonable.
This patch fixes it. PF will record the state change which need to notify VF when lost the VF's KEEP_ALIVE mailbox. And notify VF when receive the mailbox again. Introduce a new flag HCLGE_VPORT_STATE_INITED, used to distinguish the case whether VF driver loaded or not. For VF will query these states when initializing, so it's unnecessary to notify it in this case.
Signed-off-by: Jian Shen shenjian15@huawei.com Signed-off-by: Jiantao Xiao xiaojiantao1@h-partners.com Reviewed-by: Yue Haibing yuehaibing@huawei.com Reviewed-by: Jian Shen shenjian15@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- .../hisilicon/hns3/hns3pf/hclge_main.c | 51 +++++++++---- .../hisilicon/hns3/hns3pf/hclge_main.h | 7 ++ .../hisilicon/hns3/hns3pf/hclge_mbx.c | 71 ++++++++++++++++--- 3 files changed, 106 insertions(+), 23 deletions(-)
diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c index c14e52ff154b..b444dbb42c48 100644 --- a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c +++ b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c @@ -3816,9 +3816,17 @@ static int hclge_set_all_vf_rst(struct hclge_dev *hdev, bool reset) return ret; }
- if (!reset || !test_bit(HCLGE_VPORT_STATE_ALIVE, &vport->state)) + if (!reset || + !test_bit(HCLGE_VPORT_STATE_INITED, &vport->state)) continue;
+ if (!test_bit(HCLGE_VPORT_STATE_ALIVE, &vport->state) && + hdev->reset_type == HNAE3_FUNC_RESET) { + set_bit(HCLGE_VPORT_NEED_NOTIFY_RESET, + &vport->need_notify); + continue; + } + /* Inform VF to process the reset. * hclge_inform_reset_assert_to_vf may fail if VF * driver is not loaded. @@ -4541,12 +4549,15 @@ static void hclge_update_vport_alive(struct hclge_dev *hdev) for (i = 1; i < hdev->num_alloc_vport; i++) { struct hclge_vport *vport = &hdev->vport[i];
- if (time_after(jiffies, vport->last_active_jiffies + 8 * HZ)) + if (!test_bit(HCLGE_VPORT_STATE_INITED, &vport->state) || + !test_bit(HCLGE_VPORT_STATE_ALIVE, &vport->state)) + continue; + if (time_after(jiffies, vport->last_active_jiffies + 8 * HZ)) { clear_bit(HCLGE_VPORT_STATE_ALIVE, &vport->state); - - /* If vf is not alive, set to default value */ - if (!test_bit(HCLGE_VPORT_STATE_ALIVE, &vport->state)) - vport->mps = HCLGE_MAC_DEFAULT_FRAME; + dev_warn(&hdev->pdev->dev, + "vf %u heartbeat timeout\n", + i - HCLGE_VF_VPORT_START_NUM); + } } }
@@ -8256,9 +8267,11 @@ int hclge_vport_start(struct hclge_vport *vport) { struct hclge_dev *hdev = vport->back;
+ set_bit(HCLGE_VPORT_STATE_INITED, &vport->state); set_bit(HCLGE_VPORT_STATE_ALIVE, &vport->state); set_bit(HCLGE_VPORT_STATE_PROMISC_CHANGE, &vport->state); vport->last_active_jiffies = jiffies; + vport->need_notify = 0;
if (test_bit(vport->vport_id, hdev->vport_config_block)) { if (vport->vport_id) { @@ -8276,7 +8289,9 @@ int hclge_vport_start(struct hclge_vport *vport)
void hclge_vport_stop(struct hclge_vport *vport) { + clear_bit(HCLGE_VPORT_STATE_INITED, &vport->state); clear_bit(HCLGE_VPORT_STATE_ALIVE, &vport->state); + vport->need_notify = 0; }
static int hclge_client_start(struct hnae3_handle *handle) @@ -9429,7 +9444,8 @@ static int hclge_set_vf_mac(struct hnae3_handle *handle, int vf, return 0; }
- dev_info(&hdev->pdev->dev, "MAC of VF %d has been set to %s\n", + dev_info(&hdev->pdev->dev, + "MAC of VF %d has been set to %s, will be active after vf reset\n", vf, format_mac_addr); return 0; } @@ -10692,12 +10708,16 @@ static int hclge_set_vf_vlan_filter(struct hnae3_handle *handle, int vfid, * for DEVICE_VERSION_V3, vf doesn't need to know about the port based * VLAN state. */ - if (ae_dev->dev_version < HNAE3_DEVICE_VERSION_V3 && - test_bit(HCLGE_VPORT_STATE_ALIVE, &vport->state)) - (void)hclge_push_vf_port_base_vlan_info(&hdev->vport[0], - vport->vport_id, - state, &vlan_info); - + if (ae_dev->dev_version < HNAE3_DEVICE_VERSION_V3) { + if (test_bit(HCLGE_VPORT_STATE_ALIVE, &vport->state)) + (void)hclge_push_vf_port_base_vlan_info(&hdev->vport[0], + vport->vport_id, + state, + &vlan_info); + else + set_bit(HCLGE_VPORT_NEED_NOTIFY_VF_VLAN, + &vport->need_notify); + } return 0; }
@@ -12237,7 +12257,7 @@ static void hclge_reset_vport_state(struct hclge_dev *hdev) int i;
for (i = 0; i < hdev->num_alloc_vport; i++) { - hclge_vport_stop(vport); + clear_bit(HCLGE_VPORT_STATE_ALIVE, &vport->state); vport++; } } @@ -13240,6 +13260,9 @@ static void hclge_clear_vport_vf_info(struct hclge_vport *vport, int vfid) struct hclge_vlan_info vlan_info; int ret;
+ hclge_vport_stop(vport); + vport->mps = 0; + /* after disable sriov, clean VF rate configured by PF */ ret = hclge_tm_qs_shaper_cfg(vport, 0); if (ret) diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.h b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.h index 42e2d0c77d33..0e3c3382a5eb 100644 --- a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.h +++ b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.h @@ -977,9 +977,15 @@ enum HCLGE_VPORT_STATE { HCLGE_VPORT_STATE_MAC_TBL_CHANGE, HCLGE_VPORT_STATE_PROMISC_CHANGE, HCLGE_VPORT_STATE_VLAN_FLTR_CHANGE, + HCLGE_VPORT_STATE_INITED, HCLGE_VPORT_STATE_MAX };
+enum HCLGE_VPORT_NEED_NOTIFY { + HCLGE_VPORT_NEED_NOTIFY_RESET, + HCLGE_VPORT_NEED_NOTIFY_VF_VLAN, +}; + struct hclge_vlan_info { u16 vlan_proto; /* so far support 802.1Q only */ u16 qos; @@ -1027,6 +1033,7 @@ struct hclge_vport { struct hnae3_handle roh;
unsigned long state; + unsigned long need_notify; unsigned long last_active_jiffies; u32 mps; /* Max packet size */ struct hclge_vf_info vf_info; diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_mbx.c b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_mbx.c index a7b06c63143c..04ff9bf12185 100644 --- a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_mbx.c +++ b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_mbx.c @@ -124,17 +124,26 @@ static int hclge_send_mbx_msg(struct hclge_vport *vport, u8 *msg, u16 msg_len, return status; }
+static int hclge_inform_vf_reset(struct hclge_vport *vport, u16 reset_type) +{ + __le16 msg_data; + u8 dest_vfid; + + dest_vfid = (u8)vport->vport_id; + msg_data = cpu_to_le16(reset_type); + + /* send this requested info to VF */ + return hclge_send_mbx_msg(vport, (u8 *)&msg_data, sizeof(msg_data), + HCLGE_MBX_ASSERTING_RESET, dest_vfid); +} + int hclge_inform_reset_assert_to_vf(struct hclge_vport *vport) { struct hclge_dev *hdev = vport->back; - __le16 msg_data; u16 reset_type; - u8 dest_vfid;
BUILD_BUG_ON(HNAE3_MAX_RESET > U16_MAX);
- dest_vfid = (u8)vport->vport_id; - if (hdev->reset_type == HNAE3_FUNC_RESET) reset_type = HNAE3_VF_PF_FUNC_RESET; else if (hdev->reset_type == HNAE3_FLR_RESET) @@ -142,11 +151,7 @@ int hclge_inform_reset_assert_to_vf(struct hclge_vport *vport) else reset_type = HNAE3_VF_FUNC_RESET;
- msg_data = cpu_to_le16(reset_type); - - /* send this requested info to VF */ - return hclge_send_mbx_msg(vport, (u8 *)&msg_data, sizeof(msg_data), - HCLGE_MBX_ASSERTING_RESET, dest_vfid); + return hclge_inform_vf_reset(vport, reset_type); }
static void hclge_free_vector_ring_chain(struct hnae3_ring_chain_node *head) @@ -652,9 +657,56 @@ static int hclge_reset_vf(struct hclge_vport *vport) return hclge_func_reset_cmd(hdev, vport->vport_id); }
+static void hclge_notify_vf_config(struct hclge_vport *vport) +{ + struct hclge_dev *hdev = vport->back; + struct hnae3_ae_dev *ae_dev = pci_get_drvdata(hdev->pdev); + struct hclge_port_base_vlan_config *vlan_cfg; + int ret; + + hclge_push_vf_link_status(vport); + if (test_bit(HCLGE_VPORT_NEED_NOTIFY_RESET, &vport->need_notify)) { + ret = hclge_inform_vf_reset(vport, HNAE3_VF_PF_FUNC_RESET); + if (ret) { + dev_err(&hdev->pdev->dev, + "failed to inform VF %u reset!", + vport->vport_id - HCLGE_VF_VPORT_START_NUM); + return; + } + vport->need_notify = 0; + return; + } + + if (ae_dev->dev_version < HNAE3_DEVICE_VERSION_V3 && + test_bit(HCLGE_VPORT_NEED_NOTIFY_VF_VLAN, &vport->need_notify)) { + vlan_cfg = &vport->port_base_vlan_cfg; + ret = hclge_push_vf_port_base_vlan_info(&hdev->vport[0], + vport->vport_id, + vlan_cfg->state, + &vlan_cfg->vlan_info); + if (ret) { + dev_err(&hdev->pdev->dev, + "failed to inform VF %u port base vlan!", + vport->vport_id - HCLGE_VF_VPORT_START_NUM); + return; + } + clear_bit(HCLGE_VPORT_NEED_NOTIFY_VF_VLAN, &vport->need_notify); + } +} + static void hclge_vf_keep_alive(struct hclge_vport *vport) { + struct hclge_dev *hdev = vport->back; + vport->last_active_jiffies = jiffies; + + if (test_bit(HCLGE_VPORT_STATE_INITED, &vport->state) && + !test_bit(HCLGE_VPORT_STATE_ALIVE, &vport->state)) { + set_bit(HCLGE_VPORT_STATE_ALIVE, &vport->state); + dev_info(&hdev->pdev->dev, "VF %u is alive!", + vport->vport_id - HCLGE_VF_VPORT_START_NUM); + hclge_notify_vf_config(vport); + } }
static int hclge_set_vf_mtu(struct hclge_vport *vport, @@ -954,6 +1006,7 @@ static int hclge_mbx_vf_uninit_handler(struct hclge_mbx_ops_param *param) hclge_rm_vport_all_mac_table(param->vport, true, HCLGE_MAC_ADDR_MC); hclge_rm_vport_all_vlan_table(param->vport, true); + param->vport->mps = 0; return 0; }
From: Jian Shen shenjian15@huawei.com
driver inclusion category:feature bugzilla: https://gitee.com/openeuler/kernel/issues/I62HX2
----------------------------------------------------------------------
For device version V3, it supports queue bonding, which can identify the tuple information of TCP stream, and create flow director rules automatically, in order to keep the tx and rx packets are in the same queue pair. The driver set FD_ADD field of TX BD for TCP SYN packet, and set FD_DEL filed for TCP FIN or RST packet. The hardware create or remove a fd rule according to the TX BD, and it also support to age-out a rule if not hit for a long time.
The queue bonding mode is default to be disabled, and can be enabled/disabled with ethtool priv-flags command.
Signed-off-by: Jian Shen shenjian15@huawei.com Signed-off-by: Jiantao Xiao xiaojiantao1@h-partners.com Reviewed-by: Yue Haibing yuehaibing@huawei.com Reviewed-by: Jian Shen shenjian15@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- drivers/net/ethernet/hisilicon/hns3/hnae3.h | 7 ++ .../ethernet/hisilicon/hns3/hns3_debugfs.c | 4 +- .../net/ethernet/hisilicon/hns3/hns3_enet.c | 83 +++++++++++- .../net/ethernet/hisilicon/hns3/hns3_enet.h | 12 +- .../ethernet/hisilicon/hns3/hns3_ethtool.c | 13 +- .../hisilicon/hns3/hns3pf/hclge_cmd.h | 6 + .../hisilicon/hns3/hns3pf/hclge_main.c | 119 +++++++++++++++++- .../hisilicon/hns3/hns3pf/hclge_main.h | 3 + 8 files changed, 238 insertions(+), 9 deletions(-)
diff --git a/drivers/net/ethernet/hisilicon/hns3/hnae3.h b/drivers/net/ethernet/hisilicon/hns3/hnae3.h index cedcac37145a..0b1d4b72ac14 100644 --- a/drivers/net/ethernet/hisilicon/hns3/hnae3.h +++ b/drivers/net/ethernet/hisilicon/hns3/hnae3.h @@ -553,6 +553,10 @@ struct hnae3_ae_dev { * Check if any cls flower rule exist * dbg_read_cmd * Execute debugfs read command. + * request_flush_qb_config + * Request to update queue bonding configuration + * query_fd_qb_state + * Query whether hw queue bonding enabled * set_tx_hwts_info * Save information for 1588 tx packet * get_rx_hwts @@ -746,6 +750,8 @@ struct hnae3_ae_ops { struct ethtool_link_ksettings *cmd); int (*set_phy_link_ksettings)(struct hnae3_handle *handle, const struct ethtool_link_ksettings *cmd); + void (*request_flush_qb_config)(struct hnae3_handle *handle); + bool (*query_fd_qb_state)(struct hnae3_handle *handle); bool (*set_tx_hwts_info)(struct hnae3_handle *handle, struct sk_buff *skb); void (*get_rx_hwts)(struct hnae3_handle *handle, struct sk_buff *skb, @@ -862,6 +868,7 @@ struct hnae3_roh_private_info { enum hnae3_pflag { HNAE3_PFLAG_LIMIT_PROMISC, HNAE3_PFLAG_PUSH_ENABLE, + HNAE3_PFLAG_FD_QB_ENABLE, HNAE3_PFLAG_MAX };
diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3_debugfs.c b/drivers/net/ethernet/hisilicon/hns3/hns3_debugfs.c index 0f8f5c466871..a31b10748ac1 100644 --- a/drivers/net/ethernet/hisilicon/hns3/hns3_debugfs.c +++ b/drivers/net/ethernet/hisilicon/hns3/hns3_debugfs.c @@ -927,7 +927,7 @@ static const struct hns3_dbg_item tx_bd_info_items[] = { { "OT_VLAN_TAG", 3 }, { "TV", 5 }, { "OLT_VLAN_LEN", 2 }, - { "PAYLEN_OL4CS", 2 }, + { "PAYLEN_FDOP_OL4CS", 2 }, { "BD_FE_SC_VLD", 2 }, { "MSS_HW_CSUM", 0 }, }; @@ -947,7 +947,7 @@ static void hns3_dump_tx_bd_info(struct hns3_nic_priv *priv, sprintf(result[j++], "%u", le16_to_cpu(desc->tx.tv)); sprintf(result[j++], "%u", le32_to_cpu(desc->tx.ol_type_vlan_len_msec)); - sprintf(result[j++], "%#x", le32_to_cpu(desc->tx.paylen_ol4cs)); + sprintf(result[j++], "%#x", le32_to_cpu(desc->tx.paylen_fdop_ol4cs)); sprintf(result[j++], "%#x", le16_to_cpu(desc->tx.bdtp_fe_sc_vld_ra_ri)); sprintf(result[j++], "%u", le16_to_cpu(desc->tx.mss_hw_csum)); } diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3_enet.c b/drivers/net/ethernet/hisilicon/hns3/hns3_enet.c index 119a1600eb94..ea095a238685 100644 --- a/drivers/net/ethernet/hisilicon/hns3/hns3_enet.c +++ b/drivers/net/ethernet/hisilicon/hns3/hns3_enet.c @@ -1543,6 +1543,73 @@ static int hns3_handle_vtags(struct hns3_enet_ring *tx_ring, return 0; }
+static bool hns3_query_fd_qb_state(struct hnae3_handle *handle) +{ + const struct hnae3_ae_ops *ops = handle->ae_algo->ops; + + if (!test_bit(HNAE3_PFLAG_FD_QB_ENABLE, &handle->priv_flags)) + return false; + + if (!ops->query_fd_qb_state) + return false; + + return ops->query_fd_qb_state(handle); +} + +/* fd_op is the field of tx bd indicates hw whether to add or delete + * a qb rule or do nothing. + */ +static u8 hns3_fd_qb_handle(struct hns3_enet_ring *ring, struct sk_buff *skb) +{ + struct hnae3_handle *handle = ring->tqp->handle; + union l4_hdr_info l4; + union l3_hdr_info l3; + u8 l4_proto_tmp = 0; + __be16 frag_off; + u8 ip_version; + u8 fd_op = 0; + + if (!hns3_query_fd_qb_state(handle)) + return 0; + + if (skb->encapsulation) { + ip_version = inner_ip_hdr(skb)->version; + l3.hdr = skb_inner_network_header(skb); + l4.hdr = skb_inner_transport_header(skb); + } else { + ip_version = ip_hdr(skb)->version; + l3.hdr = skb_network_header(skb); + l4.hdr = skb_transport_header(skb); + } + + if (ip_version == IP_VERSION_IPV6) { + unsigned char *exthdr; + + exthdr = l3.hdr + sizeof(*l3.v6); + l4_proto_tmp = l3.v6->nexthdr; + if (l4.hdr != exthdr) + ipv6_skip_exthdr(skb, exthdr - skb->data, + &l4_proto_tmp, &frag_off); + } else if (ip_version == IP_VERSION_IPV4) { + l4_proto_tmp = l3.v4->protocol; + } + + if (l4_proto_tmp != IPPROTO_TCP) + return 0; + + ring->fd_qb_tx_sample++; + if (l4.tcp->fin || l4.tcp->rst) { + hnae3_set_bit(fd_op, HNS3_TXD_FD_DEL_B, 1); + ring->fd_qb_tx_sample = 0; + } else if (l4.tcp->syn || + ring->fd_qb_tx_sample >= HNS3_FD_QB_FORCE_CNT_MAX) { + hnae3_set_bit(fd_op, HNS3_TXD_FD_ADD_B, 1); + ring->fd_qb_tx_sample = 0; + } + + return fd_op; +} + /* check if the hardware is capable of checksum offloading */ static bool hns3_check_hw_tx_csum(struct sk_buff *skb) { @@ -1560,7 +1627,7 @@ static bool hns3_check_hw_tx_csum(struct sk_buff *skb) }
struct hns3_desc_param { - u32 paylen_ol4cs; + u32 paylen_fdop_ol4cs; u32 ol_type_vlan_len_msec; u32 type_cs_vlan_tso; u16 mss_hw_csum; @@ -1570,7 +1637,7 @@ struct hns3_desc_param {
static void hns3_init_desc_data(struct sk_buff *skb, struct hns3_desc_param *pa) { - pa->paylen_ol4cs = skb->len; + pa->paylen_fdop_ol4cs = skb->len; pa->ol_type_vlan_len_msec = 0; pa->type_cs_vlan_tso = 0; pa->mss_hw_csum = 0; @@ -1638,7 +1705,7 @@ static int hns3_handle_csum_partial(struct hns3_enet_ring *ring, return ret; }
- ret = hns3_set_tso(skb, ¶m->paylen_ol4cs, ¶m->mss_hw_csum, + ret = hns3_set_tso(skb, ¶m->paylen_fdop_ol4cs, ¶m->mss_hw_csum, ¶m->type_cs_vlan_tso, &desc_cb->send_bytes); if (unlikely(ret < 0)) { hns3_ring_stats_update(ring, tx_tso_err); @@ -1652,6 +1719,7 @@ static int hns3_fill_skb_desc(struct hns3_enet_ring *ring, struct hns3_desc_cb *desc_cb) { struct hns3_desc_param param; + u8 fd_op; int ret;
hns3_init_desc_data(skb, ¶m); @@ -1667,11 +1735,15 @@ static int hns3_fill_skb_desc(struct hns3_enet_ring *ring, return ret; }
+ fd_op = hns3_fd_qb_handle(ring, skb); + hnae3_set_field(param.paylen_fdop_ol4cs, HNS3_TXD_FD_OP_M, + HNS3_TXD_FD_OP_S, fd_op); + /* Set txbd */ desc->tx.ol_type_vlan_len_msec = cpu_to_le32(param.ol_type_vlan_len_msec); desc->tx.type_cs_vlan_tso_len = cpu_to_le32(param.type_cs_vlan_tso); - desc->tx.paylen_ol4cs = cpu_to_le32(param.paylen_ol4cs); + desc->tx.paylen_fdop_ol4cs = cpu_to_le32(param.paylen_fdop_ol4cs); desc->tx.mss_hw_csum = cpu_to_le16(param.mss_hw_csum); desc->tx.vlan_tag = cpu_to_le16(param.inner_vtag); desc->tx.outer_vlan_tag = cpu_to_le16(param.out_vtag); @@ -5365,6 +5437,9 @@ static int hns3_client_init(struct hnae3_handle *handle)
hns3_state_init(handle);
+ if (test_bit(HNAE3_DEV_SUPPORT_QB_B, ae_dev->caps)) + set_bit(HNAE3_PFLAG_FD_QB_ENABLE, &handle->supported_pflags); + ret = register_netdev(netdev); if (ret) { dev_err(priv->dev, "probe register netdev fail!\n"); diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3_enet.h b/drivers/net/ethernet/hisilicon/hns3/hns3_enet.h index 133a054af6b7..a44f26aae2a7 100644 --- a/drivers/net/ethernet/hisilicon/hns3/hns3_enet.h +++ b/drivers/net/ethernet/hisilicon/hns3/hns3_enet.h @@ -179,6 +179,11 @@ enum hns3_nic_state { #define HNS3_TXD_DECTTL_S 12 #define HNS3_TXD_DECTTL_M (0xf << HNS3_TXD_DECTTL_S)
+#define HNS3_TXD_FD_ADD_B 1 +#define HNS3_TXD_FD_DEL_B 0 +#define HNS3_TXD_FD_OP_M GENMASK(21, 20) +#define HNS3_TXD_FD_OP_S 20 + #define HNS3_TXD_OL4CS_B 22
#define HNS3_TXD_MSS_S 0 @@ -214,6 +219,8 @@ enum hns3_nic_state { #define HNS3_CQ_MODE_EQE 1U #define HNS3_CQ_MODE_CQE 0U
+#define HNS3_FD_QB_FORCE_CNT_MAX 20 + enum hns3_pkt_l2t_type { HNS3_L2_TYPE_UNICAST, HNS3_L2_TYPE_MULTICAST, @@ -285,7 +292,7 @@ struct __packed hns3_desc { }; };
- __le32 paylen_ol4cs; + __le32 paylen_fdop_ol4cs; __le16 bdtp_fe_sc_vld_ra_ri; __le16 mss_hw_csum; } tx; @@ -398,6 +405,9 @@ enum hns3_pkt_ol4type { HNS3_OL4_TYPE_UNKNOWN };
+#define IP_VERSION_IPV4 0x4 +#define IP_VERSION_IPV6 0x6 + struct hns3_rx_ptype { u32 ptype : 8; u32 csum_level : 2; diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3_ethtool.c b/drivers/net/ethernet/hisilicon/hns3/hns3_ethtool.c index 47799457439f..46b20650d09a 100644 --- a/drivers/net/ethernet/hisilicon/hns3/hns3_ethtool.c +++ b/drivers/net/ethernet/hisilicon/hns3/hns3_ethtool.c @@ -467,6 +467,16 @@ static void hns3_update_limit_promisc_mode(struct net_device *netdev, hns3_request_update_promisc_mode(handle); }
+static void hns3_update_fd_qb_state(struct net_device *netdev, bool enable) +{ + struct hnae3_handle *handle = hns3_get_handle(netdev); + + if (!handle->ae_algo->ops->request_flush_qb_config) + return; + + handle->ae_algo->ops->request_flush_qb_config(handle); +} + static void hns3_update_state(struct net_device *netdev, enum hns3_nic_state state, bool enable) { @@ -485,7 +495,8 @@ static void hns3_update_push_state(struct net_device *netdev, bool enable)
static const struct hns3_pflag_desc hns3_priv_flags[HNAE3_PFLAG_MAX] = { { "limit_promisc", hns3_update_limit_promisc_mode }, - { "tx_push_enable", hns3_update_push_state } + { "tx_push_enable", hns3_update_push_state }, + { "qb_enable", hns3_update_fd_qb_state }, };
static int hns3_get_sset_count(struct net_device *netdev, int stringset) diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_cmd.h b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_cmd.h index 88110835e876..84cf0c9dd551 100644 --- a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_cmd.h +++ b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_cmd.h @@ -749,6 +749,12 @@ struct hclge_fd_ad_cnt_read_cmd { u8 rsv2[8]; };
+struct hclge_fd_qb_cfg_cmd { + u8 en; + u8 vf_id; + u8 rsv[22]; +}; + #define HCLGE_FD_USER_DEF_OFT_S 0 #define HCLGE_FD_USER_DEF_OFT_M GENMASK(14, 0) #define HCLGE_FD_USER_DEF_EN_B 15 diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c index b444dbb42c48..061a50209caa 100644 --- a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c +++ b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c @@ -4561,6 +4561,95 @@ static void hclge_update_vport_alive(struct hclge_dev *hdev) } }
+static int hclge_set_fd_qb(struct hclge_dev *hdev, u8 vf_id, bool enable) +{ + struct hclge_fd_qb_cfg_cmd *req; + struct hclge_desc desc; + int ret; + + hclge_cmd_setup_basic_desc(&desc, HCLGE_OPC_FD_QB_CTRL, false); + req = (struct hclge_fd_qb_cfg_cmd *)desc.data; + req->en = enable; + req->vf_id = vf_id; + + ret = hclge_cmd_send(&hdev->hw, &desc, 1); + if (ret) + dev_err(&hdev->pdev->dev, + "failed to %s qb config for vport %u, ret = %d.\n", + enable ? "enable" : "disable", vf_id, ret); + return ret; +} + +static int hclge_sync_pf_qb_mode(struct hclge_dev *hdev) +{ + struct hclge_vport *vport = &hdev->vport[0]; + struct hnae3_handle *handle = &vport->nic; + bool request_enable = true; + int ret; + + if (!test_and_clear_bit(HCLGE_VPORT_STATE_QB_CHANGE, &vport->state)) + return 0; + + spin_lock_bh(&hdev->fd_rule_lock); + if (hdev->fd_active_type == HCLGE_FD_EP_ACTIVE || + hdev->fd_active_type == HCLGE_FD_TC_FLOWER_ACTIVE || + !test_bit(HNAE3_PFLAG_FD_QB_ENABLE, &handle->priv_flags)) + request_enable = false; + + if (request_enable == + test_bit(HCLGE_STATE_HW_QB_ENABLE, &hdev->state)) { + spin_unlock_bh(&hdev->fd_rule_lock); + return 0; + } + + if (request_enable) + hclge_clear_arfs_rules(hdev); + + ret = hclge_set_fd_qb(hdev, vport->vport_id, request_enable); + if (!ret) { + if (request_enable) { + set_bit(HCLGE_STATE_HW_QB_ENABLE, &hdev->state); + hdev->fd_active_type = HCLGE_FD_QB_ACTIVE; + } else { + clear_bit(HCLGE_STATE_HW_QB_ENABLE, &hdev->state); + hdev->fd_active_type = HCLGE_FD_RULE_NONE; + } + } else { + set_bit(HCLGE_VPORT_STATE_QB_CHANGE, &vport->state); + } + spin_unlock_bh(&hdev->fd_rule_lock); + + return ret; +} + +static int hclge_disable_fd_qb_mode(struct hclge_dev *hdev) +{ + struct hnae3_ae_dev *ae_dev = hdev->ae_dev; + int ret; + + if (!test_bit(HNAE3_DEV_SUPPORT_QB_B, ae_dev->caps) || + !test_bit(HCLGE_STATE_HW_QB_ENABLE, &hdev->state)) + return 0; + + ret = hclge_set_fd_qb(hdev, 0, false); + if (ret) + return ret; + + clear_bit(HCLGE_STATE_HW_QB_ENABLE, &hdev->state); + + return 0; +} + +static void hclge_sync_fd_qb_mode(struct hclge_dev *hdev) +{ + struct hnae3_ae_dev *ae_dev = hdev->ae_dev; + + if (!test_bit(HNAE3_DEV_SUPPORT_QB_B, ae_dev->caps)) + return; + + hclge_sync_pf_qb_mode(hdev); +} + static void hclge_periodic_service_task(struct hclge_dev *hdev) { unsigned long delta = round_jiffies_relative(HZ); @@ -4574,6 +4663,7 @@ static void hclge_periodic_service_task(struct hclge_dev *hdev) hclge_update_link_status(hdev); hclge_sync_mac_table(hdev); hclge_sync_promisc_mode(hdev); + hclge_sync_fd_qb_mode(hdev); hclge_sync_fd_table(hdev);
if (time_is_after_jiffies(hdev->last_serv_processed + HZ)) { @@ -5097,10 +5187,29 @@ static void hclge_request_update_promisc_mode(struct hnae3_handle *handle) set_bit(HCLGE_VPORT_STATE_PROMISC_CHANGE, &vport->state); }
+static bool hclge_query_fd_qb_state(struct hnae3_handle *handle) +{ + struct hclge_vport *vport = hclge_get_vport(handle); + struct hclge_dev *hdev = vport->back; + + return test_bit(HCLGE_STATE_HW_QB_ENABLE, &hdev->state); +} + +static void hclge_flush_qb_config(struct hnae3_handle *handle) +{ + struct hclge_vport *vport = hclge_get_vport(handle); + + set_bit(HCLGE_VPORT_STATE_QB_CHANGE, &vport->state); +} + static void hclge_sync_fd_state(struct hclge_dev *hdev) { - if (hlist_empty(&hdev->fd_rule_list)) + struct hclge_vport *vport = &hdev->vport[0]; + + if (hlist_empty(&hdev->fd_rule_list)) { hdev->fd_active_type = HCLGE_FD_RULE_NONE; + set_bit(HCLGE_VPORT_STATE_QB_CHANGE, &vport->state); + } }
static void hclge_fd_inc_rule_cnt(struct hclge_dev *hdev, u16 location) @@ -6546,6 +6655,10 @@ static int hclge_add_fd_entry_common(struct hclge_dev *hdev, { int ret;
+ ret = hclge_disable_fd_qb_mode(hdev); + if (ret) + return ret; + spin_lock_bh(&hdev->fd_rule_lock);
if (hdev->fd_active_type != rule->rule_type && @@ -8269,6 +8382,7 @@ int hclge_vport_start(struct hclge_vport *vport)
set_bit(HCLGE_VPORT_STATE_INITED, &vport->state); set_bit(HCLGE_VPORT_STATE_ALIVE, &vport->state); + set_bit(HCLGE_VPORT_STATE_QB_CHANGE, &vport->state); set_bit(HCLGE_VPORT_STATE_PROMISC_CHANGE, &vport->state); vport->last_active_jiffies = jiffies; vport->need_notify = 0; @@ -10486,6 +10600,7 @@ static void hclge_restore_hw_table(struct hclge_dev *hdev) hclge_restore_vport_port_base_vlan_config(hdev); hclge_restore_vport_vlan_table(vport); set_bit(HCLGE_STATE_FD_USER_DEF_CHANGED, &hdev->state); + clear_bit(HCLGE_STATE_HW_QB_ENABLE, &hdev->state); hclge_restore_fd_entries(handle); }
@@ -13333,6 +13448,8 @@ static const struct hnae3_ae_ops hclge_ops = { .put_vector = hclge_put_vector, .set_promisc_mode = hclge_set_promisc_mode, .request_update_promisc_mode = hclge_request_update_promisc_mode, + .request_flush_qb_config = hclge_flush_qb_config, + .query_fd_qb_state = hclge_query_fd_qb_state, .set_loopback = hclge_set_loopback, .start = hclge_ae_start, .stop = hclge_ae_stop, diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.h b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.h index 0e3c3382a5eb..41ebf71b8258 100644 --- a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.h +++ b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.h @@ -215,6 +215,7 @@ enum HCLGE_DEV_STATE { HCLGE_STATE_FD_TBL_CHANGED, HCLGE_STATE_FD_CLEAR_ALL, HCLGE_STATE_FD_USER_DEF_CHANGED, + HCLGE_STATE_HW_QB_ENABLE, HCLGE_STATE_PTP_EN, HCLGE_STATE_PTP_TX_HANDLING, HCLGE_STATE_MAX @@ -611,6 +612,7 @@ enum HCLGE_FD_ACTIVE_RULE_TYPE { HCLGE_FD_ARFS_ACTIVE, HCLGE_FD_EP_ACTIVE, HCLGE_FD_TC_FLOWER_ACTIVE, + HCLGE_FD_QB_ACTIVE, };
enum HCLGE_FD_PACKET_TYPE { @@ -975,6 +977,7 @@ struct hclge_rx_vtag_cfg { enum HCLGE_VPORT_STATE { HCLGE_VPORT_STATE_ALIVE, HCLGE_VPORT_STATE_MAC_TBL_CHANGE, + HCLGE_VPORT_STATE_QB_CHANGE, HCLGE_VPORT_STATE_PROMISC_CHANGE, HCLGE_VPORT_STATE_VLAN_FLTR_CHANGE, HCLGE_VPORT_STATE_INITED,
From: Jian Shen shenjian15@huawei.com
driver inclusion category:feature bugzilla: https://gitee.com/openeuler/kernel/issues/I62HX2
----------------------------------------------------------------------
For device version V3, the hardware supports queue bonding mode. VF can not enable queue bond mode unless PF enables it. So VF needs to query whether PF support queue bonding mode when initializing, and query whether PF enables queue bonding mode periodically. For the resource limited, to avoid a VF occupy to many FD rule space, only trust VF is allowed to enable queue bonding mode.
Signed-off-by: Jian Shen shenjian15@huawei.com Signed-off-by: Jiantao Xiao xiaojiantao1@h-partners.com Reviewed-by: Yue Haibing yuehaibing@huawei.com Reviewed-by: Jian Shen shenjian15@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- .../net/ethernet/hisilicon/hns3/hclge_mbx.h | 8 ++ .../hisilicon/hns3/hns3pf/hclge_main.c | 51 ++++++++++++- .../hisilicon/hns3/hns3pf/hclge_main.h | 2 + .../hisilicon/hns3/hns3pf/hclge_mbx.c | 37 ++++++++++ .../hisilicon/hns3/hns3vf/hclgevf_main.c | 74 +++++++++++++++++++ .../hisilicon/hns3/hns3vf/hclgevf_main.h | 6 ++ .../hisilicon/hns3/hns3vf/hclgevf_mbx.c | 17 +++++ 7 files changed, 194 insertions(+), 1 deletion(-)
diff --git a/drivers/net/ethernet/hisilicon/hns3/hclge_mbx.h b/drivers/net/ethernet/hisilicon/hns3/hclge_mbx.h index abcd7877f7d2..0de9b83c9d4e 100644 --- a/drivers/net/ethernet/hisilicon/hns3/hclge_mbx.h +++ b/drivers/net/ethernet/hisilicon/hns3/hclge_mbx.h @@ -47,6 +47,8 @@ enum HCLGE_MBX_OPCODE { HCLGE_MBX_VF_UNINIT, /* (VF -> PF) vf is unintializing */ HCLGE_MBX_HANDLE_VF_TBL, /* (VF -> PF) store/clear hw table */ HCLGE_MBX_GET_RING_VECTOR_MAP, /* (VF -> PF) get ring-to-vector map */ + HCLGE_MBX_SET_QB = 0x28, /* (VF -> PF) set queue bonding */ + HCLGE_MBX_PUSH_QB_STATE, /* (PF -> VF) push qb state */
HCLGE_MBX_GET_VF_FLR_STATUS = 200, /* (M7 -> PF) get vf flr status */ HCLGE_MBX_PUSH_LINK_STATUS, /* (M7 -> PF) get port link status */ @@ -77,6 +79,12 @@ enum hclge_mbx_tbl_cfg_subcode { HCLGE_MBX_VPORT_LIST_CLEAR, };
+enum hclge_mbx_qb_cfg_subcode { + HCLGE_MBX_QB_CHECK_CAPS = 0, /* query whether support qb */ + HCLGE_MBX_QB_ENABLE, /* request pf enable qb */ + HCLGE_MBX_QB_GET_STATE /* query whether qb enabled */ +}; + #define HCLGE_MBX_MAX_MSG_SIZE 14 #define HCLGE_MBX_MAX_RESP_DATA_SIZE 8U #define HCLGE_MBX_MAX_RING_CHAIN_PARAM_NUM 4 diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c index 061a50209caa..fa15471737fe 100644 --- a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c +++ b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c @@ -4586,6 +4586,7 @@ static int hclge_sync_pf_qb_mode(struct hclge_dev *hdev) struct hnae3_handle *handle = &vport->nic; bool request_enable = true; int ret; + u16 i;
if (!test_and_clear_bit(HCLGE_VPORT_STATE_QB_CHANGE, &vport->state)) return 0; @@ -4614,6 +4615,11 @@ static int hclge_sync_pf_qb_mode(struct hclge_dev *hdev) clear_bit(HCLGE_STATE_HW_QB_ENABLE, &hdev->state); hdev->fd_active_type = HCLGE_FD_RULE_NONE; } + + for (i = 1; i < hdev->num_alloc_vport; i++) { + vport = &hdev->vport[i]; + set_bit(HCLGE_VPORT_STATE_QB_CHANGE, &vport->state); + } } else { set_bit(HCLGE_VPORT_STATE_QB_CHANGE, &vport->state); } @@ -4622,10 +4628,33 @@ static int hclge_sync_pf_qb_mode(struct hclge_dev *hdev) return ret; }
+static int hclge_sync_vf_qb_mode(struct hclge_vport *vport) +{ + struct hclge_dev *hdev = vport->back; + bool request_enable = false; + int ret; + + if (!test_and_clear_bit(HCLGE_VPORT_STATE_QB_CHANGE, &vport->state)) + return 0; + + if (vport->vf_info.trusted && vport->vf_info.request_qb_en && + test_bit(HCLGE_STATE_HW_QB_ENABLE, &hdev->state)) + request_enable = true; + + ret = hclge_set_fd_qb(hdev, vport->vport_id, request_enable); + if (ret) + set_bit(HCLGE_VPORT_STATE_QB_CHANGE, &vport->state); + vport->vf_info.qb_en = request_enable ? 1 : 0; + + return ret; +} + static int hclge_disable_fd_qb_mode(struct hclge_dev *hdev) { struct hnae3_ae_dev *ae_dev = hdev->ae_dev; + struct hclge_vport *vport; int ret; + u16 i;
if (!test_bit(HNAE3_DEV_SUPPORT_QB_B, ae_dev->caps) || !test_bit(HCLGE_STATE_HW_QB_ENABLE, &hdev->state)) @@ -4637,17 +4666,35 @@ static int hclge_disable_fd_qb_mode(struct hclge_dev *hdev)
clear_bit(HCLGE_STATE_HW_QB_ENABLE, &hdev->state);
+ for (i = 1; i < hdev->num_alloc_vport; i++) { + vport = &hdev->vport[i]; + set_bit(HCLGE_VPORT_STATE_QB_CHANGE, &vport->state); + } + return 0; }
static void hclge_sync_fd_qb_mode(struct hclge_dev *hdev) { struct hnae3_ae_dev *ae_dev = hdev->ae_dev; + struct hclge_vport *vport; + int ret; + u16 i;
if (!test_bit(HNAE3_DEV_SUPPORT_QB_B, ae_dev->caps)) return;
- hclge_sync_pf_qb_mode(hdev); + ret = hclge_sync_pf_qb_mode(hdev); + if (ret) + return; + + for (i = 1; i < hdev->num_alloc_vport; i++) { + vport = &hdev->vport[i]; + + ret = hclge_sync_vf_qb_mode(vport); + if (ret) + return; + } }
static void hclge_periodic_service_task(struct hclge_dev *hdev) @@ -12269,6 +12316,8 @@ static int hclge_set_vf_trust(struct hnae3_handle *handle, int vf, bool enable) return 0;
vport->vf_info.trusted = new_trusted; + + set_bit(HCLGE_VPORT_STATE_QB_CHANGE, &vport->state); set_bit(HCLGE_VPORT_STATE_PROMISC_CHANGE, &vport->state); hclge_task_schedule(hdev, 0);
diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.h b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.h index 41ebf71b8258..1399f519b5f5 100644 --- a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.h +++ b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.h @@ -1008,6 +1008,8 @@ struct hclge_vf_info { u32 spoofchk; u32 max_tx_rate; u32 trusted; + u8 request_qb_en; + u8 qb_en; u8 request_uc_en; u8 request_mc_en; u8 request_bc_en; diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_mbx.c b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_mbx.c index 04ff9bf12185..47ca3ce63dfb 100644 --- a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_mbx.c +++ b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_mbx.c @@ -831,6 +831,36 @@ static void hclge_handle_vf_tbl(struct hclge_vport *vport, } }
+static void hclge_handle_vf_qb(struct hclge_vport *vport, + struct hclge_mbx_vf_to_pf_cmd *mbx_req, + struct hclge_respond_to_vf_msg *resp_msg) +{ + struct hclge_dev *hdev = vport->back; + + if (mbx_req->msg.subcode == HCLGE_MBX_QB_CHECK_CAPS) { + struct hnae3_handle *handle = &hdev->vport[0].nic; + + resp_msg->data[0] = test_bit(HNAE3_PFLAG_FD_QB_ENABLE, + &handle->supported_pflags); + resp_msg->len = sizeof(u8); + } else if (mbx_req->msg.subcode == HCLGE_MBX_QB_ENABLE) { + vport->vf_info.request_qb_en = mbx_req->msg.data[0]; + set_bit(HCLGE_VPORT_STATE_QB_CHANGE, &vport->state); + } else if (mbx_req->msg.subcode == HCLGE_MBX_QB_GET_STATE) { + u16 msg_data = vport->vf_info.qb_en; + int ret; + + ret = hclge_send_mbx_msg(vport, (u8 *)&msg_data, + sizeof(msg_data), + HCLGE_MBX_PUSH_QB_STATE, + vport->vport_id); + if (ret) + dev_err(&hdev->pdev->dev, + "failed to inform qb state to vport %u, ret = %d\n", + vport->vport_id, ret); + } +} + static int hclge_mbx_map_ring_to_vector_handler(struct hclge_mbx_ops_param *param) { @@ -1040,6 +1070,12 @@ static int hclge_mbx_handle_vf_tbl_handler(struct hclge_mbx_ops_param *param) return 0; }
+static int hclge_mbx_handle_vf_qb_handler(struct hclge_mbx_ops_param *param) +{ + hclge_handle_vf_qb(param->vport, param->req, param->resp_msg); + return 0; +} + static const hclge_mbx_ops_fn hclge_mbx_ops_list[HCLGE_MBX_OPCODE_MAX] = { [HCLGE_MBX_RESET] = hclge_mbx_reset_handler, [HCLGE_MBX_SET_UNICAST] = hclge_mbx_set_unicast_handler, @@ -1064,6 +1100,7 @@ static const hclge_mbx_ops_fn hclge_mbx_ops_list[HCLGE_MBX_OPCODE_MAX] = { [HCLGE_MBX_VF_UNINIT] = hclge_mbx_vf_uninit_handler, [HCLGE_MBX_HANDLE_VF_TBL] = hclge_mbx_handle_vf_tbl_handler, [HCLGE_MBX_GET_RING_VECTOR_MAP] = hclge_mbx_get_ring_vector_map_handler, + [HCLGE_MBX_SET_QB] = hclge_mbx_handle_vf_qb_handler, [HCLGE_MBX_GET_VF_FLR_STATUS] = hclge_mbx_get_vf_flr_status_handler, [HCLGE_MBX_PUSH_LINK_STATUS] = hclge_mbx_push_link_status_handler, [HCLGE_MBX_NCSI_ERROR] = hclge_mbx_ncsi_error_handler, diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3vf/hclgevf_main.c b/drivers/net/ethernet/hisilicon/hns3/hns3vf/hclgevf_main.c index e40535eea363..789be62a105c 100644 --- a/drivers/net/ethernet/hisilicon/hns3/hns3vf/hclgevf_main.c +++ b/drivers/net/ethernet/hisilicon/hns3/hns3vf/hclgevf_main.c @@ -406,6 +406,74 @@ static int hclgevf_knic_setup(struct hclgevf_dev *hdev) return 0; }
+static void hclgevf_update_fd_qb_state(struct hclgevf_dev *hdev) +{ + struct hnae3_handle *handle = &hdev->nic; + struct hclge_vf_to_pf_msg send_msg; + int ret; + + if (!hdev->qb_cfg.pf_support_qb || + !test_bit(HNAE3_PFLAG_FD_QB_ENABLE, &handle->priv_flags)) + return; + + hclgevf_build_send_msg(&send_msg, HCLGE_MBX_SET_QB, + HCLGE_MBX_QB_GET_STATE); + ret = hclgevf_send_mbx_msg(hdev, &send_msg, false, NULL, 0); + if (ret) + dev_err(&hdev->pdev->dev, "failed to get qb state, ret = %d", + ret); +} + +static void hclgevf_get_pf_qb_caps(struct hclgevf_dev *hdev) +{ + struct hnae3_ae_dev *ae_dev = pci_get_drvdata(hdev->pdev); + struct hclge_vf_to_pf_msg send_msg; + u8 resp_msg; + int ret; + + if (!test_bit(HNAE3_DEV_SUPPORT_QB_B, ae_dev->caps)) + return; + + hclgevf_build_send_msg(&send_msg, HCLGE_MBX_SET_QB, + HCLGE_MBX_QB_CHECK_CAPS); + ret = hclgevf_send_mbx_msg(hdev, &send_msg, true, &resp_msg, + sizeof(resp_msg)); + if (ret) { + dev_err(&hdev->pdev->dev, + "failed to get qb caps from PF, ret = %d", ret); + return; + } + + hdev->qb_cfg.pf_support_qb = resp_msg > 0; +} + +static void hclgevf_set_fd_qb(struct hnae3_handle *handle) +{ +#define HCLGEVF_QB_MBX_STATE_OFFSET 0 + + struct hclgevf_dev *hdev = hclgevf_ae_get_hdev(handle); + struct hclge_vf_to_pf_msg send_msg; + u8 resp_msg; + int ret; + + hclgevf_build_send_msg(&send_msg, HCLGE_MBX_SET_QB, + HCLGE_MBX_QB_ENABLE); + send_msg.data[HCLGEVF_QB_MBX_STATE_OFFSET] = + test_bit(HNAE3_PFLAG_FD_QB_ENABLE, &handle->priv_flags) ? 1 : 0; + ret = hclgevf_send_mbx_msg(hdev, &send_msg, true, &resp_msg, + sizeof(resp_msg)); + if (ret) + dev_err(&hdev->pdev->dev, "failed to set qb state, ret = %d", + ret); +} + +static bool hclgevf_query_fd_qb_state(struct hnae3_handle *handle) +{ + struct hclgevf_dev *hdev = hclgevf_ae_get_hdev(handle); + + return hdev->qb_cfg.hw_qb_en; +} + static void hclgevf_request_link_info(struct hclgevf_dev *hdev) { struct hclge_vf_to_pf_msg send_msg; @@ -1944,6 +2012,8 @@ static void hclgevf_periodic_service_task(struct hclgevf_dev *hdev)
hclgevf_sync_promisc_mode(hdev);
+ hclgevf_update_fd_qb_state(hdev); + hdev->last_serv_processed = jiffies;
out: @@ -2973,6 +3043,8 @@ static int hclgevf_init_hdev(struct hclgevf_dev *hdev) goto err_config; }
+ hclgevf_get_pf_qb_caps(hdev); + hclgevf_init_rxd_adv_layout(hdev);
set_bit(HCLGEVF_STATE_SERVICE_INITED, &hdev->state); @@ -3422,6 +3494,8 @@ static const struct hnae3_ae_ops hclgevf_ops = { .set_promisc_mode = hclgevf_set_promisc_mode, .request_update_promisc_mode = hclgevf_request_update_promisc_mode, .get_cmdq_stat = hclgevf_get_cmdq_stat, + .request_flush_qb_config = hclgevf_set_fd_qb, + .query_fd_qb_state = hclgevf_query_fd_qb_state, };
static struct hnae3_ae_algo ae_algovf = { diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3vf/hclgevf_main.h b/drivers/net/ethernet/hisilicon/hns3/hns3vf/hclgevf_main.h index 59ca6c794d6d..4568336d73f9 100644 --- a/drivers/net/ethernet/hisilicon/hns3/hns3vf/hclgevf_main.h +++ b/drivers/net/ethernet/hisilicon/hns3/hns3vf/hclgevf_main.h @@ -206,6 +206,11 @@ struct hclgevf_mac_table_cfg { struct list_head mc_mac_list; };
+struct hclgevf_qb_cfg { + bool pf_support_qb; + bool hw_qb_en; +}; + struct hclgevf_dev { struct pci_dev *pdev; struct hnae3_ae_dev *ae_dev; @@ -273,6 +278,7 @@ struct hclgevf_dev { unsigned long serv_processed_cnt; unsigned long last_serv_processed;
+ struct hclgevf_qb_cfg qb_cfg; struct devlink *devlink; };
diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3vf/hclgevf_mbx.c b/drivers/net/ethernet/hisilicon/hns3/hns3vf/hclgevf_mbx.c index bbf7b14079de..2c7c5e7d4fe7 100644 --- a/drivers/net/ethernet/hisilicon/hns3/hns3vf/hclgevf_mbx.c +++ b/drivers/net/ethernet/hisilicon/hns3/hns3vf/hclgevf_mbx.c @@ -264,6 +264,7 @@ void hclgevf_mbx_handler(struct hclgevf_dev *hdev) case HCLGE_MBX_LINK_STAT_MODE: case HCLGE_MBX_PUSH_VLAN_INFO: case HCLGE_MBX_PUSH_PROMISC_INFO: + case HCLGE_MBX_PUSH_QB_STATE: hclgevf_handle_mbx_msg(hdev, req); break; default: @@ -289,6 +290,19 @@ static void hclgevf_parse_promisc_info(struct hclgevf_dev *hdev, "Promisc mode is closed by host for being untrusted.\n"); }
+static void hclgevf_parse_qb_info(struct hclgevf_dev *hdev, u16 qb_state) +{ +#define HCLGEVF_HW_QB_ON 1 +#define HCLGEVF_HW_QB_OFF 0 + + if (qb_state > HCLGEVF_HW_QB_ON) { + dev_warn(&hdev->pdev->dev, "Invalid state, ignored.\n"); + return; + } + + hdev->qb_cfg.hw_qb_en = qb_state > HCLGEVF_HW_QB_OFF; +} + void hclgevf_mbx_async_handler(struct hclgevf_dev *hdev) { struct hclge_mbx_port_base_vlan *vlan_info; @@ -367,6 +381,9 @@ void hclgevf_mbx_async_handler(struct hclgevf_dev *hdev) case HCLGE_MBX_PUSH_PROMISC_INFO: hclgevf_parse_promisc_info(hdev, le16_to_cpu(msg_q[1])); break; + case HCLGE_MBX_PUSH_QB_STATE: + hclgevf_parse_qb_info(hdev, msg_q[1]); + break; default: dev_err(&hdev->pdev->dev, "fetched unsupported(%u) message from arq\n",
From: Jian Shen shenjian15@huawei.com
driver inclusion category:feature bugzilla: https://gitee.com/openeuler/kernel/issues/I62HX2
----------------------------------------------------------------------
For the fd rule of queue bonding is created by hardware automatically, the driver needs to specify the fd counter for each function, then it's available to query how many times the queue bonding fd rules hit.
Signed-off-by: Jian Shen shenjian15@huawei.com Signed-off-by: Jiantao Xiao xiaojiantao1@h-partners.com Reviewed-by: Yue Haibing yuehaibing@huawei.com Reviewed-by: Jian Shen shenjian15@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- .../hisilicon/hns3/hns3pf/hclge_cmd.h | 12 ++++++ .../hisilicon/hns3/hns3pf/hclge_main.c | 39 +++++++++++++++++++ 2 files changed, 51 insertions(+)
diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_cmd.h b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_cmd.h index 84cf0c9dd551..eee9a1082894 100644 --- a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_cmd.h +++ b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_cmd.h @@ -755,6 +755,18 @@ struct hclge_fd_qb_cfg_cmd { u8 rsv[22]; };
+#define HCLGE_FD_QB_AD_RULE_ID_VLD_B 0 +#define HCLGE_FD_QB_AD_COUNTER_VLD_B 1 +struct hclge_fd_qb_ad_cmd { + u8 vf_id; + u8 rsv1; + u8 ad_sel; + u8 rsv2; + __le16 hit_rule_id; + u8 counter_id; + u8 rsv3[17]; +}; + #define HCLGE_FD_USER_DEF_OFT_S 0 #define HCLGE_FD_USER_DEF_OFT_M GENMASK(14, 0) #define HCLGE_FD_USER_DEF_EN_B 15 diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c index fa15471737fe..afac6036a748 100644 --- a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c +++ b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c @@ -4561,6 +4561,40 @@ static void hclge_update_vport_alive(struct hclge_dev *hdev) } }
+static int hclge_set_fd_qb_counter(struct hclge_dev *hdev, u8 vf_id) +{ + struct hclge_fd_qb_ad_cmd *req; + struct hclge_desc desc; + int ret; + + hclge_cmd_setup_basic_desc(&desc, HCLGE_OPC_FD_QB_AD_OP, false); + req = (struct hclge_fd_qb_ad_cmd *)desc.data; + req->vf_id = vf_id; + hnae3_set_bit(req->ad_sel, HCLGE_FD_QB_AD_COUNTER_VLD_B, 1); + req->counter_id = vf_id % hdev->fd_cfg.cnt_num[HCLGE_FD_STAGE_1]; + ret = hclge_cmd_send(&hdev->hw, &desc, 1); + if (ret) + dev_warn(&hdev->pdev->dev, + "failed to set qb counter for vport %u, ret = %d.\n", + vf_id, ret); + return ret; +} + +static void hclge_init_fd_qb_counter(struct hclge_dev *hdev) +{ + int ret; + u16 i; + + if (!test_bit(HNAE3_DEV_SUPPORT_QB_B, hdev->ae_dev->caps)) + return; + + for (i = 0; i < hdev->num_alloc_vport; i++) { + ret = hclge_set_fd_qb_counter(hdev, i); + if (ret) + return; + } +} + static int hclge_set_fd_qb(struct hclge_dev *hdev, u8 vf_id, bool enable) { struct hclge_fd_qb_cfg_cmd *req; @@ -5682,6 +5716,11 @@ static int hclge_init_fd_config(struct hclge_dev *hdev) if (ret) return ret;
+ if (!hdev->fd_cfg.cnt_num[HCLGE_FD_STAGE_1]) + hdev->fd_cfg.cnt_num[HCLGE_FD_STAGE_1] = 1; + + hclge_init_fd_qb_counter(hdev); + return hclge_set_fd_key_config(hdev, HCLGE_FD_STAGE_1); }
From: Hao Chen chenhao418@huawei.com
driver inclusion category:feature bugzilla: https://gitee.com/openeuler/kernel/issues/I62HX2
----------------------------------------------------------------------
When serdes lane support setting 25Gb/s、50Gb/s speed and user wants to set port speed as 50Gb/s, it can be setted as one 50Gb/s serdes lane or two 25Gb/s serdes lanes.
So, this patch adds support to query and set lane number by sysfs to satisfy this scenario.
Signed-off-by: Hao Chen chenhao418@huawei.com Signed-off-by: Jiantao Xiao xiaojiantao1@h-partners.com Reviewed-by: Yue Haibing yuehaibing@huawei.com Reviewed-by: Jian Shen shenjian15@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- drivers/net/ethernet/hisilicon/hns3/Makefile | 2 +- drivers/net/ethernet/hisilicon/hns3/hnae3.h | 4 + .../hns3/hns3_common/hclge_comm_cmd.c | 1 + .../hns3/hns3_common/hclge_comm_cmd.h | 1 + .../hisilicon/hns3/hns3pf/hclge_cmd.h | 7 +- .../hisilicon/hns3/hns3pf/hclge_main.c | 24 +++-- .../hisilicon/hns3/hns3pf/hclge_main.h | 5 + .../hisilicon/hns3/hns3pf/hclge_sysfs.c | 91 +++++++++++++++++++ 8 files changed, 126 insertions(+), 9 deletions(-) create mode 100644 drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_sysfs.c
diff --git a/drivers/net/ethernet/hisilicon/hns3/Makefile b/drivers/net/ethernet/hisilicon/hns3/Makefile index 6efea4662858..0bb3a28f6f8f 100644 --- a/drivers/net/ethernet/hisilicon/hns3/Makefile +++ b/drivers/net/ethernet/hisilicon/hns3/Makefile @@ -21,7 +21,7 @@ hclgevf-objs = hns3vf/hclgevf_main.o hns3vf/hclgevf_mbx.o hns3vf/hclgevf_devlin hns3_common/hclge_comm_cmd.o hns3_common/hclge_comm_rss.o hns3_common/hclge_comm_tqp_stats.o
obj-$(CONFIG_HNS3_HCLGE) += hclge.o -hclge-objs = hns3pf/hclge_main.o hns3pf/hclge_mdio.o hns3pf/hclge_tm.o \ +hclge-objs = hns3pf/hclge_main.o hns3pf/hclge_mdio.o hns3pf/hclge_tm.o hns3pf/hclge_sysfs.o \ hns3pf/hclge_mbx.o hns3pf/hclge_err.o hns3pf/hclge_debugfs.o hns3pf/hclge_ptp.o hns3pf/hclge_devlink.o \ hns3_common/hclge_comm_cmd.o hns3_common/hclge_comm_rss.o hns3_common/hclge_comm_tqp_stats.o
diff --git a/drivers/net/ethernet/hisilicon/hns3/hnae3.h b/drivers/net/ethernet/hisilicon/hns3/hnae3.h index 0b1d4b72ac14..779b32bd646d 100644 --- a/drivers/net/ethernet/hisilicon/hns3/hnae3.h +++ b/drivers/net/ethernet/hisilicon/hns3/hnae3.h @@ -100,6 +100,7 @@ enum HNAE3_DEV_CAP_BITS { HNAE3_DEV_SUPPORT_VLAN_FLTR_MDF_B, HNAE3_DEV_SUPPORT_MC_MAC_MNG_B, HNAE3_DEV_SUPPORT_CQ_B, + HNAE3_DEV_SUPPORT_LANE_NUM_B, };
#define hnae3_ae_dev_fd_supported(ae_dev) \ @@ -162,6 +163,9 @@ enum HNAE3_DEV_CAP_BITS { #define hnae3_ae_dev_cq_supported(ae_dev) \ test_bit(HNAE3_DEV_SUPPORT_CQ_B, (ae_dev)->caps)
+#define hnae3_ae_dev_lane_num_supported(ae_dev) \ + test_bit(HNAE3_DEV_SUPPORT_LANE_NUM_B, (ae_dev)->caps) + enum HNAE3_PF_CAP_BITS { HNAE3_PF_SUPPORT_VLAN_FLTR_MDF_B = 0, }; diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3_common/hclge_comm_cmd.c b/drivers/net/ethernet/hisilicon/hns3/hns3_common/hclge_comm_cmd.c index f9bd3fc969c5..f1251890ef14 100644 --- a/drivers/net/ethernet/hisilicon/hns3/hns3_common/hclge_comm_cmd.c +++ b/drivers/net/ethernet/hisilicon/hns3/hns3_common/hclge_comm_cmd.c @@ -153,6 +153,7 @@ static const struct hclge_comm_caps_bit_map hclge_pf_cmd_caps[] = { {HCLGE_COMM_CAP_CQ_B, HNAE3_DEV_SUPPORT_CQ_B}, {HCLGE_COMM_CAP_GRO_B, HNAE3_DEV_SUPPORT_GRO_B}, {HCLGE_COMM_CAP_FD_B, HNAE3_DEV_SUPPORT_FD_B}, + {HCLGE_COMM_CAP_LANE_NUM_B, HNAE3_DEV_SUPPORT_LANE_NUM_B}, };
static const struct hclge_comm_caps_bit_map hclge_vf_cmd_caps[] = { diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3_common/hclge_comm_cmd.h b/drivers/net/ethernet/hisilicon/hns3/hns3_common/hclge_comm_cmd.h index 8aaa5fdfa2f6..ec1cb010d0ac 100644 --- a/drivers/net/ethernet/hisilicon/hns3/hns3_common/hclge_comm_cmd.h +++ b/drivers/net/ethernet/hisilicon/hns3/hns3_common/hclge_comm_cmd.h @@ -342,6 +342,7 @@ enum HCLGE_COMM_CAP_BITS { HCLGE_COMM_CAP_CQ_B = 18, HCLGE_COMM_CAP_GRO_B = 20, HCLGE_COMM_CAP_FD_B = 21, + HCLGE_COMM_CAP_LANE_NUM_B = 27, };
enum HCLGE_COMM_API_CAP_BITS { diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_cmd.h b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_cmd.h index eee9a1082894..ce0e72862257 100644 --- a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_cmd.h +++ b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_cmd.h @@ -322,7 +322,9 @@ struct hclge_config_mac_speed_dup_cmd {
#define HCLGE_CFG_MAC_SPEED_CHANGE_EN_B 0 u8 mac_change_fec_en; - u8 rsv[22]; + u8 rsv[4]; + u8 lane_num; + u8 rsv1[17]; };
#define HCLGE_TQP_ENABLE_B 0 @@ -349,7 +351,8 @@ struct hclge_sfp_info_cmd { __le32 speed_ability; /* speed ability for current media */ __le32 module_type; u8 fec_ability; - u8 rsv[7]; + u8 lane_num; + u8 rsv[6]; };
#define HCLGE_MAC_CFG_FEC_AUTO_EN_B 0 diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c index afac6036a748..a10a3a746742 100644 --- a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c +++ b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c @@ -2663,8 +2663,8 @@ static int hclge_convert_to_fw_speed(u32 speed_drv, u32 *speed_fw) return -EINVAL; }
-static int hclge_cfg_mac_speed_dup_hw(struct hclge_dev *hdev, int speed, - u8 duplex) +int hclge_cfg_mac_speed_dup_hw(struct hclge_dev *hdev, int speed, + u8 duplex, u8 lane_num) { struct hclge_config_mac_speed_dup_cmd *req; struct hclge_desc desc; @@ -2688,6 +2688,7 @@ static int hclge_cfg_mac_speed_dup_hw(struct hclge_dev *hdev, int speed, speed_fw); hnae3_set_bit(req->mac_change_fec_en, HCLGE_CFG_MAC_SPEED_CHANGE_EN_B, 1); + req->lane_num = lane_num;
ret = hclge_cmd_send(&hdev->hw, &desc, 1); if (ret) { @@ -2709,7 +2710,7 @@ int hclge_cfg_mac_speed_dup(struct hclge_dev *hdev, int speed, u8 duplex) mac->duplex == duplex) return 0;
- ret = hclge_cfg_mac_speed_dup_hw(hdev, speed, duplex); + ret = hclge_cfg_mac_speed_dup_hw(hdev, speed, duplex, 0); if (ret) return ret;
@@ -2875,7 +2876,8 @@ static int hclge_mac_init(struct hclge_dev *hdev) hdev->support_sfp_query = true; hdev->hw.mac.duplex = HCLGE_MAC_FULL; ret = hclge_cfg_mac_speed_dup_hw(hdev, hdev->hw.mac.speed, - hdev->hw.mac.duplex); + hdev->hw.mac.duplex, + hdev->hw.mac.lane_num); if (ret) return ret;
@@ -3200,6 +3202,7 @@ static int hclge_get_sfp_info(struct hclge_dev *hdev, struct hclge_mac *mac) mac->autoneg = resp->autoneg; mac->support_autoneg = resp->autoneg_ability; mac->speed_type = QUERY_ACTIVE_SPEED; + mac->lane_num = resp->lane_num; if (!resp->active_fec) mac->fec_mode = 0; else @@ -12173,13 +12176,19 @@ static int hclge_init_ae_dev(struct hnae3_ae_dev *ae_dev) goto err_mdiobus_unreg; }
+ ret = hclge_register_sysfs(hdev); + if (ret) { + dev_err(&pdev->dev, "failed to register sysfs, ret = %d\n", ret); + goto err_mdiobus_unreg; + } + ret = hclge_ptp_init(hdev); if (ret) - goto err_mdiobus_unreg; + goto err_sysfs_unregister;
ret = hclge_update_port_info(hdev); if (ret) - goto err_mdiobus_unreg; + goto err_sysfs_unregister;
INIT_KFIFO(hdev->mac_tnl_log);
@@ -12224,6 +12233,8 @@ static int hclge_init_ae_dev(struct hnae3_ae_dev *ae_dev)
return 0;
+err_sysfs_unregister: + hclge_unregister_sysfs(hdev); err_mdiobus_unreg: if (hdev->hw.mac.phydev) mdiobus_unregister(hdev->hw.mac.mdio_bus); @@ -12605,6 +12616,7 @@ static void hclge_uninit_ae_dev(struct hnae3_ae_dev *ae_dev) struct hclge_dev *hdev = ae_dev->priv; struct hclge_mac *mac = &hdev->hw.mac;
+ hclge_unregister_sysfs(hdev); hclge_reset_vf_rate(hdev); hclge_clear_vf_vlan(hdev); hclge_state_uninit(hdev); diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.h b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.h index 1399f519b5f5..e5d786adafc3 100644 --- a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.h +++ b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.h @@ -260,6 +260,7 @@ struct hclge_mac { u8 duplex; u8 support_autoneg; u8 speed_type; /* 0: sfp speed, 1: active speed */ + u8 lane_num; u32 speed; u32 max_speed; u32 speed_ability; /* speed ability supported by current media */ @@ -1143,4 +1144,8 @@ int hclge_check_mac_addr_valid(struct hclge_dev *hdev, u8 vf, int hclge_push_vf_link_status(struct hclge_vport *vport); int hclge_enable_vport_vlan_filter(struct hclge_vport *vport, bool request_en); int hclge_mac_update_stats(struct hclge_dev *hdev); +int hclge_register_sysfs(struct hclge_dev *hdev); +void hclge_unregister_sysfs(struct hclge_dev *hdev); +int hclge_cfg_mac_speed_dup_hw(struct hclge_dev *hdev, int speed, u8 duplex, + u8 lane_num); #endif diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_sysfs.c b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_sysfs.c new file mode 100644 index 000000000000..b7cc89c3f6d8 --- /dev/null +++ b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_sysfs.c @@ -0,0 +1,91 @@ +// SPDX-License-Identifier: GPL-2.0+ +// Copyright (c) Huawei Technologies Co., Ltd. 2022. All rights reserved. + +#include "hnae3.h" +#include "hclge_main.h" + +static ssize_t lane_num_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct pci_dev *pdev = container_of(dev, struct pci_dev, dev); + struct hnae3_ae_dev *ae_dev = pci_get_drvdata(pdev); + struct hclge_dev *hdev = ae_dev->priv; + + return scnprintf(buf, PAGE_SIZE, "%u\n", hdev->hw.mac.lane_num); +} + +static ssize_t lane_num_store(struct device *dev, + struct device_attribute *attr, + const char *buf, size_t count) +{ +#define HCLGE_CONVERSION_NUM 10 /* Convert string to decimal number */ + + struct pci_dev *pdev = container_of(dev, struct pci_dev, dev); + struct hnae3_ae_dev *ae_dev = pci_get_drvdata(pdev); + struct hclge_dev *hdev = ae_dev->priv; + u8 lane_num, duplex; + u32 speed; + int ret; + + ret = kstrtou8(buf, HCLGE_CONVERSION_NUM, &lane_num); + if (ret) { + dev_err(dev, "input params of lane number format unmatch.\n"); + return -EINVAL; + } + + if (!lane_num || lane_num > 8 || !is_power_of_2(lane_num)) { + dev_err(dev, "lane number only supports setting 1, 2, 4, 8.\n"); + return -EINVAL; + } + + rtnl_lock(); + + if (hdev->hw.mac.support_autoneg && hdev->hw.mac.autoneg) { + ret = count; + goto out; + } + + if (lane_num == hdev->hw.mac.lane_num) { + dev_info(dev, "setting lane number not changed.\n"); + ret = count; + goto out; + } + + speed = hdev->hw.mac.speed; + duplex = hdev->hw.mac.duplex; + + ret = hclge_cfg_mac_speed_dup_hw(hdev, speed, duplex, lane_num); + if (!ret) + ret = count; + +out: + rtnl_unlock(); + return ret; +} + +static DEVICE_ATTR_RW(lane_num); + +static const struct device_attribute *hclge_hw_attrs_list[] = { + &dev_attr_lane_num, +}; + +int hclge_register_sysfs(struct hclge_dev *hdev) +{ + int ret; + + if (!hnae3_ae_dev_lane_num_supported(hdev->ae_dev)) + return 0; + + ret = device_create_file(&hdev->pdev->dev, hclge_hw_attrs_list[0]); + if (ret) + dev_err(&hdev->pdev->dev, + "failed to create node %s, ret = %d.\n", + hclge_hw_attrs_list[0]->attr.name, ret); + + return ret; +} + +void hclge_unregister_sysfs(struct hclge_dev *hdev) +{ + device_remove_file(&hdev->pdev->dev, hclge_hw_attrs_list[0]); +}
From: liaoguojia liaoguojia@huawei.com
driver inclusion category:feature bugzilla: https://gitee.com/openeuler/kernel/issues/I62HX2
----------------------------------------------------------------------
On version HNAE3_DEVICE_VERSION_V2, the tcam table entry of the FD is obtained by traversing the list recorded by the driver.
On version HNAE3_DEVICE_VERSION_V3, a new usage mode of FD is supported, called Queue bond mode. In this mode, the hardware automatically creates rules and the driver does not record the flow table entry.
So we needs to check the validity of the entry by traversing the entire hardware entry to dump out the QB tcam table.
Signed-off-by: liaoguojia liaoguojia@huawei.com Signed-off-by: Jiantao Xiao xiaojiantao1@h-partners.com Reviewed-by: Yue Haibing yuehaibing@huawei.com Reviewed-by: Jian Shen shenjian15@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- .../hisilicon/hns3/hns3pf/hclge_debugfs.c | 113 +++++++++++++++--- .../hisilicon/hns3/hns3pf/hclge_debugfs.h | 5 - 2 files changed, 98 insertions(+), 20 deletions(-)
diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_debugfs.c b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_debugfs.c index 7051ea085569..0b7e8b4c7571 100644 --- a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_debugfs.c +++ b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_debugfs.c @@ -1506,8 +1506,7 @@ static int hclge_dbg_dump_mng_table(struct hclge_dev *hdev, char *buf, int len) #define HCLGE_DBG_TCAM_BUF_SIZE 256
static int hclge_dbg_fd_tcam_read(struct hclge_dev *hdev, bool sel_x, - char *tcam_buf, - struct hclge_dbg_tcam_msg tcam_msg) + char *tcam_buf, u8 stage, u32 loc) { struct hclge_fd_tcam_config_1_cmd *req1; struct hclge_fd_tcam_config_2_cmd *req2; @@ -1527,9 +1526,9 @@ static int hclge_dbg_fd_tcam_read(struct hclge_dev *hdev, bool sel_x, req2 = (struct hclge_fd_tcam_config_2_cmd *)desc[1].data; req3 = (struct hclge_fd_tcam_config_3_cmd *)desc[2].data;
- req1->stage = tcam_msg.stage; + req1->stage = stage; req1->xy_sel = sel_x ? 1 : 0; - req1->index = cpu_to_le32(tcam_msg.loc); + req1->index = cpu_to_le32(loc);
ret = hclge_cmd_send(&hdev->hw, desc, 3); if (ret) @@ -1537,7 +1536,7 @@ static int hclge_dbg_fd_tcam_read(struct hclge_dev *hdev, bool sel_x,
pos += scnprintf(tcam_buf + pos, HCLGE_DBG_TCAM_BUF_SIZE - pos, "read result tcam key %s(%u):\n", sel_x ? "x" : "y", - tcam_msg.loc); + loc);
/* tcam_data0 ~ tcam_data1 */ req = (u32 *)req1->tcam_data; @@ -1582,7 +1581,6 @@ static int hclge_dbg_get_rules_location(struct hclge_dev *hdev, u16 *rule_locs) static int hclge_dbg_dump_fd_tcam(struct hclge_dev *hdev, char *buf, int len) { u32 rule_num = hdev->fd_cfg.rule_num[HCLGE_FD_STAGE_1]; - struct hclge_dbg_tcam_msg tcam_msg; int i, ret, rule_cnt; u16 *rule_locs; char *tcam_buf; @@ -1617,10 +1615,7 @@ static int hclge_dbg_dump_fd_tcam(struct hclge_dev *hdev, char *buf, int len)
ret = 0; for (i = 0; i < rule_cnt; i++) { - tcam_msg.stage = HCLGE_FD_STAGE_1; - tcam_msg.loc = rule_locs[i]; - - ret = hclge_dbg_fd_tcam_read(hdev, true, tcam_buf, tcam_msg); + ret = hclge_dbg_fd_tcam_read(hdev, true, tcam_buf, HCLGE_FD_STAGE_1, rule_locs[i]); if (ret) { dev_err(&hdev->pdev->dev, "failed to get fd tcam key x, ret = %d\n", ret); @@ -1629,7 +1624,7 @@ static int hclge_dbg_dump_fd_tcam(struct hclge_dev *hdev, char *buf, int len)
pos += scnprintf(buf + pos, len - pos, "%s", tcam_buf);
- ret = hclge_dbg_fd_tcam_read(hdev, false, tcam_buf, tcam_msg); + ret = hclge_dbg_fd_tcam_read(hdev, false, tcam_buf, HCLGE_FD_STAGE_1, rule_locs[i]); if (ret) { dev_err(&hdev->pdev->dev, "failed to get fd tcam key y, ret = %d\n", ret); @@ -1645,6 +1640,86 @@ static int hclge_dbg_dump_fd_tcam(struct hclge_dev *hdev, char *buf, int len) return ret; }
+static int hclge_query_rules_valid(struct hclge_dev *hdev, u8 stage, u32 loc) +{ +#define HCLGE_TCAM_SELECTION_X 1 + struct hclge_fd_tcam_config_1_cmd *req1; + struct hclge_fd_tcam_config_2_cmd *req2; + struct hclge_fd_tcam_config_3_cmd *req3; + struct hclge_desc desc[3]; + int ret; + + hclge_cmd_setup_basic_desc(&desc[0], HCLGE_OPC_FD_TCAM_OP, true); + desc[0].flag |= cpu_to_le16(HCLGE_COMM_CMD_FLAG_NEXT); + hclge_cmd_setup_basic_desc(&desc[1], HCLGE_OPC_FD_TCAM_OP, true); + desc[1].flag |= cpu_to_le16(HCLGE_COMM_CMD_FLAG_NEXT); + hclge_cmd_setup_basic_desc(&desc[2], HCLGE_OPC_FD_TCAM_OP, true); + + req1 = (struct hclge_fd_tcam_config_1_cmd *)desc[0].data; + req2 = (struct hclge_fd_tcam_config_2_cmd *)desc[1].data; + req3 = (struct hclge_fd_tcam_config_3_cmd *)desc[2].data; + + req1->stage = stage; + req1->xy_sel = HCLGE_TCAM_SELECTION_X; + req1->index = cpu_to_le32(loc); + + ret = hclge_cmd_send(&hdev->hw, desc, 3); + if (ret) { + dev_err(&hdev->pdev->dev, + "failed to read tcam status, ret = %d\n", ret); + return ret; + } + + return req1->entry_vld; +} + +static int hclge_dbg_dump_qb_tcam(struct hclge_dev *hdev, char *buf, int len) +{ + char *tcam_buf; + int pos = 0; + int ret = 0; + int i; + + if (!hnae3_ae_dev_fd_supported(hdev->ae_dev)) { + dev_err(&hdev->pdev->dev, + "Only FD-supported dev supports dump fd tcam\n"); + return -EOPNOTSUPP; + } + + tcam_buf = kzalloc(HCLGE_DBG_TCAM_BUF_SIZE, GFP_KERNEL); + if (!tcam_buf) + return -ENOMEM; + + for (i = 0; i < hdev->fd_cfg.rule_num[HCLGE_FD_STAGE_1]; i++) { + if (hclge_query_rules_valid(hdev, HCLGE_FD_STAGE_1, i) <= 0) + continue; + + ret = hclge_dbg_fd_tcam_read(hdev, true, tcam_buf, + HCLGE_FD_STAGE_1, i); + if (ret) { + dev_err(&hdev->pdev->dev, + "failed to get qb tcam key x, ret = %d\n", ret); + goto out; + } + + pos += scnprintf(buf + pos, len - pos, "%s", tcam_buf); + + ret = hclge_dbg_fd_tcam_read(hdev, false, tcam_buf, + HCLGE_FD_STAGE_1, i); + if (ret) { + dev_err(&hdev->pdev->dev, + "failed to get qb tcam key y, ret = %d\n", ret); + goto out; + } + + pos += scnprintf(buf + pos, len - pos, "%s", tcam_buf); + } + +out: + kfree(tcam_buf); + return ret; +} + static int hclge_dbg_dump_fd_counter(struct hclge_dev *hdev, char *buf, int len) { u8 func_num = pci_num_vf(hdev->pdev) + 1; /* pf and enabled vf num */ @@ -2398,6 +2473,14 @@ static int hclge_dbg_dump_ptp_info(struct hclge_dev *hdev, char *buf, int len) return 0; }
+static int hclge_dbg_dump_tcam(struct hclge_dev *hdev, char *buf, int len) +{ + if (test_bit(HCLGE_STATE_HW_QB_ENABLE, &hdev->state)) + return hclge_dbg_dump_qb_tcam(hdev, buf, len); + else + return hclge_dbg_dump_fd_tcam(hdev, buf, len); +}; + static int hclge_dbg_dump_mac_uc(struct hclge_dev *hdev, char *buf, int len) { hclge_dbg_dump_mac_list(hdev, buf, len, true); @@ -2537,14 +2620,14 @@ static const struct hclge_dbg_func hclge_dbg_cmd_func[] = { .cmd = HNAE3_DBG_CMD_REG_DCB, .dbg_dump = hclge_dbg_dump_dcb, }, - { - .cmd = HNAE3_DBG_CMD_FD_TCAM, - .dbg_dump = hclge_dbg_dump_fd_tcam, - }, { .cmd = HNAE3_DBG_CMD_MAC_TNL_STATUS, .dbg_dump = hclge_dbg_dump_mac_tnl_status, }, + { + .cmd = HNAE3_DBG_CMD_FD_TCAM, + .dbg_dump = hclge_dbg_dump_tcam, + }, { .cmd = HNAE3_DBG_CMD_SERV_INFO, .dbg_dump = hclge_dbg_dump_serv_info, diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_debugfs.h b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_debugfs.h index 724052928b88..7af1f5a84fba 100644 --- a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_debugfs.h +++ b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_debugfs.h @@ -69,11 +69,6 @@ struct hclge_dbg_reg_common_msg { enum hclge_opcode_type cmd; };
-struct hclge_dbg_tcam_msg { - u8 stage; - u32 loc; -}; - #define HCLGE_DBG_MAX_DFX_MSG_LEN 60 struct hclge_dbg_dfx_message { int flag;
From: Yuanzheng Song songyuanzheng@huawei.com
stable inclusion from stable-v5.10.153 commit 935a8b6202101d7f58fe9cd11287f9cec0d8dd32 category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I5XS4G CVE: NA
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=...
--------------------------------
The vma->anon_vma of the child process may be NULL because the entire vma does not contain anonymous pages. In this case, a BUG will occur when the copy_present_page() passes a copy of a non-anonymous page of that vma to the page_add_new_anon_rmap() to set up new anonymous rmap.
------------[ cut here ]------------ kernel BUG at mm/rmap.c:1044! Internal error: Oops - BUG: 0 [#1] SMP Modules linked in: CPU: 2 PID: 3617 Comm: test Not tainted 5.10.149 #1 Hardware name: linux,dummy-virt (DT) pstate: 80000005 (Nzcv daif -PAN -UAO -TCO BTYPE=--) pc : __page_set_anon_rmap+0xbc/0xf8 lr : __page_set_anon_rmap+0xbc/0xf8 sp : ffff800014c1b870 x29: ffff800014c1b870 x28: 0000000000000001 x27: 0000000010100073 x26: ffff1d65c517baa8 x25: ffff1d65cab0f000 x24: ffff1d65c416d800 x23: ffff1d65cab5f248 x22: 0000000020000000 x21: 0000000000000001 x20: 0000000000000000 x19: fffffe75970023c0 x18: 0000000000000000 x17: 0000000000000000 x16: 0000000000000000 x15: 0000000000000000 x14: 0000000000000000 x13: 0000000000000000 x12: 0000000000000000 x11: 0000000000000000 x10: 0000000000000000 x9 : ffffc3096d5fb858 x8 : 0000000000000000 x7 : 0000000000000011 x6 : ffff5a5c9089c000 x5 : 0000000000020000 x4 : ffff5a5c9089c000 x3 : ffffc3096d200000 x2 : ffffc3096e8d0000 x1 : ffff1d65ca3da740 x0 : 0000000000000000 Call trace: __page_set_anon_rmap+0xbc/0xf8 page_add_new_anon_rmap+0x1e0/0x390 copy_pte_range+0xd00/0x1248 copy_page_range+0x39c/0x620 dup_mmap+0x2e0/0x5a8 dup_mm+0x78/0x140 copy_process+0x918/0x1a20 kernel_clone+0xac/0x638 __do_sys_clone+0x78/0xb0 __arm64_sys_clone+0x30/0x40 el0_svc_common.constprop.0+0xb0/0x308 do_el0_svc+0x48/0xb8 el0_svc+0x24/0x38 el0_sync_handler+0x160/0x168 el0_sync+0x180/0x1c0 Code: 97f8ff85 f9400294 17ffffeb 97f8ff82 (d4210000) ---[ end trace a972347688dc9bd4 ]--- Kernel panic - not syncing: Oops - BUG: Fatal exception SMP: stopping secondary CPUs Kernel Offset: 0x43095d200000 from 0xffff800010000000 PHYS_OFFSET: 0xffffe29a80000000 CPU features: 0x08200022,61806082 Memory Limit: none ---[ end Kernel panic - not syncing: Oops - BUG: Fatal exception ]---
This problem has been fixed by the commit <fb3d824d1a46> ("mm/rmap: split page_dup_rmap() into page_dup_file_rmap() and page_try_dup_anon_rmap()"), but still exists in the linux-5.10.y branch.
This patch is not applicable to this version because of the large version differences. Therefore, fix it by adding non-anonymous page check in the copy_present_page().
Cc: stable@vger.kernel.org Fixes: 70e806e4e645 ("mm: Do early cow for pinned pages during fork() for ptes") Signed-off-by: Yuanzheng Song songyuanzheng@huawei.com Acked-by: Peter Xu peterx@redhat.com Signed-off-by: Greg Kroah-Hartman gregkh@linuxfoundation.org Signed-off-by: Yuanzheng Song songyuanzheng@huawei.com Reviewed-by: Kefeng Wang wangkefeng.wang@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- mm/memory.c | 11 +++++++++++ 1 file changed, 11 insertions(+)
diff --git a/mm/memory.c b/mm/memory.c index 0a160b704269..0505f9c009c4 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -823,6 +823,17 @@ copy_present_page(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma if (likely(!page_maybe_dma_pinned(page))) return 1;
+ /* + * The vma->anon_vma of the child process may be NULL + * because the entire vma does not contain anonymous pages. + * A BUG will occur when the copy_present_page() passes + * a copy of a non-anonymous page of that vma to the + * page_add_new_anon_rmap() to set up new anonymous rmap. + * Return 1 if the page is not an anonymous page. + */ + if (!PageAnon(page)) + return 1; + new_page = *prealloc; if (!new_page) return -EAGAIN;
From: Liu Shixin liushixin2@huawei.com
stable inclusion from stable-v5.10.150 commit 45c33966759ea1b4040c08dacda99ef623c0ca29 category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I62WRY CVE: NA
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=...
--------------------------------
commit 958f32ce832ba781ac20e11bb2d12a9352ea28fc upstream.
The vma_lock and hugetlb_fault_mutex are dropped before handling userfault and reacquire them again after handle_userfault(), but reacquire the vma_lock could lead to UAF[1,2] due to the following race,
hugetlb_fault hugetlb_no_page /*unlock vma_lock */ hugetlb_handle_userfault handle_userfault /* unlock mm->mmap_lock*/ vm_mmap_pgoff do_mmap mmap_region munmap_vma_range /* clean old vma */ /* lock vma_lock again <--- UAF */ /* unlock vma_lock */
Since the vma_lock will unlock immediately after hugetlb_handle_userfault(), let's drop the unneeded lock and unlock in hugetlb_handle_userfault() to fix the issue.
[1] https://lore.kernel.org/linux-mm/000000000000d5e00a05e834962e@google.com/ [2] https://lore.kernel.org/linux-mm/20220921014457.1668-1-liuzixian4@huawei.com... Link: https://lkml.kernel.org/r/20220923042113.137273-1-liushixin2@huawei.com Fixes: 1a1aad8a9b7b ("userfaultfd: hugetlbfs: add userfaultfd hugetlb hook") Signed-off-by: Liu Shixin liushixin2@huawei.com Signed-off-by: Kefeng Wang wangkefeng.wang@huawei.com Reported-by: syzbot+193f9cee8638750b23cf@syzkaller.appspotmail.com Reported-by: Liu Zixian liuzixian4@huawei.com Reviewed-by: Mike Kravetz mike.kravetz@oracle.com Cc: David Hildenbrand david@redhat.com Cc: John Hubbard jhubbard@nvidia.com Cc: Muchun Song songmuchun@bytedance.com Cc: Sidhartha Kumar sidhartha.kumar@oracle.com Cc: stable@vger.kernel.org [4.14+] Signed-off-by: Andrew Morton akpm@linux-foundation.org Signed-off-by: Greg Kroah-Hartman gregkh@linuxfoundation.org Conflicts: mm/hugetlb.c Signed-off-by: Liu Shixin liushixin2@huawei.com Reviewed-by: Kefeng Wang wangkefeng.wang@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- mm/hugetlb.c | 25 ++++++++++++++----------- 1 file changed, 14 insertions(+), 11 deletions(-)
diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 6b96eda50977..8f680994df4b 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -4770,6 +4770,7 @@ static vm_fault_t hugetlb_no_page(struct mm_struct *mm, spinlock_t *ptl; unsigned long haddr = address & huge_page_mask(h); bool new_page = false; + u32 hash = hugetlb_fault_mutex_hash(mapping, idx);
/* * Currently, we are forced to kill the process in the event the @@ -4779,7 +4780,7 @@ static vm_fault_t hugetlb_no_page(struct mm_struct *mm, if (is_vma_resv_set(vma, HPAGE_RESV_UNMAPPED)) { pr_warn_ratelimited("PID %d killed due to inadequate hugepage pool\n", current->pid); - return ret; + goto out; }
/* @@ -4798,7 +4799,6 @@ static vm_fault_t hugetlb_no_page(struct mm_struct *mm, * Check for page in userfault range */ if (userfaultfd_missing(vma)) { - u32 hash; struct vm_fault vmf = { .vma = vma, .address = haddr, @@ -4813,17 +4813,14 @@ static vm_fault_t hugetlb_no_page(struct mm_struct *mm, };
/* - * hugetlb_fault_mutex and i_mmap_rwsem must be - * dropped before handling userfault. Reacquire - * after handling fault to make calling code simpler. + * vma_lock and hugetlb_fault_mutex must be dropped + * before handling userfault. Also mmap_lock will + * be dropped during handling userfault, any vma + * operation should be careful from here. */ - hash = hugetlb_fault_mutex_hash(mapping, idx); mutex_unlock(&hugetlb_fault_mutex_table[hash]); i_mmap_unlock_read(mapping); - ret = handle_userfault(&vmf, VM_UFFD_MISSING); - i_mmap_lock_read(mapping); - mutex_lock(&hugetlb_fault_mutex_table[hash]); - goto out; + return handle_userfault(&vmf, VM_UFFD_MISSING); }
page = alloc_huge_page(vma, haddr, 0); @@ -4930,6 +4927,8 @@ static vm_fault_t hugetlb_no_page(struct mm_struct *mm,
unlock_page(page); out: + mutex_unlock(&hugetlb_fault_mutex_table[hash]); + i_mmap_unlock_read(mapping); return ret;
backout: @@ -5029,7 +5028,11 @@ vm_fault_t hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma, if (sp_check_vm_share_pool(vma->vm_flags)) ret = sharepool_no_page(mm, vma, mapping, idx, address, ptep, flags); else - ret = hugetlb_no_page(mm, vma, mapping, idx, address, ptep, flags); + /* + * hugetlb_no_page will drop vma lock and hugetlb fault + * mutex internally, which make us return immediately. + */ + return hugetlb_no_page(mm, vma, mapping, idx, address, ptep, flags); goto out_mutex; }
From: Quan Zhou zhouquan65@huawei.com
virt inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I62Q2L CVE: NA
----------------------------------------------------
Add a new entry ("HIP09") in oem_str[] to support detection of the new HiSi CPU type.
Signed-off-by: Quan Zhou zhouquan65@huawei.com Reviewed-by: Zenghui Yu yuzenghui@huawei.com Reviewed-by: Nianyao Tang tangnianyao@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- arch/arm64/kvm/hisilicon/hisi_virt.c | 4 +++- arch/arm64/kvm/hisilicon/hisi_virt.h | 1 + 2 files changed, 4 insertions(+), 1 deletion(-)
diff --git a/arch/arm64/kvm/hisilicon/hisi_virt.c b/arch/arm64/kvm/hisilicon/hisi_virt.c index 9587f9508a79..90c363ed642e 100644 --- a/arch/arm64/kvm/hisilicon/hisi_virt.c +++ b/arch/arm64/kvm/hisilicon/hisi_virt.c @@ -15,6 +15,7 @@ static const char * const hisi_cpu_type_str[] = { "Hisi1612", "Hisi1616", "Hisi1620", + "HIP09", "Unknown" };
@@ -22,7 +23,8 @@ static const char * const hisi_cpu_type_str[] = { static const char * const oem_str[] = { "HIP06", /* Hisi 1612 */ "HIP07", /* Hisi 1616 */ - "HIP08" /* Hisi 1620 */ + "HIP08", /* Hisi 1620 */ + "HIP09" /* HIP09 */ };
/* diff --git a/arch/arm64/kvm/hisilicon/hisi_virt.h b/arch/arm64/kvm/hisilicon/hisi_virt.h index ef8de6a2101e..ebc462bf2a9d 100644 --- a/arch/arm64/kvm/hisilicon/hisi_virt.h +++ b/arch/arm64/kvm/hisilicon/hisi_virt.h @@ -10,6 +10,7 @@ enum hisi_cpu_type { HI_1612, HI_1616, HI_1620, + HI_IP09, UNKNOWN_HI_TYPE };
From: Quan Zhou zhouquan65@huawei.com
virt inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I62Q2L CVE: NA
----------------------------------------------------
DVMBM is an virtualization extension since HIP09, which allows TLBI executed at NS EL1 to be broadcast in a configurable range of physical CPUs (even with HCR_EL2.FB set). It will bring TLBI broadcast optimization.
Introduce the method to detect and enable this feature. Also add a kernel command parameter "kvm-arm.dvmbm_enabled" (=0 on default) so that users can {en,dis}able DVMBM on need. The parameter description is added under Documentation/.
Signed-off-by: Quan Zhou zhouquan65@huawei.com Reviewed-by: Zenghui Yu yuzenghui@huawei.com Reviewed-by: Nianyao Tang tangnianyao@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- .../admin-guide/kernel-parameters.txt | 3 ++ arch/arm64/include/asm/kvm_host.h | 1 + arch/arm64/kvm/arm.c | 5 ++ arch/arm64/kvm/hisilicon/hisi_virt.c | 49 +++++++++++++++++++ arch/arm64/kvm/hisilicon/hisi_virt.h | 6 +++ 5 files changed, 64 insertions(+)
diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt index 3860cb462f0b..8a1a25216da6 100644 --- a/Documentation/admin-guide/kernel-parameters.txt +++ b/Documentation/admin-guide/kernel-parameters.txt @@ -2408,6 +2408,9 @@ [KVM,ARM] Allow use of GICv4 for direct injection of LPIs.
+ kvm-arm.dvmbm_enabled= + [KVM,ARM] Allow use of HiSilicon DVMBM capability. + kvm_cma_resv_ratio=n [PPC] Reserves given percentage from system memory area for contiguous memory allocation for KVM hash pagetable diff --git a/arch/arm64/include/asm/kvm_host.h b/arch/arm64/include/asm/kvm_host.h index 71a3ba24b287..df75b7e45c77 100644 --- a/arch/arm64/include/asm/kvm_host.h +++ b/arch/arm64/include/asm/kvm_host.h @@ -715,5 +715,6 @@ extern unsigned int twedel; #endif
extern bool kvm_ncsnp_support; +extern bool kvm_dvmbm_support;
#endif /* __ARM64_KVM_HOST_H__ */ diff --git a/arch/arm64/kvm/arm.c b/arch/arm64/kvm/arm.c index 469f324ce536..ca990f0269b2 100644 --- a/arch/arm64/kvm/arm.c +++ b/arch/arm64/kvm/arm.c @@ -66,6 +66,9 @@ static bool vgic_present; /* Capability of non-cacheable snooping */ bool kvm_ncsnp_support;
+/* Capability of DVMBM */ +bool kvm_dvmbm_support; + static DEFINE_PER_CPU(unsigned char, kvm_arm_hardware_enabled); DEFINE_STATIC_KEY_FALSE(userspace_irqchip_in_use);
@@ -1865,8 +1868,10 @@ int kvm_arch_init(void *opaque) #ifdef CONFIG_KVM_HISI_VIRT probe_hisi_cpu_type(); kvm_ncsnp_support = hisi_ncsnp_supported(); + kvm_dvmbm_support = hisi_dvmbm_supported(); #endif kvm_info("KVM ncsnp %s\n", kvm_ncsnp_support ? "enabled" : "disabled"); + kvm_info("KVM dvmbm %s\n", kvm_dvmbm_support ? "enabled" : "disabled");
in_hyp_mode = is_kernel_in_hyp_mode();
diff --git a/arch/arm64/kvm/hisilicon/hisi_virt.c b/arch/arm64/kvm/hisilicon/hisi_virt.c index 90c363ed642e..b81488cd663b 100644 --- a/arch/arm64/kvm/hisilicon/hisi_virt.c +++ b/arch/arm64/kvm/hisilicon/hisi_virt.c @@ -11,6 +11,8 @@
static enum hisi_cpu_type cpu_type = UNKNOWN_HI_TYPE;
+static bool dvmbm_enabled; + static const char * const hisi_cpu_type_str[] = { "Hisi1612", "Hisi1616", @@ -124,3 +126,50 @@ bool hisi_ncsnp_supported(void)
return supported; } + +static int __init early_dvmbm_enable(char *buf) +{ + return strtobool(buf, &dvmbm_enabled); +} +early_param("kvm-arm.dvmbm_enabled", early_dvmbm_enable); + +static void hardware_enable_dvmbm(void *data) +{ + u64 val; + + val = read_sysreg_s(SYS_LSUDVM_CTRL_EL2); + val |= LSUDVM_CTLR_EL2_MASK; + write_sysreg_s(val, SYS_LSUDVM_CTRL_EL2); +} + +static void hardware_disable_dvmbm(void *data) +{ + u64 val; + + val = read_sysreg_s(SYS_LSUDVM_CTRL_EL2); + val &= ~LSUDVM_CTLR_EL2_MASK; + write_sysreg_s(val, SYS_LSUDVM_CTRL_EL2); +} + +bool hisi_dvmbm_supported(void) +{ + if (cpu_type != HI_IP09) + return false; + + /* Determine whether DVMBM is supported by the hardware */ + if (!(read_sysreg(aidr_el1) & AIDR_EL1_DVMBM_MASK)) + return false; + + /* User provided kernel command-line parameter */ + if (!dvmbm_enabled || !is_kernel_in_hyp_mode()) { + on_each_cpu(hardware_disable_dvmbm, NULL, 1); + return false; + } + + /* + * Enable TLBI Broadcast optimization by setting + * LSUDVM_CTRL_EL2's bit[0]. + */ + on_each_cpu(hardware_enable_dvmbm, NULL, 1); + return true; +} diff --git a/arch/arm64/kvm/hisilicon/hisi_virt.h b/arch/arm64/kvm/hisilicon/hisi_virt.h index ebc462bf2a9d..95e5e889dcb1 100644 --- a/arch/arm64/kvm/hisilicon/hisi_virt.h +++ b/arch/arm64/kvm/hisilicon/hisi_virt.h @@ -14,7 +14,13 @@ enum hisi_cpu_type { UNKNOWN_HI_TYPE };
+/* HIP09 */ +#define AIDR_EL1_DVMBM_MASK GENMASK_ULL(13, 12) +#define SYS_LSUDVM_CTRL_EL2 sys_reg(3, 4, 15, 7, 4) +#define LSUDVM_CTLR_EL2_MASK BIT_ULL(0) + void probe_hisi_cpu_type(void); bool hisi_ncsnp_supported(void); +bool hisi_dvmbm_supported(void);
#endif /* __HISI_VIRT_H__ */
From: Quan Zhou zhouquan65@huawei.com
virt inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I62Q2L CVE: NA
----------------------------------------------------
We already have cpus_ptr in current thread struct now, through which we can know the pcpu range the thread is allowed to run on. So in kvm_arch_vcpu_{load,put}, we can also know the pcpu range the vcpu thread is allowed to be scheduled on, and that is the range we want to configure for TLBI broadcast.
Introduce two variables cpus_ptr and pre_cpus_ptr in struct kvm_vcpu_arch. @cpus_ptr always comes from current->cpus_ptr and @pre_cpus_ptr always comes from @cpus_ptr.
Signed-off-by: Quan Zhou zhouquan65@huawei.com Reviewed-by: Zenghui Yu yuzenghui@huawei.com Reviewed-by: Nianyao Tang tangnianyao@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- arch/arm64/include/asm/kvm_host.h | 6 +++++ arch/arm64/kvm/arm.c | 18 +++++++++++++ arch/arm64/kvm/hisilicon/hisi_virt.c | 38 ++++++++++++++++++++++++++++ arch/arm64/kvm/hisilicon/hisi_virt.h | 5 ++++ 4 files changed, 67 insertions(+)
diff --git a/arch/arm64/include/asm/kvm_host.h b/arch/arm64/include/asm/kvm_host.h index df75b7e45c77..7b6e2b3fd376 100644 --- a/arch/arm64/include/asm/kvm_host.h +++ b/arch/arm64/include/asm/kvm_host.h @@ -390,6 +390,12 @@ struct kvm_vcpu_arch { } pvsched;
struct id_registers idregs; + +#ifdef CONFIG_KVM_HISI_VIRT + /* Copy of current->cpus_ptr */ + cpumask_t *cpus_ptr; + cpumask_t *pre_cpus_ptr; +#endif };
/* Pointer to the vcpu's SVE FFR for sve_{save,load}_state() */ diff --git a/arch/arm64/kvm/arm.c b/arch/arm64/kvm/arm.c index ca990f0269b2..0d8371eca686 100644 --- a/arch/arm64/kvm/arm.c +++ b/arch/arm64/kvm/arm.c @@ -338,6 +338,12 @@ int kvm_arch_vcpu_create(struct kvm_vcpu *vcpu) if (err) return err;
+#ifdef CONFIG_KVM_HISI_VIRT + err = kvm_hisi_dvmbm_vcpu_init(vcpu); + if (err) + return err; +#endif + return create_hyp_mappings(vcpu, vcpu + 1, PAGE_HYP); }
@@ -355,6 +361,10 @@ void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu) kvm_pmu_vcpu_destroy(vcpu);
kvm_arm_vcpu_destroy(vcpu); + +#ifdef CONFIG_KVM_HISI_VIRT + kvm_hisi_dvmbm_vcpu_destroy(vcpu); +#endif }
int kvm_cpu_has_pending_timer(struct kvm_vcpu *vcpu) @@ -445,6 +455,10 @@ void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
if (kvm_arm_is_pvsched_enabled(&vcpu->arch)) kvm_update_pvsched_preempted(vcpu, 0); + +#ifdef CONFIG_KVM_HISI_VIRT + kvm_hisi_dvmbm_load(vcpu); +#endif }
void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu) @@ -460,6 +474,10 @@ void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu)
if (kvm_arm_is_pvsched_enabled(&vcpu->arch)) kvm_update_pvsched_preempted(vcpu, 1); + +#ifdef CONFIG_KVM_HISI_VIRT + kvm_hisi_dvmbm_put(vcpu); +#endif }
static void vcpu_power_off(struct kvm_vcpu *vcpu) diff --git a/arch/arm64/kvm/hisilicon/hisi_virt.c b/arch/arm64/kvm/hisilicon/hisi_virt.c index b81488cd663b..2c79e7f28ca5 100644 --- a/arch/arm64/kvm/hisilicon/hisi_virt.c +++ b/arch/arm64/kvm/hisilicon/hisi_virt.c @@ -173,3 +173,41 @@ bool hisi_dvmbm_supported(void) on_each_cpu(hardware_enable_dvmbm, NULL, 1); return true; } + +int kvm_hisi_dvmbm_vcpu_init(struct kvm_vcpu *vcpu) +{ + if (!kvm_dvmbm_support) + return 0; + + vcpu->arch.cpus_ptr = kzalloc(sizeof(cpumask_t), GFP_ATOMIC); + vcpu->arch.pre_cpus_ptr = kzalloc(sizeof(cpumask_t), GFP_ATOMIC); + if (!vcpu->arch.cpus_ptr || !vcpu->arch.pre_cpus_ptr) + return -ENOMEM; + + return 0; +} + +void kvm_hisi_dvmbm_vcpu_destroy(struct kvm_vcpu *vcpu) +{ + if (!kvm_dvmbm_support) + return; + + kfree(vcpu->arch.cpus_ptr); + kfree(vcpu->arch.pre_cpus_ptr); +} + +void kvm_hisi_dvmbm_load(struct kvm_vcpu *vcpu) +{ + if (!kvm_dvmbm_support) + return; + + cpumask_copy(vcpu->arch.cpus_ptr, current->cpus_ptr); +} + +void kvm_hisi_dvmbm_put(struct kvm_vcpu *vcpu) +{ + if (!kvm_dvmbm_support) + return; + + cpumask_copy(vcpu->arch.pre_cpus_ptr, vcpu->arch.cpus_ptr); +} diff --git a/arch/arm64/kvm/hisilicon/hisi_virt.h b/arch/arm64/kvm/hisilicon/hisi_virt.h index 95e5e889dcb1..3aac75651733 100644 --- a/arch/arm64/kvm/hisilicon/hisi_virt.h +++ b/arch/arm64/kvm/hisilicon/hisi_virt.h @@ -23,4 +23,9 @@ void probe_hisi_cpu_type(void); bool hisi_ncsnp_supported(void); bool hisi_dvmbm_supported(void);
+int kvm_hisi_dvmbm_vcpu_init(struct kvm_vcpu *vcpu); +void kvm_hisi_dvmbm_vcpu_destroy(struct kvm_vcpu *vcpu); +void kvm_hisi_dvmbm_load(struct kvm_vcpu *vcpu); +void kvm_hisi_dvmbm_put(struct kvm_vcpu *vcpu); + #endif /* __HISI_VIRT_H__ */
From: Quan Zhou zhouquan65@huawei.com
virt inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I62Q2L CVE: NA
----------------------------------------------------
Introduce dvm_cpumask and dvm_lock in struct kvm_arch. dvm_cpumask will store the union of all vcpus' cpus_ptr and will be used for the TLBI broadcast range. dvm_lock ensures a exclusive manipulation of dvm_cpumask.
In vcpu_load, we should decide whether to perform the subsequent update operation by checking whether dvm_cpumask has changed.
Signed-off-by: Quan Zhou zhouquan65@huawei.com Reviewed-by: Zenghui Yu yuzenghui@huawei.com Reviewed-by: Nianyao Tang tangnianyao@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- arch/arm64/include/asm/kvm_host.h | 5 +++ arch/arm64/kvm/arm.c | 10 ++++++ arch/arm64/kvm/hisilicon/hisi_virt.c | 53 ++++++++++++++++++++++++++++ arch/arm64/kvm/hisilicon/hisi_virt.h | 2 ++ 4 files changed, 70 insertions(+)
diff --git a/arch/arm64/include/asm/kvm_host.h b/arch/arm64/include/asm/kvm_host.h index 7b6e2b3fd376..164a90b53195 100644 --- a/arch/arm64/include/asm/kvm_host.h +++ b/arch/arm64/include/asm/kvm_host.h @@ -121,6 +121,11 @@ struct kvm_arch { unsigned int pmuver;
u8 pfr0_csv2; + +#ifdef CONFIG_KVM_HISI_VIRT + spinlock_t dvm_lock; + cpumask_t *dvm_cpumask; /* Union of all vcpu's cpus_ptr */ +#endif };
struct kvm_vcpu_fault_info { diff --git a/arch/arm64/kvm/arm.c b/arch/arm64/kvm/arm.c index 0d8371eca686..67d88b336da2 100644 --- a/arch/arm64/kvm/arm.c +++ b/arch/arm64/kvm/arm.c @@ -143,6 +143,12 @@ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type) { int ret;
+#ifdef CONFIG_KVM_HISI_VIRT + ret = kvm_hisi_init_dvmbm(kvm); + if (ret) + return ret; +#endif + ret = kvm_arm_setup_stage2(kvm, type); if (ret) return ret; @@ -182,6 +188,10 @@ void kvm_arch_destroy_vm(struct kvm *kvm) { int i;
+#ifdef CONFIG_KVM_HISI_VIRT + kvm_hisi_destroy_dvmbm(kvm); +#endif + bitmap_free(kvm->arch.pmu_filter);
kvm_vgic_destroy(kvm); diff --git a/arch/arm64/kvm/hisilicon/hisi_virt.c b/arch/arm64/kvm/hisilicon/hisi_virt.c index 2c79e7f28ca5..18e2ddd8bf4b 100644 --- a/arch/arm64/kvm/hisilicon/hisi_virt.c +++ b/arch/arm64/kvm/hisilicon/hisi_virt.c @@ -198,10 +198,42 @@ void kvm_hisi_dvmbm_vcpu_destroy(struct kvm_vcpu *vcpu)
void kvm_hisi_dvmbm_load(struct kvm_vcpu *vcpu) { + struct kvm *kvm = vcpu->kvm; + struct kvm_vcpu *tmp; + cpumask_t mask; + int i; + + /* Don't bother on old hardware */ if (!kvm_dvmbm_support) return;
cpumask_copy(vcpu->arch.cpus_ptr, current->cpus_ptr); + + if (likely(cpumask_equal(vcpu->arch.cpus_ptr, + vcpu->arch.pre_cpus_ptr))) + return; + + /* Re-calculate dvm_cpumask for this VM */ + spin_lock(&kvm->arch.dvm_lock); + + cpumask_clear(&mask); + kvm_for_each_vcpu(i, tmp, kvm) { + /* + * We may get the stale cpus_ptr if another thread + * is concurrently changing its affinity. It'll + * eventually go through vcpu_load() and we rely on + * the last dvm_lock holder to make things correct. + */ + cpumask_or(&mask, &mask, tmp->arch.cpus_ptr); + } + + if (cpumask_equal(kvm->arch.dvm_cpumask, &mask)) + goto out_unlock; + + cpumask_copy(kvm->arch.dvm_cpumask, &mask); + +out_unlock: + spin_unlock(&kvm->arch.dvm_lock); }
void kvm_hisi_dvmbm_put(struct kvm_vcpu *vcpu) @@ -211,3 +243,24 @@ void kvm_hisi_dvmbm_put(struct kvm_vcpu *vcpu)
cpumask_copy(vcpu->arch.pre_cpus_ptr, vcpu->arch.cpus_ptr); } + +int kvm_hisi_init_dvmbm(struct kvm *kvm) +{ + if (!kvm_dvmbm_support) + return 0; + + spin_lock_init(&kvm->arch.dvm_lock); + kvm->arch.dvm_cpumask = kzalloc(sizeof(cpumask_t), GFP_ATOMIC); + if (!kvm->arch.dvm_cpumask) + return -ENOMEM; + + return 0; +} + +void kvm_hisi_destroy_dvmbm(struct kvm *kvm) +{ + if (!kvm_dvmbm_support) + return; + + kfree(kvm->arch.dvm_cpumask); +} diff --git a/arch/arm64/kvm/hisilicon/hisi_virt.h b/arch/arm64/kvm/hisilicon/hisi_virt.h index 3aac75651733..1fd4b3295d78 100644 --- a/arch/arm64/kvm/hisilicon/hisi_virt.h +++ b/arch/arm64/kvm/hisilicon/hisi_virt.h @@ -27,5 +27,7 @@ int kvm_hisi_dvmbm_vcpu_init(struct kvm_vcpu *vcpu); void kvm_hisi_dvmbm_vcpu_destroy(struct kvm_vcpu *vcpu); void kvm_hisi_dvmbm_load(struct kvm_vcpu *vcpu); void kvm_hisi_dvmbm_put(struct kvm_vcpu *vcpu); +int kvm_hisi_init_dvmbm(struct kvm *kvm); +void kvm_hisi_destroy_dvmbm(struct kvm *kvm);
#endif /* __HISI_VIRT_H__ */
From: Quan Zhou zhouquan65@huawei.com
virt inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I62Q2L CVE: NA
----------------------------------------------------
Implement the capability of DVMBM. Before each vcpu is loaded, we re-calculate the VM-wide dvm_cpumask, and if it's changed we will kick all other vcpus out to reload the latest LSUDVMBM value to the register, and a new request KVM_REQ_RELOAD_DVMBM is added to implement this.
Otherwise if the dvm_cpumask is not changed by this single vcpu, in order to ensure the correctness of the contents in the register, we reload the LSUDVMBM value to the register and nothing else will be done.
Signed-off-by: Quan Zhou zhouquan65@huawei.com Reviewed-by: Zenghui Yu yuzenghui@huawei.com Reviewed-by: Nianyao Tang tangnianyao@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- arch/arm64/include/asm/kvm_host.h | 2 + arch/arm64/kvm/arm.c | 5 ++ arch/arm64/kvm/hisilicon/hisi_virt.c | 125 ++++++++++++++++++++++++++- arch/arm64/kvm/hisilicon/hisi_virt.h | 28 ++++++ 4 files changed, 159 insertions(+), 1 deletion(-)
diff --git a/arch/arm64/include/asm/kvm_host.h b/arch/arm64/include/asm/kvm_host.h index 164a90b53195..cfd788771c64 100644 --- a/arch/arm64/include/asm/kvm_host.h +++ b/arch/arm64/include/asm/kvm_host.h @@ -47,6 +47,7 @@ #define KVM_REQ_RECORD_STEAL KVM_ARCH_REQ(3) #define KVM_REQ_RELOAD_GICv4 KVM_ARCH_REQ(4) #define KVM_REQ_RELOAD_PMU KVM_ARCH_REQ(5) +#define KVM_REQ_RELOAD_DVMBM KVM_ARCH_REQ(6)
#define KVM_DIRTY_LOG_MANUAL_CAPS (KVM_DIRTY_LOG_MANUAL_PROTECT_ENABLE | \ KVM_DIRTY_LOG_INITIALLY_SET) @@ -125,6 +126,7 @@ struct kvm_arch { #ifdef CONFIG_KVM_HISI_VIRT spinlock_t dvm_lock; cpumask_t *dvm_cpumask; /* Union of all vcpu's cpus_ptr */ + u64 lsudvmbm_el2; #endif };
diff --git a/arch/arm64/kvm/arm.c b/arch/arm64/kvm/arm.c index 67d88b336da2..7527ac19332f 100644 --- a/arch/arm64/kvm/arm.c +++ b/arch/arm64/kvm/arm.c @@ -752,6 +752,11 @@ static void check_vcpu_requests(struct kvm_vcpu *vcpu) if (kvm_check_request(KVM_REQ_RELOAD_PMU, vcpu)) kvm_pmu_handle_pmcr(vcpu, __vcpu_sys_reg(vcpu, PMCR_EL0)); + +#ifdef CONFIG_KVM_HISI_VIRT + if (kvm_check_request(KVM_REQ_RELOAD_DVMBM, vcpu)) + kvm_hisi_reload_lsudvmbm(vcpu->kvm); +#endif } }
diff --git a/arch/arm64/kvm/hisilicon/hisi_virt.c b/arch/arm64/kvm/hisilicon/hisi_virt.c index 18e2ddd8bf4b..10233b801896 100644 --- a/arch/arm64/kvm/hisilicon/hisi_virt.c +++ b/arch/arm64/kvm/hisilicon/hisi_virt.c @@ -196,6 +196,97 @@ void kvm_hisi_dvmbm_vcpu_destroy(struct kvm_vcpu *vcpu) kfree(vcpu->arch.pre_cpus_ptr); }
+static void __kvm_write_lsudvmbm(struct kvm *kvm) +{ + write_sysreg_s(kvm->arch.lsudvmbm_el2, SYS_LSUDVMBM_EL2); +} + +static void kvm_write_lsudvmbm(struct kvm *kvm) +{ + /* Do we really need to hold the dvm_lock?? */ + spin_lock(&kvm->arch.dvm_lock); + __kvm_write_lsudvmbm(kvm); + spin_unlock(&kvm->arch.dvm_lock); +} + +static int kvm_dvmbm_get_dies_info(struct kvm *kvm, u64 *vm_aff3s, int size) +{ + int num = 0, cpu; + + for_each_cpu(cpu, kvm->arch.dvm_cpumask) { + bool found = false; + u64 aff3; + int i; + + if (num >= size) + break; + + aff3 = MPIDR_AFFINITY_LEVEL(cpu_logical_map(cpu), 3); + for (i = 0; i < num; i++) { + if (vm_aff3s[i] == aff3) { + found = true; + break; + } + } + + if (!found) + vm_aff3s[num++] = aff3; + } + + return num; +} + +static void kvm_update_vm_lsudvmbm(struct kvm *kvm) +{ + u64 mpidr, aff3, aff2, aff1; + u64 vm_aff3s[DVMBM_MAX_DIES]; + u64 val; + int cpu, nr_dies; + + nr_dies = kvm_dvmbm_get_dies_info(kvm, vm_aff3s, DVMBM_MAX_DIES); + if (nr_dies > 2) { + val = DVMBM_RANGE_ALL_DIES << DVMBM_RANGE_SHIFT; + goto out_update; + } + + if (nr_dies == 1) { + val = DVMBM_RANGE_ONE_DIE << DVMBM_RANGE_SHIFT | + vm_aff3s[0] << DVMBM_DIE1_SHIFT; + + /* fulfill bits [52:0] */ + for_each_cpu(cpu, kvm->arch.dvm_cpumask) { + mpidr = cpu_logical_map(cpu); + aff2 = MPIDR_AFFINITY_LEVEL(mpidr, 2); + aff1 = MPIDR_AFFINITY_LEVEL(mpidr, 1); + + val |= 1ULL << (aff2 * 4 + aff1); + } + + goto out_update; + } + + /* nr_dies == 2 */ + val = DVMBM_RANGE_TWO_DIES << DVMBM_RANGE_SHIFT | + DVMBM_GRAN_CLUSTER << DVMBM_GRAN_SHIFT | + vm_aff3s[0] << DVMBM_DIE1_SHIFT | + vm_aff3s[1] << DVMBM_DIE2_SHIFT; + + /* and fulfill bits [43:0] */ + for_each_cpu(cpu, kvm->arch.dvm_cpumask) { + mpidr = cpu_logical_map(cpu); + aff3 = MPIDR_AFFINITY_LEVEL(mpidr, 3); + aff2 = MPIDR_AFFINITY_LEVEL(mpidr, 2); + + if (aff3 == vm_aff3s[0]) + val |= 1ULL << (aff2 + DVMBM_DIE1_CLUSTER_SHIFT); + else + val |= 1ULL << (aff2 + DVMBM_DIE2_CLUSTER_SHIFT); + } + +out_update: + kvm->arch.lsudvmbm_el2 = val; +} + void kvm_hisi_dvmbm_load(struct kvm_vcpu *vcpu) { struct kvm *kvm = vcpu->kvm; @@ -210,8 +301,10 @@ void kvm_hisi_dvmbm_load(struct kvm_vcpu *vcpu) cpumask_copy(vcpu->arch.cpus_ptr, current->cpus_ptr);
if (likely(cpumask_equal(vcpu->arch.cpus_ptr, - vcpu->arch.pre_cpus_ptr))) + vcpu->arch.pre_cpus_ptr))) { + kvm_write_lsudvmbm(kvm); return; + }
/* Re-calculate dvm_cpumask for this VM */ spin_lock(&kvm->arch.dvm_lock); @@ -232,7 +325,21 @@ void kvm_hisi_dvmbm_load(struct kvm_vcpu *vcpu)
cpumask_copy(kvm->arch.dvm_cpumask, &mask);
+ /* + * Perform a heavy invalidation for this VMID. Good place + * to optimize, right? + */ + kvm_flush_remote_tlbs(kvm); + + /* + * Re-calculate LSUDVMBM_EL2 for this VM and kick all vcpus + * out to reload the LSUDVMBM configuration. + */ + kvm_update_vm_lsudvmbm(kvm); + kvm_make_all_cpus_request(kvm, KVM_REQ_RELOAD_DVMBM); + out_unlock: + __kvm_write_lsudvmbm(kvm); spin_unlock(&kvm->arch.dvm_lock); }
@@ -242,6 +349,12 @@ void kvm_hisi_dvmbm_put(struct kvm_vcpu *vcpu) return;
cpumask_copy(vcpu->arch.pre_cpus_ptr, vcpu->arch.cpus_ptr); + + /* + * We're pretty sure that host kernel runs at EL2 (as + * DVMBM is disabled in case of nVHE) and can't be affected + * by the configured SYS_LSUDVMBM_EL2. + */ }
int kvm_hisi_init_dvmbm(struct kvm *kvm) @@ -264,3 +377,13 @@ void kvm_hisi_destroy_dvmbm(struct kvm *kvm)
kfree(kvm->arch.dvm_cpumask); } + +void kvm_hisi_reload_lsudvmbm(struct kvm *kvm) +{ + if (WARN_ON_ONCE(!kvm_dvmbm_support)) + return; + + preempt_disable(); + kvm_write_lsudvmbm(kvm); + preempt_enable(); +} diff --git a/arch/arm64/kvm/hisilicon/hisi_virt.h b/arch/arm64/kvm/hisilicon/hisi_virt.h index 1fd4b3295d78..aefed2777a9e 100644 --- a/arch/arm64/kvm/hisilicon/hisi_virt.h +++ b/arch/arm64/kvm/hisilicon/hisi_virt.h @@ -19,6 +19,33 @@ enum hisi_cpu_type { #define SYS_LSUDVM_CTRL_EL2 sys_reg(3, 4, 15, 7, 4) #define LSUDVM_CTLR_EL2_MASK BIT_ULL(0)
+/* + * MPIDR_EL1 layout on HIP09 + * + * Aff3[7:3] - socket ID [0-15] + * Aff3[2:0] - die ID [1,3] + * Aff2 - cluster ID [0-9] + * Aff1 - core ID [0-3] + * Aff0 - thread ID [0,1] + */ + +#define SYS_LSUDVMBM_EL2 sys_reg(3, 4, 15, 7, 5) +#define DVMBM_RANGE_SHIFT 62 +#define DVMBM_RANGE_ONE_DIE 0ULL +#define DVMBM_RANGE_TWO_DIES 1ULL +#define DVMBM_RANGE_ALL_DIES 3ULL + +#define DVMBM_GRAN_SHIFT 61 +#define DVMBM_GRAN_CLUSTER 0ULL +#define DVMBM_GRAN_DIE 1ULL + +#define DVMBM_DIE1_SHIFT 53 +#define DVMBM_DIE2_SHIFT 45 +#define DVMBM_DIE1_CLUSTER_SHIFT 22 +#define DVMBM_DIE2_CLUSTER_SHIFT 0 + +#define DVMBM_MAX_DIES 32 + void probe_hisi_cpu_type(void); bool hisi_ncsnp_supported(void); bool hisi_dvmbm_supported(void); @@ -29,5 +56,6 @@ void kvm_hisi_dvmbm_load(struct kvm_vcpu *vcpu); void kvm_hisi_dvmbm_put(struct kvm_vcpu *vcpu); int kvm_hisi_init_dvmbm(struct kvm *kvm); void kvm_hisi_destroy_dvmbm(struct kvm *kvm); +void kvm_hisi_reload_lsudvmbm(struct kvm *kvm);
#endif /* __HISI_VIRT_H__ */
From: Ard Biesheuvel ardb@kernel.org
mainline inclusion from mainline-v5.13-rc1 commit f9e7a99fb6b86aa6a00e53b34ee6973840e005aa category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I634EK CVE: NA
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?i...
--------------------------------
The cache invalidation code in v7_invalidate_l1 can be tweaked to re-read the associativity from CCSIDR, and keep the way identifier component in a single register that is assigned in the outer loop. This way, we need 2 registers less.
Given that the number of sets is typically much larger than the associativity, rearrange the code so that the outer loop has the fewer number of iterations, ensuring that the re-read of CCSIDR only occurs a handful of times in practice.
Fix the whitespace while at it, and update the comment to indicate that this code is no longer a clone of anything else.
Acked-by: Nicolas Pitre nico@fluxnic.net Signed-off-by: Ard Biesheuvel ardb@kernel.org Signed-off-by: Russell King rmk+kernel@armlinux.org.uk Signed-off-by: Zhang Jianhua chris.zjh@huawei.com Reviewed-by: Liao Chang liaochang1@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- arch/arm/mm/cache-v7.S | 51 +++++++++++++++++++++--------------------- 1 file changed, 25 insertions(+), 26 deletions(-)
diff --git a/arch/arm/mm/cache-v7.S b/arch/arm/mm/cache-v7.S index 307f381eee71..76201ee9ee59 100644 --- a/arch/arm/mm/cache-v7.S +++ b/arch/arm/mm/cache-v7.S @@ -33,9 +33,8 @@ icache_size: * processor. We fix this by performing an invalidate, rather than a * clean + invalidate, before jumping into the kernel. * - * This function is cloned from arch/arm/mach-tegra/headsmp.S, and needs - * to be called for both secondary cores startup and primary core resume - * procedures. + * This function needs to be called for both secondary cores startup and + * primary core resume procedures. */ ENTRY(v7_invalidate_l1) mov r0, #0 @@ -43,32 +42,32 @@ ENTRY(v7_invalidate_l1) isb mrc p15, 1, r0, c0, c0, 0 @ read cache geometry from CCSIDR
- movw r1, #0x7fff - and r2, r1, r0, lsr #13 + movw r3, #0x3ff + and r3, r3, r0, lsr #3 @ 'Associativity' in CCSIDR[12:3] + clz r1, r3 @ WayShift + mov r2, #1 + mov r3, r3, lsl r1 @ NumWays-1 shifted into bits [31:...] + movs r1, r2, lsl r1 @ #1 shifted left by same amount + moveq r1, #1 @ r1 needs value > 0 even if only 1 way
- movw r1, #0x3ff + and r2, r0, #0x7 + add r2, r2, #4 @ SetShift
- and r3, r1, r0, lsr #3 @ NumWays - 1 - add r2, r2, #1 @ NumSets +1: movw r4, #0x7fff + and r0, r4, r0, lsr #13 @ 'NumSets' in CCSIDR[27:13]
- and r0, r0, #0x7 - add r0, r0, #4 @ SetShift - - clz r1, r3 @ WayShift - add r4, r3, #1 @ NumWays -1: sub r2, r2, #1 @ NumSets-- - mov r3, r4 @ Temp = NumWays -2: subs r3, r3, #1 @ Temp-- - mov r5, r3, lsl r1 - mov r6, r2, lsl r0 - orr r5, r5, r6 @ Reg = (Temp<<WayShift)|(NumSets<<SetShift) - mcr p15, 0, r5, c7, c6, 2 - bgt 2b - cmp r2, #0 - bgt 1b - dsb st - isb - ret lr +2: mov r4, r0, lsl r2 @ NumSet << SetShift + orr r4, r4, r3 @ Reg = (Temp<<WayShift)|(NumSets<<SetShift) + mcr p15, 0, r4, c7, c6, 2 + subs r0, r0, #1 @ Set-- + bpl 2b + subs r3, r3, r1 @ Way-- + bcc 3f + mrc p15, 1, r0, c0, c0, 0 @ re-read cache geometry from CCSIDR + b 1b +3: dsb st + isb + ret lr ENDPROC(v7_invalidate_l1)
/*
From: Ard Biesheuvel ardb@kernel.org
mainline inclusion from mainline-v5.13-rc1 commit 95731b8ee63ec9419822a51cd9878fa32582fdd2 category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I634EK CVE: NA
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?i...
--------------------------------
Now that we have reduced the number of registers that we need to preserve when calling v7_invalidate_l1 from the boot code, we can use scratch registers to preserve the remaining ones, and get rid of the mini stack entirely. This works around any issues regarding cache behavior in relation to the uncached accesses to this memory, which is hard to get right in the general case (i.e., both bare metal and under virtualization)
While at it, switch v7_invalidate_l1 to using ip as a scratch register instead of r4. This makes the function AAPCS compliant, and removes the need to stash r4 in ip across the call.
conflict: arch/arm/include/asm/memory.h
Acked-by: Nicolas Pitre nico@fluxnic.net Signed-off-by: Ard Biesheuvel ardb@kernel.org Signed-off-by: Russell King rmk+kernel@armlinux.org.uk Signed-off-by: Zhang Jianhua chris.zjh@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- arch/arm/include/asm/memory.h | 15 -------------- arch/arm/mm/cache-v7.S | 10 ++++----- arch/arm/mm/proc-v7.S | 39 ++++++++++++++++------------------- 3 files changed, 23 insertions(+), 41 deletions(-)
diff --git a/arch/arm/include/asm/memory.h b/arch/arm/include/asm/memory.h index a7a22bf5ca7e..05d692d50fe3 100644 --- a/arch/arm/include/asm/memory.h +++ b/arch/arm/include/asm/memory.h @@ -150,21 +150,6 @@ extern unsigned long vectors_base; */ #define PLAT_PHYS_OFFSET UL(CONFIG_PHYS_OFFSET)
-#ifdef CONFIG_XIP_KERNEL -/* - * When referencing data in RAM from the XIP region in a relative manner - * with the MMU off, we need the relative offset between the two physical - * addresses. The macro below achieves this, which is: - * __pa(v_data) - __xip_pa(v_text) - */ -#define PHYS_RELATIVE(v_data, v_text) \ - (((v_data) - PAGE_OFFSET + PLAT_PHYS_OFFSET) - \ - ((v_text) - XIP_VIRT_ADDR(CONFIG_XIP_PHYS_ADDR) + \ - CONFIG_XIP_PHYS_ADDR)) -#else -#define PHYS_RELATIVE(v_data, v_text) ((v_data) - (v_text)) -#endif - #ifndef __ASSEMBLY__
#ifdef CONFIG_RANDOMIZE_BASE diff --git a/arch/arm/mm/cache-v7.S b/arch/arm/mm/cache-v7.S index 76201ee9ee59..830bbfb26ca5 100644 --- a/arch/arm/mm/cache-v7.S +++ b/arch/arm/mm/cache-v7.S @@ -53,12 +53,12 @@ ENTRY(v7_invalidate_l1) and r2, r0, #0x7 add r2, r2, #4 @ SetShift
-1: movw r4, #0x7fff - and r0, r4, r0, lsr #13 @ 'NumSets' in CCSIDR[27:13] +1: movw ip, #0x7fff + and r0, ip, r0, lsr #13 @ 'NumSets' in CCSIDR[27:13]
-2: mov r4, r0, lsl r2 @ NumSet << SetShift - orr r4, r4, r3 @ Reg = (Temp<<WayShift)|(NumSets<<SetShift) - mcr p15, 0, r4, c7, c6, 2 +2: mov ip, r0, lsl r2 @ NumSet << SetShift + orr ip, ip, r3 @ Reg = (Temp<<WayShift)|(NumSets<<SetShift) + mcr p15, 0, ip, c7, c6, 2 subs r0, r0, #1 @ Set-- bpl 2b subs r3, r3, r1 @ Way-- diff --git a/arch/arm/mm/proc-v7.S b/arch/arm/mm/proc-v7.S index 2fcffcc60cc6..7bee6f68c74c 100644 --- a/arch/arm/mm/proc-v7.S +++ b/arch/arm/mm/proc-v7.S @@ -265,6 +265,20 @@ ENDPROC(cpu_pj4b_do_resume)
#endif
+ @ + @ Invoke the v7_invalidate_l1() function, which adheres to the AAPCS + @ rules, and so it may corrupt registers that we need to preserve. + @ + .macro do_invalidate_l1 + mov r6, r1 + mov r7, r2 + mov r10, lr + bl v7_invalidate_l1 @ corrupts {r0-r3, ip, lr} + mov r1, r6 + mov r2, r7 + mov lr, r10 + .endm + /* * __v7_setup * @@ -286,6 +300,7 @@ __v7_ca5mp_setup: __v7_ca9mp_setup: __v7_cr7mp_setup: __v7_cr8mp_setup: + do_invalidate_l1 mov r10, #(1 << 0) @ Cache/TLB ops broadcasting b 1f __v7_ca7mp_setup: @@ -293,13 +308,9 @@ __v7_ca12mp_setup: __v7_ca15mp_setup: __v7_b15mp_setup: __v7_ca17mp_setup: + do_invalidate_l1 mov r10, #0 -1: adr r0, __v7_setup_stack_ptr - ldr r12, [r0] - add r12, r12, r0 @ the local stack - stmia r12, {r1-r6, lr} @ v7_invalidate_l1 touches r0-r6 - bl v7_invalidate_l1 - ldmia r12, {r1-r6, lr} +1: #ifdef CONFIG_SMP orr r10, r10, #(1 << 6) @ Enable SMP/nAMP mode ALT_SMP(mrc p15, 0, r0, c1, c0, 1) @@ -480,12 +491,7 @@ __v7_pj4b_setup: #endif /* CONFIG_CPU_PJ4B */
__v7_setup: - adr r0, __v7_setup_stack_ptr - ldr r12, [r0] - add r12, r12, r0 @ the local stack - stmia r12, {r1-r6, lr} @ v7_invalidate_l1 touches r0-r6 - bl v7_invalidate_l1 - ldmia r12, {r1-r6, lr} + do_invalidate_l1
__v7_setup_cont: and r0, r9, #0xff000000 @ ARM? @@ -557,17 +563,8 @@ __errata_finish: orr r0, r0, r6 @ set them THUMB( orr r0, r0, #1 << 30 ) @ Thumb exceptions ret lr @ return to head.S:__ret - - .align 2 -__v7_setup_stack_ptr: - .word PHYS_RELATIVE(__v7_setup_stack, .) ENDPROC(__v7_setup)
- .bss - .align 2 -__v7_setup_stack: - .space 4 * 7 @ 7 registers - __INITDATA
.weak cpu_v7_bugs_init
From: Vladimir Murzin vladimir.murzin@arm.com
mainline inclusion from mainline-v5.16-rc7 commit 7202216a6f34d571a22274e729f841256bf8b1ef category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I634EK CVE: NA
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?i...
--------------------------------
__secondary_data used to reside in r7 around call to PROCINFO_INITFUNC. After commit 95731b8ee63e ("ARM: 9059/1: cache-v7: get rid of mini-stack") r7 is used as a scratch register, so we have to reload __secondary_data before we setup the stack pointer.
conflict: arch/arm/kernel/head-nommu.S
Fixes: 95731b8ee63e ("ARM: 9059/1: cache-v7: get rid of mini-stack") Signed-off-by: Vladimir Murzin vladimir.murzin@arm.com Signed-off-by: Russell King (Oracle) rmk+kernel@armlinux.org.uk Signed-off-by: Zhang Jianhua chris.zjh@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- arch/arm/kernel/head-nommu.S | 1 + 1 file changed, 1 insertion(+)
diff --git a/arch/arm/kernel/head-nommu.S b/arch/arm/kernel/head-nommu.S index 0fc814bbc34b..8796a69c78e0 100644 --- a/arch/arm/kernel/head-nommu.S +++ b/arch/arm/kernel/head-nommu.S @@ -114,6 +114,7 @@ ENTRY(secondary_startup) add r12, r12, r10 ret r12 1: bl __after_proc_init + ldr r7, __secondary_data @ reload r7 ldr sp, [r7, #12] @ set up the stack pointer mov fp, #0 b secondary_start_kernel
From: Luo Meng luomeng12@huawei.com
hulk inclusion category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I62762 CVE: NA
--------------------------------
A crash as follows: BUG: KASAN: null-ptr-deref in dev_create.cold+0x12/0x77 Read of size 8 at addr 0000000000000020 by task dmsetup/683
CPU: 4 PID: 683 Comm: dmsetup Not tainted 5.10.0-01524-g884de6e91114-dirty #11 Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS ?-20190727_073836-buildvm-ppc64le-16.ppc.fedoraproject.org-3.fc31 04/01/2014 Call Trace: ? dump_stack+0xdd/0x126 ? kasan_report.cold+0xd1/0xdb ? dev_create.cold+0x12/0x77 ? __asan_load8+0xae/0x110 ? dev_create.cold+0x12/0x77 ? dev_rename+0x720/0x720 ? cap_capable+0xcf/0x130 ? ctl_ioctl+0x2f5/0x750 ? dev_rename+0x720/0x720 ? free_params+0x50/0x50 ? unmerge_queues+0x176/0x1b0 ? __blkcg_punt_bio_submit+0x110/0x110 ? mem_cgroup_handle_over_high+0x33/0x5e0 ? dm_ctl_ioctl+0x12/0x20 ? __se_sys_ioctl+0xc5/0x120 ? __x64_sys_ioctl+0x46/0x60 ? do_syscall_64+0x45/0x70 ? entry_SYSCALL_64_after_hwframe+0x61/0xc6
This can be easily reproduced using: dmsetup create xxx --table "0 1000 linear /dev/sda 0" dmsetup remove xxx
Fix this by adding hass_lock in dev_create().
Fixes: a5100d0798e6 ("dm ioctl: add DMINFO() to track dm device create/remove")
Signed-off-by: Luo Meng luomeng12@huawei.com Reviewed-by: Hou Tao houtao1@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- drivers/md/dm-ioctl.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-)
diff --git a/drivers/md/dm-ioctl.c b/drivers/md/dm-ioctl.c index 2186a3a4e48b..1a3e40be81c6 100644 --- a/drivers/md/dm-ioctl.c +++ b/drivers/md/dm-ioctl.c @@ -800,9 +800,13 @@ static int dev_create(struct file *filp, struct dm_ioctl *param, size_t param_si
__dev_status(md, param);
+ mutex_lock(&dm_hash_cells_mutex); hc = dm_get_mdptr(md); - DMINFO("%s[%i]: %s (%s) is created successfully", - current->comm, current->pid, md->disk->disk_name, hc->name); + if (hc) + DMINFO("%s[%i]: %s (%s) is created successfully", + current->comm, current->pid, md->disk->disk_name, hc->name); + + mutex_unlock(&dm_hash_cells_mutex); dm_put(md);
return 0;
From: Luo Meng luomeng12@huawei.com
hulk inclusion category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I5P05D CVE: NA
--------------------------------
When thinpool is suspended and sets fail_io, resume will report error as below: device-mapper: resume ioctl on vg-thinpool failed: Invalid argument
Thinpool also can't be removed if bio is in deferred list.
This can be easily reproduced using:
echo "offline" > /sys/block/sda/device/state dd if=/dev/zero of=/dev/mapper/thin bs=4K count=1 dmsetup suspend /dev/mapper/pool mkfs.ext4 /dev/mapper/thin dmsetup resume /dev/mapper/pool
The root cause is maybe_resize_data_dev() will check fail_io and return error before called dm_resume.
Fix this by adding FAIL mode check at the end of pool_preresume().
Fixes: da105ed5fd7e (dm thin metadata: introduce dm_pool_abort_metadata) Signed-off-by: Luo Meng luomeng12@huawei.com Reviewed-by: Hou Tao houtao1@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- drivers/md/dm-thin.c | 17 +++++++++++++---- 1 file changed, 13 insertions(+), 4 deletions(-)
diff --git a/drivers/md/dm-thin.c b/drivers/md/dm-thin.c index a196d7cb51bd..e837839e4def 100644 --- a/drivers/md/dm-thin.c +++ b/drivers/md/dm-thin.c @@ -3566,20 +3566,29 @@ static int pool_preresume(struct dm_target *ti) */ r = bind_control_target(pool, ti); if (r) - return r; + goto out;
r = maybe_resize_data_dev(ti, &need_commit1); if (r) - return r; + goto out;
r = maybe_resize_metadata_dev(ti, &need_commit2); if (r) - return r; + goto out;
if (need_commit1 || need_commit2) (void) commit(pool);
- return 0; +out: + /* + * When thinpool is PM_FAIL, it cannot be rebuilt if + * bio is in deferred list. Therefor need to return 0 and + * call pool_resume() to flush IO. + */ + if (r && get_pool_mode(pool) == PM_FAIL) + r = 0; + + return r; }
static void pool_suspend_active_thins(struct pool *pool)
From: Zheng Yejian zhengyejian1@huawei.com
hulk inclusion category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I60L10 CVE: NA
--------------------------------
It was reported that if 'static_call' is used in a old function, then the livepatch module created by kpatch for that old function cannot be inserted normally.
Root cause is that relocation of static_call symbols in livepatch module has not been done while initing: load_module prepare_coming_module blocking_notifier_call_chain_robust notifier_call_chain_robust static_call_module_notify <-- 1. static_call symbols init here, but relocation is done at below MARK "2." do_init_module do_one_initcall klp_register_patch klp_init_patch klp_init_object klp_init_object_loaded <-- 2. relocate .klp.xxx here
To solve it, we move the static_call initialization after relocation.
Signed-off-by: Zheng Yejian zhengyejian1@huawei.com Reviewed-by: Kuohai Xu xukuohai@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- kernel/livepatch/core.c | 19 +++++++++++++++++++ kernel/static_call.c | 19 +++++++++++++++++++ 2 files changed, 38 insertions(+)
diff --git a/kernel/livepatch/core.c b/kernel/livepatch/core.c index f6981faa18a8..338c2624de0e 100644 --- a/kernel/livepatch/core.c +++ b/kernel/livepatch/core.c @@ -1192,6 +1192,12 @@ static void klp_init_patch_early(struct klp_patch *patch) } }
+#if defined(CONFIG_HAVE_STATIC_CALL_INLINE) +extern int klp_static_call_register(struct module *mod); +#else +static inline int klp_static_call_register(struct module *mod) { return 0; } +#endif + static int klp_init_patch(struct klp_patch *patch) { struct klp_object *obj; @@ -1223,6 +1229,19 @@ static int klp_init_patch(struct klp_patch *patch) pr_err("register jump label failed, ret=%d\n", ret); return ret; } + ret = klp_static_call_register(patch->mod); + if (ret) { + /* + * We no need to distinctly clean pre-registered jump_label + * here because it will be clean at path: + * load_module + * do_init_module + * fail_free_freeinit: <-- notify GOING here + */ + module_enable_ro(patch->mod, true); + pr_err("register static call failed, ret=%d\n", ret); + return ret; + } module_enable_ro(patch->mod, true);
#ifdef CONFIG_LIVEPATCH_STOP_MACHINE_CONSISTENCY diff --git a/kernel/static_call.c b/kernel/static_call.c index 43ba0b1e0edb..d38f6a92e3e4 100644 --- a/kernel/static_call.c +++ b/kernel/static_call.c @@ -356,6 +356,9 @@ static int static_call_add_module(struct module *mod) struct static_call_site *stop = start + mod->num_static_call_sites; struct static_call_site *site;
+ if (unlikely(!mod_klp_rel_completed(mod))) + return 0; + for (site = start; site != stop; site++) { unsigned long s_key = __static_call_key(site); unsigned long addr = s_key & ~STATIC_CALL_SITE_FLAGS; @@ -398,6 +401,9 @@ static void static_call_del_module(struct module *mod) struct static_call_mod *site_mod, **prev; struct static_call_site *site;
+ if (unlikely(!mod_klp_rel_completed(mod))) + return; + for (site = start; site < stop; site++) { key = static_call_key(site); if (key == prev_key) @@ -450,8 +456,21 @@ static struct notifier_block static_call_module_nb = { .notifier_call = static_call_module_notify, };
+int klp_static_call_register(struct module *mod) +{ + int ret; + + ret = static_call_module_notify(&static_call_module_nb, MODULE_STATE_COMING, mod); + return notifier_to_errno(ret); +} + #else
+int klp_static_call_register(struct module *mod) +{ + return 0; +} + static inline int __static_call_mod_text_reserved(void *start, void *end) { return 0;
From: Zheng Yejian zhengyejian1@huawei.com
hulk inclusion category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I60L10 CVE: NA
--------------------------------
In arm/arm64/ppc32/ppc64, this field is named as old_insns, so uniform it.
Signed-off-by: Zheng Yejian zhengyejian1@huawei.com Reviewed-by: Kuohai Xu xukuohai@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- arch/x86/include/asm/livepatch.h | 2 +- arch/x86/kernel/livepatch.c | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-)
diff --git a/arch/x86/include/asm/livepatch.h b/arch/x86/include/asm/livepatch.h index b510f935ec11..dbcf69b9c4cb 100644 --- a/arch/x86/include/asm/livepatch.h +++ b/arch/x86/include/asm/livepatch.h @@ -36,7 +36,7 @@ int klp_check_calltrace(struct klp_patch *patch, int enable);
#define JMP_E9_INSN_SIZE 5 struct arch_klp_data { - unsigned char old_code[JMP_E9_INSN_SIZE]; + unsigned char old_insns[JMP_E9_INSN_SIZE]; #ifdef CONFIG_LIVEPATCH_STOP_MACHINE_CONSISTENCY /* * Saved opcode at the entry of the old func (which maybe replaced diff --git a/arch/x86/kernel/livepatch.c b/arch/x86/kernel/livepatch.c index d134169488b6..5488bf014637 100644 --- a/arch/x86/kernel/livepatch.c +++ b/arch/x86/kernel/livepatch.c @@ -483,7 +483,7 @@ long arch_klp_save_old_code(struct arch_klp_data *arch_data, void *old_func)
/* Prevent text modification */ mutex_lock(&text_mutex); - ret = copy_from_kernel_nofault(arch_data->old_code, + ret = copy_from_kernel_nofault(arch_data->old_insns, old_func, JMP_E9_INSN_SIZE); mutex_unlock(&text_mutex);
@@ -525,7 +525,7 @@ void arch_klp_unpatch_func(struct klp_func *func) ip = (unsigned long)func_node->old_func; list_del_rcu(&func->stack_node); if (list_empty(&func_node->func_stack)) { - new = func_node->arch_data.old_code; + new = func_node->arch_data.old_insns; } else { next_func = list_first_or_null_rcu(&func_node->func_stack, struct klp_func, stack_node);
From: Zheng Yejian zhengyejian1@huawei.com
hulk inclusion category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I60L10 CVE: NA
--------------------------------
If a function is patched, instructions at the beginning are modified to be 'jump codes' which jump to new function. This requires the function be big enough, otherwise the modification may be out of function range.
Currently each architecture needs to implement arch_klp_func_can_patch() to check function size. However, there exists following problems: 1. arch 'x86' didn't implement arch_klp_func_can_patch(); 2. implementations in arm64 & ppc32, function size is checked only if there's a long jump. There is a scenario where a very short function is successfully patched, but as kernel module increases, someday long jump is required, then the function become unable to be patched. 3. implementaions look like duplicate.
In this patch, introduce macro KLP_MAX_REPLACE_SIZE to denote the maximum size that will be replaced on patching, then move the check ahead into klp_init_object_loaded().
Fixes: c33e42836a74 ("livepatch/core: Allow implementation without ftrace") Signed-off-by: Zheng Yejian zhengyejian1@huawei.com Reviewed-by: Kuohai Xu xukuohai@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- arch/arm/include/asm/livepatch.h | 2 ++ arch/arm/kernel/livepatch.c | 25 ------------------------- arch/arm64/include/asm/livepatch.h | 2 ++ arch/arm64/kernel/livepatch.c | 25 ------------------------- arch/powerpc/include/asm/livepatch.h | 2 ++ arch/powerpc/kernel/livepatch_32.c | 18 ------------------ arch/powerpc/kernel/livepatch_64.c | 15 --------------- arch/x86/include/asm/livepatch.h | 2 ++ kernel/livepatch/core.c | 15 ++++++--------- 9 files changed, 14 insertions(+), 92 deletions(-)
diff --git a/arch/arm/include/asm/livepatch.h b/arch/arm/include/asm/livepatch.h index 47d8b01618c7..445a78d83d21 100644 --- a/arch/arm/include/asm/livepatch.h +++ b/arch/arm/include/asm/livepatch.h @@ -57,6 +57,8 @@ struct arch_klp_data { u32 saved_opcode; };
+#define KLP_MAX_REPLACE_SIZE sizeof_field(struct arch_klp_data, old_insns) + int arch_klp_add_breakpoint(struct arch_klp_data *arch_data, void *old_func); void arch_klp_remove_breakpoint(struct arch_klp_data *arch_data, void *old_func); long arch_klp_save_old_code(struct arch_klp_data *arch_data, void *old_func); diff --git a/arch/arm/kernel/livepatch.c b/arch/arm/kernel/livepatch.c index 713ce67fa6e3..bc09f338e713 100644 --- a/arch/arm/kernel/livepatch.c +++ b/arch/arm/kernel/livepatch.c @@ -496,28 +496,3 @@ void arch_klp_unpatch_func(struct klp_func *func) do_patch(pc, (unsigned long)next_func->new_func); } } - -#ifdef CONFIG_ARM_MODULE_PLTS -/* return 0 if the func can be patched */ -int arch_klp_func_can_patch(struct klp_func *func) -{ - unsigned long pc = (unsigned long)func->old_func; - unsigned long new_addr = (unsigned long)func->new_func; - unsigned long old_size = func->old_size; - - if (!old_size) - return -EINVAL; - - if (!offset_in_range(pc, new_addr, SZ_32M) && - (old_size < LJMP_INSN_SIZE * ARM_INSN_SIZE)) { - pr_err("func %s size less than limit\n", func->old_name); - return -EPERM; - } - return 0; -} -#else -int arch_klp_func_can_patch(struct klp_func *func) -{ - return 0; -} -#endif /* #ifdef CONFIG_ARM_MODULE_PLTS */ diff --git a/arch/arm64/include/asm/livepatch.h b/arch/arm64/include/asm/livepatch.h index bcb6c4081978..c41a22adc944 100644 --- a/arch/arm64/include/asm/livepatch.h +++ b/arch/arm64/include/asm/livepatch.h @@ -66,6 +66,8 @@ struct arch_klp_data { u32 saved_opcode; };
+#define KLP_MAX_REPLACE_SIZE sizeof_field(struct arch_klp_data, old_insns) + int arch_klp_add_breakpoint(struct arch_klp_data *arch_data, void *old_func); void arch_klp_remove_breakpoint(struct arch_klp_data *arch_data, void *old_func); long arch_klp_save_old_code(struct arch_klp_data *arch_data, void *old_func); diff --git a/arch/arm64/kernel/livepatch.c b/arch/arm64/kernel/livepatch.c index cda56066d859..8ec09c22dc26 100644 --- a/arch/arm64/kernel/livepatch.c +++ b/arch/arm64/kernel/livepatch.c @@ -483,28 +483,3 @@ void arch_klp_unpatch_func(struct klp_func *func) do_patch(pc, (unsigned long)next_func->new_func); } } - -#ifdef CONFIG_ARM64_MODULE_PLTS -/* return 0 if the func can be patched */ -int arch_klp_func_can_patch(struct klp_func *func) -{ - unsigned long pc = (unsigned long)func->old_func; - unsigned long new_addr = (unsigned long)func->new_func; - unsigned long old_size = func->old_size; - - if ((long)old_size <= 0) - return -EINVAL; - - if (!offset_in_range(pc, new_addr, SZ_128M) && - (old_size < LJMP_INSN_SIZE * sizeof(u32))) { - pr_err("func %s size less than limit\n", func->old_name); - return -EPERM; - } - return 0; -} -#else -int arch_klp_func_can_patch(struct klp_func *func) -{ - return 0; -} -#endif diff --git a/arch/powerpc/include/asm/livepatch.h b/arch/powerpc/include/asm/livepatch.h index 39dcfc3c28ce..ae674ea59ab3 100644 --- a/arch/powerpc/include/asm/livepatch.h +++ b/arch/powerpc/include/asm/livepatch.h @@ -118,6 +118,8 @@ struct arch_klp_data {
#endif /* CONFIG_PPC64 */
+#define KLP_MAX_REPLACE_SIZE sizeof_field(struct arch_klp_data, old_insns) + struct stackframe { unsigned long sp; unsigned long pc; diff --git a/arch/powerpc/kernel/livepatch_32.c b/arch/powerpc/kernel/livepatch_32.c index 8f53386e7cf8..4eefae2f92dc 100644 --- a/arch/powerpc/kernel/livepatch_32.c +++ b/arch/powerpc/kernel/livepatch_32.c @@ -488,22 +488,4 @@ void arch_klp_unpatch_func(struct klp_func *func) do_patch(pc, (unsigned long)next_func->new_func); } } - -/* return 0 if the func can be patched */ -int arch_klp_func_can_patch(struct klp_func *func) -{ - unsigned long pc = (unsigned long)func->old_func; - unsigned long new_addr = (unsigned long)func->new_func; - unsigned long old_size = func->old_size; - - if (!old_size) - return -EINVAL; - - if (!offset_in_range(pc, new_addr, SZ_32M) && - (old_size < LJMP_INSN_SIZE * sizeof(u32))) { - pr_err("func %s size less than limit\n", func->old_name); - return -EPERM; - } - return 0; -} #endif diff --git a/arch/powerpc/kernel/livepatch_64.c b/arch/powerpc/kernel/livepatch_64.c index cbb5e02cccff..aca7361ac12b 100644 --- a/arch/powerpc/kernel/livepatch_64.c +++ b/arch/powerpc/kernel/livepatch_64.c @@ -491,21 +491,6 @@ void arch_klp_unpatch_func(struct klp_func *func) } }
-/* return 0 if the func can be patched */ -int arch_klp_func_can_patch(struct klp_func *func) -{ - unsigned long old_size = func->old_size; - - if (!old_size) - return -EINVAL; - - if (old_size < LJMP_INSN_SIZE * sizeof(u32)) { - pr_err("func %s size less than limit\n", func->old_name); - return -EPERM; - } - return 0; -} - int arch_klp_init_func(struct klp_object *obj, struct klp_func *func) { #ifdef PPC64_ELF_ABI_v1 diff --git a/arch/x86/include/asm/livepatch.h b/arch/x86/include/asm/livepatch.h index dbcf69b9c4cb..e2cef5b2d8aa 100644 --- a/arch/x86/include/asm/livepatch.h +++ b/arch/x86/include/asm/livepatch.h @@ -46,6 +46,8 @@ struct arch_klp_data { #endif };
+#define KLP_MAX_REPLACE_SIZE sizeof_field(struct arch_klp_data, old_insns) + long arch_klp_save_old_code(struct arch_klp_data *arch_data, void *old_func); #ifdef CONFIG_LIVEPATCH_STOP_MACHINE_CONSISTENCY int arch_klp_check_breakpoint(struct arch_klp_data *arch_data, void *old_func); diff --git a/kernel/livepatch/core.c b/kernel/livepatch/core.c index 338c2624de0e..f613e94f0e38 100644 --- a/kernel/livepatch/core.c +++ b/kernel/livepatch/core.c @@ -938,11 +938,6 @@ void klp_free_replaced_patches_async(struct klp_patch *new_patch) }
#ifdef CONFIG_LIVEPATCH_WO_FTRACE -int __weak arch_klp_func_can_patch(struct klp_func *func) -{ - return 0; -} - int __weak arch_klp_init_func(struct klp_object *obj, struct klp_func *func) { return 0; @@ -965,9 +960,6 @@ static int klp_init_func(struct klp_object *obj, struct klp_func *func) else func->old_mod = NULL; #endif - ret = arch_klp_func_can_patch(func); - if (ret) - return ret;
ret = arch_klp_init_func(obj, func); if (ret) @@ -1043,11 +1035,16 @@ static int klp_init_object_loaded(struct klp_patch *patch,
ret = kallsyms_lookup_size_offset((unsigned long)func->old_func, &func->old_size, NULL); - if (!ret) { + if (!ret || ((long)func->old_size < 0)) { pr_err("kallsyms size lookup failed for '%s'\n", func->old_name); return -ENOENT; } + if (func->old_size < KLP_MAX_REPLACE_SIZE) { + pr_err("%s size less than limit (%lu < %zu)\n", func->old_name, + func->old_size, KLP_MAX_REPLACE_SIZE); + return -EINVAL; + }
#ifdef PPC64_ELF_ABI_v1 /*
From: Zheng Yejian zhengyejian1@huawei.com
hulk inclusion category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I60L10 CVE: NA
--------------------------------
static call and static key allow user to modify instructions on call site, relate configs are: CONFIG_HAVE_STATIC_CALL_INLINE for static call, CONFIG_JUMP_LABEL for static key.
When they exist in first several instruction of an old function, and livepatch could also modify there, then confliction happened.
To avoid the confliction, we don't allow a livepatch module of this case to be inserted.
Fixes: c33e42836a74 ("livepatch/core: Allow implementation without ftrace") Signed-off-by: Zheng Yejian zhengyejian1@huawei.com Reviewed-by: Kuohai Xu xukuohai@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- kernel/livepatch/core.c | 42 +++++++++++++++++++++++++++++++++++++++++ 1 file changed, 42 insertions(+)
diff --git a/kernel/livepatch/core.c b/kernel/livepatch/core.c index f613e94f0e38..c8ef647c9cc4 100644 --- a/kernel/livepatch/core.c +++ b/kernel/livepatch/core.c @@ -34,6 +34,7 @@ #include <linux/delay.h> #include <linux/stop_machine.h> #endif +#include <linux/static_call.h>
/* * klp_mutex is a coarse lock which serializes access to klp data. All @@ -1195,6 +1196,43 @@ extern int klp_static_call_register(struct module *mod); static inline int klp_static_call_register(struct module *mod) { return 0; } #endif
+static int check_address_conflict(struct klp_patch *patch) +{ + struct klp_object *obj; + struct klp_func *func; + int ret; + void *start; + void *end; + + /* + * Locks seem required as comment of jump_label_text_reserved() said: + * Caller must hold jump_label_mutex. + * But looking into implementation of jump_label_text_reserved() and + * static_call_text_reserved(), call sites of every jump_label or static_call + * are checked, and they won't be changed after corresponding module inserted, + * so no need to take jump_label_lock and static_call_lock here. + */ + klp_for_each_object(patch, obj) { + klp_for_each_func(obj, func) { + start = func->old_func; + end = start + KLP_MAX_REPLACE_SIZE - 1; + ret = jump_label_text_reserved(start, end); + if (ret) { + pr_err("'%s' has static key in first %zu bytes, ret=%d\n", + func->old_name, KLP_MAX_REPLACE_SIZE, ret); + return -EINVAL; + } + ret = static_call_text_reserved(start, end); + if (ret) { + pr_err("'%s' has static call in first %zu bytes, ret=%d\n", + func->old_name, KLP_MAX_REPLACE_SIZE, ret); + return -EINVAL; + } + } + } + return 0; +} + static int klp_init_patch(struct klp_patch *patch) { struct klp_object *obj; @@ -1241,6 +1279,10 @@ static int klp_init_patch(struct klp_patch *patch) } module_enable_ro(patch->mod, true);
+ ret = check_address_conflict(patch); + if (ret) + return ret; + #ifdef CONFIG_LIVEPATCH_STOP_MACHINE_CONSISTENCY klp_for_each_object(patch, obj) klp_load_hook(obj);
From: Zheng Yejian zhengyejian1@huawei.com
hulk inclusion category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I60MKD CVE: NA
--------------------------------
Fix several code style issues: - Do not use magic numbers.The number is 10 - Do not use parentheses when printing numbers. - Braces {} are not necessary for single statement blocks - Do not add blank lines on the start of a code block defined by braces.
Signed-off-by: Zheng Yejian zhengyejian1@huawei.com Reviewed-by: Kuohai Xu xukuohai@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- arch/arm/kernel/livepatch.c | 11 +---------- arch/arm64/kernel/livepatch.c | 15 +++------------ arch/powerpc/kernel/livepatch_32.c | 15 +++------------ arch/powerpc/kernel/livepatch_64.c | 13 ++----------- arch/x86/kernel/livepatch.c | 2 +- include/linux/livepatch.h | 16 ++++++++++++++++ kernel/livepatch/core.c | 4 +--- 7 files changed, 27 insertions(+), 49 deletions(-)
diff --git a/arch/arm/kernel/livepatch.c b/arch/arm/kernel/livepatch.c index bc09f338e713..b4d26474ba33 100644 --- a/arch/arm/kernel/livepatch.c +++ b/arch/arm/kernel/livepatch.c @@ -277,16 +277,7 @@ static int do_check_calltrace(struct walk_stackframe_args *args, frame.sp = current_stack_pointer; frame.lr = (unsigned long)__builtin_return_address(0); frame.pc = (unsigned long)do_check_calltrace; - } else if (strncmp(t->comm, "migration/", 10) == 0) { - /* - * current on other CPU - * we call this in stop_machine, so the current - * of each CPUs is mirgation, just compare the - * task_comm here, because we can't get the - * cpu_curr(task_cpu(t))). This assumes that no - * other thread will pretend to be a stopper via - * task_comm. - */ + } else if (klp_is_migration_thread(t->comm)) { continue; } else { frame.fp = thread_saved_fp(t); diff --git a/arch/arm64/kernel/livepatch.c b/arch/arm64/kernel/livepatch.c index 8ec09c22dc26..6b5bcb491125 100644 --- a/arch/arm64/kernel/livepatch.c +++ b/arch/arm64/kernel/livepatch.c @@ -274,16 +274,7 @@ static int do_check_calltrace(struct walk_stackframe_args *args, /* current on this CPU */ frame.fp = (unsigned long)__builtin_frame_address(0); frame.pc = (unsigned long)do_check_calltrace; - } else if (strncmp(t->comm, "migration/", 10) == 0) { - /* - * current on other CPU - * we call this in stop_machine, so the current - * of each CPUs is mirgation, just compare the - * task_comm here, because we can't get the - * cpu_curr(task_cpu(t))). This assumes that no - * other thread will pretend to be a stopper via - * task_comm. - */ + } else if (klp_is_migration_thread(t->comm)) { continue; } else { frame.fp = thread_saved_fp(t); @@ -425,7 +416,7 @@ static int do_patch(unsigned long pc, unsigned long new_addr) for (i = 0; i < LJMP_INSN_SIZE; i++) { ret = aarch64_insn_patch_text_nosync(((u32 *)pc) + i, insns[i]); if (ret) { - pr_err("patch instruction(%d) large range failed, ret=%d\n", + pr_err("patch instruction %d large range failed, ret=%d\n", i, ret); return -EPERM; } @@ -471,7 +462,7 @@ void arch_klp_unpatch_func(struct klp_func *func) ret = aarch64_insn_patch_text_nosync(((u32 *)pc) + i, func_node->arch_data.old_insns[i]); if (ret) { - pr_err("restore instruction(%d) failed, ret=%d\n", i, ret); + pr_err("restore instruction %d failed, ret=%d\n", i, ret); return; } } diff --git a/arch/powerpc/kernel/livepatch_32.c b/arch/powerpc/kernel/livepatch_32.c index 4eefae2f92dc..7b4ed23bf2ca 100644 --- a/arch/powerpc/kernel/livepatch_32.c +++ b/arch/powerpc/kernel/livepatch_32.c @@ -293,16 +293,7 @@ static int do_check_calltrace(struct walk_stackframe_args *args, * backtrace is so similar */ stack = (unsigned long *)current_stack_pointer; - } else if (strncmp(t->comm, "migration/", 10) == 0) { - /* - * current on other CPU - * we call this in stop_machine, so the current - * of each CPUs is mirgation, just compare the - * task_comm here, because we can't get the - * cpu_curr(task_cpu(t))). This assumes that no - * other thread will pretend to be a stopper via - * task_comm. - */ + } else if (klp_is_migration_thread(t->comm)) { continue; } else { /* @@ -440,7 +431,7 @@ static int do_patch(unsigned long pc, unsigned long new_addr) ret = patch_instruction((struct ppc_inst *)(((u32 *)pc) + i), ppc_inst(insns[i])); if (ret) { - pr_err("patch instruction(%d) large range failed, ret=%d\n", + pr_err("patch instruction %d large range failed, ret=%d\n", i, ret); return -EPERM; } @@ -478,7 +469,7 @@ void arch_klp_unpatch_func(struct klp_func *func) ret = patch_instruction((struct ppc_inst *)(((u32 *)pc) + i), ppc_inst(func_node->arch_data.old_insns[i])); if (ret) { - pr_err("restore instruction(%d) failed, ret=%d\n", i, ret); + pr_err("restore instruction %d failed, ret=%d\n", i, ret); return; } } diff --git a/arch/powerpc/kernel/livepatch_64.c b/arch/powerpc/kernel/livepatch_64.c index aca7361ac12b..416f9f03d747 100644 --- a/arch/powerpc/kernel/livepatch_64.c +++ b/arch/powerpc/kernel/livepatch_64.c @@ -314,16 +314,7 @@ static int do_check_calltrace(struct walk_stackframe_args *args, * so similar */ stack = (unsigned long *)current_stack_pointer; - } else if (strncmp(t->comm, "migration/", 10) == 0) { - /* - * current on other CPU - * we call this in stop_machine, so the current - * of each CPUs is mirgation, just compare the - * task_comm here, because we can't get the - * cpu_curr(task_cpu(t))). This assumes that no - * other thread will pretend to be a stopper via - * task_comm. - */ + } else if (klp_is_migration_thread(t->comm)) { continue; } else { /* @@ -476,7 +467,7 @@ void arch_klp_unpatch_func(struct klp_func *func) ret = patch_instruction((struct ppc_inst *)((u32 *)pc + i), ppc_inst(func_node->arch_data.old_insns[i])); if (ret) { - pr_err("restore instruction(%d) failed, ret=%d\n", i, ret); + pr_err("restore instruction %d failed, ret=%d\n", i, ret); break; } } diff --git a/arch/x86/kernel/livepatch.c b/arch/x86/kernel/livepatch.c index 5488bf014637..0241e560bd2e 100644 --- a/arch/x86/kernel/livepatch.c +++ b/arch/x86/kernel/livepatch.c @@ -315,7 +315,7 @@ static int do_check_calltrace(bool (*fn)(void *, int *, unsigned long), void *da #endif
for_each_process_thread(g, t) { - if (!strncmp(t->comm, "migration/", 10)) + if (klp_is_migration_thread(t->comm)) continue;
#ifdef CONFIG_ARCH_STACKWALK diff --git a/include/linux/livepatch.h b/include/linux/livepatch.h index 9301f8e9bb90..56ad1c1dd83e 100644 --- a/include/linux/livepatch.h +++ b/include/linux/livepatch.h @@ -316,6 +316,22 @@ static inline bool klp_have_reliable_stack(void) { return true; } #define klp_smp_isb() #endif
+#define KLP_MIGRATION_NAME_PREFIX "migration/" +static inline bool klp_is_migration_thread(const char *task_name) +{ + /* + * current on other CPU + * we call this in stop_machine, so the current + * of each CPUs is migration, just compare the + * task_comm here, because we can't get the + * cpu_curr(task_cpu(t))). This assumes that no + * other thread will pretend to be a stopper via + * task_comm. + */ + return !strncmp(task_name, KLP_MIGRATION_NAME_PREFIX, + sizeof(KLP_MIGRATION_NAME_PREFIX) - 1); +} + #endif /* CONFIG_LIVEPATCH_PER_TASK_CONSISTENCY */
#else /* !CONFIG_LIVEPATCH */ diff --git a/kernel/livepatch/core.c b/kernel/livepatch/core.c index c8ef647c9cc4..9e65f6ae4061 100644 --- a/kernel/livepatch/core.c +++ b/kernel/livepatch/core.c @@ -1054,10 +1054,9 @@ static int klp_init_object_loaded(struct klp_patch *patch, * feature 'function descriptor'), otherwise size found by * 'kallsyms_lookup_size_offset' may be abnormal. */ - if (func->old_name[0] != '.') { + if (func->old_name[0] != '.') pr_warn("old_name '%s' may miss the prefix '.', old_size=%lu\n", func->old_name, func->old_size); - } #endif
if (func->nop) @@ -1565,7 +1564,6 @@ static int klp_mem_prepare(struct klp_patch *patch)
static void remove_breakpoint(struct klp_func *func, bool restore) { - struct klp_func_node *func_node = klp_find_func_node(func->old_func); struct arch_klp_data *arch_data = &func_node->arch_data;
From: David Vernet void@manifault.com
mainline inclusion from mainline-v5.17-rc1 commit f5bdb34bf0c9314548f2d8e2360b703ff3610303 category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I60MYE CVE: NA
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?i...
--------------------------------
When initializing a 'struct klp_object' in klp_init_object_loaded(), and performing relocations in klp_resolve_symbols(), klp_find_object_symbol() is invoked to look up the address of a symbol in an already-loaded module (or vmlinux). This, in turn, calls kallsyms_on_each_symbol() or module_kallsyms_on_each_symbol() to find the address of the symbol that is being patched.
It turns out that symbol lookups often take up the most CPU time when enabling and disabling a patch, and may hog the CPU and cause other tasks on that CPU's runqueue to starve -- even in paths where interrupts are enabled. For example, under certain workloads, enabling a KLP patch with many objects or functions may cause ksoftirqd to be starved, and thus for interrupts to be backlogged and delayed. This may end up causing TCP retransmits on the host where the KLP patch is being applied, and in general, may cause any interrupts serviced by softirqd to be delayed while the patch is being applied.
So as to ensure that kallsyms_on_each_symbol() does not end up hogging the CPU, this patch adds a call to cond_resched() in kallsyms_on_each_symbol() and module_kallsyms_on_each_symbol(), which are invoked when doing a symbol lookup in vmlinux and a module respectively. Without this patch, if a live-patch is applied on a 36-core Intel host with heavy TCP traffic, a ~10x spike is observed in TCP retransmits while the patch is being applied. Additionally, collecting sched events with perf indicates that ksoftirqd is awakened ~1.3 seconds before it's eventually scheduled. With the patch, no increase in TCP retransmit events is observed, and ksoftirqd is scheduled shortly after it's awakened.
Signed-off-by: David Vernet void@manifault.com Acked-by: Miroslav Benes mbenes@suse.cz Acked-by: Song Liu song@kernel.org Signed-off-by: Petr Mladek pmladek@suse.com Link: https://lore.kernel.org/r/20211229215646.830451-1-void@manifault.com Signed-off-by: Zheng Yejian zhengyejian1@huawei.com Reviewed-by: Kuohai Xu xukuohai@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- kernel/kallsyms.c | 1 + kernel/module.c | 2 ++ 2 files changed, 3 insertions(+)
diff --git a/kernel/kallsyms.c b/kernel/kallsyms.c index fe9de067771c..c6738525fe11 100644 --- a/kernel/kallsyms.c +++ b/kernel/kallsyms.c @@ -191,6 +191,7 @@ int kallsyms_on_each_symbol(int (*fn)(void *, const char *, struct module *, ret = fn(data, namebuf, NULL, kallsyms_sym_address(i)); if (ret != 0) return ret; + cond_resched(); } return module_kallsyms_on_each_symbol(fn, data); } diff --git a/kernel/module.c b/kernel/module.c index cfa3d8c370a8..00aabcd30e4e 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -4484,6 +4484,8 @@ int module_kallsyms_on_each_symbol(int (*fn)(void *, const char *, mod, kallsyms_symbol_value(sym)); if (ret != 0) return ret; + + cond_resched(); } } return 0;
From: Zheng Yejian zhengyejian1@huawei.com
hulk inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I60N44 CVE: NA
--------------------------------
Misspelling of 'CONFIG_PREEMPTION' may cause old function not being checked, which results in a running function being livepatched.
Fixes: 20106abf1e74 ("livepatch: Check whole stack when CONFIG_PREEMPT is set") Signed-off-by: Zheng Yejian zhengyejian1@huawei.com Reviewed-by: Kuohai Xu xukuohai@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- arch/powerpc/kernel/livepatch_64.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/arch/powerpc/kernel/livepatch_64.c b/arch/powerpc/kernel/livepatch_64.c index 416f9f03d747..a2ec7c8c1bad 100644 --- a/arch/powerpc/kernel/livepatch_64.c +++ b/arch/powerpc/kernel/livepatch_64.c @@ -174,7 +174,7 @@ static int klp_check_activeness_func(struct klp_patch *patch, int enable, * excution of instructions to be repalced is * complete. */ - if (IS_ENABLED(CONFIG_PREEMTION) || + if (IS_ENABLED(CONFIG_PREEMPTION) || (func->force == KLP_NORMAL_FORCE) || check_jump_insn(func_addr)) { ret = add_func_to_list(check_funcs, &pcheck,
From: Luo Meng luomeng12@huawei.com
hulk inclusion category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I5WBID CVE: NA
--------------------------------
When dm_resume() and dm_destroy() are concurrent, it will lead to UAF.
One of the concurrency UAF can be shown as below:
use free do_resume | __find_device_hash_cell | dm_get | atomic_inc(&md->holders) | | dm_destroy | __dm_destroy | if (!dm_suspended_md(md)) | atomic_read(&md->holders) | msleep(1) dm_resume | __dm_resume | dm_table_resume_targets | pool_resume | do_waker #add delay work | | dm_table_destroy | pool_dtr | __pool_dec | __pool_destroy | destroy_workqueue | kfree(pool) # free pool time out __do_softirq run_timer_softirq # pool has already been freed
This can be easily reproduced using: 1. create thin-pool 2. dmsetup suspend pool 3. dmsetup resume pool 4. dmsetup remove_all # Concurrent with 3
The root cause of UAF bugs is that dm_resume() adds timer after dm_destroy() skips cancel timer beause of suspend status. After timeout, it will call run_timer_softirq(), however pool has already been freed. The concurrency UAF bug will happen.
Therefore, canceling timer is moved after md->holders is zero.
Signed-off-by: Luo Meng luomeng12@huawei.com Reviewed-by: Hou Tao houtao1@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- drivers/md/dm.c | 26 +++++++++++++------------- 1 file changed, 13 insertions(+), 13 deletions(-)
diff --git a/drivers/md/dm.c b/drivers/md/dm.c index 1fc745097405..335eef5c3c05 100644 --- a/drivers/md/dm.c +++ b/drivers/md/dm.c @@ -2252,6 +2252,19 @@ static void __dm_destroy(struct mapped_device *md, bool wait)
blk_set_queue_dying(md->queue);
+ /* + * Rare, but there may be I/O requests still going to complete, + * for example. Wait for all references to disappear. + * No one should increment the reference count of the mapped_device, + * after the mapped_device state becomes DMF_FREEING. + */ + if (wait) + while (atomic_read(&md->holders)) + msleep(1); + else if (atomic_read(&md->holders)) + DMWARN("%s: Forcibly removing mapped_device still in use! (%d users)", + dm_device_name(md), atomic_read(&md->holders)); + /* * Take suspend_lock so that presuspend and postsuspend methods * do not race with internal suspend. @@ -2268,19 +2281,6 @@ static void __dm_destroy(struct mapped_device *md, bool wait) dm_put_live_table(md, srcu_idx); mutex_unlock(&md->suspend_lock);
- /* - * Rare, but there may be I/O requests still going to complete, - * for example. Wait for all references to disappear. - * No one should increment the reference count of the mapped_device, - * after the mapped_device state becomes DMF_FREEING. - */ - if (wait) - while (atomic_read(&md->holders)) - msleep(1); - else if (atomic_read(&md->holders)) - DMWARN("%s: Forcibly removing mapped_device still in use! (%d users)", - dm_device_name(md), atomic_read(&md->holders)); - dm_sysfs_exit(md); dm_table_destroy(__unbind(md)); free_dev(md);
From: Hao Lan lanhao@huawei.com
driver inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I63AH1 CVE: NA
----------------------------------------------------------------------
Implement configuration and query WOL by ethtool and added the needed device commands and structures to hns3. Add it do not support suspend resume interface.
Signed-off-by: Hao Lan lanhao@huawei.com Signed-off-by: Jiantao Xiao xiaojiantao1@h-partners.com Reviewed-by: Yue Haibing yuehaibing@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- drivers/net/ethernet/hisilicon/hns3/hnae3.h | 12 ++ .../hns3/hns3_common/hclge_comm_cmd.c | 1 + .../hns3/hns3_common/hclge_comm_cmd.h | 3 + .../ethernet/hisilicon/hns3/hns3_ethtool.c | 27 +++ .../hisilicon/hns3/hns3pf/hclge_cmd.h | 24 +++ .../hisilicon/hns3/hns3pf/hclge_main.c | 202 ++++++++++++++++++ .../hisilicon/hns3/hns3pf/hclge_main.h | 10 + 7 files changed, 279 insertions(+)
diff --git a/drivers/net/ethernet/hisilicon/hns3/hnae3.h b/drivers/net/ethernet/hisilicon/hns3/hnae3.h index 779b32bd646d..91c2a826f2b3 100644 --- a/drivers/net/ethernet/hisilicon/hns3/hnae3.h +++ b/drivers/net/ethernet/hisilicon/hns3/hnae3.h @@ -101,6 +101,7 @@ enum HNAE3_DEV_CAP_BITS { HNAE3_DEV_SUPPORT_MC_MAC_MNG_B, HNAE3_DEV_SUPPORT_CQ_B, HNAE3_DEV_SUPPORT_LANE_NUM_B, + HNAE3_DEV_SUPPORT_WOL_B, };
#define hnae3_ae_dev_fd_supported(ae_dev) \ @@ -166,6 +167,9 @@ enum HNAE3_DEV_CAP_BITS { #define hnae3_ae_dev_lane_num_supported(ae_dev) \ test_bit(HNAE3_DEV_SUPPORT_LANE_NUM_B, (ae_dev)->caps)
+#define hnae3_ae_dev_wol_supported(ae_dev) \ + test_bit(HNAE3_DEV_SUPPORT_WOL_B, (ae_dev)->caps) + enum HNAE3_PF_CAP_BITS { HNAE3_PF_SUPPORT_VLAN_FLTR_MDF_B = 0, }; @@ -569,6 +573,10 @@ struct hnae3_ae_dev { * Get phc info * clean_vf_config * Clean residual vf info after disable sriov + * get_wol + * Get wake on lan info + * set_wol + * Config wake on lan */ struct hnae3_ae_ops { int (*init_ae_dev)(struct hnae3_ae_dev *ae_dev); @@ -767,6 +775,10 @@ struct hnae3_ae_ops { void (*clean_vf_config)(struct hnae3_ae_dev *ae_dev, int num_vfs); int (*get_dscp_prio)(struct hnae3_handle *handle, u8 dscp, u8 *tc_map_mode, u8 *priority); + void (*get_wol)(struct hnae3_handle *handle, + struct ethtool_wolinfo *wol); + int (*set_wol)(struct hnae3_handle *handle, + struct ethtool_wolinfo *wol); };
struct hnae3_dcb_ops { diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3_common/hclge_comm_cmd.c b/drivers/net/ethernet/hisilicon/hns3/hns3_common/hclge_comm_cmd.c index f1251890ef14..e27824340615 100644 --- a/drivers/net/ethernet/hisilicon/hns3/hns3_common/hclge_comm_cmd.c +++ b/drivers/net/ethernet/hisilicon/hns3/hns3_common/hclge_comm_cmd.c @@ -154,6 +154,7 @@ static const struct hclge_comm_caps_bit_map hclge_pf_cmd_caps[] = { {HCLGE_COMM_CAP_GRO_B, HNAE3_DEV_SUPPORT_GRO_B}, {HCLGE_COMM_CAP_FD_B, HNAE3_DEV_SUPPORT_FD_B}, {HCLGE_COMM_CAP_LANE_NUM_B, HNAE3_DEV_SUPPORT_LANE_NUM_B}, + {HCLGE_COMM_CAP_WOL_B, HNAE3_DEV_SUPPORT_WOL_B}, };
static const struct hclge_comm_caps_bit_map hclge_vf_cmd_caps[] = { diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3_common/hclge_comm_cmd.h b/drivers/net/ethernet/hisilicon/hns3/hns3_common/hclge_comm_cmd.h index ec1cb010d0ac..f74dc9e674c0 100644 --- a/drivers/net/ethernet/hisilicon/hns3/hns3_common/hclge_comm_cmd.h +++ b/drivers/net/ethernet/hisilicon/hns3/hns3_common/hclge_comm_cmd.h @@ -293,6 +293,8 @@ enum hclge_opcode_type { HCLGE_PPP_CMD0_INT_CMD = 0x2100, HCLGE_PPP_CMD1_INT_CMD = 0x2101, HCLGE_MAC_ETHERTYPE_IDX_RD = 0x2105, + HCLGE_OPC_WOL_CFG = 0x2200, + HCLGE_OPC_WOL_GET_SUPPORTED_MODE = 0x2201, HCLGE_NCSI_INT_EN = 0x2401,
/* ROH MAC commands */ @@ -343,6 +345,7 @@ enum HCLGE_COMM_CAP_BITS { HCLGE_COMM_CAP_GRO_B = 20, HCLGE_COMM_CAP_FD_B = 21, HCLGE_COMM_CAP_LANE_NUM_B = 27, + HCLGE_COMM_CAP_WOL_B = 28, };
enum HCLGE_COMM_API_CAP_BITS { diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3_ethtool.c b/drivers/net/ethernet/hisilicon/hns3/hns3_ethtool.c index 46b20650d09a..6db505abc2d7 100644 --- a/drivers/net/ethernet/hisilicon/hns3/hns3_ethtool.c +++ b/drivers/net/ethernet/hisilicon/hns3/hns3_ethtool.c @@ -2043,6 +2043,31 @@ static int hns3_get_link_ext_state(struct net_device *netdev, return -ENODATA; }
+static void hns3_get_wol(struct net_device *netdev, struct ethtool_wolinfo *wol) +{ + struct hnae3_handle *handle = hns3_get_handle(netdev); + struct hnae3_ae_dev *ae_dev = pci_get_drvdata(handle->pdev); + const struct hnae3_ae_ops *ops = handle->ae_algo->ops; + + if (!hnae3_ae_dev_wol_supported(ae_dev) || !ops->get_wol) + return; + + ops->get_wol(handle, wol); +} + +static int hns3_set_wol(struct net_device *netdev, + struct ethtool_wolinfo *wol) +{ + struct hnae3_handle *handle = hns3_get_handle(netdev); + struct hnae3_ae_dev *ae_dev = pci_get_drvdata(handle->pdev); + const struct hnae3_ae_ops *ops = handle->ae_algo->ops; + + if (!hnae3_ae_dev_wol_supported(ae_dev) || !ops->set_wol) + return -EOPNOTSUPP; + + return ops->set_wol(handle, wol); +} + static const struct ethtool_ops hns3vf_ethtool_ops = { .supported_coalesce_params = HNS3_ETHTOOL_COALESCE, .supported_ring_params = HNS3_ETHTOOL_RING, @@ -2117,6 +2142,8 @@ static const struct ethtool_ops hns3_ethtool_ops = { .set_tunable = hns3_set_tunable, .reset = hns3_set_reset, .get_link_ext_state = hns3_get_link_ext_state, + .get_wol = hns3_get_wol, + .set_wol = hns3_set_wol, };
void hns3_ethtool_set_ops(struct net_device *netdev) diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_cmd.h b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_cmd.h index ce0e72862257..672ab4d3a9a0 100644 --- a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_cmd.h +++ b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_cmd.h @@ -884,6 +884,30 @@ struct hclge_phy_reg_cmd { u8 rsv1[18]; };
+enum HCLGE_WOL_MODE { + HCLGE_WOL_PHY = BIT(0), + HCLGE_WOL_UNICAST = BIT(1), + HCLGE_WOL_MULTICAST = BIT(2), + HCLGE_WOL_BROADCAST = BIT(3), + HCLGE_WOL_ARP = BIT(4), + HCLGE_WOL_MAGIC = BIT(5), + HCLGE_WOL_MAGICSECURED = BIT(6), + HCLGE_WOL_FILTER = BIT(7), + HCLGE_WOL_DISABLE = 0, +}; + +struct hclge_wol_cfg_cmd { + __le32 wake_on_lan_mode; + u8 sopass[SOPASS_MAX]; + u8 sopass_size; + u8 rsv[13]; +}; + +struct hclge_query_wol_supported_cmd { + __le32 supported_wake_mode; + u8 rsv[20]; +}; + struct hclge_hw; int hclge_cmd_send(struct hclge_hw *hw, struct hclge_desc *desc, int num); enum hclge_comm_cmd_status hclge_cmd_mdio_write(struct hclge_hw *hw, diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c index a10a3a746742..98538d8f96f4 100644 --- a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c +++ b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c @@ -12024,6 +12024,199 @@ static void hclge_uninit_rxd_adv_layout(struct hclge_dev *hdev) hclge_write_dev(&hdev->hw, HCLGE_RXD_ADV_LAYOUT_EN_REG, 0); }
+static __u32 hclge_wol_mode_to_ethtool(u32 mode) +{ + __u32 ret = 0; + + if (mode & HCLGE_WOL_PHY) + ret |= WAKE_PHY; + + if (mode & HCLGE_WOL_UNICAST) + ret |= WAKE_UCAST; + + if (mode & HCLGE_WOL_MULTICAST) + ret |= WAKE_MCAST; + + if (mode & HCLGE_WOL_BROADCAST) + ret |= WAKE_BCAST; + + if (mode & HCLGE_WOL_ARP) + ret |= WAKE_ARP; + + if (mode & HCLGE_WOL_MAGIC) + ret |= WAKE_MAGIC; + + if (mode & HCLGE_WOL_MAGICSECURED) + ret |= WAKE_MAGICSECURE; + + if (mode & HCLGE_WOL_FILTER) + ret |= WAKE_FILTER; + + return ret; +} + +static u32 hclge_wol_mode_from_ethtool(__u32 mode) +{ + u32 ret = HCLGE_WOL_DISABLE; + + if (mode & WAKE_PHY) + ret |= HCLGE_WOL_PHY; + + if (mode & WAKE_UCAST) + ret |= HCLGE_WOL_UNICAST; + + if (mode & WAKE_MCAST) + ret |= HCLGE_WOL_MULTICAST; + + if (mode & WAKE_BCAST) + ret |= HCLGE_WOL_BROADCAST; + + if (mode & WAKE_ARP) + ret |= HCLGE_WOL_ARP; + + if (mode & WAKE_MAGIC) + ret |= HCLGE_WOL_MAGIC; + + if (mode & WAKE_MAGICSECURE) + ret |= HCLGE_WOL_MAGICSECURED; + + if (mode & WAKE_FILTER) + ret |= HCLGE_WOL_FILTER; + + return ret; +} + +int hclge_get_wol_supported_mode(struct hclge_dev *hdev, u32 *wol_supported) +{ + struct hclge_query_wol_supported_cmd *wol_supported_cmd; + struct hclge_desc desc; + int ret; + + hclge_cmd_setup_basic_desc(&desc, HCLGE_OPC_WOL_GET_SUPPORTED_MODE, + true); + wol_supported_cmd = (struct hclge_query_wol_supported_cmd *)&desc.data; + + ret = hclge_cmd_send(&hdev->hw, &desc, 1); + if (ret) { + dev_err(&hdev->pdev->dev, + "failed to query wol supported, ret = %d\n", ret); + return ret; + } + + *wol_supported = le32_to_cpu(wol_supported_cmd->supported_wake_mode); + + return 0; +} + +int hclge_get_wol_cfg(struct hclge_dev *hdev, u32 *mode) +{ + struct hclge_wol_cfg_cmd *wol_cfg_cmd; + struct hclge_desc desc; + int ret; + + hclge_cmd_setup_basic_desc(&desc, HCLGE_OPC_WOL_CFG, true); + ret = hclge_cmd_send(&hdev->hw, &desc, 1); + if (ret) { + dev_err(&hdev->pdev->dev, + "failed to get wol config, ret = %d\n", ret); + return ret; + } + + wol_cfg_cmd = (struct hclge_wol_cfg_cmd *)&desc.data; + *mode = le32_to_cpu(wol_cfg_cmd->wake_on_lan_mode); + + return 0; +} + +static int hclge_set_wol_cfg(struct hclge_dev *hdev, + struct hclge_wol_info *wol_info) +{ + struct hclge_wol_cfg_cmd *wol_cfg_cmd; + struct hclge_desc desc; + int ret; + + hclge_cmd_setup_basic_desc(&desc, HCLGE_OPC_WOL_CFG, false); + wol_cfg_cmd = (struct hclge_wol_cfg_cmd *)&desc.data; + wol_cfg_cmd->wake_on_lan_mode = cpu_to_le32(wol_info->wol_current_mode); + wol_cfg_cmd->sopass_size = wol_info->wol_sopass_size; + memcpy(&wol_cfg_cmd->sopass, wol_info->wol_sopass, SOPASS_MAX); + + ret = hclge_cmd_send(&hdev->hw, &desc, 1); + if (ret) + dev_err(&hdev->pdev->dev, + "failed to set wol config, ret = %d\n", ret); + + return ret; +} + +static int hclge_update_wol(struct hclge_dev *hdev) +{ + struct hclge_wol_info *wol_info = &hdev->hw.mac.wol; + + if (!hnae3_ae_dev_wol_supported(hdev->ae_dev)) + return 0; + + return hclge_set_wol_cfg(hdev, wol_info); +} + +static int hclge_init_wol(struct hclge_dev *hdev) +{ + struct hclge_wol_info *wol_info = &hdev->hw.mac.wol; + int ret; + + if (!hnae3_ae_dev_wol_supported(hdev->ae_dev)) + return 0; + + memset(wol_info, 0, sizeof(struct hclge_wol_info)); + ret = hclge_get_wol_supported_mode(hdev, + &wol_info->wol_support_mode); + if (ret) { + wol_info->wol_support_mode = HCLGE_WOL_DISABLE; + return ret; + } + + return hclge_update_wol(hdev); +} + +static void hclge_get_wol(struct hnae3_handle *handle, + struct ethtool_wolinfo *wol) +{ + struct hclge_vport *vport = hclge_get_vport(handle); + struct hclge_dev *hdev = vport->back; + struct hclge_wol_info *wol_info = &hdev->hw.mac.wol; + + wol->supported = hclge_wol_mode_to_ethtool(wol_info->wol_support_mode); + wol->wolopts = + hclge_wol_mode_to_ethtool(wol_info->wol_current_mode); + if (wol_info->wol_current_mode & HCLGE_WOL_MAGICSECURED) + memcpy(&wol->sopass, wol_info->wol_sopass, SOPASS_MAX); +} + +static int hclge_set_wol(struct hnae3_handle *handle, + struct ethtool_wolinfo *wol) +{ + struct hclge_vport *vport = hclge_get_vport(handle); + struct hclge_dev *hdev = vport->back; + struct hclge_wol_info *wol_info = &hdev->hw.mac.wol; + u32 wol_supported; + u32 wol_mode; + + wol_supported = hclge_wol_mode_from_ethtool(wol->supported); + wol_mode = hclge_wol_mode_from_ethtool(wol->wolopts); + if (wol_mode & ~wol_supported) + return -EINVAL; + + wol_info->wol_current_mode = wol_mode; + if (wol_mode & HCLGE_WOL_MAGICSECURED) { + memcpy(wol_info->wol_sopass, &wol->sopass, SOPASS_MAX); + wol_info->wol_sopass_size = SOPASS_MAX; + } else { + wol_info->wol_sopass_size = 0; + } + + return hclge_set_wol_cfg(hdev, wol_info); +} + static int hclge_init_ae_dev(struct hnae3_ae_dev *ae_dev) { struct pci_dev *pdev = ae_dev->pdev; @@ -12223,6 +12416,11 @@ static int hclge_init_ae_dev(struct hnae3_ae_dev *ae_dev) /* Enable MISC vector(vector0) */ hclge_enable_vector(&hdev->misc_vector, true);
+ ret = hclge_init_wol(hdev); + if (ret) + dev_warn(&pdev->dev, + "failed to wake on lan init, ret = %d\n", ret); + hclge_state_init(hdev); hdev->last_reset_time = jiffies;
@@ -12605,6 +12803,8 @@ static int hclge_reset_ae_dev(struct hnae3_ae_dev *ae_dev)
hclge_init_rxd_adv_layout(hdev);
+ (void)hclge_update_wol(hdev); + dev_info(&pdev->dev, "Reset done, %s driver initialization finished.\n", HCLGE_DRIVER_NAME);
@@ -13640,6 +13840,8 @@ static const struct hnae3_ae_ops hclge_ops = { .get_link_diagnosis_info = hclge_get_link_diagnosis_info, .clean_vf_config = hclge_clean_vport_config, .get_dscp_prio = hclge_get_dscp_prio, + .get_wol = hclge_get_wol, + .set_wol = hclge_set_wol, };
static struct hnae3_ae_algo ae_algo = { diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.h b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.h index e5d786adafc3..9fdffc3d18b2 100644 --- a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.h +++ b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.h @@ -250,6 +250,13 @@ enum HCLGE_MAC_DUPLEX { #define QUERY_SFP_SPEED 0 #define QUERY_ACTIVE_SPEED 1
+struct hclge_wol_info { + u32 wol_support_mode; /* store the wake on lan info */ + u32 wol_current_mode; + u8 wol_sopass[SOPASS_MAX]; + u8 wol_sopass_size; +}; + struct hclge_mac { u8 mac_id; u8 phy_addr; @@ -269,6 +276,7 @@ struct hclge_mac { u32 user_fec_mode; u32 fec_ability; int link; /* store the link status of mac & phy (if phy exists) */ + struct hclge_wol_info wol; struct phy_device *phydev; struct mii_bus *mdio_bus; phy_interface_t phy_if; @@ -1148,4 +1156,6 @@ int hclge_register_sysfs(struct hclge_dev *hdev); void hclge_unregister_sysfs(struct hclge_dev *hdev); int hclge_cfg_mac_speed_dup_hw(struct hclge_dev *hdev, int speed, u8 duplex, u8 lane_num); +int hclge_get_wol_supported_mode(struct hclge_dev *hdev, u32 *wol_supported); +int hclge_get_wol_cfg(struct hclge_dev *hdev, u32 *mode); #endif
From: Hao Lan lanhao@huawei.com
driver inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I63AH1 CVE: NA
----------------------------------------------------------------------
Implement debugfs for wake on lan to hns3. The debugfs support verify the firmware wake on lan configuration.
Signed-off-by: Hao Lan lanhao@huawei.com Signed-off-by: Jiantao Xiao xiaojiantao1@h-partners.com Reviewed-by: Yue Haibing yuehaibing@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- drivers/net/ethernet/hisilicon/hns3/hnae3.h | 1 + .../ethernet/hisilicon/hns3/hns3_debugfs.c | 10 +++ .../hisilicon/hns3/hns3pf/hclge_debugfs.c | 62 +++++++++++++++++++ 3 files changed, 73 insertions(+)
diff --git a/drivers/net/ethernet/hisilicon/hns3/hnae3.h b/drivers/net/ethernet/hisilicon/hns3/hnae3.h index 91c2a826f2b3..8bda1649de32 100644 --- a/drivers/net/ethernet/hisilicon/hns3/hnae3.h +++ b/drivers/net/ethernet/hisilicon/hns3/hnae3.h @@ -326,6 +326,7 @@ enum hnae3_dbg_cmd { HNAE3_DBG_CMD_UMV_INFO, HNAE3_DBG_CMD_PAGE_POOL_INFO, HNAE3_DBG_CMD_COAL_INFO, + HNAE3_DBG_CMD_WOL_INFO, HNAE3_DBG_CMD_UNKNOWN, };
diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3_debugfs.c b/drivers/net/ethernet/hisilicon/hns3/hns3_debugfs.c index a31b10748ac1..62250669b9e8 100644 --- a/drivers/net/ethernet/hisilicon/hns3/hns3_debugfs.c +++ b/drivers/net/ethernet/hisilicon/hns3/hns3_debugfs.c @@ -357,6 +357,13 @@ static struct hns3_dbg_cmd_info hns3_dbg_cmd[] = { .buf_len = HNS3_DBG_READ_LEN_1MB, .init = hns3_dbg_common_file_init, }, + { + .name = "wol_info", + .cmd = HNAE3_DBG_CMD_WOL_INFO, + .dentry = HNS3_DBG_DENTRY_COMMON, + .buf_len = HNS3_DBG_READ_LEN, + .init = hns3_dbg_common_file_init, + }, };
static struct hns3_dbg_cap_info hns3_dbg_cap[] = { @@ -402,6 +409,9 @@ static struct hns3_dbg_cap_info hns3_dbg_cap[] = { }, { .name = "support modify vlan filter state", .cap_bit = HNAE3_DEV_SUPPORT_VLAN_FLTR_MDF_B, + }, { + .name = "support wake on lan", + .cap_bit = HNAE3_DEV_SUPPORT_WOL_B, } };
diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_debugfs.c b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_debugfs.c index 0b7e8b4c7571..1a69638f1ba6 100644 --- a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_debugfs.c +++ b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_debugfs.c @@ -2495,6 +2495,64 @@ static int hclge_dbg_dump_mac_mc(struct hclge_dev *hdev, char *buf, int len) return 0; }
+static void hclge_dump_wol_mode(u32 mode, char *buf, int len, int *pos) +{ + if (mode & HCLGE_WOL_PHY) + *pos += scnprintf(buf + *pos, len - *pos, " [p]phy\n"); + + if (mode & HCLGE_WOL_UNICAST) + *pos += scnprintf(buf + *pos, len - *pos, " [u]unicast\n"); + + if (mode & HCLGE_WOL_MULTICAST) + *pos += scnprintf(buf + *pos, len - *pos, " [m]multicast\n"); + + if (mode & HCLGE_WOL_BROADCAST) + *pos += scnprintf(buf + *pos, len - *pos, " [b]broadcast\n"); + + if (mode & HCLGE_WOL_ARP) + *pos += scnprintf(buf + *pos, len - *pos, " [a]arp\n"); + + if (mode & HCLGE_WOL_MAGIC) + *pos += scnprintf(buf + *pos, len - *pos, " [g]magic\n"); + + if (mode & HCLGE_WOL_MAGICSECURED) + *pos += scnprintf(buf + *pos, len - *pos, + " [s]magic secured\n"); + + if (mode & HCLGE_WOL_FILTER) + *pos += scnprintf(buf + *pos, len - *pos, " [f]filter\n"); +} + +static int hclge_dbg_dump_wol_info(struct hclge_dev *hdev, char *buf, int len) +{ + u32 wol_supported; + int pos = 0; + u32 mode; + + if (!hnae3_ae_dev_wol_supported(hdev->ae_dev)) { + pos += scnprintf(buf + pos, len - pos, + "wake-on-lan is unsupported\n"); + return 0; + } + + pos += scnprintf(buf + pos, len - pos, "wake-on-lan mode:\n"); + pos += scnprintf(buf + pos, len - pos, " supported:\n"); + if (hclge_get_wol_supported_mode(hdev, &wol_supported)) + return -EINVAL; + + hclge_dump_wol_mode(wol_supported, buf, len, &pos); + + pos += scnprintf(buf + pos, len - pos, " current:\n"); + if (hclge_get_wol_cfg(hdev, &mode)) + return -EINVAL; + if (mode) + hclge_dump_wol_mode(mode, buf, len, &pos); + else + pos += scnprintf(buf + pos, len - pos, " [d]disabled\n"); + + return 0; +} + static const struct hclge_dbg_func hclge_dbg_cmd_func[] = { { .cmd = HNAE3_DBG_CMD_TM_NODES, @@ -2644,6 +2702,10 @@ static const struct hclge_dbg_func hclge_dbg_cmd_func[] = { .cmd = HNAE3_DBG_CMD_UMV_INFO, .dbg_dump = hclge_dbg_dump_umv_info, }, + { + .cmd = HNAE3_DBG_CMD_WOL_INFO, + .dbg_dump = hclge_dbg_dump_wol_info, + }, };
int hclge_dbg_read_cmd(struct hnae3_handle *handle, enum hnae3_dbg_cmd cmd,
From: Jie Wang wangjie125@huawei.com
driver inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I63AH1 CVE: NA
----------------------------------------------------------------------
Currently hns3 driver is designed to support VF fault detect feature in new hardwares. For code compatibility, vf fault detect cap bit is added to the driver.
Signed-off-by: Jie Wang wangjie125@huawei.com Signed-off-by: Jiantao Xiao xiaojiantao1@h-partners.com Reviewed-by: Yue Haibing yuehaibing@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- drivers/net/ethernet/hisilicon/hns3/hnae3.h | 4 ++++ .../net/ethernet/hisilicon/hns3/hns3_common/hclge_comm_cmd.c | 1 + .../net/ethernet/hisilicon/hns3/hns3_common/hclge_comm_cmd.h | 1 + drivers/net/ethernet/hisilicon/hns3/hns3_debugfs.c | 3 +++ 4 files changed, 9 insertions(+)
diff --git a/drivers/net/ethernet/hisilicon/hns3/hnae3.h b/drivers/net/ethernet/hisilicon/hns3/hnae3.h index 8bda1649de32..e0222667f3f2 100644 --- a/drivers/net/ethernet/hisilicon/hns3/hnae3.h +++ b/drivers/net/ethernet/hisilicon/hns3/hnae3.h @@ -102,6 +102,7 @@ enum HNAE3_DEV_CAP_BITS { HNAE3_DEV_SUPPORT_CQ_B, HNAE3_DEV_SUPPORT_LANE_NUM_B, HNAE3_DEV_SUPPORT_WOL_B, + HNAE3_DEV_SUPPORT_VF_FAULT_B, };
#define hnae3_ae_dev_fd_supported(ae_dev) \ @@ -170,6 +171,9 @@ enum HNAE3_DEV_CAP_BITS { #define hnae3_ae_dev_wol_supported(ae_dev) \ test_bit(HNAE3_DEV_SUPPORT_WOL_B, (ae_dev)->caps)
+#define hnae3_ae_dev_vf_fault_supported(ae_dev) \ + test_bit(HNAE3_DEV_SUPPORT_VF_FAULT_B, (ae_dev)->caps) + enum HNAE3_PF_CAP_BITS { HNAE3_PF_SUPPORT_VLAN_FLTR_MDF_B = 0, }; diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3_common/hclge_comm_cmd.c b/drivers/net/ethernet/hisilicon/hns3/hns3_common/hclge_comm_cmd.c index e27824340615..9dacb74ff598 100644 --- a/drivers/net/ethernet/hisilicon/hns3/hns3_common/hclge_comm_cmd.c +++ b/drivers/net/ethernet/hisilicon/hns3/hns3_common/hclge_comm_cmd.c @@ -155,6 +155,7 @@ static const struct hclge_comm_caps_bit_map hclge_pf_cmd_caps[] = { {HCLGE_COMM_CAP_FD_B, HNAE3_DEV_SUPPORT_FD_B}, {HCLGE_COMM_CAP_LANE_NUM_B, HNAE3_DEV_SUPPORT_LANE_NUM_B}, {HCLGE_COMM_CAP_WOL_B, HNAE3_DEV_SUPPORT_WOL_B}, + {HCLGE_COMM_CAP_VF_FAULT_B, HNAE3_DEV_SUPPORT_VF_FAULT_B}, };
static const struct hclge_comm_caps_bit_map hclge_vf_cmd_caps[] = { diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3_common/hclge_comm_cmd.h b/drivers/net/ethernet/hisilicon/hns3/hns3_common/hclge_comm_cmd.h index f74dc9e674c0..09813a1f6661 100644 --- a/drivers/net/ethernet/hisilicon/hns3/hns3_common/hclge_comm_cmd.h +++ b/drivers/net/ethernet/hisilicon/hns3/hns3_common/hclge_comm_cmd.h @@ -344,6 +344,7 @@ enum HCLGE_COMM_CAP_BITS { HCLGE_COMM_CAP_CQ_B = 18, HCLGE_COMM_CAP_GRO_B = 20, HCLGE_COMM_CAP_FD_B = 21, + HCLGE_COMM_CAP_VF_FAULT_B = 26, HCLGE_COMM_CAP_LANE_NUM_B = 27, HCLGE_COMM_CAP_WOL_B = 28, }; diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3_debugfs.c b/drivers/net/ethernet/hisilicon/hns3/hns3_debugfs.c index 62250669b9e8..f504e64917df 100644 --- a/drivers/net/ethernet/hisilicon/hns3/hns3_debugfs.c +++ b/drivers/net/ethernet/hisilicon/hns3/hns3_debugfs.c @@ -412,6 +412,9 @@ static struct hns3_dbg_cap_info hns3_dbg_cap[] = { }, { .name = "support wake on lan", .cap_bit = HNAE3_DEV_SUPPORT_WOL_B, + }, { + .name = "support vf fault detect", + .cap_bit = HNAE3_DEV_SUPPORT_VF_FAULT_B, } };
From: Jie Wang wangjie125@huawei.com
driver inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I63AH1 CVE: NA
----------------------------------------------------------------------
Currently hns3 driver supports vf fault detect feature. Several ras caused by VF resources don't need to do PF function reset for recovery. The driver only needs to reset the specified VF.
So this patch adds process in ras module. New process will get detailed information about ras and do the most correct measures based on these accurate information.
Signed-off-by: Jie Wang wangjie125@huawei.com Signed-off-by: Jiantao Xiao xiaojiantao1@h-partners.com Reviewed-by: Yue Haibing yuehaibing@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- drivers/net/ethernet/hisilicon/hns3/hnae3.h | 1 + .../hns3/hns3_common/hclge_comm_cmd.h | 1 + .../hisilicon/hns3/hns3pf/hclge_err.c | 113 +++++++++++++++++- .../hisilicon/hns3/hns3pf/hclge_err.h | 2 + .../hisilicon/hns3/hns3pf/hclge_main.c | 3 +- .../hisilicon/hns3/hns3pf/hclge_main.h | 1 + 6 files changed, 115 insertions(+), 6 deletions(-)
diff --git a/drivers/net/ethernet/hisilicon/hns3/hnae3.h b/drivers/net/ethernet/hisilicon/hns3/hnae3.h index e0222667f3f2..388532bd6a35 100644 --- a/drivers/net/ethernet/hisilicon/hns3/hnae3.h +++ b/drivers/net/ethernet/hisilicon/hns3/hnae3.h @@ -274,6 +274,7 @@ enum hnae3_reset_type { HNAE3_GLOBAL_RESET, HNAE3_IMP_RESET, HNAE3_NONE_RESET, + HNAE3_VF_EXP_RESET, HNAE3_MAX_RESET, };
diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3_common/hclge_comm_cmd.h b/drivers/net/ethernet/hisilicon/hns3/hns3_common/hclge_comm_cmd.h index 09813a1f6661..2fde935dcbbd 100644 --- a/drivers/net/ethernet/hisilicon/hns3/hns3_common/hclge_comm_cmd.h +++ b/drivers/net/ethernet/hisilicon/hns3/hns3_common/hclge_comm_cmd.h @@ -92,6 +92,7 @@ enum hclge_opcode_type { HCLGE_OPC_DFX_SSU_REG_2 = 0x004F,
HCLGE_OPC_QUERY_DEV_SPECS = 0x0050, + HCLGE_OPC_GET_QUEUE_ERR_VF = 0x0067,
/* MAC command */ HCLGE_OPC_CONFIG_MAC_MODE = 0x0301, diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_err.c b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_err.c index 869b529f68e3..8b058a7de5bf 100644 --- a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_err.c +++ b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_err.c @@ -1308,10 +1308,12 @@ static const struct hclge_hw_type_id hclge_hw_type_id_st[] = { .msg = "tqp_int_ecc_error" }, { .type_id = PF_ABNORMAL_INT_ERROR, - .msg = "pf_abnormal_int_error" + .msg = "pf_abnormal_int_error", + .cause_by_vf = true }, { .type_id = MPF_ABNORMAL_INT_ERROR, - .msg = "mpf_abnormal_int_error" + .msg = "mpf_abnormal_int_error", + .cause_by_vf = true }, { .type_id = COMMON_ERROR, .msg = "common_error" @@ -2769,7 +2771,7 @@ void hclge_handle_occurred_error(struct hclge_dev *hdev) hclge_handle_error_info_log(ae_dev); }
-static void +static bool hclge_handle_error_type_reg_log(struct device *dev, struct hclge_mod_err_info *mod_info, struct hclge_type_reg_err_info *type_reg_info) @@ -2780,6 +2782,7 @@ hclge_handle_error_type_reg_log(struct device *dev, u8 mod_id, total_module, type_id, total_type, i, is_ras; u8 index_module = MODULE_NONE; u8 index_type = NONE_ERROR; + bool cause_by_vf = false;
mod_id = mod_info->mod_id; type_id = type_reg_info->type_id & HCLGE_ERR_TYPE_MASK; @@ -2798,6 +2801,7 @@ hclge_handle_error_type_reg_log(struct device *dev, for (i = 0; i < total_type; i++) { if (type_id == hclge_hw_type_id_st[i].type_id) { index_type = i; + cause_by_vf = hclge_hw_type_id_st[i].cause_by_vf; break; } } @@ -2815,6 +2819,8 @@ hclge_handle_error_type_reg_log(struct device *dev, dev_err(dev, "reg_value:\n"); for (i = 0; i < type_reg_info->reg_num; i++) dev_err(dev, "0x%08x\n", type_reg_info->hclge_reg[i]); + + return cause_by_vf; }
static void hclge_handle_error_module_log(struct hnae3_ae_dev *ae_dev, @@ -2825,6 +2831,7 @@ static void hclge_handle_error_module_log(struct hnae3_ae_dev *ae_dev, struct device *dev = &hdev->pdev->dev; struct hclge_mod_err_info *mod_info; struct hclge_sum_err_info *sum_info; + bool cause_by_vf = false; u8 mod_num, err_num, i; u32 offset = 0;
@@ -2853,12 +2860,16 @@ static void hclge_handle_error_module_log(struct hnae3_ae_dev *ae_dev,
type_reg_info = (struct hclge_type_reg_err_info *) &buf[offset++]; - hclge_handle_error_type_reg_log(dev, mod_info, - type_reg_info); + if (hclge_handle_error_type_reg_log(dev, mod_info, + type_reg_info)) + cause_by_vf = true;
offset += type_reg_info->reg_num; } } + + if (hnae3_ae_dev_vf_fault_supported(hdev->ae_dev) && cause_by_vf) + set_bit(HNAE3_VF_EXP_RESET, &ae_dev->hw_err_reset_req); }
static int hclge_query_all_err_bd_num(struct hclge_dev *hdev, u32 *bd_num) @@ -2950,3 +2961,95 @@ int hclge_handle_error_info_log(struct hnae3_ae_dev *ae_dev) out: return ret; } + +static bool hclge_reset_vf_in_bitmap(struct hclge_dev *hdev, + unsigned long *bitmap) +{ + struct hclge_vport *vport; + bool exist_set = false; + int func_id; + int ret; + + func_id = find_first_bit(bitmap, HCLGE_VPORT_NUM); + if (func_id == PF_VPORT_ID) + return false; + + while (func_id != HCLGE_VPORT_NUM) { + vport = hclge_get_vf_vport(hdev, + func_id - HCLGE_VF_VPORT_START_NUM); + if (!vport) { + dev_err(&hdev->pdev->dev, "invalid func id(%d)\n", + func_id); + return false; + } + + dev_info(&hdev->pdev->dev, "do function %d recovery.", func_id); + + ret = hclge_reset_tqp(&vport->nic); + if (ret) { + dev_err(&hdev->pdev->dev, + "failed to reset tqp, ret = %d.", ret); + return false; + } + + ret = hclge_func_reset_cmd(hdev, func_id); + if (ret) { + dev_err(&hdev->pdev->dev, + "failed to reset func %d, ret = %d.", + func_id, ret); + return false; + } + + exist_set = true; + clear_bit(func_id, bitmap); + func_id = find_first_bit(bitmap, HCLGE_VPORT_NUM); + } + + return exist_set; +} + +static void hclge_get_vf_fault_bitmap(struct hclge_desc *desc, + unsigned long *bitmap) +{ +#define HCLGE_FIR_FAULT_BYTES 24 +#define HCLGE_SEC_FAULT_BYTES 8 + + u8 *buff; + + memcpy(bitmap, desc[0].data, HCLGE_FIR_FAULT_BYTES); + buff = (u8 *)bitmap + HCLGE_FIR_FAULT_BYTES; + memcpy(buff, desc[1].data, HCLGE_SEC_FAULT_BYTES); +} + +int hclge_handle_vf_queue_err_ras(struct hclge_dev *hdev) +{ + unsigned long vf_fault_bitmap[BITS_TO_LONGS(HCLGE_VPORT_NUM)]; + struct hclge_desc desc[2]; + bool cause_by_vf = false; + int ret; + + if (!hnae3_ae_dev_vf_fault_supported(hdev->ae_dev) || + !test_and_clear_bit(HNAE3_VF_EXP_RESET, + &hdev->ae_dev->hw_err_reset_req)) + return 0; + + hclge_comm_cmd_setup_basic_desc(&desc[0], HCLGE_OPC_GET_QUEUE_ERR_VF, + true); + desc[0].flag |= cpu_to_le16(HCLGE_COMM_CMD_FLAG_NEXT); + hclge_comm_cmd_setup_basic_desc(&desc[1], HCLGE_OPC_GET_QUEUE_ERR_VF, + true); + + ret = hclge_comm_cmd_send(&hdev->hw.hw, desc, 2); + if (ret) { + dev_err(&hdev->pdev->dev, + "failed to get vf bitmap, ret = %d.\n", ret); + return ret; + } + hclge_get_vf_fault_bitmap(desc, vf_fault_bitmap); + + cause_by_vf = hclge_reset_vf_in_bitmap(hdev, vf_fault_bitmap); + if (cause_by_vf) + hdev->ae_dev->hw_err_reset_req = 0; + + return 0; +} diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_err.h b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_err.h index bbc67be31cf6..6d66483e17c2 100644 --- a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_err.h +++ b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_err.h @@ -204,6 +204,7 @@ struct hclge_hw_module_id { struct hclge_hw_type_id { enum hclge_err_type_list type_id; const char *msg; + bool cause_by_vf; /* indicate the error may from vf exception */ };
struct hclge_sum_err_info { @@ -236,4 +237,5 @@ int hclge_handle_hw_msix_error(struct hclge_dev *hdev, unsigned long *reset_requests); int hclge_handle_error_info_log(struct hnae3_ae_dev *ae_dev); int hclge_handle_mac_tnl(struct hclge_dev *hdev); +int hclge_handle_vf_queue_err_ras(struct hclge_dev *hdev); #endif diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c index 98538d8f96f4..0eb66ddf4c4b 100644 --- a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c +++ b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c @@ -3407,7 +3407,7 @@ static int hclge_get_status(struct hnae3_handle *handle) return hdev->hw.mac.link; }
-static struct hclge_vport *hclge_get_vf_vport(struct hclge_dev *hdev, int vf) +struct hclge_vport *hclge_get_vf_vport(struct hclge_dev *hdev, int vf) { if (!pci_num_vf(hdev->pdev)) { dev_err(&hdev->pdev->dev, @@ -4488,6 +4488,7 @@ static void hclge_handle_err_recovery(struct hclge_dev *hdev) if (hclge_find_error_source(hdev)) { hclge_handle_error_info_log(ae_dev); hclge_handle_mac_tnl(hdev); + hclge_handle_vf_queue_err_ras(hdev); }
hclge_handle_err_reset_request(hdev); diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.h b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.h index 9fdffc3d18b2..6c807e89dd4b 100644 --- a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.h +++ b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.h @@ -1158,4 +1158,5 @@ int hclge_cfg_mac_speed_dup_hw(struct hclge_dev *hdev, int speed, u8 duplex, u8 lane_num); int hclge_get_wol_supported_mode(struct hclge_dev *hdev, u32 *wol_supported); int hclge_get_wol_cfg(struct hclge_dev *hdev, u32 *mode); +struct hclge_vport *hclge_get_vf_vport(struct hclge_dev *hdev, int vf); #endif
From: Chengchang Tang tangchengchang@huawei.com
driver inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I63KVU
----------------------------------------------------------
The hip09 introduces the DCA(Dynamic context attachment) feature which supports many RC QPs to share the WQE buffer in a memory pool, this will reduce the memory consumption when there are too many QPs are inactive.
If a QP enables DCA feature, the WQE's buffer will not be allocated when creating. But when the users start to post WRs, the hns driver will allocate a buffer from the memory pool and then fill WQEs which tagged with this QP's number.
The hns ROCEE will stop accessing the WQE buffer when the user polled all of the CQEs for a DCA QP, then the driver will recycle this WQE's buffer to the memory pool.
This patch adds a group of methods to support the user space register buffers to a memory pool which belongs to the user context. The hns kernel driver will update the pages state in this pool when the user calling the post/poll methods and the user driver can get the QP's WQE buffer address by the key and offset which queried from kernel.
Signed-off-by: Chengchang Tang tangchengchang@huawei.com Reviewed-by: Yangyang Li liyangyang20@huawei.com Reviewed-by: YueHaibing yuehaibing@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- drivers/infiniband/hw/hns/Makefile | 2 +- drivers/infiniband/hw/hns/hns_roce_dca.c | 354 ++++++++++++++++++++ drivers/infiniband/hw/hns/hns_roce_dca.h | 26 ++ drivers/infiniband/hw/hns/hns_roce_device.h | 10 + drivers/infiniband/hw/hns/hns_roce_main.c | 40 ++- include/uapi/rdma/hns-abi.h | 25 ++ 6 files changed, 449 insertions(+), 8 deletions(-) create mode 100644 drivers/infiniband/hw/hns/hns_roce_dca.c create mode 100644 drivers/infiniband/hw/hns/hns_roce_dca.h
diff --git a/drivers/infiniband/hw/hns/Makefile b/drivers/infiniband/hw/hns/Makefile index 8ffbf009b948..a55bcceeef98 100644 --- a/drivers/infiniband/hw/hns/Makefile +++ b/drivers/infiniband/hw/hns/Makefile @@ -8,7 +8,7 @@ ccflags-y := -I $(srctree)/drivers/net/ethernet/hisilicon/hns3 hns-roce-objs := hns_roce_main.o hns_roce_cmd.o hns_roce_pd.o \ hns_roce_ah.o hns_roce_hem.o hns_roce_mr.o hns_roce_qp.o \ hns_roce_cq.o hns_roce_alloc.o hns_roce_db.o hns_roce_srq.o hns_roce_restrack.o \ - hns_roce_bond.o + hns_roce_bond.o hns_roce_dca.o
ifdef CONFIG_INFINIBAND_HNS_HIP08 hns-roce-hw-v2-objs := hns_roce_hw_v2.o $(hns-roce-objs) diff --git a/drivers/infiniband/hw/hns/hns_roce_dca.c b/drivers/infiniband/hw/hns/hns_roce_dca.c new file mode 100644 index 000000000000..4da7d762333f --- /dev/null +++ b/drivers/infiniband/hw/hns/hns_roce_dca.c @@ -0,0 +1,354 @@ +// SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB +/* + * Copyright (c) 2022 Hisilicon Limited. All rights reserved. + */ + +#include <rdma/ib_user_verbs.h> +#include <rdma/ib_verbs.h> +#include <rdma/uverbs_types.h> +#include <rdma/uverbs_ioctl.h> +#include <rdma/uverbs_std_types.h> +#include <rdma/ib_umem.h> +#include "hns_roce_device.h" +#include "hns_roce_dca.h" + +#define UVERBS_MODULE_NAME hns_ib +#include <rdma/uverbs_named_ioctl.h> + +/* DCA memory */ +struct dca_mem { +#define DCA_MEM_FLAGS_ALLOCED BIT(0) +#define DCA_MEM_FLAGS_REGISTERED BIT(1) + u32 flags; + struct list_head list; /* link to mem list in dca context */ + spinlock_t lock; /* protect the @flags and @list */ + int page_count; /* page count in this mem obj */ + u64 key; /* register by caller */ + u32 size; /* bytes in this mem object */ + struct hns_dca_page_state *states; /* record each page's state */ + void *pages; /* memory handle for getting dma address */ +}; + +struct dca_mem_attr { + u64 key; + u64 addr; + u32 size; +}; + +static void *alloc_dca_pages(struct hns_roce_dev *hr_dev, struct dca_mem *mem, + struct dca_mem_attr *attr) +{ + struct ib_device *ibdev = &hr_dev->ib_dev; + struct ib_umem *umem; + + umem = ib_umem_get(ibdev, attr->addr, attr->size, 0); + if (IS_ERR(umem)) { + ibdev_err(ibdev, "failed to get uDCA pages, ret = %ld.\n", + PTR_ERR(umem)); + return NULL; + } + + mem->page_count = ib_umem_num_dma_blocks(umem, HNS_HW_PAGE_SIZE); + + return umem; +} + +static void init_dca_umem_states(struct hns_dca_page_state *states, int count, + struct ib_umem *umem) +{ + struct ib_block_iter biter; + dma_addr_t cur_addr; + dma_addr_t pre_addr; + int i = 0; + + pre_addr = 0; + rdma_for_each_block(umem->sg_head.sgl, &biter, + umem->sg_head.nents, HNS_HW_PAGE_SIZE) { + cur_addr = rdma_block_iter_dma_address(&biter); + if (i < count) { + if (cur_addr - pre_addr != HNS_HW_PAGE_SIZE) + states[i].head = 1; + } + + pre_addr = cur_addr; + i++; + } +} + +static struct hns_dca_page_state *alloc_dca_states(void *pages, int count) +{ + struct hns_dca_page_state *states; + + states = kcalloc(count, sizeof(*states), GFP_KERNEL); + if (!states) + return NULL; + + init_dca_umem_states(states, count, pages); + + return states; +} + +/* user DCA is managed by ucontext */ +static inline struct hns_roce_dca_ctx * +to_hr_dca_ctx(struct hns_roce_ucontext *uctx) +{ + return &uctx->dca_ctx; +} + +static void unregister_dca_mem(struct hns_roce_ucontext *uctx, + struct dca_mem *mem) +{ + struct hns_roce_dca_ctx *ctx = to_hr_dca_ctx(uctx); + unsigned long flags; + void *states, *pages; + + spin_lock_irqsave(&ctx->pool_lock, flags); + + spin_lock(&mem->lock); + mem->flags &= ~DCA_MEM_FLAGS_REGISTERED; + mem->page_count = 0; + pages = mem->pages; + mem->pages = NULL; + states = mem->states; + mem->states = NULL; + spin_unlock(&mem->lock); + + ctx->free_mems--; + ctx->free_size -= mem->size; + + ctx->total_size -= mem->size; + spin_unlock_irqrestore(&ctx->pool_lock, flags); + + kfree(states); + ib_umem_release(pages); +} + +static int register_dca_mem(struct hns_roce_dev *hr_dev, + struct hns_roce_ucontext *uctx, + struct dca_mem *mem, struct dca_mem_attr *attr) +{ + struct hns_roce_dca_ctx *ctx = to_hr_dca_ctx(uctx); + void *states, *pages; + unsigned long flags; + + pages = alloc_dca_pages(hr_dev, mem, attr); + if (!pages) + return -ENOMEM; + + states = alloc_dca_states(pages, mem->page_count); + if (!states) { + ib_umem_release(pages); + return -ENOMEM; + } + + spin_lock_irqsave(&ctx->pool_lock, flags); + + spin_lock(&mem->lock); + mem->pages = pages; + mem->states = states; + mem->key = attr->key; + mem->size = attr->size; + mem->flags |= DCA_MEM_FLAGS_REGISTERED; + spin_unlock(&mem->lock); + + ctx->free_mems++; + ctx->free_size += attr->size; + ctx->total_size += attr->size; + spin_unlock_irqrestore(&ctx->pool_lock, flags); + + return 0; +} + +static void init_dca_context(struct hns_roce_dca_ctx *ctx) +{ + INIT_LIST_HEAD(&ctx->pool); + spin_lock_init(&ctx->pool_lock); + ctx->total_size = 0; +} + +static void cleanup_dca_context(struct hns_roce_dev *hr_dev, + struct hns_roce_dca_ctx *ctx) +{ + struct dca_mem *mem, *tmp; + unsigned long flags; + + spin_lock_irqsave(&ctx->pool_lock, flags); + list_for_each_entry_safe(mem, tmp, &ctx->pool, list) { + list_del(&mem->list); + mem->flags = 0; + spin_unlock_irqrestore(&ctx->pool_lock, flags); + + kfree(mem->states); + ib_umem_release(mem->pages); + kfree(mem); + + spin_lock_irqsave(&ctx->pool_lock, flags); + } + ctx->total_size = 0; + spin_unlock_irqrestore(&ctx->pool_lock, flags); +} + +void hns_roce_register_udca(struct hns_roce_dev *hr_dev, + struct hns_roce_ucontext *uctx) +{ + if (!(uctx->config & HNS_ROCE_UCTX_CONFIG_DCA)) + return; + + init_dca_context(&uctx->dca_ctx); +} + +void hns_roce_unregister_udca(struct hns_roce_dev *hr_dev, + struct hns_roce_ucontext *uctx) +{ + if (!(uctx->config & HNS_ROCE_UCTX_CONFIG_DCA)) + return; + + cleanup_dca_context(hr_dev, &uctx->dca_ctx); +} + +static struct dca_mem *alloc_dca_mem(struct hns_roce_dca_ctx *ctx) +{ + struct dca_mem *mem, *tmp, *found = NULL; + unsigned long flags; + + spin_lock_irqsave(&ctx->pool_lock, flags); + list_for_each_entry_safe(mem, tmp, &ctx->pool, list) { + spin_lock(&mem->lock); + if (!mem->flags) { + found = mem; + mem->flags |= DCA_MEM_FLAGS_ALLOCED; + spin_unlock(&mem->lock); + break; + } + spin_unlock(&mem->lock); + } + spin_unlock_irqrestore(&ctx->pool_lock, flags); + + if (found) + return found; + + mem = kzalloc(sizeof(*mem), GFP_NOWAIT); + if (!mem) + return NULL; + + spin_lock_init(&mem->lock); + INIT_LIST_HEAD(&mem->list); + + mem->flags |= DCA_MEM_FLAGS_ALLOCED; + + spin_lock_irqsave(&ctx->pool_lock, flags); + list_add(&mem->list, &ctx->pool); + spin_unlock_irqrestore(&ctx->pool_lock, flags); + + return mem; +} + +static void free_dca_mem(struct dca_mem *mem) +{ + /* We cannot hold the whole pool's lock during the DCA is working + * until cleanup the context in cleanup_dca_context(), so we just + * set the DCA mem state as free when destroying DCA mem object. + */ + spin_lock(&mem->lock); + mem->flags = 0; + spin_unlock(&mem->lock); +} + +static inline struct hns_roce_ucontext * +uverbs_attr_to_hr_uctx(struct uverbs_attr_bundle *attrs) +{ + return rdma_udata_to_drv_context(&attrs->driver_udata, + struct hns_roce_ucontext, ibucontext); +} + +static int UVERBS_HANDLER(HNS_IB_METHOD_DCA_MEM_REG)( + struct uverbs_attr_bundle *attrs) +{ + struct hns_roce_ucontext *uctx = uverbs_attr_to_hr_uctx(attrs); + struct hns_roce_dev *hr_dev = to_hr_dev(uctx->ibucontext.device); + struct ib_uobject *uobj = + uverbs_attr_get_uobject(attrs, HNS_IB_ATTR_DCA_MEM_REG_HANDLE); + struct dca_mem_attr init_attr = {}; + struct dca_mem *mem; + int ret; + + ret = uverbs_copy_from(&init_attr.addr, attrs, + HNS_IB_ATTR_DCA_MEM_REG_ADDR); + if (!ret) + ret = uverbs_copy_from(&init_attr.size, attrs, + HNS_IB_ATTR_DCA_MEM_REG_LEN); + if (!ret) + ret = uverbs_copy_from(&init_attr.key, attrs, + HNS_IB_ATTR_DCA_MEM_REG_KEY); + if (ret) + return ret; + + mem = alloc_dca_mem(to_hr_dca_ctx(uctx)); + if (!mem) + return -ENOMEM; + + ret = register_dca_mem(hr_dev, uctx, mem, &init_attr); + if (ret) { + free_dca_mem(mem); + return ret; + } + + uobj->object = mem; + + return 0; +} + +static int dca_cleanup(struct ib_uobject *uobject, enum rdma_remove_reason why, + struct uverbs_attr_bundle *attrs) +{ + struct hns_roce_ucontext *uctx = uverbs_attr_to_hr_uctx(attrs); + struct dca_mem *mem; + + /* One DCA MEM maybe shared by many QPs, so the DCA mem uobject must + * be destroyed before all QP uobjects, and we will destroy the DCA + * uobjects when cleanup DCA context by calling hns_roce_cleanup_dca(). + */ + if (why == RDMA_REMOVE_CLOSE || why == RDMA_REMOVE_DRIVER_REMOVE) + return 0; + + mem = uobject->object; + unregister_dca_mem(uctx, mem); + free_dca_mem(mem); + + return 0; +} + +DECLARE_UVERBS_NAMED_METHOD( + HNS_IB_METHOD_DCA_MEM_REG, + UVERBS_ATTR_IDR(HNS_IB_ATTR_DCA_MEM_REG_HANDLE, HNS_IB_OBJECT_DCA_MEM, + UVERBS_ACCESS_NEW, UA_MANDATORY), + UVERBS_ATTR_PTR_IN(HNS_IB_ATTR_DCA_MEM_REG_LEN, UVERBS_ATTR_TYPE(u32), + UA_MANDATORY), + UVERBS_ATTR_PTR_IN(HNS_IB_ATTR_DCA_MEM_REG_ADDR, UVERBS_ATTR_TYPE(u64), + UA_MANDATORY), + UVERBS_ATTR_PTR_IN(HNS_IB_ATTR_DCA_MEM_REG_KEY, UVERBS_ATTR_TYPE(u64), + UA_MANDATORY)); + +DECLARE_UVERBS_NAMED_METHOD_DESTROY( + HNS_IB_METHOD_DCA_MEM_DEREG, + UVERBS_ATTR_IDR(HNS_IB_ATTR_DCA_MEM_DEREG_HANDLE, HNS_IB_OBJECT_DCA_MEM, + UVERBS_ACCESS_DESTROY, UA_MANDATORY)); + +DECLARE_UVERBS_NAMED_OBJECT(HNS_IB_OBJECT_DCA_MEM, + UVERBS_TYPE_ALLOC_IDR(dca_cleanup), + &UVERBS_METHOD(HNS_IB_METHOD_DCA_MEM_REG), + &UVERBS_METHOD(HNS_IB_METHOD_DCA_MEM_DEREG)); + +static bool dca_is_supported(struct ib_device *device) +{ + struct hns_roce_dev *dev = to_hr_dev(device); + + return dev->caps.flags & HNS_ROCE_CAP_FLAG_DCA_MODE; +} + +const struct uapi_definition hns_roce_dca_uapi_defs[] = { + UAPI_DEF_CHAIN_OBJ_TREE_NAMED( + HNS_IB_OBJECT_DCA_MEM, + UAPI_DEF_IS_OBJ_SUPPORTED(dca_is_supported)), + {} +}; diff --git a/drivers/infiniband/hw/hns/hns_roce_dca.h b/drivers/infiniband/hw/hns/hns_roce_dca.h new file mode 100644 index 000000000000..e303c3cae25f --- /dev/null +++ b/drivers/infiniband/hw/hns/hns_roce_dca.h @@ -0,0 +1,26 @@ +/* SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB */ +/* + * Copyright (c) 2022 Hisilicon Limited. All rights reserved. + */ + +#ifndef __HNS_ROCE_DCA_H +#define __HNS_ROCE_DCA_H + +#include <rdma/uverbs_ioctl.h> + +/* DCA page state (32 bit) */ +struct hns_dca_page_state { + u32 buf_id : 29; /* If zero, means page can be used by any buffer. */ + u32 lock : 1; /* @buf_id locked this page to prepare access. */ + u32 active : 1; /* @buf_id is accessing this page. */ + u32 head : 1; /* This page is the head in a continuous address range. */ +}; + +extern const struct uapi_definition hns_roce_dca_uapi_defs[]; + +void hns_roce_register_udca(struct hns_roce_dev *hr_dev, + struct hns_roce_ucontext *uctx); +void hns_roce_unregister_udca(struct hns_roce_dev *hr_dev, + struct hns_roce_ucontext *uctx); + +#endif diff --git a/drivers/infiniband/hw/hns/hns_roce_device.h b/drivers/infiniband/hw/hns/hns_roce_device.h index d722f372a7bb..d5fc8ae85a7f 100644 --- a/drivers/infiniband/hw/hns/hns_roce_device.h +++ b/drivers/infiniband/hw/hns/hns_roce_device.h @@ -145,6 +145,7 @@ enum { HNS_ROCE_CAP_FLAG_ATOMIC = BIT(10), HNS_ROCE_CAP_FLAG_DIRECT_WQE = BIT(12), HNS_ROCE_CAP_FLAG_SDI_MODE = BIT(14), + HNS_ROCE_CAP_FLAG_DCA_MODE = BIT(15), HNS_ROCE_CAP_FLAG_STASH = BIT(17), HNS_ROCE_CAP_FLAG_CQE_INLINE = BIT(19), HNS_ROCE_CAP_FLAG_RQ_INLINE = BIT(20), @@ -203,6 +204,14 @@ struct hns_user_mmap_entry { u64 address; };
+struct hns_roce_dca_ctx { + struct list_head pool; /* all DCA mems link to @pool */ + spinlock_t pool_lock; /* protect @pool */ + unsigned int free_mems; /* free mem num in pool */ + size_t free_size; /* free mem size in pool */ + size_t total_size; /* total size in pool */ +}; + struct hns_roce_ucontext { struct ib_ucontext ibucontext; struct hns_roce_uar uar; @@ -210,6 +219,7 @@ struct hns_roce_ucontext { struct mutex page_mutex; struct hns_user_mmap_entry *db_mmap_entry; u32 config; + struct hns_roce_dca_ctx dca_ctx; };
struct hns_roce_pd { diff --git a/drivers/infiniband/hw/hns/hns_roce_main.c b/drivers/infiniband/hw/hns/hns_roce_main.c index e3b188b2bb4c..d9d787fbc70b 100644 --- a/drivers/infiniband/hw/hns/hns_roce_main.c +++ b/drivers/infiniband/hw/hns/hns_roce_main.c @@ -43,6 +43,7 @@ #include "hns_roce_device.h" #include "hns_roce_hem.h" #include "hns_roce_hw_v2.h" +#include "hns_roce_dca.h"
static int hns_roce_set_mac(struct hns_roce_dev *hr_dev, u32 port, const u8 *addr) @@ -377,6 +378,17 @@ static int hns_roce_alloc_uar_entry(struct ib_ucontext *uctx) return 0; }
+static void ucontext_set_resp(struct ib_ucontext *uctx, + struct hns_roce_ib_alloc_ucontext_resp *resp) +{ + struct hns_roce_dev *hr_dev = to_hr_dev(uctx->device); + + resp->qp_tab_size = hr_dev->caps.num_qps; + resp->srq_tab_size = hr_dev->caps.num_srqs; + resp->cqe_size = hr_dev->caps.cqe_sz; + resp->mac_type = hr_dev->mac_type; +} + static int hns_roce_alloc_ucontext(struct ib_ucontext *uctx, struct ib_udata *udata) { @@ -389,9 +401,6 @@ static int hns_roce_alloc_ucontext(struct ib_ucontext *uctx, if (!hr_dev->active) return -EAGAIN;
- resp.qp_tab_size = hr_dev->caps.num_qps; - resp.srq_tab_size = hr_dev->caps.num_srqs; - ret = ib_copy_from_udata(&ucmd, udata, min(udata->inlen, sizeof(ucmd))); if (ret) @@ -415,6 +424,11 @@ static int hns_roce_alloc_ucontext(struct ib_ucontext *uctx, resp.config |= HNS_ROCE_RSP_CQE_INLINE_FLAGS; }
+ if (hr_dev->caps.flags & HNS_ROCE_CAP_FLAG_DCA_MODE) { + context->config |= ucmd.config & HNS_ROCE_UCTX_CONFIG_DCA; + resp.config |= HNS_ROCE_UCTX_RSP_DCA_FLAGS; + } + ret = hns_roce_uar_alloc(hr_dev, &context->uar); if (ret) goto error_fail_uar_alloc; @@ -429,17 +443,18 @@ static int hns_roce_alloc_ucontext(struct ib_ucontext *uctx, mutex_init(&context->page_mutex); }
- resp.cqe_size = hr_dev->caps.cqe_sz; - resp.mac_type = hr_dev->mac_type; + hns_roce_register_udca(hr_dev, context);
- ret = ib_copy_to_udata(udata, &resp, - min(udata->outlen, sizeof(resp))); + ucontext_set_resp(uctx, &resp); + ret = ib_copy_to_udata(udata, &resp, min(udata->outlen, sizeof(resp))); if (ret) goto error_fail_copy_to_udata;
return 0;
error_fail_copy_to_udata: + hns_roce_unregister_udca(hr_dev, context); + hns_roce_dealloc_uar_entry(context);
error_fail_uar_entry: @@ -454,6 +469,8 @@ static void hns_roce_dealloc_ucontext(struct ib_ucontext *ibcontext) struct hns_roce_ucontext *context = to_hr_ucontext(ibcontext); struct hns_roce_dev *hr_dev = to_hr_dev(ibcontext->device);
+ hns_roce_unregister_udca(hr_dev, context); + hns_roce_dealloc_uar_entry(context);
ida_free(&hr_dev->uar_ida.ida, (int)context->uar.logic_idx); @@ -553,6 +570,11 @@ static void hns_roce_unregister_device(struct hns_roce_dev *hr_dev) ib_unregister_device(&hr_dev->ib_dev); }
+const struct uapi_definition hns_roce_uapi_defs[] = { + UAPI_DEF_CHAIN(hns_roce_dca_uapi_defs), + {} +}; + static const struct ib_device_ops hns_roce_dev_ops = { .owner = THIS_MODULE, .driver_id = RDMA_DRIVER_HNS, @@ -716,6 +738,10 @@ static int hns_roce_register_device(struct hns_roce_dev *hr_dev) ib_set_device_ops(ib_dev, hr_dev->hw->hns_roce_dev_ops); ib_set_device_ops(ib_dev, &hns_roce_dev_ops); ib_set_device_ops(ib_dev, &hns_roce_dev_restrack_ops); + + if (IS_ENABLED(CONFIG_INFINIBAND_USER_ACCESS)) + ib_dev->driver_def = hns_roce_uapi_defs; + for (i = 0; i < hr_dev->caps.num_ports; i++) { if (!hr_dev->iboe.netdevs[i]) continue; diff --git a/include/uapi/rdma/hns-abi.h b/include/uapi/rdma/hns-abi.h index e79cd4a15cbe..bb48c0b016f8 100644 --- a/include/uapi/rdma/hns-abi.h +++ b/include/uapi/rdma/hns-abi.h @@ -101,12 +101,14 @@ enum { HNS_ROCE_EXSGE_FLAGS = 1 << 0, HNS_ROCE_RQ_INLINE_FLAGS = 1 << 1, HNS_ROCE_CQE_INLINE_FLAGS = 1 << 2, + HNS_ROCE_UCTX_CONFIG_DCA = 1 << 3, };
enum { HNS_ROCE_RSP_EXSGE_FLAGS = 1 << 0, HNS_ROCE_RSP_RQ_INLINE_FLAGS = 1 << 1, HNS_ROCE_RSP_CQE_INLINE_FLAGS = 1 << 2, + HNS_ROCE_UCTX_RSP_DCA_FLAGS = HNS_ROCE_UCTX_CONFIG_DCA, };
struct hns_roce_ib_alloc_ucontext_resp { @@ -129,4 +131,27 @@ struct hns_roce_ib_alloc_pd_resp { __u32 pdn; };
+#define UVERBS_ID_NS_MASK 0xF000 +#define UVERBS_ID_NS_SHIFT 12 + +enum hns_ib_objects { + HNS_IB_OBJECT_DCA_MEM = (1U << UVERBS_ID_NS_SHIFT), +}; + +enum hns_ib_dca_mem_methods { + HNS_IB_METHOD_DCA_MEM_REG = (1U << UVERBS_ID_NS_SHIFT), + HNS_IB_METHOD_DCA_MEM_DEREG, +}; + +enum hns_ib_dca_mem_reg_attrs { + HNS_IB_ATTR_DCA_MEM_REG_HANDLE = (1U << UVERBS_ID_NS_SHIFT), + HNS_IB_ATTR_DCA_MEM_REG_LEN, + HNS_IB_ATTR_DCA_MEM_REG_ADDR, + HNS_IB_ATTR_DCA_MEM_REG_KEY, +}; + +enum hns_ib_dca_mem_dereg_attrs { + HNS_IB_ATTR_DCA_MEM_DEREG_HANDLE = (1U << UVERBS_ID_NS_SHIFT), +}; + #endif /* HNS_ABI_USER_H */
From: Chengchang Tang tangchengchang@huawei.com
driver inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I63KVU
----------------------------------------------------------
If no QP is using a DCA mem object, the userspace driver can destroy it. So add a new method 'HNS_IB_METHOD_DCA_MEM_SHRINK' to allow the userspace dirver to remove an object from DCA memory pool.
If a DCA mem object has been shrunk, the userspace driver can destroy it by 'HNS_IB_METHOD_DCA_MEM_DEREG' method and free the buffer which is allocated in userspace.
Signed-off-by: Chengchang Tang tangchengchang@huawei.com Reviewed-by: Yangyang Li liyangyang20@huawei.com Reviewed-by: YueHaibing yuehaibing@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- drivers/infiniband/hw/hns/hns_roce_dca.c | 146 ++++++++++++++++++++++- drivers/infiniband/hw/hns/hns_roce_dca.h | 7 ++ include/uapi/rdma/hns-abi.h | 8 ++ 3 files changed, 160 insertions(+), 1 deletion(-)
diff --git a/drivers/infiniband/hw/hns/hns_roce_dca.c b/drivers/infiniband/hw/hns/hns_roce_dca.c index 4da7d762333f..eb388f4fd499 100644 --- a/drivers/infiniband/hw/hns/hns_roce_dca.c +++ b/drivers/infiniband/hw/hns/hns_roce_dca.c @@ -35,6 +35,16 @@ struct dca_mem_attr { u32 size; };
+static inline bool dca_page_is_free(struct hns_dca_page_state *state) +{ + return state->buf_id == HNS_DCA_INVALID_BUF_ID; +} + +static inline bool dca_mem_is_available(struct dca_mem *mem) +{ + return mem->flags == (DCA_MEM_FLAGS_ALLOCED | DCA_MEM_FLAGS_REGISTERED); +} + static void *alloc_dca_pages(struct hns_roce_dev *hr_dev, struct dca_mem *mem, struct dca_mem_attr *attr) { @@ -88,6 +98,41 @@ static struct hns_dca_page_state *alloc_dca_states(void *pages, int count) return states; }
+#define DCA_MEM_STOP_ITERATE -1 +#define DCA_MEM_NEXT_ITERATE -2 +static void travel_dca_pages(struct hns_roce_dca_ctx *ctx, void *param, + int (*cb)(struct dca_mem *, int, void *)) +{ + struct dca_mem *mem, *tmp; + unsigned long flags; + bool avail; + int ret; + int i; + + spin_lock_irqsave(&ctx->pool_lock, flags); + list_for_each_entry_safe(mem, tmp, &ctx->pool, list) { + spin_unlock_irqrestore(&ctx->pool_lock, flags); + + spin_lock(&mem->lock); + avail = dca_mem_is_available(mem); + ret = 0; + for (i = 0; avail && i < mem->page_count; i++) { + ret = cb(mem, i, param); + if (ret == DCA_MEM_STOP_ITERATE || + ret == DCA_MEM_NEXT_ITERATE) + break; + } + spin_unlock(&mem->lock); + spin_lock_irqsave(&ctx->pool_lock, flags); + + if (ret == DCA_MEM_STOP_ITERATE) + goto done; + } + +done: + spin_unlock_irqrestore(&ctx->pool_lock, flags); +} + /* user DCA is managed by ucontext */ static inline struct hns_roce_dca_ctx * to_hr_dca_ctx(struct hns_roce_ucontext *uctx) @@ -159,6 +204,63 @@ static int register_dca_mem(struct hns_roce_dev *hr_dev, return 0; }
+struct dca_mem_shrink_attr { + u64 shrink_key; + u32 shrink_mems; +}; + +static int shrink_dca_page_proc(struct dca_mem *mem, int index, void *param) +{ + struct dca_mem_shrink_attr *attr = param; + struct hns_dca_page_state *state; + int i, free_pages; + + free_pages = 0; + for (i = 0; i < mem->page_count; i++) { + state = &mem->states[i]; + if (dca_page_is_free(state)) + free_pages++; + } + + /* No pages are in use */ + if (free_pages == mem->page_count) { + /* unregister first empty DCA mem */ + if (!attr->shrink_mems) { + mem->flags &= ~DCA_MEM_FLAGS_REGISTERED; + attr->shrink_key = mem->key; + } + + attr->shrink_mems++; + } + + if (attr->shrink_mems > 1) + return DCA_MEM_STOP_ITERATE; + else + return DCA_MEM_NEXT_ITERATE; +} + +static int shrink_dca_mem(struct hns_roce_dev *hr_dev, + struct hns_roce_ucontext *uctx, u64 reserved_size, + struct hns_dca_shrink_resp *resp) +{ + struct hns_roce_dca_ctx *ctx = to_hr_dca_ctx(uctx); + struct dca_mem_shrink_attr attr = {}; + unsigned long flags; + bool need_shink; + + spin_lock_irqsave(&ctx->pool_lock, flags); + need_shink = ctx->free_mems > 0 && ctx->free_size > reserved_size; + spin_unlock_irqrestore(&ctx->pool_lock, flags); + if (!need_shink) + return 0; + + travel_dca_pages(ctx, &attr, shrink_dca_page_proc); + resp->free_mems = attr.shrink_mems; + resp->free_key = attr.shrink_key; + + return 0; +} + static void init_dca_context(struct hns_roce_dca_ctx *ctx) { INIT_LIST_HEAD(&ctx->pool); @@ -334,10 +436,52 @@ DECLARE_UVERBS_NAMED_METHOD_DESTROY( UVERBS_ATTR_IDR(HNS_IB_ATTR_DCA_MEM_DEREG_HANDLE, HNS_IB_OBJECT_DCA_MEM, UVERBS_ACCESS_DESTROY, UA_MANDATORY));
+static int UVERBS_HANDLER(HNS_IB_METHOD_DCA_MEM_SHRINK)( + struct uverbs_attr_bundle *attrs) +{ + struct hns_roce_ucontext *uctx = uverbs_attr_to_hr_uctx(attrs); + struct hns_dca_shrink_resp resp = {}; + u64 reserved_size = 0; + int ret; + + ret = uverbs_copy_from(&reserved_size, attrs, + HNS_IB_ATTR_DCA_MEM_SHRINK_RESERVED_SIZE); + if (ret) + return ret; + + ret = shrink_dca_mem(to_hr_dev(uctx->ibucontext.device), uctx, + reserved_size, &resp); + if (ret) + return ret; + + ret = uverbs_copy_to(attrs, HNS_IB_ATTR_DCA_MEM_SHRINK_OUT_FREE_KEY, + &resp.free_key, sizeof(resp.free_key)); + if (!ret) + ret = uverbs_copy_to(attrs, + HNS_IB_ATTR_DCA_MEM_SHRINK_OUT_FREE_MEMS, + &resp.free_mems, sizeof(resp.free_mems)); + if (ret) + return ret; + + return 0; +} + +DECLARE_UVERBS_NAMED_METHOD( + HNS_IB_METHOD_DCA_MEM_SHRINK, + UVERBS_ATTR_IDR(HNS_IB_ATTR_DCA_MEM_SHRINK_HANDLE, + HNS_IB_OBJECT_DCA_MEM, UVERBS_ACCESS_WRITE, + UA_MANDATORY), + UVERBS_ATTR_PTR_IN(HNS_IB_ATTR_DCA_MEM_SHRINK_RESERVED_SIZE, + UVERBS_ATTR_TYPE(u64), UA_MANDATORY), + UVERBS_ATTR_PTR_OUT(HNS_IB_ATTR_DCA_MEM_SHRINK_OUT_FREE_KEY, + UVERBS_ATTR_TYPE(u64), UA_MANDATORY), + UVERBS_ATTR_PTR_OUT(HNS_IB_ATTR_DCA_MEM_SHRINK_OUT_FREE_MEMS, + UVERBS_ATTR_TYPE(u32), UA_MANDATORY)); DECLARE_UVERBS_NAMED_OBJECT(HNS_IB_OBJECT_DCA_MEM, UVERBS_TYPE_ALLOC_IDR(dca_cleanup), &UVERBS_METHOD(HNS_IB_METHOD_DCA_MEM_REG), - &UVERBS_METHOD(HNS_IB_METHOD_DCA_MEM_DEREG)); + &UVERBS_METHOD(HNS_IB_METHOD_DCA_MEM_DEREG), + &UVERBS_METHOD(HNS_IB_METHOD_DCA_MEM_SHRINK));
static bool dca_is_supported(struct ib_device *device) { diff --git a/drivers/infiniband/hw/hns/hns_roce_dca.h b/drivers/infiniband/hw/hns/hns_roce_dca.h index e303c3cae25f..14153a96abea 100644 --- a/drivers/infiniband/hw/hns/hns_roce_dca.h +++ b/drivers/infiniband/hw/hns/hns_roce_dca.h @@ -18,6 +18,13 @@ struct hns_dca_page_state {
extern const struct uapi_definition hns_roce_dca_uapi_defs[];
+struct hns_dca_shrink_resp { + u64 free_key; /* free buffer's key which registered by the user */ + u32 free_mems; /* free buffer count which no any QP be using */ +}; + +#define HNS_DCA_INVALID_BUF_ID 0UL + void hns_roce_register_udca(struct hns_roce_dev *hr_dev, struct hns_roce_ucontext *uctx); void hns_roce_unregister_udca(struct hns_roce_dev *hr_dev, diff --git a/include/uapi/rdma/hns-abi.h b/include/uapi/rdma/hns-abi.h index bb48c0b016f8..925d68df7ee9 100644 --- a/include/uapi/rdma/hns-abi.h +++ b/include/uapi/rdma/hns-abi.h @@ -141,6 +141,7 @@ enum hns_ib_objects { enum hns_ib_dca_mem_methods { HNS_IB_METHOD_DCA_MEM_REG = (1U << UVERBS_ID_NS_SHIFT), HNS_IB_METHOD_DCA_MEM_DEREG, + HNS_IB_METHOD_DCA_MEM_SHRINK, };
enum hns_ib_dca_mem_reg_attrs { @@ -154,4 +155,11 @@ enum hns_ib_dca_mem_dereg_attrs { HNS_IB_ATTR_DCA_MEM_DEREG_HANDLE = (1U << UVERBS_ID_NS_SHIFT), };
+enum hns_ib_dca_mem_shrink_attrs { + HNS_IB_ATTR_DCA_MEM_SHRINK_HANDLE = (1U << UVERBS_ID_NS_SHIFT), + HNS_IB_ATTR_DCA_MEM_SHRINK_RESERVED_SIZE, + HNS_IB_ATTR_DCA_MEM_SHRINK_OUT_FREE_KEY, + HNS_IB_ATTR_DCA_MEM_SHRINK_OUT_FREE_MEMS, +}; + #endif /* HNS_ABI_USER_H */
From: Chengchang Tang tangchengchang@huawei.com
driver inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I63KVU
----------------------------------------------------------
If the userspace driver assign a NULL to the field of 'buf_addr' in 'struct hns_roce_ib_create_qp' when creating QP, this means the kernel driver need setup the QP as DCA mode. So add a QP capability bit in response to indicate the userspace driver that the DCA mode has been enabled.
Signed-off-by: Chengchang Tang tangchengchang@huawei.com Reviewed-by: Yangyang Li liyangyang20@huawei.com Reviewed-by: YueHaibing yuehaibing@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- drivers/infiniband/hw/hns/hns_roce_dca.c | 27 +++-- drivers/infiniband/hw/hns/hns_roce_dca.h | 4 + drivers/infiniband/hw/hns/hns_roce_device.h | 5 + drivers/infiniband/hw/hns/hns_roce_hw_v2.c | 12 +- drivers/infiniband/hw/hns/hns_roce_qp.c | 116 +++++++++++++++----- include/uapi/rdma/hns-abi.h | 1 + 6 files changed, 128 insertions(+), 37 deletions(-)
diff --git a/drivers/infiniband/hw/hns/hns_roce_dca.c b/drivers/infiniband/hw/hns/hns_roce_dca.c index eb388f4fd499..c2c7fcf3be04 100644 --- a/drivers/infiniband/hw/hns/hns_roce_dca.c +++ b/drivers/infiniband/hw/hns/hns_roce_dca.c @@ -239,7 +239,7 @@ static int shrink_dca_page_proc(struct dca_mem *mem, int index, void *param) return DCA_MEM_NEXT_ITERATE; }
-static int shrink_dca_mem(struct hns_roce_dev *hr_dev, +static void shrink_dca_mem(struct hns_roce_dev *hr_dev, struct hns_roce_ucontext *uctx, u64 reserved_size, struct hns_dca_shrink_resp *resp) { @@ -252,13 +252,11 @@ static int shrink_dca_mem(struct hns_roce_dev *hr_dev, need_shink = ctx->free_mems > 0 && ctx->free_size > reserved_size; spin_unlock_irqrestore(&ctx->pool_lock, flags); if (!need_shink) - return 0; + return;
travel_dca_pages(ctx, &attr, shrink_dca_page_proc); resp->free_mems = attr.shrink_mems; resp->free_key = attr.shrink_key; - - return 0; }
static void init_dca_context(struct hns_roce_dca_ctx *ctx) @@ -356,6 +354,21 @@ static void free_dca_mem(struct dca_mem *mem) spin_unlock(&mem->lock); }
+void hns_roce_enable_dca(struct hns_roce_dev *hr_dev, struct hns_roce_qp *hr_qp) +{ + struct hns_roce_dca_cfg *cfg = &hr_qp->dca_cfg; + + cfg->buf_id = HNS_DCA_INVALID_BUF_ID; +} + +void hns_roce_disable_dca(struct hns_roce_dev *hr_dev, + struct hns_roce_qp *hr_qp) +{ + struct hns_roce_dca_cfg *cfg = &hr_qp->dca_cfg; + + cfg->buf_id = HNS_DCA_INVALID_BUF_ID; +} + static inline struct hns_roce_ucontext * uverbs_attr_to_hr_uctx(struct uverbs_attr_bundle *attrs) { @@ -449,10 +462,8 @@ static int UVERBS_HANDLER(HNS_IB_METHOD_DCA_MEM_SHRINK)( if (ret) return ret;
- ret = shrink_dca_mem(to_hr_dev(uctx->ibucontext.device), uctx, - reserved_size, &resp); - if (ret) - return ret; + shrink_dca_mem(to_hr_dev(uctx->ibucontext.device), uctx, + reserved_size, &resp);
ret = uverbs_copy_to(attrs, HNS_IB_ATTR_DCA_MEM_SHRINK_OUT_FREE_KEY, &resp.free_key, sizeof(resp.free_key)); diff --git a/drivers/infiniband/hw/hns/hns_roce_dca.h b/drivers/infiniband/hw/hns/hns_roce_dca.h index 14153a96abea..c930b46160b2 100644 --- a/drivers/infiniband/hw/hns/hns_roce_dca.h +++ b/drivers/infiniband/hw/hns/hns_roce_dca.h @@ -30,4 +30,8 @@ void hns_roce_register_udca(struct hns_roce_dev *hr_dev, void hns_roce_unregister_udca(struct hns_roce_dev *hr_dev, struct hns_roce_ucontext *uctx);
+void hns_roce_enable_dca(struct hns_roce_dev *hr_dev, + struct hns_roce_qp *hr_qp); +void hns_roce_disable_dca(struct hns_roce_dev *hr_dev, + struct hns_roce_qp *hr_qp); #endif diff --git a/drivers/infiniband/hw/hns/hns_roce_device.h b/drivers/infiniband/hw/hns/hns_roce_device.h index d5fc8ae85a7f..90fb613d8f9c 100644 --- a/drivers/infiniband/hw/hns/hns_roce_device.h +++ b/drivers/infiniband/hw/hns/hns_roce_device.h @@ -314,6 +314,10 @@ struct hns_roce_mtr { struct hns_roce_hem_cfg hem_cfg; /* config for hardware addressing */ };
+struct hns_roce_dca_cfg { + u32 buf_id; +}; + struct hns_roce_mw { struct ib_mw ibmw; u32 pdn; @@ -610,6 +614,7 @@ struct hns_roce_qp { struct hns_roce_wq sq;
struct hns_roce_mtr mtr; + struct hns_roce_dca_cfg dca_cfg;
u32 buff_size; struct mutex mutex; diff --git a/drivers/infiniband/hw/hns/hns_roce_hw_v2.c b/drivers/infiniband/hw/hns/hns_roce_hw_v2.c index 12f124a1de04..b8b57950432e 100644 --- a/drivers/infiniband/hw/hns/hns_roce_hw_v2.c +++ b/drivers/infiniband/hw/hns/hns_roce_hw_v2.c @@ -4707,6 +4707,16 @@ static int modify_qp_init_to_rtr(struct ib_qp *ibqp, hr_reg_write(context, QPC_TRRL_BA_H, trrl_ba >> (32 + 16 + 4)); hr_reg_clear(qpc_mask, QPC_TRRL_BA_H);
+ if (hr_dev->pci_dev->revision >= PCI_REVISION_ID_HIP09) { + if (hr_qp->en_flags & HNS_ROCE_QP_CAP_DCA) { + hr_reg_enable(context, QPC_DCA_MODE); + hr_reg_clear(qpc_mask, QPC_DCA_MODE); + } + } else { + /* reset IRRL_HEAD */ + hr_reg_clear(qpc_mask, QPC_V2_IRRL_HEAD); + } + context->irrl_ba = cpu_to_le32(irrl_ba >> 6); qpc_mask->irrl_ba = 0; hr_reg_write(context, QPC_IRRL_BA_H, irrl_ba >> (32 + 6)); @@ -4843,8 +4853,6 @@ static int modify_qp_rtr_to_rts(struct ib_qp *ibqp,
hr_reg_clear(qpc_mask, QPC_CHECK_FLG);
- hr_reg_clear(qpc_mask, QPC_V2_IRRL_HEAD); - return 0; }
diff --git a/drivers/infiniband/hw/hns/hns_roce_qp.c b/drivers/infiniband/hw/hns/hns_roce_qp.c index df5bebc5e1c1..e42461d5ea4c 100644 --- a/drivers/infiniband/hw/hns/hns_roce_qp.c +++ b/drivers/infiniband/hw/hns/hns_roce_qp.c @@ -38,6 +38,7 @@ #include "hns_roce_common.h" #include "hns_roce_device.h" #include "hns_roce_hem.h" +#include "hns_roce_dca.h"
static void flush_work_handle(struct work_struct *work) { @@ -638,8 +639,21 @@ static int set_user_sq_size(struct hns_roce_dev *hr_dev, return 0; }
+static bool check_dca_is_enable(struct hns_roce_dev *hr_dev, bool is_user, + unsigned long addr) +{ + if (!(hr_dev->caps.flags & HNS_ROCE_CAP_FLAG_DCA_MODE)) + return false; + + /* If the user QP's buffer addr is 0, the DCA mode should be enabled */ + if (is_user) + return !addr; + + return false; +} + static int set_wqe_buf_attr(struct hns_roce_dev *hr_dev, - struct hns_roce_qp *hr_qp, + struct hns_roce_qp *hr_qp, bool dca_en, struct hns_roce_buf_attr *buf_attr) { int buf_size; @@ -683,9 +697,21 @@ static int set_wqe_buf_attr(struct hns_roce_dev *hr_dev, if (hr_qp->buff_size < 1) return -EINVAL;
- buf_attr->page_shift = HNS_HW_PAGE_SHIFT + hr_dev->caps.mtt_buf_pg_sz; buf_attr->region_count = idx;
+ if (dca_en) { + /* + * When enable DCA, there's no need to alloc buffer now, and + * the page shift should be fixed to 4K. + */ + buf_attr->mtt_only = true; + buf_attr->page_shift = HNS_HW_PAGE_SHIFT; + } else { + buf_attr->mtt_only = false; + buf_attr->page_shift = HNS_HW_PAGE_SHIFT + + hr_dev->caps.mtt_buf_pg_sz; + } + return 0; }
@@ -738,39 +764,75 @@ static int hns_roce_qp_has_rq(struct ib_qp_init_attr *attr) return 1; }
-static int alloc_qp_buf(struct hns_roce_dev *hr_dev, struct hns_roce_qp *hr_qp, - struct ib_qp_init_attr *init_attr, - struct ib_udata *udata, unsigned long addr) +static int alloc_wqe_buf(struct hns_roce_dev *hr_dev, struct hns_roce_qp *hr_qp, + bool dca_en, struct hns_roce_buf_attr *buf_attr, + struct ib_udata *udata, unsigned long addr) { struct ib_device *ibdev = &hr_dev->ib_dev; - struct hns_roce_buf_attr buf_attr = {}; int ret;
- ret = set_wqe_buf_attr(hr_dev, hr_qp, &buf_attr); - if (ret) { - ibdev_err(ibdev, "failed to split WQE buf, ret = %d.\n", ret); - goto err_inline; + if (dca_en) { + /* DCA must be enabled after the buffer size is configured. */ + hns_roce_enable_dca(hr_dev, hr_qp); + + hr_qp->en_flags |= HNS_ROCE_QP_CAP_DCA; + } else { + /* + * Because DCA and DWQE share the same fileds in RCWQE buffer, + * so DWQE only supported when DCA is disable. + */ + if (hr_dev->caps.flags & HNS_ROCE_CAP_FLAG_DIRECT_WQE) + hr_qp->en_flags |= HNS_ROCE_QP_CAP_DIRECT_WQE; } - ret = hns_roce_mtr_create(hr_dev, &hr_qp->mtr, &buf_attr, + + ret = hns_roce_mtr_create(hr_dev, &hr_qp->mtr, buf_attr, PAGE_SHIFT + hr_dev->caps.mtt_ba_pg_sz, udata, addr); if (ret) { ibdev_err(ibdev, "failed to create WQE mtr, ret = %d.\n", ret); - goto err_inline; + if (dca_en) + hns_roce_disable_dca(hr_dev, hr_qp); }
- if (hr_dev->caps.flags & HNS_ROCE_CAP_FLAG_DIRECT_WQE) - hr_qp->en_flags |= HNS_ROCE_QP_CAP_DIRECT_WQE; + return ret; +}
- return 0; -err_inline: +static void free_wqe_buf(struct hns_roce_dev *hr_dev, struct hns_roce_qp *hr_qp, + struct ib_udata *udata) +{ + hns_roce_mtr_destroy(hr_dev, &hr_qp->mtr); + + if (hr_qp->en_flags & HNS_ROCE_QP_CAP_DCA) + hns_roce_disable_dca(hr_dev, hr_qp); +} + +static int alloc_qp_wqe(struct hns_roce_dev *hr_dev, struct hns_roce_qp *hr_qp, + struct ib_qp_init_attr *init_attr, + struct ib_udata *udata, unsigned long addr) +{ + struct ib_device *ibdev = &hr_dev->ib_dev; + struct hns_roce_buf_attr buf_attr = {}; + bool dca_en; + int ret; + + dca_en = check_dca_is_enable(hr_dev, !!udata, addr); + ret = set_wqe_buf_attr(hr_dev, hr_qp, dca_en, &buf_attr); + if (ret) { + ibdev_err(ibdev, "failed to split WQE buf, ret = %d.\n", ret); + return ret; + } + + ret = alloc_wqe_buf(hr_dev, hr_qp, dca_en, &buf_attr, udata, addr); + if (ret) + ibdev_err(ibdev, "failed to alloc WQE buf, ret = %d.\n", ret);
return ret; }
-static void free_qp_buf(struct hns_roce_dev *hr_dev, struct hns_roce_qp *hr_qp) +static void free_qp_wqe(struct hns_roce_dev *hr_dev, struct hns_roce_qp *hr_qp, + struct ib_udata *udata) { - hns_roce_mtr_destroy(hr_dev, &hr_qp->mtr); + free_wqe_buf(hr_dev, hr_qp, udata); }
static inline bool user_qp_has_sdb(struct hns_roce_dev *hr_dev, @@ -1097,18 +1159,18 @@ static int hns_roce_create_qp_common(struct hns_roce_dev *hr_dev, } }
- ret = alloc_qp_buf(hr_dev, hr_qp, init_attr, udata, ucmd.buf_addr); - if (ret) { - ibdev_err(ibdev, "failed to alloc QP buffer, ret = %d.\n", ret); - goto err_buf; - } - ret = alloc_qpn(hr_dev, hr_qp); if (ret) { ibdev_err(ibdev, "failed to alloc QPN, ret = %d.\n", ret); goto err_qpn; }
+ ret = alloc_qp_wqe(hr_dev, hr_qp, init_attr, udata, ucmd.buf_addr); + if (ret) { + ibdev_err(ibdev, "failed to alloc QP buffer, ret = %d.\n", ret); + goto err_buf; + } + ret = alloc_qp_db(hr_dev, hr_qp, init_attr, udata, &ucmd, &resp); if (ret) { ibdev_err(ibdev, "failed to alloc QP doorbell, ret = %d.\n", @@ -1159,10 +1221,10 @@ static int hns_roce_create_qp_common(struct hns_roce_dev *hr_dev, err_qpc: free_qp_db(hr_dev, hr_qp, udata); err_db: + free_qp_wqe(hr_dev, hr_qp, udata); +err_buf: free_qpn(hr_dev, hr_qp); err_qpn: - free_qp_buf(hr_dev, hr_qp); -err_buf: free_kernel_wrid(hr_qp); return ret; } @@ -1176,7 +1238,7 @@ void hns_roce_qp_destroy(struct hns_roce_dev *hr_dev, struct hns_roce_qp *hr_qp,
free_qpc(hr_dev, hr_qp); free_qpn(hr_dev, hr_qp); - free_qp_buf(hr_dev, hr_qp); + free_qp_wqe(hr_dev, hr_qp, udata); free_kernel_wrid(hr_qp); free_qp_db(hr_dev, hr_qp, udata);
diff --git a/include/uapi/rdma/hns-abi.h b/include/uapi/rdma/hns-abi.h index 925d68df7ee9..ce7165073bbf 100644 --- a/include/uapi/rdma/hns-abi.h +++ b/include/uapi/rdma/hns-abi.h @@ -77,6 +77,7 @@ enum hns_roce_qp_cap_flags { HNS_ROCE_QP_CAP_RQ_RECORD_DB = 1 << 0, HNS_ROCE_QP_CAP_SQ_RECORD_DB = 1 << 1, HNS_ROCE_QP_CAP_OWNER_DB = 1 << 2, + HNS_ROCE_QP_CAP_DCA = 1 << 4, HNS_ROCE_QP_CAP_DIRECT_WQE = 1 << 5, };
From: Chengchang Tang tangchengchang@huawei.com
driver inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I63KVU
----------------------------------------------------------
If a uQP works as DCA mode, the userspace driver need config the WQE buffer by calling the 'HNS_IB_METHOD_DCA_MEM_ATTACH' method before filling the WQE. This method will allocate a group of pages from DCA memory pool and write the configuration of addressing to QPC.
Signed-off-by: Chengchang Tang tangchengchang@huawei.com Reviewed-by: Yangyang Li liyangyang20@huawei.com Reviewed-by: YueHaibing yuehaibing@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- drivers/infiniband/hw/hns/hns_roce_dca.c | 466 +++++++++++++++++++- drivers/infiniband/hw/hns/hns_roce_dca.h | 25 ++ drivers/infiniband/hw/hns/hns_roce_device.h | 13 + drivers/infiniband/hw/hns/hns_roce_hw_v2.c | 15 +- include/uapi/rdma/hns-abi.h | 11 + 5 files changed, 523 insertions(+), 7 deletions(-)
diff --git a/drivers/infiniband/hw/hns/hns_roce_dca.c b/drivers/infiniband/hw/hns/hns_roce_dca.c index c2c7fcf3be04..21b360be4996 100644 --- a/drivers/infiniband/hw/hns/hns_roce_dca.c +++ b/drivers/infiniband/hw/hns/hns_roce_dca.c @@ -35,11 +35,53 @@ struct dca_mem_attr { u32 size; };
+static inline void set_dca_page_to_free(struct hns_dca_page_state *state) +{ + state->buf_id = HNS_DCA_INVALID_BUF_ID; + state->active = 0; + state->lock = 0; +} + +static inline void lock_dca_page_to_attach(struct hns_dca_page_state *state, + u32 buf_id) +{ + state->buf_id = HNS_DCA_ID_MASK & buf_id; + state->active = 0; + state->lock = 1; +} + +static inline void unlock_dca_page_to_active(struct hns_dca_page_state *state, + u32 buf_id) +{ + state->buf_id = HNS_DCA_ID_MASK & buf_id; + state->active = 1; + state->lock = 0; +} + static inline bool dca_page_is_free(struct hns_dca_page_state *state) { return state->buf_id == HNS_DCA_INVALID_BUF_ID; }
+static inline bool dca_page_is_attached(struct hns_dca_page_state *state, + u32 buf_id) +{ + /* only the own bit needs to be matched. */ + return (HNS_DCA_OWN_MASK & buf_id) == + (HNS_DCA_OWN_MASK & state->buf_id); +} + +static inline bool dca_page_is_allocated(struct hns_dca_page_state *state, + u32 buf_id) +{ + return dca_page_is_attached(state, buf_id) && state->lock; +} + +static inline bool dca_page_is_inactive(struct hns_dca_page_state *state) +{ + return !state->lock && !state->active; +} + static inline bool dca_mem_is_available(struct dca_mem *mem) { return mem->flags == (DCA_MEM_FLAGS_ALLOCED | DCA_MEM_FLAGS_REGISTERED); @@ -354,11 +396,366 @@ static void free_dca_mem(struct dca_mem *mem) spin_unlock(&mem->lock); }
+static inline struct hns_roce_dca_ctx *hr_qp_to_dca_ctx(struct hns_roce_qp *qp) +{ + return to_hr_dca_ctx(to_hr_ucontext(qp->ibqp.pd->uobject->context)); +} + +struct dca_page_clear_attr { + u32 buf_id; + u32 max_pages; + u32 clear_pages; +}; + +static int clear_dca_pages_proc(struct dca_mem *mem, int index, void *param) +{ + struct hns_dca_page_state *state = &mem->states[index]; + struct dca_page_clear_attr *attr = param; + + if (dca_page_is_attached(state, attr->buf_id)) { + set_dca_page_to_free(state); + attr->clear_pages++; + } + + if (attr->clear_pages >= attr->max_pages) + return DCA_MEM_STOP_ITERATE; + else + return 0; +} + +static void clear_dca_pages(struct hns_roce_dca_ctx *ctx, u32 buf_id, u32 count) +{ + struct dca_page_clear_attr attr = {}; + + attr.buf_id = buf_id; + attr.max_pages = count; + travel_dca_pages(ctx, &attr, clear_dca_pages_proc); +} + +struct dca_page_assign_attr { + u32 buf_id; + int unit; + int total; + int max; +}; + +static bool dca_page_is_allocable(struct hns_dca_page_state *state, bool head) +{ + bool is_free = dca_page_is_free(state) || dca_page_is_inactive(state); + + return head ? is_free : is_free && !state->head; +} + +static int assign_dca_pages_proc(struct dca_mem *mem, int index, void *param) +{ + struct dca_page_assign_attr *attr = param; + struct hns_dca_page_state *state; + int checked_pages = 0; + int start_index = 0; + int free_pages = 0; + int i; + + /* Check the continuous pages count is not smaller than unit count */ + for (i = index; free_pages < attr->unit && i < mem->page_count; i++) { + checked_pages++; + state = &mem->states[i]; + if (dca_page_is_allocable(state, free_pages == 0)) { + if (free_pages == 0) + start_index = i; + + free_pages++; + } else { + free_pages = 0; + } + } + + if (free_pages < attr->unit) + return DCA_MEM_NEXT_ITERATE; + + for (i = 0; i < free_pages; i++) { + state = &mem->states[start_index + i]; + lock_dca_page_to_attach(state, attr->buf_id); + attr->total++; + } + + if (attr->total >= attr->max) + return DCA_MEM_STOP_ITERATE; + + return checked_pages; +} + +static u32 assign_dca_pages(struct hns_roce_dca_ctx *ctx, u32 buf_id, u32 count, + u32 unit) +{ + struct dca_page_assign_attr attr = {}; + + attr.buf_id = buf_id; + attr.unit = unit; + attr.max = count; + travel_dca_pages(ctx, &attr, assign_dca_pages_proc); + return attr.total; +} + +struct dca_page_active_attr { + u32 buf_id; + u32 max_pages; + u32 alloc_pages; + u32 dirty_mems; +}; + +static int active_dca_pages_proc(struct dca_mem *mem, int index, void *param) +{ + struct dca_page_active_attr *attr = param; + struct hns_dca_page_state *state; + bool changed = false; + bool stop = false; + int i, free_pages; + + free_pages = 0; + for (i = 0; !stop && i < mem->page_count; i++) { + state = &mem->states[i]; + if (dca_page_is_free(state)) { + free_pages++; + } else if (dca_page_is_allocated(state, attr->buf_id)) { + free_pages++; + /* Change matched pages state */ + unlock_dca_page_to_active(state, attr->buf_id); + changed = true; + attr->alloc_pages++; + if (attr->alloc_pages == attr->max_pages) + stop = true; + } + } + + for (; changed && i < mem->page_count; i++) + if (dca_page_is_free(state)) + free_pages++; + + /* Clean mem changed to dirty */ + if (changed && free_pages == mem->page_count) + attr->dirty_mems++; + + return stop ? DCA_MEM_STOP_ITERATE : DCA_MEM_NEXT_ITERATE; +} + +static u32 active_dca_pages(struct hns_roce_dca_ctx *ctx, u32 buf_id, u32 count) +{ + struct dca_page_active_attr attr = {}; + unsigned long flags; + + attr.buf_id = buf_id; + attr.max_pages = count; + travel_dca_pages(ctx, &attr, active_dca_pages_proc); + + /* Update free size */ + spin_lock_irqsave(&ctx->pool_lock, flags); + ctx->free_mems -= attr.dirty_mems; + ctx->free_size -= attr.alloc_pages << HNS_HW_PAGE_SHIFT; + spin_unlock_irqrestore(&ctx->pool_lock, flags); + + return attr.alloc_pages; +} + +struct dca_get_alloced_pages_attr { + u32 buf_id; + dma_addr_t *pages; + u32 total; + u32 max; +}; + +static int get_alloced_umem_proc(struct dca_mem *mem, int index, void *param) + +{ + struct dca_get_alloced_pages_attr *attr = param; + struct hns_dca_page_state *states = mem->states; + struct ib_umem *umem = mem->pages; + struct ib_block_iter biter; + u32 i = 0; + + rdma_for_each_block(umem->sg_head.sgl, &biter, umem->nmap, + HNS_HW_PAGE_SIZE) { + if (dca_page_is_allocated(&states[i], attr->buf_id)) { + attr->pages[attr->total++] = + rdma_block_iter_dma_address(&biter); + if (attr->total >= attr->max) + return DCA_MEM_STOP_ITERATE; + } + i++; + } + + return DCA_MEM_NEXT_ITERATE; +} + +static int apply_dca_cfg(struct hns_roce_dev *hr_dev, struct hns_roce_qp *hr_qp, + struct hns_dca_attach_attr *attach_attr) +{ + struct hns_roce_dca_attr attr; + + if (hr_dev->hw->set_dca_buf) { + attr.sq_offset = attach_attr->sq_offset; + attr.sge_offset = attach_attr->sge_offset; + attr.rq_offset = attach_attr->rq_offset; + return hr_dev->hw->set_dca_buf(hr_dev, hr_qp, &attr); + } + + return 0; +} + +static int setup_dca_buf_to_hw(struct hns_roce_dca_ctx *ctx, + struct hns_roce_qp *hr_qp, u32 buf_id, + struct hns_dca_attach_attr *attach_attr) +{ + struct hns_roce_dev *hr_dev = to_hr_dev(hr_qp->ibqp.device); + struct dca_get_alloced_pages_attr attr = {}; + struct ib_device *ibdev = &hr_dev->ib_dev; + u32 count = hr_qp->dca_cfg.npages; + dma_addr_t *pages; + int ret; + + /* Alloc a tmp array to store buffer's dma address */ + pages = kvcalloc(count, sizeof(dma_addr_t), GFP_NOWAIT); + if (!pages) + return -ENOMEM; + + attr.buf_id = buf_id; + attr.pages = pages; + attr.max = count; + + travel_dca_pages(ctx, &attr, get_alloced_umem_proc); + if (attr.total != count) { + ibdev_err(ibdev, "failed to get DCA page %u != %u.\n", + attr.total, count); + ret = -ENOMEM; + goto done; + } + + /* Update MTT for ROCEE addressing */ + ret = hns_roce_mtr_map(hr_dev, &hr_qp->mtr, pages, count); + if (ret) { + ibdev_err(ibdev, "failed to map DCA pages, ret = %d.\n", ret); + goto done; + } + + /* Apply the changes for WQE address */ + ret = apply_dca_cfg(hr_dev, hr_qp, attach_attr); + if (ret) + ibdev_err(ibdev, "failed to apply DCA cfg, ret = %d.\n", ret); + +done: + /* Drop tmp array */ + kvfree(pages); + return ret; +} + +static u32 alloc_buf_from_dca_mem(struct hns_roce_qp *hr_qp, + struct hns_roce_dca_ctx *ctx) +{ + u32 buf_pages, unit_pages, alloc_pages; + u32 buf_id; + + buf_pages = hr_qp->dca_cfg.npages; + /* Gen new buf id */ + buf_id = HNS_DCA_TO_BUF_ID(hr_qp->qpn, hr_qp->dca_cfg.attach_count); + + /* Assign pages from free pages */ + unit_pages = hr_qp->mtr.hem_cfg.is_direct ? buf_pages : 1; + alloc_pages = assign_dca_pages(ctx, buf_id, buf_pages, unit_pages); + if (buf_pages != alloc_pages) { + if (alloc_pages > 0) + clear_dca_pages(ctx, buf_id, alloc_pages); + return HNS_DCA_INVALID_BUF_ID; + } + + return buf_id; +} + +static int active_alloced_buf(struct hns_roce_qp *hr_qp, + struct hns_roce_dca_ctx *ctx, + struct hns_dca_attach_attr *attr, u32 buf_id) +{ + struct hns_roce_dev *hr_dev = to_hr_dev(hr_qp->ibqp.device); + struct ib_device *ibdev = &hr_dev->ib_dev; + u32 active_pages, alloc_pages; + int ret; + + ret = setup_dca_buf_to_hw(ctx, hr_qp, buf_id, attr); + if (ret) { + ibdev_err(ibdev, "failed to setup DCA buf, ret = %d.\n", ret); + goto active_fail; + } + + alloc_pages = hr_qp->dca_cfg.npages; + active_pages = active_dca_pages(ctx, buf_id, alloc_pages); + if (active_pages != alloc_pages) { + ibdev_err(ibdev, "failed to active DCA pages, %u != %u.\n", + active_pages, alloc_pages); + ret = -ENOBUFS; + goto active_fail; + } + + return 0; + +active_fail: + clear_dca_pages(ctx, buf_id, alloc_pages); + return ret; +} + +static int attach_dca_mem(struct hns_roce_dev *hr_dev, + struct hns_roce_qp *hr_qp, + struct hns_dca_attach_attr *attr, + struct hns_dca_attach_resp *resp) +{ + struct hns_roce_dca_ctx *ctx = hr_qp_to_dca_ctx(hr_qp); + struct hns_roce_dca_cfg *cfg = &hr_qp->dca_cfg; + u32 buf_id; + int ret; + + resp->alloc_flags = 0; + spin_lock(&cfg->lock); + buf_id = cfg->buf_id; + /* Already attached */ + if (buf_id != HNS_DCA_INVALID_BUF_ID) { + resp->alloc_pages = cfg->npages; + spin_unlock(&cfg->lock); + return 0; + } + + /* Start to new attach */ + resp->alloc_pages = 0; + buf_id = alloc_buf_from_dca_mem(hr_qp, ctx); + if (buf_id == HNS_DCA_INVALID_BUF_ID) { + spin_unlock(&cfg->lock); + /* No report fail, need try again after the pool increased */ + return 0; + } + + ret = active_alloced_buf(hr_qp, ctx, attr, buf_id); + if (ret) { + spin_unlock(&cfg->lock); + ibdev_err(&hr_dev->ib_dev, + "failed to active DCA buf for QP-%lu, ret = %d.\n", + hr_qp->qpn, ret); + return ret; + } + + /* Attach ok */ + cfg->buf_id = buf_id; + cfg->attach_count++; + spin_unlock(&cfg->lock); + + resp->alloc_flags |= HNS_IB_ATTACH_FLAGS_NEW_BUFFER; + resp->alloc_pages = cfg->npages; + + return 0; +} + void hns_roce_enable_dca(struct hns_roce_dev *hr_dev, struct hns_roce_qp *hr_qp) { struct hns_roce_dca_cfg *cfg = &hr_qp->dca_cfg;
+ spin_lock_init(&cfg->lock); cfg->buf_id = HNS_DCA_INVALID_BUF_ID; + cfg->npages = hr_qp->buff_size >> HNS_HW_PAGE_SHIFT; }
void hns_roce_disable_dca(struct hns_roce_dev *hr_dev, @@ -488,11 +885,78 @@ DECLARE_UVERBS_NAMED_METHOD( UVERBS_ATTR_TYPE(u64), UA_MANDATORY), UVERBS_ATTR_PTR_OUT(HNS_IB_ATTR_DCA_MEM_SHRINK_OUT_FREE_MEMS, UVERBS_ATTR_TYPE(u32), UA_MANDATORY)); + +static inline struct hns_roce_qp * +uverbs_attr_to_hr_qp(struct uverbs_attr_bundle *attrs) +{ + struct ib_uobject *uobj = + uverbs_attr_get_uobject(attrs, 1U << UVERBS_ID_NS_SHIFT); + + if (uobj_get_object_id(uobj) == UVERBS_OBJECT_QP) + return to_hr_qp(uobj->object); + + return NULL; +} + +static int UVERBS_HANDLER(HNS_IB_METHOD_DCA_MEM_ATTACH)( + struct uverbs_attr_bundle *attrs) +{ + struct hns_roce_qp *hr_qp = uverbs_attr_to_hr_qp(attrs); + struct hns_dca_attach_attr attr = {}; + struct hns_dca_attach_resp resp = {}; + int ret; + + if (!hr_qp) + return -EINVAL; + + ret = uverbs_copy_from(&attr.sq_offset, attrs, + HNS_IB_ATTR_DCA_MEM_ATTACH_SQ_OFFSET); + if (!ret) + ret = uverbs_copy_from(&attr.sge_offset, attrs, + HNS_IB_ATTR_DCA_MEM_ATTACH_SGE_OFFSET); + if (!ret) + ret = uverbs_copy_from(&attr.rq_offset, attrs, + HNS_IB_ATTR_DCA_MEM_ATTACH_RQ_OFFSET); + if (ret) + return ret; + + ret = attach_dca_mem(to_hr_dev(hr_qp->ibqp.device), hr_qp, &attr, + &resp); + if (ret) + return ret; + + ret = uverbs_copy_to(attrs, HNS_IB_ATTR_DCA_MEM_ATTACH_OUT_ALLOC_FLAGS, + &resp.alloc_flags, sizeof(resp.alloc_flags)); + if (!ret) + ret = uverbs_copy_to(attrs, + HNS_IB_ATTR_DCA_MEM_ATTACH_OUT_ALLOC_PAGES, + &resp.alloc_pages, + sizeof(resp.alloc_pages)); + + return ret; +} + +DECLARE_UVERBS_NAMED_METHOD( + HNS_IB_METHOD_DCA_MEM_ATTACH, + UVERBS_ATTR_IDR(HNS_IB_ATTR_DCA_MEM_ATTACH_HANDLE, UVERBS_OBJECT_QP, + UVERBS_ACCESS_WRITE, UA_MANDATORY), + UVERBS_ATTR_PTR_IN(HNS_IB_ATTR_DCA_MEM_ATTACH_SQ_OFFSET, + UVERBS_ATTR_TYPE(u32), UA_MANDATORY), + UVERBS_ATTR_PTR_IN(HNS_IB_ATTR_DCA_MEM_ATTACH_SGE_OFFSET, + UVERBS_ATTR_TYPE(u32), UA_MANDATORY), + UVERBS_ATTR_PTR_IN(HNS_IB_ATTR_DCA_MEM_ATTACH_RQ_OFFSET, + UVERBS_ATTR_TYPE(u32), UA_MANDATORY), + UVERBS_ATTR_PTR_OUT(HNS_IB_ATTR_DCA_MEM_ATTACH_OUT_ALLOC_FLAGS, + UVERBS_ATTR_TYPE(u32), UA_MANDATORY), + UVERBS_ATTR_PTR_OUT(HNS_IB_ATTR_DCA_MEM_ATTACH_OUT_ALLOC_PAGES, + UVERBS_ATTR_TYPE(u32), UA_MANDATORY)); + DECLARE_UVERBS_NAMED_OBJECT(HNS_IB_OBJECT_DCA_MEM, UVERBS_TYPE_ALLOC_IDR(dca_cleanup), &UVERBS_METHOD(HNS_IB_METHOD_DCA_MEM_REG), &UVERBS_METHOD(HNS_IB_METHOD_DCA_MEM_DEREG), - &UVERBS_METHOD(HNS_IB_METHOD_DCA_MEM_SHRINK)); + &UVERBS_METHOD(HNS_IB_METHOD_DCA_MEM_SHRINK), + &UVERBS_METHOD(HNS_IB_METHOD_DCA_MEM_ATTACH));
static bool dca_is_supported(struct ib_device *device) { diff --git a/drivers/infiniband/hw/hns/hns_roce_dca.h b/drivers/infiniband/hw/hns/hns_roce_dca.h index c930b46160b2..f9eea9beb092 100644 --- a/drivers/infiniband/hw/hns/hns_roce_dca.h +++ b/drivers/infiniband/hw/hns/hns_roce_dca.h @@ -25,6 +25,31 @@ struct hns_dca_shrink_resp {
#define HNS_DCA_INVALID_BUF_ID 0UL
+/* + * buffer id(29b) = tag(7b) + owner(22b) + * [28:22] tag : indicate the QP config update times. + * [21: 0] owner: indicate the QP to which the page belongs. + */ +#define HNS_DCA_ID_MASK GENMASK(28, 0) +#define HNS_DCA_TAG_MASK GENMASK(28, 22) +#define HNS_DCA_OWN_MASK GENMASK(21, 0) + +#define HNS_DCA_BUF_ID_TO_TAG(buf_id) (((buf_id) & HNS_DCA_TAG_MASK) >> 22) +#define HNS_DCA_BUF_ID_TO_QPN(buf_id) ((buf_id) & HNS_DCA_OWN_MASK) +#define HNS_DCA_TO_BUF_ID(qpn, tag) (((qpn) & HNS_DCA_OWN_MASK) | \ + (((tag) << 22) & HNS_DCA_TAG_MASK)) + +struct hns_dca_attach_attr { + u32 sq_offset; + u32 sge_offset; + u32 rq_offset; +}; + +struct hns_dca_attach_resp { + u32 alloc_flags; + u32 alloc_pages; +}; + void hns_roce_register_udca(struct hns_roce_dev *hr_dev, struct hns_roce_ucontext *uctx); void hns_roce_unregister_udca(struct hns_roce_dev *hr_dev, diff --git a/drivers/infiniband/hw/hns/hns_roce_device.h b/drivers/infiniband/hw/hns/hns_roce_device.h index 90fb613d8f9c..a58b7d598f4f 100644 --- a/drivers/infiniband/hw/hns/hns_roce_device.h +++ b/drivers/infiniband/hw/hns/hns_roce_device.h @@ -315,7 +315,17 @@ struct hns_roce_mtr { };
struct hns_roce_dca_cfg { + spinlock_t lock; u32 buf_id; + u16 attach_count; + u32 npages; +}; + +/* DCA attr for setting WQE buffer */ +struct hns_roce_dca_attr { + u32 sq_offset; + u32 sge_offset; + u32 rq_offset; };
struct hns_roce_mw { @@ -899,6 +909,9 @@ struct hns_roce_hw { int (*clear_hem)(struct hns_roce_dev *hr_dev, struct hns_roce_hem_table *table, int obj, u32 step_idx); + int (*set_dca_buf)(struct hns_roce_dev *hr_dev, + struct hns_roce_qp *hr_qp, + struct hns_roce_dca_attr *attr); int (*modify_qp)(struct ib_qp *ibqp, const struct ib_qp_attr *attr, int attr_mask, enum ib_qp_state cur_state, enum ib_qp_state new_state, struct ib_udata *udata); diff --git a/drivers/infiniband/hw/hns/hns_roce_hw_v2.c b/drivers/infiniband/hw/hns/hns_roce_hw_v2.c index b8b57950432e..443487c1408e 100644 --- a/drivers/infiniband/hw/hns/hns_roce_hw_v2.c +++ b/drivers/infiniband/hw/hns/hns_roce_hw_v2.c @@ -4573,12 +4573,15 @@ static int config_qp_rq_buf(struct hns_roce_dev *hr_dev, upper_32_bits(to_hr_hw_page_addr(mtts[0]))); hr_reg_clear(qpc_mask, QPC_RQ_CUR_BLK_ADDR_H);
- context->rq_nxt_blk_addr = cpu_to_le32(to_hr_hw_page_addr(mtts[1])); - qpc_mask->rq_nxt_blk_addr = 0; - - hr_reg_write(context, QPC_RQ_NXT_BLK_ADDR_H, - upper_32_bits(to_hr_hw_page_addr(mtts[1]))); - hr_reg_clear(qpc_mask, QPC_RQ_NXT_BLK_ADDR_H); + /* The rq next block address is only valid for HIP08 QPC. */ + if (hr_dev->pci_dev->revision == PCI_REVISION_ID_HIP08) { + context->rq_nxt_blk_addr = + cpu_to_le32(to_hr_hw_page_addr(mtts[1])); + qpc_mask->rq_nxt_blk_addr = 0; + hr_reg_write(context, QPC_RQ_NXT_BLK_ADDR_H, + upper_32_bits(to_hr_hw_page_addr(mtts[1]))); + hr_reg_clear(qpc_mask, QPC_RQ_NXT_BLK_ADDR_H); + }
return 0; } diff --git a/include/uapi/rdma/hns-abi.h b/include/uapi/rdma/hns-abi.h index ce7165073bbf..ebaf917a30bc 100644 --- a/include/uapi/rdma/hns-abi.h +++ b/include/uapi/rdma/hns-abi.h @@ -143,6 +143,7 @@ enum hns_ib_dca_mem_methods { HNS_IB_METHOD_DCA_MEM_REG = (1U << UVERBS_ID_NS_SHIFT), HNS_IB_METHOD_DCA_MEM_DEREG, HNS_IB_METHOD_DCA_MEM_SHRINK, + HNS_IB_METHOD_DCA_MEM_ATTACH, };
enum hns_ib_dca_mem_reg_attrs { @@ -163,4 +164,14 @@ enum hns_ib_dca_mem_shrink_attrs { HNS_IB_ATTR_DCA_MEM_SHRINK_OUT_FREE_MEMS, };
+#define HNS_IB_ATTACH_FLAGS_NEW_BUFFER 1U + +enum hns_ib_dca_mem_attach_attrs { + HNS_IB_ATTR_DCA_MEM_ATTACH_HANDLE = (1U << UVERBS_ID_NS_SHIFT), + HNS_IB_ATTR_DCA_MEM_ATTACH_SQ_OFFSET, + HNS_IB_ATTR_DCA_MEM_ATTACH_SGE_OFFSET, + HNS_IB_ATTR_DCA_MEM_ATTACH_RQ_OFFSET, + HNS_IB_ATTR_DCA_MEM_ATTACH_OUT_ALLOC_FLAGS, + HNS_IB_ATTR_DCA_MEM_ATTACH_OUT_ALLOC_PAGES, +}; #endif /* HNS_ABI_USER_H */
From: Chengchang Tang tangchengchang@huawei.com
driver inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I63KVU
----------------------------------------------------------
Add a new command to update the configuration of WQE buffer addressing to QPC in DCA mode.
Signed-off-by: Chengchang Tang tangchengchang@huawei.com Reviewed-by: Yangyang Li liyangyang20@huawei.com Reviewed-by: YueHaibing yuehaibing@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- drivers/infiniband/hw/hns/hns_roce_hw_v2.c | 153 +++++++++++++++++++-- drivers/infiniband/hw/hns/hns_roce_hw_v2.h | 1 + 2 files changed, 140 insertions(+), 14 deletions(-)
diff --git a/drivers/infiniband/hw/hns/hns_roce_hw_v2.c b/drivers/infiniband/hw/hns/hns_roce_hw_v2.c index 443487c1408e..1e9fa0f14268 100644 --- a/drivers/infiniband/hw/hns/hns_roce_hw_v2.c +++ b/drivers/infiniband/hw/hns/hns_roce_hw_v2.c @@ -3104,6 +3104,16 @@ static void hns_roce_v2_exit(struct hns_roce_dev *hr_dev) free_dip_list(hr_dev); }
+static inline void mbox_desc_init(struct hns_roce_post_mbox *mb, + struct hns_roce_mbox_msg *mbox_msg) +{ + mb->in_param_l = cpu_to_le32(mbox_msg->in_param); + mb->in_param_h = cpu_to_le32(mbox_msg->in_param >> 32); + mb->out_param_l = cpu_to_le32(mbox_msg->out_param); + mb->out_param_h = cpu_to_le32(mbox_msg->out_param >> 32); + mb->cmd_tag = cpu_to_le32(mbox_msg->tag << 8 | mbox_msg->cmd); +} + static int hns_roce_mbox_post(struct hns_roce_dev *hr_dev, struct hns_roce_mbox_msg *mbox_msg) { @@ -3112,17 +3122,34 @@ static int hns_roce_mbox_post(struct hns_roce_dev *hr_dev,
hns_roce_cmq_setup_basic_desc(&desc, HNS_ROCE_OPC_POST_MB, false);
- mb->in_param_l = cpu_to_le32(mbox_msg->in_param); - mb->in_param_h = cpu_to_le32(mbox_msg->in_param >> 32); - mb->out_param_l = cpu_to_le32(mbox_msg->out_param); - mb->out_param_h = cpu_to_le32(mbox_msg->out_param >> 32); - mb->cmd_tag = cpu_to_le32(mbox_msg->tag << 8 | mbox_msg->cmd); + mbox_desc_init(mb, mbox_msg); mb->token_event_en = cpu_to_le32(mbox_msg->event_en << 16 | mbox_msg->token);
return hns_roce_cmq_send(hr_dev, &desc, 1); }
+static int hns_roce_mbox_send(struct hns_roce_dev *hr_dev, + struct hns_roce_mbox_msg *mbox_msg) +{ + struct hns_roce_cmq_desc desc; + struct hns_roce_post_mbox *mb = (struct hns_roce_post_mbox *)desc.data; + + hns_roce_cmq_setup_basic_desc(&desc, HNS_ROCE_OPC_SYNC_MB, false); + + mbox_desc_init(mb, mbox_msg); + + /* The hardware doesn't care about the token fields when working in + * sync mode. + */ + mb->token_event_en = 0; + + /* The cmdq send returns 0 indicates that the hardware has already + * finished the operation defined in this mbox. + */ + return hns_roce_cmq_send(hr_dev, &desc, 1); +} + static int v2_wait_mbox_complete(struct hns_roce_dev *hr_dev, u32 timeout, u8 *complete_status) { @@ -4515,15 +4542,16 @@ static void modify_qp_init_to_init(struct ib_qp *ibqp, static int config_qp_rq_buf(struct hns_roce_dev *hr_dev, struct hns_roce_qp *hr_qp, struct hns_roce_v2_qp_context *context, - struct hns_roce_v2_qp_context *qpc_mask) + struct hns_roce_v2_qp_context *qpc_mask, + struct hns_roce_dca_attr *dca_attr) { u64 mtts[MTT_MIN_COUNT] = { 0 }; u64 wqe_sge_ba; int count;
/* Search qp buf's mtts */ - count = hns_roce_mtr_find(hr_dev, &hr_qp->mtr, hr_qp->rq.offset, mtts, - MTT_MIN_COUNT, &wqe_sge_ba); + count = hns_roce_mtr_find(hr_dev, &hr_qp->mtr, dca_attr->rq_offset, + mtts, ARRAY_SIZE(mtts), &wqe_sge_ba); if (hr_qp->rq.wqe_cnt && count < 1) { ibdev_err(&hr_dev->ib_dev, "failed to find RQ WQE, QPN = 0x%lx.\n", hr_qp->qpn); @@ -4589,7 +4617,8 @@ static int config_qp_rq_buf(struct hns_roce_dev *hr_dev, static int config_qp_sq_buf(struct hns_roce_dev *hr_dev, struct hns_roce_qp *hr_qp, struct hns_roce_v2_qp_context *context, - struct hns_roce_v2_qp_context *qpc_mask) + struct hns_roce_v2_qp_context *qpc_mask, + struct hns_roce_dca_attr *dca_attr) { struct ib_device *ibdev = &hr_dev->ib_dev; u64 sge_cur_blk = 0; @@ -4597,7 +4626,8 @@ static int config_qp_sq_buf(struct hns_roce_dev *hr_dev, int count;
/* search qp buf's mtts */ - count = hns_roce_mtr_find(hr_dev, &hr_qp->mtr, 0, &sq_cur_blk, 1, NULL); + count = hns_roce_mtr_find(hr_dev, &hr_qp->mtr, dca_attr->sq_offset, + &sq_cur_blk, 1, NULL); if (count < 1) { ibdev_err(ibdev, "failed to find QP(0x%lx) SQ buf.\n", hr_qp->qpn); @@ -4605,8 +4635,8 @@ static int config_qp_sq_buf(struct hns_roce_dev *hr_dev, } if (hr_qp->sge.sge_cnt > 0) { count = hns_roce_mtr_find(hr_dev, &hr_qp->mtr, - hr_qp->sge.offset, - &sge_cur_blk, 1, NULL); + dca_attr->sge_offset, &sge_cur_blk, 1, + NULL); if (count < 1) { ibdev_err(ibdev, "failed to find QP(0x%lx) SGE buf.\n", hr_qp->qpn); @@ -4664,6 +4694,7 @@ static int modify_qp_init_to_rtr(struct ib_qp *ibqp, struct hns_roce_dev *hr_dev = to_hr_dev(ibqp->device); struct hns_roce_qp *hr_qp = to_hr_qp(ibqp); struct ib_device *ibdev = &hr_dev->ib_dev; + struct hns_roce_dca_attr dca_attr = {}; dma_addr_t trrl_ba; dma_addr_t irrl_ba; enum ib_mtu ib_mtu; @@ -4675,7 +4706,8 @@ static int modify_qp_init_to_rtr(struct ib_qp *ibqp, int port; int ret;
- ret = config_qp_rq_buf(hr_dev, hr_qp, context, qpc_mask); + dca_attr.rq_offset = hr_qp->rq.offset; + ret = config_qp_rq_buf(hr_dev, hr_qp, context, qpc_mask, &dca_attr); if (ret) { ibdev_err(ibdev, "failed to config rq buf, ret = %d.\n", ret); return ret; @@ -4821,6 +4853,7 @@ static int modify_qp_rtr_to_rts(struct ib_qp *ibqp, struct hns_roce_dev *hr_dev = to_hr_dev(ibqp->device); struct hns_roce_qp *hr_qp = to_hr_qp(ibqp); struct ib_device *ibdev = &hr_dev->ib_dev; + struct hns_roce_dca_attr dca_attr = {}; int ret;
/* Not support alternate path and path migration */ @@ -4829,7 +4862,9 @@ static int modify_qp_rtr_to_rts(struct ib_qp *ibqp, return -EINVAL; }
- ret = config_qp_sq_buf(hr_dev, hr_qp, context, qpc_mask); + dca_attr.sq_offset = hr_qp->sq.offset; + dca_attr.sge_offset = hr_qp->sge.offset; + ret = config_qp_sq_buf(hr_dev, hr_qp, context, qpc_mask, &dca_attr); if (ret) { ibdev_err(ibdev, "failed to config sq buf, ret = %d.\n", ret); return ret; @@ -5491,6 +5526,95 @@ static int hns_roce_v2_modify_qp(struct ib_qp *ibqp, return ret; }
+static int init_dca_buf_attr(struct hns_roce_dev *hr_dev, + struct hns_roce_qp *hr_qp, + struct hns_roce_dca_attr *init_attr, + struct hns_roce_dca_attr *dca_attr) +{ + struct ib_device *ibdev = &hr_dev->ib_dev; + + if (hr_qp->sq.wqe_cnt > 0) { + dca_attr->sq_offset = hr_qp->sq.offset + init_attr->sq_offset; + if (dca_attr->sq_offset >= hr_qp->sge.offset) { + ibdev_err(ibdev, "failed to check SQ offset = %u\n", + init_attr->sq_offset); + return -EINVAL; + } + } + + if (hr_qp->sge.sge_cnt > 0) { + dca_attr->sge_offset = hr_qp->sge.offset + init_attr->sge_offset; + if (dca_attr->sge_offset >= hr_qp->rq.offset) { + ibdev_err(ibdev, "failed to check exSGE offset = %u\n", + init_attr->sge_offset); + return -EINVAL; + } + } + + if (hr_qp->rq.wqe_cnt > 0) { + dca_attr->rq_offset = hr_qp->rq.offset + init_attr->rq_offset; + if (dca_attr->rq_offset >= hr_qp->buff_size) { + ibdev_err(ibdev, "failed to check RQ offset = %u\n", + init_attr->rq_offset); + return -EINVAL; + } + } + + return 0; +} + +static int hns_roce_v2_set_dca_buf(struct hns_roce_dev *hr_dev, + struct hns_roce_qp *hr_qp, + struct hns_roce_dca_attr *init_attr) +{ + struct ib_device *ibdev = &hr_dev->ib_dev; + struct hns_roce_v2_qp_context *qpc, *msk; + struct hns_roce_dca_attr dca_attr = {}; + struct hns_roce_mbox_msg mbox_msg = {}; + dma_addr_t dma_handle; + int qpc_sz; + int ret; + + ret = init_dca_buf_attr(hr_dev, hr_qp, init_attr, &dca_attr); + if (ret) { + ibdev_err(ibdev, "failed to init DCA attr, ret = %d.\n", ret); + return ret; + } + + qpc_sz = hr_dev->caps.qpc_sz; + WARN_ON(2 * qpc_sz > HNS_ROCE_MAILBOX_SIZE); + qpc = dma_pool_alloc(hr_dev->cmd.pool, GFP_NOWAIT, &dma_handle); + if (!qpc) + return -ENOMEM; + + msk = (struct hns_roce_v2_qp_context *)((void *)qpc + qpc_sz); + memset(msk, 0xff, qpc_sz); + + ret = config_qp_rq_buf(hr_dev, hr_qp, qpc, msk, &dca_attr); + if (ret) { + ibdev_err(ibdev, "failed to config rq qpc, ret = %d.\n", ret); + goto done; + } + + ret = config_qp_sq_buf(hr_dev, hr_qp, qpc, msk, &dca_attr); + if (ret) { + ibdev_err(ibdev, "failed to config sq qpc, ret = %d.\n", ret); + goto done; + } + + mbox_msg.in_param = dma_handle; + mbox_msg.tag = hr_qp->qpn; + mbox_msg.cmd = HNS_ROCE_CMD_MODIFY_QPC; + ret = hns_roce_mbox_send(hr_dev, &mbox_msg); + if (ret) + ibdev_err(ibdev, "failed to modify DCA buf, ret = %d.\n", ret); + +done: + dma_pool_free(hr_dev->cmd.pool, qpc, dma_handle); + + return ret; +} + static int to_ib_qp_st(enum hns_roce_v2_qp_state state) { static const enum ib_qp_state map[] = { @@ -6866,6 +6990,7 @@ static const struct hns_roce_hw hns_roce_hw_v2 = { .write_cqc = hns_roce_v2_write_cqc, .set_hem = hns_roce_v2_set_hem, .clear_hem = hns_roce_v2_clear_hem, + .set_dca_buf = hns_roce_v2_set_dca_buf, .modify_qp = hns_roce_v2_modify_qp, .dereg_mr = hns_roce_v2_dereg_mr, .qp_flow_control_init = hns_roce_v2_qp_flow_control_init, diff --git a/drivers/infiniband/hw/hns/hns_roce_hw_v2.h b/drivers/infiniband/hw/hns/hns_roce_hw_v2.h index 6e3d96caa5c1..e01d24f95933 100644 --- a/drivers/infiniband/hw/hns/hns_roce_hw_v2.h +++ b/drivers/infiniband/hw/hns/hns_roce_hw_v2.h @@ -250,6 +250,7 @@ enum hns_roce_opcode_type { HNS_ROCE_OPC_QUERY_VF_RES = 0x850e, HNS_ROCE_OPC_CFG_GMV_TBL = 0x850f, HNS_ROCE_OPC_CFG_GMV_BT = 0x8510, + HNS_ROCE_OPC_SYNC_MB = 0x8511, HNS_ROCE_OPC_EXT_CFG = 0x8512, HNS_ROCE_QUERY_RAM_ECC = 0x8513, HNS_SWITCH_PARAMETER_CFG = 0x1033,
From: Chengchang Tang tangchengchang@huawei.com
driver inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I63KVU
----------------------------------------------------------
If a uQP works in DCA mode, the userspace driver needs to drop the WQE buffer by calling the 'HNS_IB_METHOD_DCA_MEM_DETACH' method when the QP's CI is equal to PI, that means, the hns ROCEE will not access the WQE's buffer at this time, and the userspace driver can free this WQE's buffer.
This method will start an worker queue to recycle the WQE buffer in kernel space, if the WQE buffer is indeed not being accessed by hns ROCEE, the worker will change the pages' state as free in DCA memory pool.
Signed-off-by: Chengchang Tang tangchengchang@huawei.com Reviewed-by: Yangyang Li liyangyang20@huawei.com Reviewed-by: YueHaibing yuehaibing@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- drivers/infiniband/hw/hns/hns_roce_dca.c | 164 +++++++++++++++++++- drivers/infiniband/hw/hns/hns_roce_dca.h | 7 +- drivers/infiniband/hw/hns/hns_roce_device.h | 4 + drivers/infiniband/hw/hns/hns_roce_hw_v2.c | 50 ++++++ drivers/infiniband/hw/hns/hns_roce_qp.c | 4 +- include/uapi/rdma/hns-abi.h | 6 + 6 files changed, 230 insertions(+), 5 deletions(-)
diff --git a/drivers/infiniband/hw/hns/hns_roce_dca.c b/drivers/infiniband/hw/hns/hns_roce_dca.c index 21b360be4996..1be0e9822c9a 100644 --- a/drivers/infiniband/hw/hns/hns_roce_dca.c +++ b/drivers/infiniband/hw/hns/hns_roce_dca.c @@ -15,6 +15,9 @@ #define UVERBS_MODULE_NAME hns_ib #include <rdma/uverbs_named_ioctl.h>
+/* DCA mem ageing interval time */ +#define DCA_MEM_AGEING_MSES 1000 + /* DCA memory */ struct dca_mem { #define DCA_MEM_FLAGS_ALLOCED BIT(0) @@ -42,6 +45,12 @@ static inline void set_dca_page_to_free(struct hns_dca_page_state *state) state->lock = 0; }
+static inline void set_dca_page_to_inactive(struct hns_dca_page_state *state) +{ + state->active = 0; + state->lock = 0; +} + static inline void lock_dca_page_to_attach(struct hns_dca_page_state *state, u32 buf_id) { @@ -710,7 +719,10 @@ static int attach_dca_mem(struct hns_roce_dev *hr_dev, u32 buf_id; int ret;
+ /* Stop DCA mem ageing worker */ + cancel_delayed_work(&cfg->dwork); resp->alloc_flags = 0; + spin_lock(&cfg->lock); buf_id = cfg->buf_id; /* Already attached */ @@ -749,20 +761,140 @@ static int attach_dca_mem(struct hns_roce_dev *hr_dev, return 0; }
+struct dca_page_free_buf_attr { + u32 buf_id; + u32 max_pages; + u32 free_pages; + u32 clean_mems; +}; + +static int free_buffer_pages_proc(struct dca_mem *mem, int index, void *param) +{ + struct dca_page_free_buf_attr *attr = param; + struct hns_dca_page_state *state; + bool changed = false; + bool stop = false; + int i, free_pages; + + free_pages = 0; + for (i = 0; !stop && i < mem->page_count; i++) { + state = &mem->states[i]; + /* Change matched pages state */ + if (dca_page_is_attached(state, attr->buf_id)) { + set_dca_page_to_free(state); + changed = true; + attr->free_pages++; + if (attr->free_pages == attr->max_pages) + stop = true; + } + + if (dca_page_is_free(state)) + free_pages++; + } + + for (; changed && i < mem->page_count; i++) + if (dca_page_is_free(state)) + free_pages++; + + if (changed && free_pages == mem->page_count) + attr->clean_mems++; + + return stop ? DCA_MEM_STOP_ITERATE : DCA_MEM_NEXT_ITERATE; +} + +static void free_buf_from_dca_mem(struct hns_roce_dca_ctx *ctx, + struct hns_roce_dca_cfg *cfg) +{ + struct dca_page_free_buf_attr attr = {}; + unsigned long flags; + u32 buf_id; + + spin_lock(&cfg->lock); + buf_id = cfg->buf_id; + cfg->buf_id = HNS_DCA_INVALID_BUF_ID; + spin_unlock(&cfg->lock); + if (buf_id == HNS_DCA_INVALID_BUF_ID) + return; + + attr.buf_id = buf_id; + attr.max_pages = cfg->npages; + travel_dca_pages(ctx, &attr, free_buffer_pages_proc); + + /* Update free size */ + spin_lock_irqsave(&ctx->pool_lock, flags); + ctx->free_mems += attr.clean_mems; + ctx->free_size += attr.free_pages << HNS_HW_PAGE_SHIFT; + spin_unlock_irqrestore(&ctx->pool_lock, flags); +} + +static void kick_dca_mem(struct hns_roce_dev *hr_dev, + struct hns_roce_dca_cfg *cfg, + struct hns_roce_ucontext *uctx) +{ + struct hns_roce_dca_ctx *ctx = to_hr_dca_ctx(uctx); + + /* Stop ageing worker and free DCA buffer from pool */ + cancel_delayed_work_sync(&cfg->dwork); + free_buf_from_dca_mem(ctx, cfg); +} + +static void dca_mem_ageing_work(struct work_struct *work) +{ + struct hns_roce_qp *hr_qp = container_of(work, struct hns_roce_qp, + dca_cfg.dwork.work); + struct hns_roce_dev *hr_dev = to_hr_dev(hr_qp->ibqp.device); + struct hns_roce_dca_ctx *ctx = hr_qp_to_dca_ctx(hr_qp); + bool hw_is_inactive; + + hw_is_inactive = hr_dev->hw->chk_dca_buf_inactive && + hr_dev->hw->chk_dca_buf_inactive(hr_dev, hr_qp); + if (hw_is_inactive) + free_buf_from_dca_mem(ctx, &hr_qp->dca_cfg); +} + +void hns_roce_dca_kick(struct hns_roce_dev *hr_dev, struct hns_roce_qp *hr_qp) +{ + struct hns_roce_ucontext *uctx; + + if (hr_qp->ibqp.uobject && hr_qp->ibqp.pd->uobject) { + uctx = to_hr_ucontext(hr_qp->ibqp.pd->uobject->context); + kick_dca_mem(hr_dev, &hr_qp->dca_cfg, uctx); + } +} + +static void detach_dca_mem(struct hns_roce_dev *hr_dev, + struct hns_roce_qp *hr_qp, + struct hns_dca_detach_attr *attr) +{ + struct hns_roce_dca_cfg *cfg = &hr_qp->dca_cfg; + + /* Start an ageing worker to free buffer */ + cancel_delayed_work(&cfg->dwork); + spin_lock(&cfg->lock); + cfg->sq_idx = attr->sq_idx; + queue_delayed_work(hr_dev->irq_workq, &cfg->dwork, + msecs_to_jiffies(DCA_MEM_AGEING_MSES)); + spin_unlock(&cfg->lock); +} + void hns_roce_enable_dca(struct hns_roce_dev *hr_dev, struct hns_roce_qp *hr_qp) { struct hns_roce_dca_cfg *cfg = &hr_qp->dca_cfg;
spin_lock_init(&cfg->lock); + INIT_DELAYED_WORK(&cfg->dwork, dca_mem_ageing_work); cfg->buf_id = HNS_DCA_INVALID_BUF_ID; cfg->npages = hr_qp->buff_size >> HNS_HW_PAGE_SHIFT; }
void hns_roce_disable_dca(struct hns_roce_dev *hr_dev, - struct hns_roce_qp *hr_qp) + struct hns_roce_qp *hr_qp, struct ib_udata *udata) { + struct hns_roce_ucontext *uctx = rdma_udata_to_drv_context(udata, + struct hns_roce_ucontext, ibucontext); struct hns_roce_dca_cfg *cfg = &hr_qp->dca_cfg;
+ kick_dca_mem(hr_dev, cfg, uctx); cfg->buf_id = HNS_DCA_INVALID_BUF_ID; }
@@ -951,12 +1083,40 @@ DECLARE_UVERBS_NAMED_METHOD( UVERBS_ATTR_PTR_OUT(HNS_IB_ATTR_DCA_MEM_ATTACH_OUT_ALLOC_PAGES, UVERBS_ATTR_TYPE(u32), UA_MANDATORY));
+static int UVERBS_HANDLER(HNS_IB_METHOD_DCA_MEM_DETACH)( + struct uverbs_attr_bundle *attrs) +{ + struct hns_roce_qp *hr_qp = uverbs_attr_to_hr_qp(attrs); + struct hns_dca_detach_attr attr = {}; + int ret; + + if (!hr_qp) + return -EINVAL; + + ret = uverbs_copy_from(&attr.sq_idx, attrs, + HNS_IB_ATTR_DCA_MEM_DETACH_SQ_INDEX); + if (ret) + return ret; + + detach_dca_mem(to_hr_dev(hr_qp->ibqp.device), hr_qp, &attr); + + return 0; +} + +DECLARE_UVERBS_NAMED_METHOD( + HNS_IB_METHOD_DCA_MEM_DETACH, + UVERBS_ATTR_IDR(HNS_IB_ATTR_DCA_MEM_DETACH_HANDLE, UVERBS_OBJECT_QP, + UVERBS_ACCESS_WRITE, UA_MANDATORY), + UVERBS_ATTR_PTR_IN(HNS_IB_ATTR_DCA_MEM_DETACH_SQ_INDEX, + UVERBS_ATTR_TYPE(u32), UA_MANDATORY)); + DECLARE_UVERBS_NAMED_OBJECT(HNS_IB_OBJECT_DCA_MEM, UVERBS_TYPE_ALLOC_IDR(dca_cleanup), &UVERBS_METHOD(HNS_IB_METHOD_DCA_MEM_REG), &UVERBS_METHOD(HNS_IB_METHOD_DCA_MEM_DEREG), &UVERBS_METHOD(HNS_IB_METHOD_DCA_MEM_SHRINK), - &UVERBS_METHOD(HNS_IB_METHOD_DCA_MEM_ATTACH)); + &UVERBS_METHOD(HNS_IB_METHOD_DCA_MEM_ATTACH), + &UVERBS_METHOD(HNS_IB_METHOD_DCA_MEM_DETACH));
static bool dca_is_supported(struct ib_device *device) { diff --git a/drivers/infiniband/hw/hns/hns_roce_dca.h b/drivers/infiniband/hw/hns/hns_roce_dca.h index f9eea9beb092..fdc3aaa4b10b 100644 --- a/drivers/infiniband/hw/hns/hns_roce_dca.h +++ b/drivers/infiniband/hw/hns/hns_roce_dca.h @@ -50,6 +50,10 @@ struct hns_dca_attach_resp { u32 alloc_pages; };
+struct hns_dca_detach_attr { + u32 sq_idx; +}; + void hns_roce_register_udca(struct hns_roce_dev *hr_dev, struct hns_roce_ucontext *uctx); void hns_roce_unregister_udca(struct hns_roce_dev *hr_dev, @@ -58,5 +62,6 @@ void hns_roce_unregister_udca(struct hns_roce_dev *hr_dev, void hns_roce_enable_dca(struct hns_roce_dev *hr_dev, struct hns_roce_qp *hr_qp); void hns_roce_disable_dca(struct hns_roce_dev *hr_dev, - struct hns_roce_qp *hr_qp); + struct hns_roce_qp *hr_qp, struct ib_udata *udata); +void hns_roce_dca_kick(struct hns_roce_dev *hr_dev, struct hns_roce_qp *hr_qp); #endif diff --git a/drivers/infiniband/hw/hns/hns_roce_device.h b/drivers/infiniband/hw/hns/hns_roce_device.h index a58b7d598f4f..ac9dcdf59887 100644 --- a/drivers/infiniband/hw/hns/hns_roce_device.h +++ b/drivers/infiniband/hw/hns/hns_roce_device.h @@ -319,6 +319,8 @@ struct hns_roce_dca_cfg { u32 buf_id; u16 attach_count; u32 npages; + u32 sq_idx; + struct delayed_work dwork; };
/* DCA attr for setting WQE buffer */ @@ -912,6 +914,8 @@ struct hns_roce_hw { int (*set_dca_buf)(struct hns_roce_dev *hr_dev, struct hns_roce_qp *hr_qp, struct hns_roce_dca_attr *attr); + bool (*chk_dca_buf_inactive)(struct hns_roce_dev *hr_dev, + struct hns_roce_qp *hr_qp); int (*modify_qp)(struct ib_qp *ibqp, const struct ib_qp_attr *attr, int attr_mask, enum ib_qp_state cur_state, enum ib_qp_state new_state, struct ib_udata *udata); diff --git a/drivers/infiniband/hw/hns/hns_roce_hw_v2.c b/drivers/infiniband/hw/hns/hns_roce_hw_v2.c index 1e9fa0f14268..f14a8e41aafa 100644 --- a/drivers/infiniband/hw/hns/hns_roce_hw_v2.c +++ b/drivers/infiniband/hw/hns/hns_roce_hw_v2.c @@ -47,6 +47,7 @@ #include "hns_roce_device.h" #include "hns_roce_cmd.h" #include "hns_roce_hem.h" +#include "hns_roce_dca.h" #include "hns_roce_hw_v2.h"
enum { @@ -370,6 +371,11 @@ static int set_rwqe_data_seg(struct ib_qp *ibqp, const struct ib_send_wr *wr, return 0; }
+static inline bool check_qp_dca_enable(struct hns_roce_qp *hr_qp) +{ + return !!(hr_qp->en_flags & HNS_ROCE_QP_CAP_DCA); +} + static int check_send_valid(struct hns_roce_dev *hr_dev, struct hns_roce_qp *hr_qp) { @@ -5522,6 +5528,10 @@ static int hns_roce_v2_modify_qp(struct ib_qp *ibqp, if (new_state == IB_QPS_RESET && !ibqp->uobject) clear_qp(hr_qp);
+ if (check_qp_dca_enable(hr_qp) && + (new_state == IB_QPS_RESET || new_state == IB_QPS_ERR)) + hns_roce_dca_kick(hr_dev, hr_qp); + out: return ret; } @@ -5754,6 +5764,45 @@ static int hns_roce_v2_query_qp(struct ib_qp *ibqp, struct ib_qp_attr *qp_attr, return ret; }
+static bool hns_roce_v2_chk_dca_buf_inactive(struct hns_roce_dev *hr_dev, + struct hns_roce_qp *hr_qp) +{ + struct hns_roce_dca_cfg *cfg = &hr_qp->dca_cfg; + struct hns_roce_v2_qp_context context = {}; + struct ib_device *ibdev = &hr_dev->ib_dev; + u32 tmp, sq_idx; + int state; + int ret; + + ret = hns_roce_v2_query_qpc(hr_dev, hr_qp->qpn, &context); + if (ret) { + ibdev_err(ibdev, "failed to query DCA QPC, ret = %d.\n", ret); + return false; + } + + state = hr_reg_read(&context, QPC_QP_ST); + if (state == HNS_ROCE_QP_ST_ERR || state == HNS_ROCE_QP_ST_RST) + return true; + + /* If RQ is not empty, the buffer is always active until the QP stops + * working. + */ + if (hr_qp->rq.wqe_cnt > 0) + return false; + + if (hr_qp->sq.wqe_cnt > 0) { + tmp = (u32)hr_reg_read(&context, QPC_RETRY_MSG_MSN); + sq_idx = tmp & (hr_qp->sq.wqe_cnt - 1); + /* If SQ-PI equals to retry_msg_msn in QPC, the QP is + * inactive. + */ + if (sq_idx != cfg->sq_idx) + return false; + } + + return true; +} + static inline int modify_qp_is_ok(struct hns_roce_qp *hr_qp) { return ((hr_qp->ibqp.qp_type == IB_QPT_RC || @@ -6991,6 +7040,7 @@ static const struct hns_roce_hw hns_roce_hw_v2 = { .set_hem = hns_roce_v2_set_hem, .clear_hem = hns_roce_v2_clear_hem, .set_dca_buf = hns_roce_v2_set_dca_buf, + .chk_dca_buf_inactive = hns_roce_v2_chk_dca_buf_inactive, .modify_qp = hns_roce_v2_modify_qp, .dereg_mr = hns_roce_v2_dereg_mr, .qp_flow_control_init = hns_roce_v2_qp_flow_control_init, diff --git a/drivers/infiniband/hw/hns/hns_roce_qp.c b/drivers/infiniband/hw/hns/hns_roce_qp.c index e42461d5ea4c..36868618cd51 100644 --- a/drivers/infiniband/hw/hns/hns_roce_qp.c +++ b/drivers/infiniband/hw/hns/hns_roce_qp.c @@ -791,7 +791,7 @@ static int alloc_wqe_buf(struct hns_roce_dev *hr_dev, struct hns_roce_qp *hr_qp, if (ret) { ibdev_err(ibdev, "failed to create WQE mtr, ret = %d.\n", ret); if (dca_en) - hns_roce_disable_dca(hr_dev, hr_qp); + hns_roce_disable_dca(hr_dev, hr_qp, udata); }
return ret; @@ -803,7 +803,7 @@ static void free_wqe_buf(struct hns_roce_dev *hr_dev, struct hns_roce_qp *hr_qp, hns_roce_mtr_destroy(hr_dev, &hr_qp->mtr);
if (hr_qp->en_flags & HNS_ROCE_QP_CAP_DCA) - hns_roce_disable_dca(hr_dev, hr_qp); + hns_roce_disable_dca(hr_dev, hr_qp, udata); }
static int alloc_qp_wqe(struct hns_roce_dev *hr_dev, struct hns_roce_qp *hr_qp, diff --git a/include/uapi/rdma/hns-abi.h b/include/uapi/rdma/hns-abi.h index ebaf917a30bc..96daeccfb7dd 100644 --- a/include/uapi/rdma/hns-abi.h +++ b/include/uapi/rdma/hns-abi.h @@ -144,6 +144,7 @@ enum hns_ib_dca_mem_methods { HNS_IB_METHOD_DCA_MEM_DEREG, HNS_IB_METHOD_DCA_MEM_SHRINK, HNS_IB_METHOD_DCA_MEM_ATTACH, + HNS_IB_METHOD_DCA_MEM_DETACH, };
enum hns_ib_dca_mem_reg_attrs { @@ -174,4 +175,9 @@ enum hns_ib_dca_mem_attach_attrs { HNS_IB_ATTR_DCA_MEM_ATTACH_OUT_ALLOC_FLAGS, HNS_IB_ATTR_DCA_MEM_ATTACH_OUT_ALLOC_PAGES, }; + +enum hns_ib_dca_mem_detach_attrs { + HNS_IB_ATTR_DCA_MEM_DETACH_HANDLE = (1U << UVERBS_ID_NS_SHIFT), + HNS_IB_ATTR_DCA_MEM_DETACH_SQ_INDEX, +}; #endif /* HNS_ABI_USER_H */
From: Chengchang Tang tangchengchang@huawei.com
driver inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I63KVU
----------------------------------------------------------
If a uQP works in DCA mode, the userspace driver need to get the buffer's address in DCA memory pool by calling the 'HNS_IB_METHOD_DCA_MEM_QUERY' method after the QP was attached by calling the 'HNS_IB_METHOD_DCA_MEM_ATTACH' method.
This method will return the DCA mem object's key and the offset to let the userspace driver get the WQE's virtual address in DCA memory pool.
Signed-off-by: Chengchang Tang tangchengchang@huawei.com Reviewed-by: Yangyang Li liyangyang20@huawei.com Reviewed-by: YueHaibing yuehaibing@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- drivers/infiniband/hw/hns/hns_roce_dca.c | 105 ++++++++++++++++++++++- include/uapi/rdma/hns-abi.h | 10 +++ 2 files changed, 114 insertions(+), 1 deletion(-)
diff --git a/drivers/infiniband/hw/hns/hns_roce_dca.c b/drivers/infiniband/hw/hns/hns_roce_dca.c index 1be0e9822c9a..97450d8dcd88 100644 --- a/drivers/infiniband/hw/hns/hns_roce_dca.c +++ b/drivers/infiniband/hw/hns/hns_roce_dca.c @@ -80,6 +80,14 @@ static inline bool dca_page_is_attached(struct hns_dca_page_state *state, (HNS_DCA_OWN_MASK & state->buf_id); }
+static inline bool dca_page_is_active(struct hns_dca_page_state *state, + u32 buf_id) +{ + /* all buf id bits must be matched */ + return (HNS_DCA_ID_MASK & buf_id) == state->buf_id && + !state->lock && state->active; +} + static inline bool dca_page_is_allocated(struct hns_dca_page_state *state, u32 buf_id) { @@ -761,6 +769,47 @@ static int attach_dca_mem(struct hns_roce_dev *hr_dev, return 0; }
+struct dca_page_query_active_attr { + u32 buf_id; + u32 curr_index; + u32 start_index; + u32 page_index; + u32 page_count; + u64 mem_key; +}; + +static int query_dca_active_pages_proc(struct dca_mem *mem, int index, + void *param) +{ + struct hns_dca_page_state *state = &mem->states[index]; + struct dca_page_query_active_attr *attr = param; + + if (!dca_page_is_active(state, attr->buf_id)) + return 0; + + if (attr->curr_index < attr->start_index) { + attr->curr_index++; + return 0; + } else if (attr->curr_index > attr->start_index) { + return DCA_MEM_STOP_ITERATE; + } + + /* Search first page in DCA mem */ + attr->page_index = index; + attr->mem_key = mem->key; + /* Search active pages in continuous addresses */ + while (index < mem->page_count) { + state = &mem->states[index]; + if (!dca_page_is_active(state, attr->buf_id)) + break; + + index++; + attr->page_count++; + } + + return DCA_MEM_STOP_ITERATE; +} + struct dca_page_free_buf_attr { u32 buf_id; u32 max_pages; @@ -1110,13 +1159,67 @@ DECLARE_UVERBS_NAMED_METHOD( UVERBS_ATTR_PTR_IN(HNS_IB_ATTR_DCA_MEM_DETACH_SQ_INDEX, UVERBS_ATTR_TYPE(u32), UA_MANDATORY));
+static int UVERBS_HANDLER(HNS_IB_METHOD_DCA_MEM_QUERY)( + struct uverbs_attr_bundle *attrs) +{ + struct hns_roce_qp *hr_qp = uverbs_attr_to_hr_qp(attrs); + struct hns_roce_dca_ctx *ctx = hr_qp_to_dca_ctx(hr_qp); + struct dca_page_query_active_attr active_attr = {}; + u32 page_idx, page_ofs; + int ret; + + if (!hr_qp) + return -EINVAL; + + ret = uverbs_copy_from(&page_idx, attrs, + HNS_IB_ATTR_DCA_MEM_QUERY_PAGE_INDEX); + if (ret) + return ret; + + active_attr.buf_id = hr_qp->dca_cfg.buf_id; + active_attr.start_index = page_idx; + travel_dca_pages(ctx, &active_attr, query_dca_active_pages_proc); + page_ofs = active_attr.page_index << HNS_HW_PAGE_SHIFT; + + if (!active_attr.page_count) + return -ENOMEM; + + ret = uverbs_copy_to(attrs, HNS_IB_ATTR_DCA_MEM_QUERY_OUT_KEY, + &active_attr.mem_key, sizeof(active_attr.mem_key)); + if (!ret) + ret = uverbs_copy_to(attrs, + HNS_IB_ATTR_DCA_MEM_QUERY_OUT_OFFSET, + &page_ofs, sizeof(page_ofs)); + if (!ret) + ret = uverbs_copy_to(attrs, + HNS_IB_ATTR_DCA_MEM_QUERY_OUT_PAGE_COUNT, + &active_attr.page_count, + sizeof(active_attr.page_count)); + + return ret; +} + +DECLARE_UVERBS_NAMED_METHOD( + HNS_IB_METHOD_DCA_MEM_QUERY, + UVERBS_ATTR_IDR(HNS_IB_ATTR_DCA_MEM_QUERY_HANDLE, UVERBS_OBJECT_QP, + UVERBS_ACCESS_READ, UA_MANDATORY), + UVERBS_ATTR_PTR_IN(HNS_IB_ATTR_DCA_MEM_QUERY_PAGE_INDEX, + UVERBS_ATTR_TYPE(u32), UA_MANDATORY), + UVERBS_ATTR_PTR_OUT(HNS_IB_ATTR_DCA_MEM_QUERY_OUT_KEY, + UVERBS_ATTR_TYPE(u64), UA_MANDATORY), + UVERBS_ATTR_PTR_OUT(HNS_IB_ATTR_DCA_MEM_QUERY_OUT_OFFSET, + UVERBS_ATTR_TYPE(u32), UA_MANDATORY), + UVERBS_ATTR_PTR_OUT(HNS_IB_ATTR_DCA_MEM_QUERY_OUT_PAGE_COUNT, + UVERBS_ATTR_TYPE(u32), UA_MANDATORY)); + DECLARE_UVERBS_NAMED_OBJECT(HNS_IB_OBJECT_DCA_MEM, UVERBS_TYPE_ALLOC_IDR(dca_cleanup), &UVERBS_METHOD(HNS_IB_METHOD_DCA_MEM_REG), &UVERBS_METHOD(HNS_IB_METHOD_DCA_MEM_DEREG), &UVERBS_METHOD(HNS_IB_METHOD_DCA_MEM_SHRINK), &UVERBS_METHOD(HNS_IB_METHOD_DCA_MEM_ATTACH), - &UVERBS_METHOD(HNS_IB_METHOD_DCA_MEM_DETACH)); + &UVERBS_METHOD(HNS_IB_METHOD_DCA_MEM_DETACH), + &UVERBS_METHOD(HNS_IB_METHOD_DCA_MEM_QUERY));
static bool dca_is_supported(struct ib_device *device) { diff --git a/include/uapi/rdma/hns-abi.h b/include/uapi/rdma/hns-abi.h index 96daeccfb7dd..3429df900cdf 100644 --- a/include/uapi/rdma/hns-abi.h +++ b/include/uapi/rdma/hns-abi.h @@ -145,6 +145,7 @@ enum hns_ib_dca_mem_methods { HNS_IB_METHOD_DCA_MEM_SHRINK, HNS_IB_METHOD_DCA_MEM_ATTACH, HNS_IB_METHOD_DCA_MEM_DETACH, + HNS_IB_METHOD_DCA_MEM_QUERY, };
enum hns_ib_dca_mem_reg_attrs { @@ -180,4 +181,13 @@ enum hns_ib_dca_mem_detach_attrs { HNS_IB_ATTR_DCA_MEM_DETACH_HANDLE = (1U << UVERBS_ID_NS_SHIFT), HNS_IB_ATTR_DCA_MEM_DETACH_SQ_INDEX, }; + +enum hns_ib_dca_mem_query_attrs { + HNS_IB_ATTR_DCA_MEM_QUERY_HANDLE = (1U << UVERBS_ID_NS_SHIFT), + HNS_IB_ATTR_DCA_MEM_QUERY_PAGE_INDEX, + HNS_IB_ATTR_DCA_MEM_QUERY_OUT_KEY, + HNS_IB_ATTR_DCA_MEM_QUERY_OUT_OFFSET, + HNS_IB_ATTR_DCA_MEM_QUERY_OUT_PAGE_COUNT, +}; + #endif /* HNS_ABI_USER_H */
From: Chengchang Tang tangchengchang@huawei.com
driver inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I63KVU
----------------------------------------------------------
This patch add DCA support for kernel space.
Signed-off-by: Chengchang Tang tangchengchang@huawei.com Reviewed-by: Yangyang Li liyangyang20@huawei.com Reviewed-by: YueHaibing yuehaibing@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- drivers/infiniband/hw/hns/hns_roce_dca.c | 1326 ++++++++++++------- drivers/infiniband/hw/hns/hns_roce_dca.h | 25 +- drivers/infiniband/hw/hns/hns_roce_device.h | 45 +- drivers/infiniband/hw/hns/hns_roce_hw_v2.c | 214 +-- drivers/infiniband/hw/hns/hns_roce_hw_v2.h | 4 + drivers/infiniband/hw/hns/hns_roce_main.c | 32 +- drivers/infiniband/hw/hns/hns_roce_mr.c | 12 +- drivers/infiniband/hw/hns/hns_roce_qp.c | 32 +- include/uapi/rdma/hns-abi.h | 3 +- 9 files changed, 1088 insertions(+), 605 deletions(-)
diff --git a/drivers/infiniband/hw/hns/hns_roce_dca.c b/drivers/infiniband/hw/hns/hns_roce_dca.c index 97450d8dcd88..f33a59ef3bc3 100644 --- a/drivers/infiniband/hw/hns/hns_roce_dca.c +++ b/drivers/infiniband/hw/hns/hns_roce_dca.c @@ -104,25 +104,71 @@ static inline bool dca_mem_is_available(struct dca_mem *mem) return mem->flags == (DCA_MEM_FLAGS_ALLOCED | DCA_MEM_FLAGS_REGISTERED); }
-static void *alloc_dca_pages(struct hns_roce_dev *hr_dev, struct dca_mem *mem, - struct dca_mem_attr *attr) +static void free_dca_pages(struct hns_roce_dev *hr_dev, bool is_user, + void *pages) +{ + if (is_user) + ib_umem_release(pages); + else + hns_roce_buf_free(hr_dev, pages); +} + +static void *alloc_dca_pages(struct hns_roce_dev *hr_dev, bool is_user, + struct dca_mem *mem, struct dca_mem_attr *attr) { struct ib_device *ibdev = &hr_dev->ib_dev; - struct ib_umem *umem; + struct hns_roce_buf *kmem; + + if (is_user) { + struct ib_umem *umem; + + umem = ib_umem_get(ibdev, attr->addr, attr->size, 0); + if (IS_ERR(umem)) { + ibdev_err(ibdev, "failed to get uDCA pages, ret = %ld.\n", + PTR_ERR(umem)); + return NULL; + }
- umem = ib_umem_get(ibdev, attr->addr, attr->size, 0); - if (IS_ERR(umem)) { - ibdev_err(ibdev, "failed to get uDCA pages, ret = %ld.\n", - PTR_ERR(umem)); + mem->page_count = ib_umem_num_dma_blocks(umem, + HNS_HW_PAGE_SIZE); + return umem; + } + + kmem = hns_roce_buf_alloc(hr_dev, attr->size, HNS_HW_PAGE_SHIFT, + HNS_ROCE_BUF_NOSLEEP | HNS_ROCE_BUF_NOFAIL); + if (IS_ERR(kmem)) { + ibdev_err(ibdev, "failed to alloc kDCA pages, ret = %ld.\n", + PTR_ERR(kmem)); return NULL; }
- mem->page_count = ib_umem_num_dma_blocks(umem, HNS_HW_PAGE_SIZE); + mem->page_count = kmem->npages; + /* Override the attr->size by actually alloced size */ + attr->size = kmem->ntrunks << kmem->trunk_shift; + return kmem; + +} + +static void init_dca_kmem_states(struct hns_roce_dev *hr_dev, + struct hns_dca_page_state *states, int count, + struct hns_roce_buf *kmem) +{ + dma_addr_t cur_addr; + dma_addr_t pre_addr; + int i; + + pre_addr = 0; + for (i = 0; i < kmem->npages && i < count; i++) { + cur_addr = hns_roce_buf_page(kmem, i); + if (cur_addr - pre_addr != HNS_HW_PAGE_SIZE) + states[i].head = 1;
- return umem; + pre_addr = cur_addr; + } }
-static void init_dca_umem_states(struct hns_dca_page_state *states, int count, +static void init_dca_umem_states(struct hns_roce_dev *hr_dev, + struct hns_dca_page_state *states, int count, struct ib_umem *umem) { struct ib_block_iter biter; @@ -144,7 +190,9 @@ static void init_dca_umem_states(struct hns_dca_page_state *states, int count, } }
-static struct hns_dca_page_state *alloc_dca_states(void *pages, int count) +static struct hns_dca_page_state *alloc_dca_states(struct hns_roce_dev *hr_dev, + void *pages, int count, + bool is_user) { struct hns_dca_page_state *states;
@@ -152,7 +200,10 @@ static struct hns_dca_page_state *alloc_dca_states(void *pages, int count) if (!states) return NULL;
- init_dca_umem_states(states, count, pages); + if (is_user) + init_dca_umem_states(hr_dev, states, count, pages); + else + init_dca_kmem_states(hr_dev, states, count, pages);
return states; } @@ -192,17 +243,143 @@ static void travel_dca_pages(struct hns_roce_dca_ctx *ctx, void *param, spin_unlock_irqrestore(&ctx->pool_lock, flags); }
-/* user DCA is managed by ucontext */ +struct dca_get_alloced_pages_attr { + u32 buf_id; + dma_addr_t *pages; + u32 total; + u32 max; +}; + +static int get_alloced_kmem_proc(struct dca_mem *mem, int index, void *param) + +{ + struct dca_get_alloced_pages_attr *attr = param; + struct hns_dca_page_state *states = mem->states; + struct hns_roce_buf *kmem = mem->pages; + u32 i; + + for (i = 0; i < kmem->npages; i++) { + if (dca_page_is_allocated(&states[i], attr->buf_id)) { + attr->pages[attr->total++] = hns_roce_buf_page(kmem, i); + if (attr->total >= attr->max) + return DCA_MEM_STOP_ITERATE; + } + } + + return DCA_MEM_NEXT_ITERATE; +} + +static int get_alloced_umem_proc(struct dca_mem *mem, int index, void *param) + +{ + struct dca_get_alloced_pages_attr *attr = param; + struct hns_dca_page_state *states = mem->states; + struct ib_umem *umem = mem->pages; + struct ib_block_iter biter; + u32 i = 0; + + rdma_for_each_block(umem->sg_head.sgl, &biter, + umem->sg_head.nents, HNS_HW_PAGE_SIZE) { + if (dca_page_is_allocated(&states[i], attr->buf_id)) { + attr->pages[attr->total++] = + rdma_block_iter_dma_address(&biter); + if (attr->total >= attr->max) + return DCA_MEM_STOP_ITERATE; + } + i++; + } + + return DCA_MEM_NEXT_ITERATE; +} + +/* user DCA is managed by ucontext, kernel DCA is managed by device */ +static inline struct hns_roce_dca_ctx * +to_hr_dca_ctx(struct hns_roce_dev *hr_dev, struct hns_roce_ucontext *uctx) +{ + return uctx ? &uctx->dca_ctx : &hr_dev->dca_ctx; +} + static inline struct hns_roce_dca_ctx * -to_hr_dca_ctx(struct hns_roce_ucontext *uctx) +hr_qp_to_dca_ctx(struct hns_roce_dev *hr_dev, struct hns_roce_qp *hr_qp) +{ + struct hns_roce_ucontext *uctx = NULL; + + if (hr_qp->ibqp.pd->uobject) + uctx = to_hr_ucontext(hr_qp->ibqp.pd->uobject->context); + + return to_hr_dca_ctx(hr_dev, uctx); +} + +static int config_dca_qpc(struct hns_roce_dev *hr_dev, + struct hns_roce_qp *hr_qp, dma_addr_t *pages, + int page_count) +{ + struct ib_device *ibdev = &hr_dev->ib_dev; + struct hns_roce_mtr *mtr = &hr_qp->mtr; + int ret; + + ret = hns_roce_mtr_map(hr_dev, mtr, pages, page_count); + if (ret) { + ibdev_err(ibdev, "failed to map DCA pages, ret = %d.\n", ret); + return ret; + } + + if (hr_dev->hw->set_dca_buf) { + ret = hr_dev->hw->set_dca_buf(hr_dev, hr_qp); + if (ret) { + ibdev_err(ibdev, "failed to set DCA to HW, ret = %d.\n", + ret); + return ret; + } + } + + return 0; +} + +static int setup_dca_buf_to_hw(struct hns_roce_dev *hr_dev, + struct hns_roce_qp *hr_qp, + struct hns_roce_dca_ctx *ctx, u32 buf_id, + u32 count) { - return &uctx->dca_ctx; + struct dca_get_alloced_pages_attr attr = {}; + dma_addr_t *pages; + int ret; + + /* alloc a tmp array to store buffer's dma address */ + pages = kvcalloc(count, sizeof(dma_addr_t), GFP_ATOMIC); + if (!pages) + return -ENOMEM; + + attr.buf_id = buf_id; + attr.pages = pages; + attr.max = count; + + if (hr_qp->ibqp.uobject) + travel_dca_pages(ctx, &attr, get_alloced_umem_proc); + else + travel_dca_pages(ctx, &attr, get_alloced_kmem_proc); + + if (attr.total != count) { + ibdev_err(&hr_dev->ib_dev, "failed to get DCA page %u != %u.\n", + attr.total, count); + ret = -ENOMEM; + goto err_get_pages; + } + + ret = config_dca_qpc(hr_dev, hr_qp, pages, count); +err_get_pages: + /* drop tmp array */ + kvfree(pages); + + return ret; }
-static void unregister_dca_mem(struct hns_roce_ucontext *uctx, +static void unregister_dca_mem(struct hns_roce_dev *hr_dev, + struct hns_roce_ucontext *uctx, struct dca_mem *mem) { - struct hns_roce_dca_ctx *ctx = to_hr_dca_ctx(uctx); + struct hns_roce_dca_ctx *ctx = to_hr_dca_ctx(hr_dev, uctx); + bool is_user = !!uctx; unsigned long flags; void *states, *pages;
@@ -224,24 +401,25 @@ static void unregister_dca_mem(struct hns_roce_ucontext *uctx, spin_unlock_irqrestore(&ctx->pool_lock, flags);
kfree(states); - ib_umem_release(pages); + free_dca_pages(hr_dev, is_user, pages); }
static int register_dca_mem(struct hns_roce_dev *hr_dev, struct hns_roce_ucontext *uctx, struct dca_mem *mem, struct dca_mem_attr *attr) { - struct hns_roce_dca_ctx *ctx = to_hr_dca_ctx(uctx); + struct hns_roce_dca_ctx *ctx = to_hr_dca_ctx(hr_dev, uctx); + bool is_user = !!uctx; void *states, *pages; unsigned long flags;
- pages = alloc_dca_pages(hr_dev, mem, attr); + pages = alloc_dca_pages(hr_dev, is_user, mem, attr); if (!pages) return -ENOMEM;
- states = alloc_dca_states(pages, mem->page_count); + states = alloc_dca_states(hr_dev, pages, mem->page_count, is_user); if (!states) { - ib_umem_release(pages); + free_dca_pages(hr_dev, is_user, pages); return -ENOMEM; }
@@ -263,266 +441,358 @@ static int register_dca_mem(struct hns_roce_dev *hr_dev, return 0; }
-struct dca_mem_shrink_attr { - u64 shrink_key; - u32 shrink_mems; +struct dca_page_clear_attr { + u32 buf_id; + u32 max_pages; + u32 clear_pages; };
-static int shrink_dca_page_proc(struct dca_mem *mem, int index, void *param) +static int clear_dca_pages_proc(struct dca_mem *mem, int index, void *param) { - struct dca_mem_shrink_attr *attr = param; - struct hns_dca_page_state *state; - int i, free_pages; - - free_pages = 0; - for (i = 0; i < mem->page_count; i++) { - state = &mem->states[i]; - if (dca_page_is_free(state)) - free_pages++; - } - - /* No pages are in use */ - if (free_pages == mem->page_count) { - /* unregister first empty DCA mem */ - if (!attr->shrink_mems) { - mem->flags &= ~DCA_MEM_FLAGS_REGISTERED; - attr->shrink_key = mem->key; - } + struct hns_dca_page_state *state = &mem->states[index]; + struct dca_page_clear_attr *attr = param;
- attr->shrink_mems++; + if (dca_page_is_attached(state, attr->buf_id)) { + set_dca_page_to_free(state); + attr->clear_pages++; }
- if (attr->shrink_mems > 1) + if (attr->clear_pages >= attr->max_pages) return DCA_MEM_STOP_ITERATE; else - return DCA_MEM_NEXT_ITERATE; + return 0; }
-static void shrink_dca_mem(struct hns_roce_dev *hr_dev, - struct hns_roce_ucontext *uctx, u64 reserved_size, - struct hns_dca_shrink_resp *resp) +static void clear_dca_pages(struct hns_roce_dca_ctx *ctx, u32 buf_id, u32 count) { - struct hns_roce_dca_ctx *ctx = to_hr_dca_ctx(uctx); - struct dca_mem_shrink_attr attr = {}; - unsigned long flags; - bool need_shink; - - spin_lock_irqsave(&ctx->pool_lock, flags); - need_shink = ctx->free_mems > 0 && ctx->free_size > reserved_size; - spin_unlock_irqrestore(&ctx->pool_lock, flags); - if (!need_shink) - return; + struct dca_page_clear_attr attr = {};
- travel_dca_pages(ctx, &attr, shrink_dca_page_proc); - resp->free_mems = attr.shrink_mems; - resp->free_key = attr.shrink_key; + attr.buf_id = buf_id; + attr.max_pages = count; + travel_dca_pages(ctx, &attr, clear_dca_pages_proc); }
-static void init_dca_context(struct hns_roce_dca_ctx *ctx) +struct dca_page_assign_attr { + u32 buf_id; + int unit; + int total; + int max; +}; + +static bool dca_page_is_allocable(struct hns_dca_page_state *state, bool head) { - INIT_LIST_HEAD(&ctx->pool); - spin_lock_init(&ctx->pool_lock); - ctx->total_size = 0; + bool is_free = dca_page_is_free(state) || dca_page_is_inactive(state); + + return head ? is_free : is_free && !state->head; }
-static void cleanup_dca_context(struct hns_roce_dev *hr_dev, - struct hns_roce_dca_ctx *ctx) +static int assign_dca_pages_proc(struct dca_mem *mem, int index, void *param) { - struct dca_mem *mem, *tmp; - unsigned long flags; + struct dca_page_assign_attr *attr = param; + struct hns_dca_page_state *state; + int checked_pages = 0; + int start_index = 0; + int free_pages = 0; + int i;
- spin_lock_irqsave(&ctx->pool_lock, flags); - list_for_each_entry_safe(mem, tmp, &ctx->pool, list) { - list_del(&mem->list); - mem->flags = 0; - spin_unlock_irqrestore(&ctx->pool_lock, flags); + /* Check the continuous pages count is not smaller than unit count */ + for (i = index; free_pages < attr->unit && i < mem->page_count; i++) { + checked_pages++; + state = &mem->states[i]; + if (dca_page_is_allocable(state, free_pages == 0)) { + if (free_pages == 0) + start_index = i;
- kfree(mem->states); - ib_umem_release(mem->pages); - kfree(mem); + free_pages++; + } else { + free_pages = 0; + } + }
- spin_lock_irqsave(&ctx->pool_lock, flags); + if (free_pages < attr->unit) + return DCA_MEM_NEXT_ITERATE; + + for (i = 0; i < free_pages; i++) { + state = &mem->states[start_index + i]; + lock_dca_page_to_attach(state, attr->buf_id); + attr->total++; } - ctx->total_size = 0; - spin_unlock_irqrestore(&ctx->pool_lock, flags); -}
-void hns_roce_register_udca(struct hns_roce_dev *hr_dev, - struct hns_roce_ucontext *uctx) -{ - if (!(uctx->config & HNS_ROCE_UCTX_CONFIG_DCA)) - return; + if (attr->total >= attr->max) + return DCA_MEM_STOP_ITERATE;
- init_dca_context(&uctx->dca_ctx); + return checked_pages; }
-void hns_roce_unregister_udca(struct hns_roce_dev *hr_dev, - struct hns_roce_ucontext *uctx) +static u32 assign_dca_pages(struct hns_roce_dca_ctx *ctx, u32 buf_id, u32 count, + u32 unit) { - if (!(uctx->config & HNS_ROCE_UCTX_CONFIG_DCA)) - return; + struct dca_page_assign_attr attr = {};
- cleanup_dca_context(hr_dev, &uctx->dca_ctx); + attr.buf_id = buf_id; + attr.unit = unit; + attr.max = count; + travel_dca_pages(ctx, &attr, assign_dca_pages_proc); + return attr.total; }
-static struct dca_mem *alloc_dca_mem(struct hns_roce_dca_ctx *ctx) -{ - struct dca_mem *mem, *tmp, *found = NULL; - unsigned long flags; - - spin_lock_irqsave(&ctx->pool_lock, flags); - list_for_each_entry_safe(mem, tmp, &ctx->pool, list) { - spin_lock(&mem->lock); - if (!mem->flags) { - found = mem; - mem->flags |= DCA_MEM_FLAGS_ALLOCED; - spin_unlock(&mem->lock); - break; +struct dca_page_active_attr { + u32 buf_id; + u32 max_pages; + u32 alloc_pages; + u32 dirty_mems; +}; + +static int active_dca_pages_proc(struct dca_mem *mem, int index, void *param) +{ + struct dca_page_active_attr *attr = param; + struct hns_dca_page_state *state; + bool changed = false; + bool stop = false; + int i, free_pages; + + free_pages = 0; + for (i = 0; !stop && i < mem->page_count; i++) { + state = &mem->states[i]; + if (dca_page_is_free(state)) { + free_pages++; + } else if (dca_page_is_allocated(state, attr->buf_id)) { + free_pages++; + /* Change matched pages state */ + unlock_dca_page_to_active(state, attr->buf_id); + changed = true; + attr->alloc_pages++; + if (attr->alloc_pages == attr->max_pages) + stop = true; } - spin_unlock(&mem->lock); } - spin_unlock_irqrestore(&ctx->pool_lock, flags);
- if (found) - return found; + for (; changed && i < mem->page_count; i++) + if (dca_page_is_free(state)) + free_pages++;
- mem = kzalloc(sizeof(*mem), GFP_NOWAIT); - if (!mem) - return NULL; + /* Clean mem changed to dirty */ + if (changed && free_pages == mem->page_count) + attr->dirty_mems++;
- spin_lock_init(&mem->lock); - INIT_LIST_HEAD(&mem->list); + return stop ? DCA_MEM_STOP_ITERATE : DCA_MEM_NEXT_ITERATE; +}
- mem->flags |= DCA_MEM_FLAGS_ALLOCED; +static u32 active_dca_pages(struct hns_roce_dca_ctx *ctx, u32 buf_id, u32 count) +{ + struct dca_page_active_attr attr = {}; + unsigned long flags;
+ attr.buf_id = buf_id; + attr.max_pages = count; + travel_dca_pages(ctx, &attr, active_dca_pages_proc); + + /* Update free size */ spin_lock_irqsave(&ctx->pool_lock, flags); - list_add(&mem->list, &ctx->pool); + ctx->free_mems -= attr.dirty_mems; + ctx->free_size -= attr.alloc_pages << HNS_HW_PAGE_SHIFT; spin_unlock_irqrestore(&ctx->pool_lock, flags);
- return mem; -} - -static void free_dca_mem(struct dca_mem *mem) -{ - /* We cannot hold the whole pool's lock during the DCA is working - * until cleanup the context in cleanup_dca_context(), so we just - * set the DCA mem state as free when destroying DCA mem object. - */ - spin_lock(&mem->lock); - mem->flags = 0; - spin_unlock(&mem->lock); -} - -static inline struct hns_roce_dca_ctx *hr_qp_to_dca_ctx(struct hns_roce_qp *qp) -{ - return to_hr_dca_ctx(to_hr_ucontext(qp->ibqp.pd->uobject->context)); + return attr.alloc_pages; }
-struct dca_page_clear_attr { +struct dca_page_query_active_attr { u32 buf_id; - u32 max_pages; - u32 clear_pages; + u32 curr_index; + u32 start_index; + u32 page_index; + u32 page_count; + u64 mem_key; };
-static int clear_dca_pages_proc(struct dca_mem *mem, int index, void *param) +static int query_dca_active_pages_proc(struct dca_mem *mem, int index, + void *param) { struct hns_dca_page_state *state = &mem->states[index]; - struct dca_page_clear_attr *attr = param; + struct dca_page_query_active_attr *attr = param;
- if (dca_page_is_attached(state, attr->buf_id)) { - set_dca_page_to_free(state); - attr->clear_pages++; - } + if (!dca_page_is_active(state, attr->buf_id)) + return 0;
- if (attr->clear_pages >= attr->max_pages) - return DCA_MEM_STOP_ITERATE; - else + if (attr->curr_index < attr->start_index) { + attr->curr_index++; return 0; + } else if (attr->curr_index > attr->start_index) { + return DCA_MEM_STOP_ITERATE; + } + + /* Search first page in DCA mem */ + attr->page_index = index; + attr->mem_key = mem->key; + /* Search active pages in continuous addresses */ + while (index < mem->page_count) { + state = &mem->states[index]; + if (!dca_page_is_active(state, attr->buf_id)) + break; + + index++; + attr->page_count++; + } + + return DCA_MEM_STOP_ITERATE; }
-static void clear_dca_pages(struct hns_roce_dca_ctx *ctx, u32 buf_id, u32 count) +static int sync_dca_buf_offset(struct hns_roce_dev *hr_dev, + struct hns_roce_qp *hr_qp, + struct hns_dca_attach_attr *attr) { - struct dca_page_clear_attr attr = {}; + struct ib_device *ibdev = &hr_dev->ib_dev;
- attr.buf_id = buf_id; - attr.max_pages = count; - travel_dca_pages(ctx, &attr, clear_dca_pages_proc); + if (hr_qp->sq.wqe_cnt > 0) { + if (attr->sq_offset >= hr_qp->sge.offset) { + ibdev_err(ibdev, "failed to check SQ offset = %u\n", + attr->sq_offset); + return -EINVAL; + } + hr_qp->sq.wqe_offset = hr_qp->sq.offset + attr->sq_offset; + } + + if (hr_qp->sge.sge_cnt > 0) { + if (attr->sge_offset >= hr_qp->rq.offset) { + ibdev_err(ibdev, "failed to check exSGE offset = %u\n", + attr->sge_offset); + return -EINVAL; + } + hr_qp->sge.wqe_offset = hr_qp->sge.offset + attr->sge_offset; + } + + if (hr_qp->rq.wqe_cnt > 0) { + if (attr->rq_offset >= hr_qp->buff_size) { + ibdev_err(ibdev, "failed to check RQ offset = %u\n", + attr->rq_offset); + return -EINVAL; + } + hr_qp->rq.wqe_offset = hr_qp->rq.offset + attr->rq_offset; + } + + return 0; }
-struct dca_page_assign_attr { +static u32 alloc_buf_from_dca_mem(struct hns_roce_qp *hr_qp, + struct hns_roce_dca_ctx *ctx) +{ + u32 buf_pages, unit_pages, alloc_pages; u32 buf_id; - int unit; - int total; - int max; -};
-static bool dca_page_is_allocable(struct hns_dca_page_state *state, bool head) -{ - bool is_free = dca_page_is_free(state) || dca_page_is_inactive(state); + buf_pages = hr_qp->dca_cfg.npages; + /* Gen new buf id */ + buf_id = HNS_DCA_TO_BUF_ID(hr_qp->qpn, hr_qp->dca_cfg.attach_count);
- return head ? is_free : is_free && !state->head; + /* Assign pages from free pages */ + unit_pages = hr_qp->mtr.hem_cfg.is_direct ? buf_pages : 1; + alloc_pages = assign_dca_pages(ctx, buf_id, buf_pages, unit_pages); + if (buf_pages != alloc_pages) { + if (alloc_pages > 0) + clear_dca_pages(ctx, buf_id, alloc_pages); + return HNS_DCA_INVALID_BUF_ID; + } + return buf_id; }
-static int assign_dca_pages_proc(struct dca_mem *mem, int index, void *param) +static int active_alloced_buf(struct hns_roce_qp *hr_qp, + struct hns_roce_dca_ctx *ctx, + struct hns_dca_attach_attr *attr, u32 buf_id) { - struct dca_page_assign_attr *attr = param; - struct hns_dca_page_state *state; - int checked_pages = 0; - int start_index = 0; - int free_pages = 0; - int i; - - /* Check the continuous pages count is not smaller than unit count */ - for (i = index; free_pages < attr->unit && i < mem->page_count; i++) { - checked_pages++; - state = &mem->states[i]; - if (dca_page_is_allocable(state, free_pages == 0)) { - if (free_pages == 0) - start_index = i; + struct hns_roce_dev *hr_dev = to_hr_dev(hr_qp->ibqp.device); + struct ib_device *ibdev = &hr_dev->ib_dev; + u32 active_pages, alloc_pages; + int ret;
- free_pages++; - } else { - free_pages = 0; - } + alloc_pages = hr_qp->dca_cfg.npages; + ret = sync_dca_buf_offset(hr_dev, hr_qp, attr); + if (ret) { + ibdev_err(ibdev, "failed to sync DCA offset, ret = %d\n", ret); + goto active_fail; }
- if (free_pages < attr->unit) - return DCA_MEM_NEXT_ITERATE; + ret = setup_dca_buf_to_hw(hr_dev, hr_qp, ctx, buf_id, alloc_pages); + if (ret) { + ibdev_err(ibdev, "failed to setup DCA buf, ret = %d.\n", ret); + goto active_fail; + }
- for (i = 0; i < free_pages; i++) { - state = &mem->states[start_index + i]; - lock_dca_page_to_attach(state, attr->buf_id); - attr->total++; + active_pages = active_dca_pages(ctx, buf_id, alloc_pages); + if (active_pages != alloc_pages) { + ibdev_err(ibdev, "failed to active DCA pages, %u != %u.\n", + active_pages, alloc_pages); + ret = -ENOBUFS; + goto active_fail; }
- if (attr->total >= attr->max) - return DCA_MEM_STOP_ITERATE; + return 0;
- return checked_pages; +active_fail: + clear_dca_pages(ctx, buf_id, alloc_pages); + return ret; }
-static u32 assign_dca_pages(struct hns_roce_dca_ctx *ctx, u32 buf_id, u32 count, - u32 unit) +static int attach_dca_mem(struct hns_roce_dev *hr_dev, + struct hns_roce_qp *hr_qp, + struct hns_dca_attach_attr *attr, + struct hns_dca_attach_resp *resp) { - struct dca_page_assign_attr attr = {}; + struct hns_roce_dca_ctx *ctx = hr_qp_to_dca_ctx(hr_dev, hr_qp); + struct hns_roce_dca_cfg *cfg = &hr_qp->dca_cfg; + u32 buf_id; + int ret;
- attr.buf_id = buf_id; - attr.unit = unit; - attr.max = count; - travel_dca_pages(ctx, &attr, assign_dca_pages_proc); - return attr.total; + /* Stop DCA mem ageing worker */ + cancel_delayed_work(&cfg->dwork); + resp->alloc_flags = 0; + + spin_lock(&cfg->lock); + buf_id = cfg->buf_id; + /* Already attached */ + if (buf_id != HNS_DCA_INVALID_BUF_ID) { + resp->alloc_pages = cfg->npages; + spin_unlock(&cfg->lock); + return 0; + } + + /* Start to new attach */ + resp->alloc_pages = 0; + buf_id = alloc_buf_from_dca_mem(hr_qp, ctx); + if (buf_id == HNS_DCA_INVALID_BUF_ID) { + spin_unlock(&cfg->lock); + /* No report fail, need try again after the pool increased */ + return 0; + } + + ret = active_alloced_buf(hr_qp, ctx, attr, buf_id); + if (ret) { + spin_unlock(&cfg->lock); + ibdev_err(&hr_dev->ib_dev, + "failed to active DCA buf for QP-%lu, ret = %d.\n", + hr_qp->qpn, ret); + return ret; + } + + /* Attach ok */ + cfg->buf_id = buf_id; + cfg->attach_count++; + spin_unlock(&cfg->lock); + + resp->alloc_flags |= HNS_DCA_ATTACH_FLAGS_NEW_BUFFER; + resp->alloc_pages = cfg->npages; + + return 0; }
-struct dca_page_active_attr { +struct dca_page_free_buf_attr { u32 buf_id; u32 max_pages; - u32 alloc_pages; - u32 dirty_mems; + u32 free_pages; + u32 clean_mems; };
-static int active_dca_pages_proc(struct dca_mem *mem, int index, void *param) +static int free_buffer_pages_proc(struct dca_mem *mem, int index, void *param) { - struct dca_page_active_attr *attr = param; + struct dca_page_free_buf_attr *attr = param; struct hns_dca_page_state *state; bool changed = false; bool stop = false; @@ -531,360 +801,453 @@ static int active_dca_pages_proc(struct dca_mem *mem, int index, void *param) free_pages = 0; for (i = 0; !stop && i < mem->page_count; i++) { state = &mem->states[i]; - if (dca_page_is_free(state)) { - free_pages++; - } else if (dca_page_is_allocated(state, attr->buf_id)) { - free_pages++; - /* Change matched pages state */ - unlock_dca_page_to_active(state, attr->buf_id); + /* Change matched pages state */ + if (dca_page_is_attached(state, attr->buf_id)) { + set_dca_page_to_free(state); changed = true; - attr->alloc_pages++; - if (attr->alloc_pages == attr->max_pages) + attr->free_pages++; + if (attr->free_pages == attr->max_pages) stop = true; } + + if (dca_page_is_free(state)) + free_pages++; }
for (; changed && i < mem->page_count; i++) if (dca_page_is_free(state)) free_pages++;
- /* Clean mem changed to dirty */ if (changed && free_pages == mem->page_count) - attr->dirty_mems++; + attr->clean_mems++;
return stop ? DCA_MEM_STOP_ITERATE : DCA_MEM_NEXT_ITERATE; }
-static u32 active_dca_pages(struct hns_roce_dca_ctx *ctx, u32 buf_id, u32 count) +static void free_buf_from_dca_mem(struct hns_roce_dca_ctx *ctx, + struct hns_roce_dca_cfg *cfg) { - struct dca_page_active_attr attr = {}; + struct dca_page_free_buf_attr attr = {}; unsigned long flags; + u32 buf_id; + + spin_lock(&cfg->lock); + buf_id = cfg->buf_id; + cfg->buf_id = HNS_DCA_INVALID_BUF_ID; + spin_unlock(&cfg->lock); + if (buf_id == HNS_DCA_INVALID_BUF_ID) + return;
attr.buf_id = buf_id; - attr.max_pages = count; - travel_dca_pages(ctx, &attr, active_dca_pages_proc); + attr.max_pages = cfg->npages; + travel_dca_pages(ctx, &attr, free_buffer_pages_proc);
/* Update free size */ spin_lock_irqsave(&ctx->pool_lock, flags); - ctx->free_mems -= attr.dirty_mems; - ctx->free_size -= attr.alloc_pages << HNS_HW_PAGE_SHIFT; + ctx->free_mems += attr.clean_mems; + ctx->free_size += attr.free_pages << HNS_HW_PAGE_SHIFT; spin_unlock_irqrestore(&ctx->pool_lock, flags); +}
- return attr.alloc_pages; +static void detach_dca_mem(struct hns_roce_dev *hr_dev, + struct hns_roce_qp *hr_qp, + struct hns_dca_detach_attr *attr) +{ + struct hns_roce_dca_cfg *cfg = &hr_qp->dca_cfg; + + /* Start an ageing worker to free buffer */ + cancel_delayed_work(&cfg->dwork); + spin_lock(&cfg->lock); + cfg->sq_idx = attr->sq_idx; + queue_delayed_work(hr_dev->irq_workq, &cfg->dwork, + msecs_to_jiffies(DCA_MEM_AGEING_MSES)); + spin_unlock(&cfg->lock); }
-struct dca_get_alloced_pages_attr { - u32 buf_id; - dma_addr_t *pages; - u32 total; - u32 max; +struct dca_mem_shrink_attr { + u64 shrink_key; + u32 shrink_mems; };
-static int get_alloced_umem_proc(struct dca_mem *mem, int index, void *param) - +static int shrink_dca_page_proc(struct dca_mem *mem, int index, void *param) { - struct dca_get_alloced_pages_attr *attr = param; - struct hns_dca_page_state *states = mem->states; - struct ib_umem *umem = mem->pages; - struct ib_block_iter biter; - u32 i = 0; + struct dca_mem_shrink_attr *attr = param; + struct hns_dca_page_state *state; + int i, free_pages;
- rdma_for_each_block(umem->sg_head.sgl, &biter, umem->nmap, - HNS_HW_PAGE_SIZE) { - if (dca_page_is_allocated(&states[i], attr->buf_id)) { - attr->pages[attr->total++] = - rdma_block_iter_dma_address(&biter); - if (attr->total >= attr->max) - return DCA_MEM_STOP_ITERATE; + free_pages = 0; + for (i = 0; i < mem->page_count; i++) { + state = &mem->states[i]; + if (dca_page_is_free(state)) + free_pages++; + } + + /* No any page be used */ + if (free_pages == mem->page_count) { + /* unregister first empty DCA mem */ + if (!attr->shrink_mems) { + mem->flags &= ~DCA_MEM_FLAGS_REGISTERED; + attr->shrink_key = mem->key; } - i++; + + attr->shrink_mems++; }
- return DCA_MEM_NEXT_ITERATE; + if (attr->shrink_mems > 1) + return DCA_MEM_STOP_ITERATE; + else + return DCA_MEM_NEXT_ITERATE; }
-static int apply_dca_cfg(struct hns_roce_dev *hr_dev, struct hns_roce_qp *hr_qp, - struct hns_dca_attach_attr *attach_attr) +struct hns_dca_shrink_resp { + u64 free_key; + u32 free_mems; +}; + +static void shrink_dca_mem(struct hns_roce_dev *hr_dev, + struct hns_roce_ucontext *uctx, u64 reserved_size, + struct hns_dca_shrink_resp *resp) { - struct hns_roce_dca_attr attr; + struct hns_roce_dca_ctx *ctx = to_hr_dca_ctx(hr_dev, uctx); + struct dca_mem_shrink_attr attr = {}; + unsigned long flags; + bool need_shink;
- if (hr_dev->hw->set_dca_buf) { - attr.sq_offset = attach_attr->sq_offset; - attr.sge_offset = attach_attr->sge_offset; - attr.rq_offset = attach_attr->rq_offset; - return hr_dev->hw->set_dca_buf(hr_dev, hr_qp, &attr); - } + spin_lock_irqsave(&ctx->pool_lock, flags); + need_shink = ctx->free_mems > 0 && ctx->free_size > reserved_size; + spin_unlock_irqrestore(&ctx->pool_lock, flags); + if (!need_shink) + return;
- return 0; + travel_dca_pages(ctx, &attr, shrink_dca_page_proc); + resp->free_mems = attr.shrink_mems; + resp->free_key = attr.shrink_key; }
-static int setup_dca_buf_to_hw(struct hns_roce_dca_ctx *ctx, - struct hns_roce_qp *hr_qp, u32 buf_id, - struct hns_dca_attach_attr *attach_attr) +static void init_dca_context(struct hns_roce_dca_ctx *ctx) { - struct hns_roce_dev *hr_dev = to_hr_dev(hr_qp->ibqp.device); - struct dca_get_alloced_pages_attr attr = {}; - struct ib_device *ibdev = &hr_dev->ib_dev; - u32 count = hr_qp->dca_cfg.npages; - dma_addr_t *pages; - int ret; + INIT_LIST_HEAD(&ctx->pool); + spin_lock_init(&ctx->pool_lock); + ctx->total_size = 0; +}
- /* Alloc a tmp array to store buffer's dma address */ - pages = kvcalloc(count, sizeof(dma_addr_t), GFP_NOWAIT); - if (!pages) - return -ENOMEM; +static void cleanup_dca_context(struct hns_roce_dev *hr_dev, + struct hns_roce_dca_ctx *ctx) +{ + struct dca_mem *mem, *tmp; + unsigned long flags; + bool is_user;
- attr.buf_id = buf_id; - attr.pages = pages; - attr.max = count; + is_user = (ctx != &hr_dev->dca_ctx); + spin_lock_irqsave(&ctx->pool_lock, flags); + list_for_each_entry_safe(mem, tmp, &ctx->pool, list) { + list_del(&mem->list); + spin_lock(&mem->lock); + mem->flags = 0; + spin_unlock(&mem->lock); + spin_unlock_irqrestore(&ctx->pool_lock, flags);
- travel_dca_pages(ctx, &attr, get_alloced_umem_proc); - if (attr.total != count) { - ibdev_err(ibdev, "failed to get DCA page %u != %u.\n", - attr.total, count); - ret = -ENOMEM; - goto done; - } + kfree(mem->states); + free_dca_pages(hr_dev, is_user, mem->pages); + kfree(mem);
- /* Update MTT for ROCEE addressing */ - ret = hns_roce_mtr_map(hr_dev, &hr_qp->mtr, pages, count); - if (ret) { - ibdev_err(ibdev, "failed to map DCA pages, ret = %d.\n", ret); - goto done; + spin_lock_irqsave(&ctx->pool_lock, flags); } + ctx->total_size = 0; + spin_unlock_irqrestore(&ctx->pool_lock, flags); +}
- /* Apply the changes for WQE address */ - ret = apply_dca_cfg(hr_dev, hr_qp, attach_attr); - if (ret) - ibdev_err(ibdev, "failed to apply DCA cfg, ret = %d.\n", ret); +#define DCA_MAX_MEM_SIZE ~0UL
-done: - /* Drop tmp array */ - kvfree(pages); - return ret; +static uint dca_unit_size; +static ulong dca_min_size = DCA_MAX_MEM_SIZE; +static ulong dca_max_size = DCA_MAX_MEM_SIZE; + +static void config_kdca_context(struct hns_roce_dca_ctx *ctx) +{ + unsigned int unit_size; + + unit_size = ALIGN(dca_unit_size, PAGE_SIZE); + ctx->unit_size = unit_size; + if (!unit_size) + return; + + if (dca_max_size == DCA_MAX_MEM_SIZE || dca_max_size == 0) + ctx->max_size = DCA_MAX_MEM_SIZE; + else + ctx->max_size = roundup(dca_max_size, unit_size); + + if (dca_min_size == DCA_MAX_MEM_SIZE) + ctx->min_size = ctx->max_size; + else + ctx->min_size = roundup(dca_min_size, unit_size); }
-static u32 alloc_buf_from_dca_mem(struct hns_roce_qp *hr_qp, - struct hns_roce_dca_ctx *ctx) +void hns_roce_init_dca(struct hns_roce_dev *hr_dev) { - u32 buf_pages, unit_pages, alloc_pages; - u32 buf_id; + init_dca_context(&hr_dev->dca_ctx); + + config_kdca_context(&hr_dev->dca_ctx); +} + +void hns_roce_cleanup_dca(struct hns_roce_dev *hr_dev) +{ + cleanup_dca_context(hr_dev, &hr_dev->dca_ctx); +} + +void hns_roce_register_udca(struct hns_roce_dev *hr_dev, + struct hns_roce_ucontext *uctx) +{ + if (!(uctx->config & HNS_ROCE_UCTX_CONFIG_DCA)) + return; + + init_dca_context(&uctx->dca_ctx); +} + +void hns_roce_unregister_udca(struct hns_roce_dev *hr_dev, + struct hns_roce_ucontext *uctx) +{ + if (!(uctx->config & HNS_ROCE_UCTX_CONFIG_DCA)) + return; + + cleanup_dca_context(hr_dev, &uctx->dca_ctx); +} + +static struct dca_mem *key_to_dca_mem(struct list_head *head, u64 key) +{ + struct dca_mem *mem; + + list_for_each_entry(mem, head, list) + if (mem->key == key) + return mem; + + return NULL; +} + +static bool add_dca_mem_enabled(struct hns_roce_dca_ctx *ctx, u32 alloc_size) +{ + unsigned long flags; + bool enable; + + spin_lock_irqsave(&ctx->pool_lock, flags); + + /* Pool size no limit */ + if (ctx->max_size == DCA_MAX_MEM_SIZE) + enable = true; + else /* Pool size not exceed max size */ + enable = (ctx->total_size + alloc_size) < ctx->max_size; + + spin_unlock_irqrestore(&ctx->pool_lock, flags);
- buf_pages = hr_qp->dca_cfg.npages; - /* Gen new buf id */ - buf_id = HNS_DCA_TO_BUF_ID(hr_qp->qpn, hr_qp->dca_cfg.attach_count); + return enable; +}
- /* Assign pages from free pages */ - unit_pages = hr_qp->mtr.hem_cfg.is_direct ? buf_pages : 1; - alloc_pages = assign_dca_pages(ctx, buf_id, buf_pages, unit_pages); - if (buf_pages != alloc_pages) { - if (alloc_pages > 0) - clear_dca_pages(ctx, buf_id, alloc_pages); - return HNS_DCA_INVALID_BUF_ID; - } +static bool shrink_dca_mem_enabled(struct hns_roce_dca_ctx *ctx) +{ + unsigned long flags; + bool enable;
- return buf_id; + spin_lock_irqsave(&ctx->pool_lock, flags); + enable = ctx->total_size > 0 && ctx->min_size < ctx->max_size; + spin_unlock_irqrestore(&ctx->pool_lock, flags); + + return enable; }
-static int active_alloced_buf(struct hns_roce_qp *hr_qp, - struct hns_roce_dca_ctx *ctx, - struct hns_dca_attach_attr *attr, u32 buf_id) +static struct dca_mem *alloc_dca_mem(struct hns_roce_dca_ctx *ctx) { - struct hns_roce_dev *hr_dev = to_hr_dev(hr_qp->ibqp.device); - struct ib_device *ibdev = &hr_dev->ib_dev; - u32 active_pages, alloc_pages; - int ret; + struct dca_mem *mem, *tmp, *found = NULL; + unsigned long flags;
- ret = setup_dca_buf_to_hw(ctx, hr_qp, buf_id, attr); - if (ret) { - ibdev_err(ibdev, "failed to setup DCA buf, ret = %d.\n", ret); - goto active_fail; + spin_lock_irqsave(&ctx->pool_lock, flags); + list_for_each_entry_safe(mem, tmp, &ctx->pool, list) { + spin_lock(&mem->lock); + if (!mem->flags) { + found = mem; + mem->flags |= DCA_MEM_FLAGS_ALLOCED; + spin_unlock(&mem->lock); + break; + } + spin_unlock(&mem->lock); } + spin_unlock_irqrestore(&ctx->pool_lock, flags);
- alloc_pages = hr_qp->dca_cfg.npages; - active_pages = active_dca_pages(ctx, buf_id, alloc_pages); - if (active_pages != alloc_pages) { - ibdev_err(ibdev, "failed to active DCA pages, %u != %u.\n", - active_pages, alloc_pages); - ret = -ENOBUFS; - goto active_fail; - } + if (found) + return found;
- return 0; + mem = kzalloc(sizeof(*mem), GFP_ATOMIC); + if (!mem) + return NULL;
-active_fail: - clear_dca_pages(ctx, buf_id, alloc_pages); - return ret; + spin_lock_init(&mem->lock); + INIT_LIST_HEAD(&mem->list); + + mem->flags |= DCA_MEM_FLAGS_ALLOCED; + + spin_lock_irqsave(&ctx->pool_lock, flags); + list_add(&mem->list, &ctx->pool); + spin_unlock_irqrestore(&ctx->pool_lock, flags); + return mem; }
-static int attach_dca_mem(struct hns_roce_dev *hr_dev, - struct hns_roce_qp *hr_qp, - struct hns_dca_attach_attr *attr, - struct hns_dca_attach_resp *resp) +static void free_dca_mem(struct dca_mem *mem) { - struct hns_roce_dca_ctx *ctx = hr_qp_to_dca_ctx(hr_qp); - struct hns_roce_dca_cfg *cfg = &hr_qp->dca_cfg; - u32 buf_id; - int ret; + /* When iterate all DCA mems in travel_dca_pages(), we will NOT hold the + * pool's lock and just set the DCA mem as free state during the DCA is + * working until cleanup the DCA context in hns_roce_cleanup_dca(). + */ + spin_lock(&mem->lock); + mem->flags = 0; + spin_unlock(&mem->lock); +}
- /* Stop DCA mem ageing worker */ - cancel_delayed_work(&cfg->dwork); - resp->alloc_flags = 0; +static int add_dca_mem(struct hns_roce_dev *hr_dev, u32 new_size) +{ + struct hns_roce_dca_ctx *ctx = to_hr_dca_ctx(hr_dev, NULL); + struct dca_mem_attr attr = {}; + struct dca_mem *mem = NULL; + int ret;
- spin_lock(&cfg->lock); - buf_id = cfg->buf_id; - /* Already attached */ - if (buf_id != HNS_DCA_INVALID_BUF_ID) { - resp->alloc_pages = cfg->npages; - spin_unlock(&cfg->lock); - return 0; - } + if (!add_dca_mem_enabled(ctx, new_size)) + return -ENOMEM;
- /* Start to new attach */ - resp->alloc_pages = 0; - buf_id = alloc_buf_from_dca_mem(hr_qp, ctx); - if (buf_id == HNS_DCA_INVALID_BUF_ID) { - spin_unlock(&cfg->lock); - /* No report fail, need try again after the pool increased */ - return 0; - } + /* Add new DCA mem */ + mem = alloc_dca_mem(ctx); + if (!mem) + return -ENOMEM;
- ret = active_alloced_buf(hr_qp, ctx, attr, buf_id); + attr.key = (u64)mem; + attr.size = roundup(new_size, ctx->unit_size); + ret = register_dca_mem(hr_dev, NULL, mem, &attr); if (ret) { - spin_unlock(&cfg->lock); + free_dca_mem(mem); ibdev_err(&hr_dev->ib_dev, - "failed to active DCA buf for QP-%lu, ret = %d.\n", - hr_qp->qpn, ret); - return ret; + "failed to register DCA mem, ret = %d.\n", ret); }
- /* Attach ok */ - cfg->buf_id = buf_id; - cfg->attach_count++; - spin_unlock(&cfg->lock); - - resp->alloc_flags |= HNS_IB_ATTACH_FLAGS_NEW_BUFFER; - resp->alloc_pages = cfg->npages; - - return 0; + return ret; }
-struct dca_page_query_active_attr { +struct dca_page_get_active_buf_attr { u32 buf_id; - u32 curr_index; - u32 start_index; - u32 page_index; - u32 page_count; - u64 mem_key; + void **buf_list; + u32 total; + u32 max; };
-static int query_dca_active_pages_proc(struct dca_mem *mem, int index, - void *param) +static int get_active_kbuf_proc(struct dca_mem *mem, int index, void *param) { - struct hns_dca_page_state *state = &mem->states[index]; - struct dca_page_query_active_attr *attr = param; - - if (!dca_page_is_active(state, attr->buf_id)) - return 0; - - if (attr->curr_index < attr->start_index) { - attr->curr_index++; - return 0; - } else if (attr->curr_index > attr->start_index) { - return DCA_MEM_STOP_ITERATE; + struct dca_page_get_active_buf_attr *attr = param; + struct hns_dca_page_state *states = mem->states; + struct hns_roce_buf *kmem = mem->pages; + void *buf; + u32 i; + + for (i = 0; i < kmem->npages; i++) { + if (!dca_page_is_active(&states[i], attr->buf_id)) + continue; + + buf = hns_roce_buf_offset(kmem, i << HNS_HW_PAGE_SHIFT); + attr->buf_list[attr->total++] = buf; + if (attr->total >= attr->max) + return DCA_MEM_STOP_ITERATE; }
- /* Search first page in DCA mem */ - attr->page_index = index; - attr->mem_key = mem->key; - /* Search active pages in continuous addresses */ - while (index < mem->page_count) { - state = &mem->states[index]; - if (!dca_page_is_active(state, attr->buf_id)) - break; + return DCA_MEM_NEXT_ITERATE; +}
- index++; - attr->page_count++; - } +static int setup_dca_buf_list(struct hns_roce_dca_ctx *ctx, + struct hns_roce_dca_cfg *cfg) +{ + struct dca_page_get_active_buf_attr attr = {};
- return DCA_MEM_STOP_ITERATE; -} + attr.buf_id = cfg->buf_id; + attr.buf_list = cfg->buf_list; + attr.max = cfg->npages; + travel_dca_pages(ctx, &attr, get_active_kbuf_proc);
-struct dca_page_free_buf_attr { - u32 buf_id; - u32 max_pages; - u32 free_pages; - u32 clean_mems; -}; + return attr.total == attr.max ? 0 : -ENOMEM; +}
-static int free_buffer_pages_proc(struct dca_mem *mem, int index, void *param) +#define DCA_EXPAND_MEM_TRY_TIMES 3 +int hns_roce_dca_attach(struct hns_roce_dev *hr_dev, struct hns_roce_qp *hr_qp, + struct hns_dca_attach_attr *attr) { - struct dca_page_free_buf_attr *attr = param; - struct hns_dca_page_state *state; - bool changed = false; - bool stop = false; - int i, free_pages; + struct hns_roce_dca_cfg *cfg = &hr_qp->dca_cfg; + struct hns_dca_attach_resp resp = {}; + bool is_new_buf = true; + int try_times = 0; + int ret;
- free_pages = 0; - for (i = 0; !stop && i < mem->page_count; i++) { - state = &mem->states[i]; - /* Change matched pages state */ - if (dca_page_is_attached(state, attr->buf_id)) { - set_dca_page_to_free(state); - changed = true; - attr->free_pages++; - if (attr->free_pages == attr->max_pages) - stop = true; + do { + resp.alloc_flags = 0; + ret = attach_dca_mem(hr_dev, hr_qp, attr, &resp); + if (ret) + break; + + if (resp.alloc_pages >= cfg->npages) { + is_new_buf = !!(resp.alloc_flags & + HNS_DCA_ATTACH_FLAGS_NEW_BUFFER); + break; }
- if (dca_page_is_free(state)) - free_pages++; - } + ret = add_dca_mem(hr_dev, hr_qp->buff_size); + if (ret) + break; + } while (try_times++ < DCA_EXPAND_MEM_TRY_TIMES);
- for (; changed && i < mem->page_count; i++) - if (dca_page_is_free(state)) - free_pages++; + if (ret || resp.alloc_pages < cfg->npages) { + ibdev_err(&hr_dev->ib_dev, + "failed to attach buf %u != %u, try %d, ret = %d.\n", + cfg->npages, resp.alloc_pages, try_times, ret); + return -ENOMEM; + }
- if (changed && free_pages == mem->page_count) - attr->clean_mems++; + /* DCA config not changed */ + if (!is_new_buf && cfg->buf_list[0]) + return 0;
- return stop ? DCA_MEM_STOP_ITERATE : DCA_MEM_NEXT_ITERATE; + return setup_dca_buf_list(hr_qp_to_dca_ctx(hr_dev, hr_qp), cfg); }
-static void free_buf_from_dca_mem(struct hns_roce_dca_ctx *ctx, - struct hns_roce_dca_cfg *cfg) +static void remove_unused_dca_mem(struct hns_roce_dev *hr_dev) { - struct dca_page_free_buf_attr attr = {}; + struct hns_roce_dca_ctx *ctx = to_hr_dca_ctx(hr_dev, NULL); + struct hns_dca_shrink_resp resp = {}; + struct dca_mem *mem; unsigned long flags; - u32 buf_id; - - spin_lock(&cfg->lock); - buf_id = cfg->buf_id; - cfg->buf_id = HNS_DCA_INVALID_BUF_ID; - spin_unlock(&cfg->lock); - if (buf_id == HNS_DCA_INVALID_BUF_ID) - return; - - attr.buf_id = buf_id; - attr.max_pages = cfg->npages; - travel_dca_pages(ctx, &attr, free_buffer_pages_proc);
- /* Update free size */ - spin_lock_irqsave(&ctx->pool_lock, flags); - ctx->free_mems += attr.clean_mems; - ctx->free_size += attr.free_pages << HNS_HW_PAGE_SHIFT; - spin_unlock_irqrestore(&ctx->pool_lock, flags); + while (shrink_dca_mem_enabled(ctx)) { + resp.free_mems = 0; + shrink_dca_mem(hr_dev, NULL, ctx->min_size, &resp); + if (resp.free_mems < 1) + break; + spin_lock_irqsave(&ctx->pool_lock, flags); + mem = key_to_dca_mem(&ctx->pool, resp.free_key); + spin_unlock_irqrestore(&ctx->pool_lock, flags); + if (!mem) + break; + unregister_dca_mem(hr_dev, NULL, mem); + free_dca_mem(mem); + /* No more free memory */ + if (resp.free_mems <= 1) + break; + } }
static void kick_dca_mem(struct hns_roce_dev *hr_dev, struct hns_roce_dca_cfg *cfg, struct hns_roce_ucontext *uctx) { - struct hns_roce_dca_ctx *ctx = to_hr_dca_ctx(uctx); + struct hns_roce_dca_ctx *ctx = to_hr_dca_ctx(hr_dev, uctx);
/* Stop ageing worker and free DCA buffer from pool */ cancel_delayed_work_sync(&cfg->dwork); free_buf_from_dca_mem(ctx, cfg); + + /* Shrink kenrel DCA mem */ + if (!uctx) + remove_unused_dca_mem(hr_dev); }
static void dca_mem_ageing_work(struct work_struct *work) @@ -892,41 +1255,36 @@ static void dca_mem_ageing_work(struct work_struct *work) struct hns_roce_qp *hr_qp = container_of(work, struct hns_roce_qp, dca_cfg.dwork.work); struct hns_roce_dev *hr_dev = to_hr_dev(hr_qp->ibqp.device); - struct hns_roce_dca_ctx *ctx = hr_qp_to_dca_ctx(hr_qp); + struct hns_roce_dca_ctx *ctx = hr_qp_to_dca_ctx(hr_dev, hr_qp); bool hw_is_inactive;
hw_is_inactive = hr_dev->hw->chk_dca_buf_inactive && hr_dev->hw->chk_dca_buf_inactive(hr_dev, hr_qp); if (hw_is_inactive) free_buf_from_dca_mem(ctx, &hr_qp->dca_cfg); + + /* Shrink kenrel DCA mem */ + if (!hr_qp->ibqp.uobject) + remove_unused_dca_mem(hr_dev); }
-void hns_roce_dca_kick(struct hns_roce_dev *hr_dev, struct hns_roce_qp *hr_qp) +void hns_roce_dca_detach(struct hns_roce_dev *hr_dev, struct hns_roce_qp *hr_qp, + struct hns_dca_detach_attr *attr) { - struct hns_roce_ucontext *uctx; - - if (hr_qp->ibqp.uobject && hr_qp->ibqp.pd->uobject) { - uctx = to_hr_ucontext(hr_qp->ibqp.pd->uobject->context); - kick_dca_mem(hr_dev, &hr_qp->dca_cfg, uctx); - } + detach_dca_mem(hr_dev, hr_qp, attr); }
-static void detach_dca_mem(struct hns_roce_dev *hr_dev, - struct hns_roce_qp *hr_qp, - struct hns_dca_detach_attr *attr) +void hns_roce_dca_kick(struct hns_roce_dev *hr_dev, struct hns_roce_qp *hr_qp, + struct ib_udata *udata) { - struct hns_roce_dca_cfg *cfg = &hr_qp->dca_cfg; + struct hns_roce_ucontext *uctx = rdma_udata_to_drv_context(udata, + struct hns_roce_ucontext, ibucontext);
- /* Start an ageing worker to free buffer */ - cancel_delayed_work(&cfg->dwork); - spin_lock(&cfg->lock); - cfg->sq_idx = attr->sq_idx; - queue_delayed_work(hr_dev->irq_workq, &cfg->dwork, - msecs_to_jiffies(DCA_MEM_AGEING_MSES)); - spin_unlock(&cfg->lock); + kick_dca_mem(hr_dev, &hr_qp->dca_cfg, uctx); }
-void hns_roce_enable_dca(struct hns_roce_dev *hr_dev, struct hns_roce_qp *hr_qp) +int hns_roce_enable_dca(struct hns_roce_dev *hr_dev, struct hns_roce_qp *hr_qp, + struct ib_udata *udata) { struct hns_roce_dca_cfg *cfg = &hr_qp->dca_cfg;
@@ -934,6 +1292,16 @@ void hns_roce_enable_dca(struct hns_roce_dev *hr_dev, struct hns_roce_qp *hr_qp) INIT_DELAYED_WORK(&cfg->dwork, dca_mem_ageing_work); cfg->buf_id = HNS_DCA_INVALID_BUF_ID; cfg->npages = hr_qp->buff_size >> HNS_HW_PAGE_SHIFT; + + /* DCA page list for kernel QP */ + if (!udata && cfg->npages) { + cfg->buf_list = kcalloc(cfg->npages, sizeof(void *), + GFP_KERNEL); + if (!cfg->buf_list) + return -ENOMEM; + } + + return 0; }
void hns_roce_disable_dca(struct hns_roce_dev *hr_dev, @@ -944,7 +1312,12 @@ void hns_roce_disable_dca(struct hns_roce_dev *hr_dev, struct hns_roce_dca_cfg *cfg = &hr_qp->dca_cfg;
kick_dca_mem(hr_dev, cfg, uctx); - cfg->buf_id = HNS_DCA_INVALID_BUF_ID; + + /* Free kenrel DCA buffer list */ + if (!udata && cfg->buf_list) { + kfree(cfg->buf_list); + cfg->buf_list = NULL; + } }
static inline struct hns_roce_ucontext * @@ -976,7 +1349,7 @@ static int UVERBS_HANDLER(HNS_IB_METHOD_DCA_MEM_REG)( if (ret) return ret;
- mem = alloc_dca_mem(to_hr_dca_ctx(uctx)); + mem = alloc_dca_mem(to_hr_dca_ctx(hr_dev, uctx)); if (!mem) return -ENOMEM;
@@ -1005,7 +1378,7 @@ static int dca_cleanup(struct ib_uobject *uobject, enum rdma_remove_reason why, return 0;
mem = uobject->object; - unregister_dca_mem(uctx, mem); + unregister_dca_mem(to_hr_dev(uctx->ibucontext.device), uctx, mem); free_dca_mem(mem);
return 0; @@ -1163,7 +1536,8 @@ static int UVERBS_HANDLER(HNS_IB_METHOD_DCA_MEM_QUERY)( struct uverbs_attr_bundle *attrs) { struct hns_roce_qp *hr_qp = uverbs_attr_to_hr_qp(attrs); - struct hns_roce_dca_ctx *ctx = hr_qp_to_dca_ctx(hr_qp); + struct hns_roce_dev *hr_dev = to_hr_dev(hr_qp->ibqp.device); + struct hns_roce_dca_ctx *ctx = hr_qp_to_dca_ctx(hr_dev, hr_qp); struct dca_page_query_active_attr active_attr = {}; u32 page_idx, page_ofs; int ret; @@ -1234,3 +1608,7 @@ const struct uapi_definition hns_roce_dca_uapi_defs[] = { UAPI_DEF_IS_OBJ_SUPPORTED(dca_is_supported)), {} }; + +module_param(dca_unit_size, uint, 0444); +module_param(dca_max_size, ulong, 0444); +module_param(dca_min_size, ulong, 0444); diff --git a/drivers/infiniband/hw/hns/hns_roce_dca.h b/drivers/infiniband/hw/hns/hns_roce_dca.h index fdc3aaa4b10b..f378102778e3 100644 --- a/drivers/infiniband/hw/hns/hns_roce_dca.h +++ b/drivers/infiniband/hw/hns/hns_roce_dca.h @@ -18,11 +18,6 @@ struct hns_dca_page_state {
extern const struct uapi_definition hns_roce_dca_uapi_defs[];
-struct hns_dca_shrink_resp { - u64 free_key; /* free buffer's key which registered by the user */ - u32 free_mems; /* free buffer count which no any QP be using */ -}; - #define HNS_DCA_INVALID_BUF_ID 0UL
/* @@ -46,6 +41,7 @@ struct hns_dca_attach_attr { };
struct hns_dca_attach_resp { +#define HNS_DCA_ATTACH_FLAGS_NEW_BUFFER BIT(0) u32 alloc_flags; u32 alloc_pages; }; @@ -54,14 +50,27 @@ struct hns_dca_detach_attr { u32 sq_idx; };
+typedef int (*hns_dca_enum_callback)(struct hns_dca_page_state *, u32, void *); + +void hns_roce_init_dca(struct hns_roce_dev *hr_dev); +void hns_roce_cleanup_dca(struct hns_roce_dev *hr_dev); + void hns_roce_register_udca(struct hns_roce_dev *hr_dev, struct hns_roce_ucontext *uctx); void hns_roce_unregister_udca(struct hns_roce_dev *hr_dev, struct hns_roce_ucontext *uctx);
-void hns_roce_enable_dca(struct hns_roce_dev *hr_dev, - struct hns_roce_qp *hr_qp); +int hns_roce_enable_dca(struct hns_roce_dev *hr_dev, struct hns_roce_qp *hr_qp, + struct ib_udata *udata); void hns_roce_disable_dca(struct hns_roce_dev *hr_dev, struct hns_roce_qp *hr_qp, struct ib_udata *udata); -void hns_roce_dca_kick(struct hns_roce_dev *hr_dev, struct hns_roce_qp *hr_qp); + +int hns_roce_dca_attach(struct hns_roce_dev *hr_dev, struct hns_roce_qp *hr_qp, + struct hns_dca_attach_attr *attr); +void hns_roce_dca_detach(struct hns_roce_dev *hr_dev, struct hns_roce_qp *hr_qp, + struct hns_dca_detach_attr *attr); + +void hns_roce_dca_kick(struct hns_roce_dev *hr_dev, struct hns_roce_qp *hr_qp, + struct ib_udata *udata); + #endif diff --git a/drivers/infiniband/hw/hns/hns_roce_device.h b/drivers/infiniband/hw/hns/hns_roce_device.h index ac9dcdf59887..b1c1f640a7a0 100644 --- a/drivers/infiniband/hw/hns/hns_roce_device.h +++ b/drivers/infiniband/hw/hns/hns_roce_device.h @@ -130,6 +130,15 @@ enum hns_roce_event { HNS_ROCE_EVENT_TYPE_INVALID_XRCETH = 0x17, };
+/* Private QP creation flags to be passed in ib_qp_init_attr.create_flags. + * + * These flags are intended for internal use by the hns driver, and they + * rely on the range reserved for that use in the ib_qp_create_flags enum. + */ +enum hns_roce_qp_create_flags { + HNS_ROCE_QP_CREATE_DCA_EN = IB_QP_CREATE_RESERVED_START, +}; + enum { HNS_ROCE_CAP_FLAG_REREG_MR = BIT(0), HNS_ROCE_CAP_FLAG_ROCE_V1_V2 = BIT(1), @@ -210,6 +219,9 @@ struct hns_roce_dca_ctx { unsigned int free_mems; /* free mem num in pool */ size_t free_size; /* free mem size in pool */ size_t total_size; /* total size in pool */ + size_t max_size; /* max size the pool can expand to */ + size_t min_size; /* shrink if @free_size > @min_size */ + unsigned int unit_size; /* unit size per DCA mem */ };
struct hns_roce_ucontext { @@ -314,20 +326,15 @@ struct hns_roce_mtr { struct hns_roce_hem_cfg hem_cfg; /* config for hardware addressing */ };
+/* DCA config */ struct hns_roce_dca_cfg { - spinlock_t lock; - u32 buf_id; - u16 attach_count; - u32 npages; - u32 sq_idx; - struct delayed_work dwork; -}; - -/* DCA attr for setting WQE buffer */ -struct hns_roce_dca_attr { - u32 sq_offset; - u32 sge_offset; - u32 rq_offset; + spinlock_t lock; + u32 buf_id; + u16 attach_count; + void **buf_list; + u32 npages; + u32 sq_idx; + struct delayed_work dwork; };
struct hns_roce_mw { @@ -367,6 +374,7 @@ struct hns_roce_wq { u32 max_gs; u32 rsv_sge; u32 offset; + int wqe_offset; u32 wqe_shift; /* WQE size */ u32 head; u32 tail; @@ -378,6 +386,7 @@ struct hns_roce_sge { unsigned int sge_cnt; /* SGE num */ u32 offset; u32 sge_shift; /* SGE size */ + int wqe_offset; };
struct hns_roce_buf_list { @@ -912,8 +921,7 @@ struct hns_roce_hw { struct hns_roce_hem_table *table, int obj, u32 step_idx); int (*set_dca_buf)(struct hns_roce_dev *hr_dev, - struct hns_roce_qp *hr_qp, - struct hns_roce_dca_attr *attr); + struct hns_roce_qp *hr_qp); bool (*chk_dca_buf_inactive)(struct hns_roce_dev *hr_dev, struct hns_roce_qp *hr_qp); int (*modify_qp)(struct ib_qp *ibqp, const struct ib_qp_attr *attr, @@ -941,6 +949,11 @@ struct hns_roce_dev { struct ib_device ib_dev; struct pci_dev *pci_dev; struct device *dev; + void *dbgfs; /* debugfs for this dev */ + + struct list_head uctx_list; /* list of all uctx on this dev */ + spinlock_t uctx_list_lock; /* protect @uctx_list */ + struct hns_roce_uar priv_uar; const char *irq_names[HNS_ROCE_MAX_IRQ_NUM]; spinlock_t sm_lock; @@ -963,6 +976,8 @@ struct hns_roce_dev { struct hns_roce_caps caps; struct xarray qp_table_xa;
+ struct hns_roce_dca_ctx dca_ctx; + unsigned char dev_addr[HNS_ROCE_MAX_PORTS][ETH_ALEN]; u64 sys_image_guid; u32 vendor_id; diff --git a/drivers/infiniband/hw/hns/hns_roce_hw_v2.c b/drivers/infiniband/hw/hns/hns_roce_hw_v2.c index f14a8e41aafa..c4b61266a242 100644 --- a/drivers/infiniband/hw/hns/hns_roce_hw_v2.c +++ b/drivers/infiniband/hw/hns/hns_roce_hw_v2.c @@ -376,11 +376,64 @@ static inline bool check_qp_dca_enable(struct hns_roce_qp *hr_qp) return !!(hr_qp->en_flags & HNS_ROCE_QP_CAP_DCA); }
+static int dca_attach_qp_buf(struct hns_roce_dev *hr_dev, + struct hns_roce_qp *hr_qp) +{ + struct hns_dca_attach_attr attr = {}; + unsigned long flags_sq, flags_rq; + u32 idx; + + spin_lock_irqsave(&hr_qp->sq.lock, flags_sq); + spin_lock_irqsave(&hr_qp->rq.lock, flags_rq); + + if (hr_qp->sq.wqe_cnt > 0) { + idx = hr_qp->sq.head & (hr_qp->sq.wqe_cnt - 1); + attr.sq_offset = idx << hr_qp->sq.wqe_shift; + } + + if (hr_qp->sge.sge_cnt > 0) { + idx = hr_qp->next_sge & (hr_qp->sge.sge_cnt - 1); + attr.sge_offset = idx << hr_qp->sge.sge_shift; + } + + if (hr_qp->rq.wqe_cnt > 0) { + idx = hr_qp->rq.head & (hr_qp->rq.wqe_cnt - 1); + attr.rq_offset = idx << hr_qp->rq.wqe_shift; + } + + spin_unlock_irqrestore(&hr_qp->rq.lock, flags_rq); + spin_unlock_irqrestore(&hr_qp->sq.lock, flags_sq); + + return hns_roce_dca_attach(hr_dev, hr_qp, &attr); +} + +static void dca_detach_qp_buf(struct hns_roce_dev *hr_dev, + struct hns_roce_qp *hr_qp) +{ + struct hns_dca_detach_attr attr = {}; + unsigned long flags_sq, flags_rq; + bool is_empty; + + spin_lock_irqsave(&hr_qp->sq.lock, flags_sq); + spin_lock_irqsave(&hr_qp->rq.lock, flags_rq); + is_empty = hr_qp->sq.head == hr_qp->sq.tail && + hr_qp->rq.head == hr_qp->rq.tail; + if (is_empty && hr_qp->sq.wqe_cnt > 0) + attr.sq_idx = hr_qp->sq.head & (hr_qp->sq.wqe_cnt - 1); + + spin_unlock_irqrestore(&hr_qp->rq.lock, flags_rq); + spin_unlock_irqrestore(&hr_qp->sq.lock, flags_sq); + + if (is_empty) + hns_roce_dca_detach(hr_dev, hr_qp, &attr); +} + static int check_send_valid(struct hns_roce_dev *hr_dev, struct hns_roce_qp *hr_qp) { struct ib_device *ibdev = &hr_dev->ib_dev; struct ib_qp *ibqp = &hr_qp->ibqp; + int ret;
if (unlikely(ibqp->qp_type != IB_QPT_RC && ibqp->qp_type != IB_QPT_GSI && @@ -400,6 +453,16 @@ static int check_send_valid(struct hns_roce_dev *hr_dev, return -EIO; }
+ if (check_qp_dca_enable(hr_qp)) { + ret = dca_attach_qp_buf(hr_dev, hr_qp); + if (unlikely(ret)) { + ibdev_err(&hr_dev->ib_dev, + "failed to attach DCA for QP-%ld send!\n", + hr_qp->qpn); + return ret; + } + } + return 0; }
@@ -586,6 +649,14 @@ static int set_rc_opcode(struct hns_roce_dev *hr_dev, return ret; }
+static inline void fill_dca_fields(struct hns_roce_qp *hr_qp, + struct hns_roce_v2_rc_send_wqe *wqe) +{ + hr_reg_write(wqe, RC_SEND_WQE_SQPN_L, hr_qp->qpn); + hr_reg_write(wqe, RC_SEND_WQE_SQPN_H, + hr_qp->qpn >> V2_RC_SEND_WQE_BYTE_4_SQPN_L_W); +} + static inline int set_rc_wqe(struct hns_roce_qp *qp, const struct ib_send_wr *wr, void *wqe, unsigned int *sge_idx, @@ -622,6 +693,9 @@ static inline int set_rc_wqe(struct hns_roce_qp *qp, ret = set_rwqe_data_seg(&qp->ibqp, wr, rc_sq_wqe, &curr_idx, valid_num_sge);
+ if (qp->en_flags & HNS_ROCE_QP_CAP_DCA) + fill_dca_fields(qp, rc_sq_wqe); + /* * The pipeline can sequentially post all valid WQEs into WQ buffer, * including new WQEs waiting for the doorbell to update the PI again. @@ -706,12 +780,26 @@ static void write_dwqe(struct hns_roce_dev *hr_dev, struct hns_roce_qp *qp, hns_roce_write512(hr_dev, wqe, qp->sq.db_reg); }
+static int check_sq_enabled(struct hns_roce_dev *hr_dev, struct hns_roce_qp *qp, + const struct ib_send_wr *wr, int nreq) +{ + if (hns_roce_wq_overflow(&qp->sq, nreq, qp->ibqp.send_cq)) + return -ENOMEM; + + if (unlikely(wr->num_sge > qp->sq.max_gs)) { + ibdev_err(&hr_dev->ib_dev, "num_sge=%d > qp->sq.max_gs=%u\n", + wr->num_sge, qp->sq.max_gs); + return -EINVAL; + } + + return 0; +} + static int hns_roce_v2_post_send(struct ib_qp *ibqp, const struct ib_send_wr *wr, const struct ib_send_wr **bad_wr) { struct hns_roce_dev *hr_dev = to_hr_dev(ibqp->device); - struct ib_device *ibdev = &hr_dev->ib_dev; struct hns_roce_qp *qp = to_hr_qp(ibqp); unsigned long flags = 0; unsigned int owner_bit; @@ -721,34 +809,25 @@ static int hns_roce_v2_post_send(struct ib_qp *ibqp, u32 nreq; int ret;
- spin_lock_irqsave(&qp->sq.lock, flags);
ret = check_send_valid(hr_dev, qp); if (unlikely(ret)) { *bad_wr = wr; - nreq = 0; - goto out; + return ret; }
+ spin_lock_irqsave(&qp->sq.lock, flags); sge_idx = qp->next_sge;
for (nreq = 0; wr; ++nreq, wr = wr->next) { - if (hns_roce_wq_overflow(&qp->sq, nreq, qp->ibqp.send_cq)) { - ret = -ENOMEM; + ret = check_sq_enabled(hr_dev, qp, wr, nreq); + if (unlikely(ret)) { *bad_wr = wr; goto out; }
wqe_idx = (qp->sq.head + nreq) & (qp->sq.wqe_cnt - 1);
- if (unlikely(wr->num_sge > qp->sq.max_gs)) { - ibdev_err(ibdev, "num_sge = %d > qp->sq.max_gs = %u.\n", - wr->num_sge, qp->sq.max_gs); - ret = -EINVAL; - *bad_wr = wr; - goto out; - } - wqe = hns_roce_get_send_wqe(qp, wqe_idx); qp->sq.wrid[wqe_idx] = wr->wr_id; owner_bit = @@ -787,6 +866,7 @@ static int check_recv_valid(struct hns_roce_dev *hr_dev, { struct ib_device *ibdev = &hr_dev->ib_dev; struct ib_qp *ibqp = &hr_qp->ibqp; + int ret;
if (unlikely(ibqp->qp_type != IB_QPT_RC && ibqp->qp_type != IB_QPT_GSI && @@ -802,6 +882,16 @@ static int check_recv_valid(struct hns_roce_dev *hr_dev, if (hr_qp->state == IB_QPS_RESET) return -EINVAL;
+ if (check_qp_dca_enable(hr_qp)) { + ret = dca_attach_qp_buf(hr_dev, hr_qp); + if (unlikely(ret)) { + ibdev_err(ibdev, + "failed to attach DCA for QP-%lu recv!\n", + hr_qp->qpn); + return ret; + } + } + return 0; }
@@ -852,15 +942,15 @@ static int hns_roce_v2_post_recv(struct ib_qp *ibqp, unsigned long flags; int ret;
- spin_lock_irqsave(&hr_qp->rq.lock, flags);
ret = check_recv_valid(hr_dev, hr_qp); if (unlikely(ret)) { *bad_wr = wr; - nreq = 0; - goto out; + return ret; }
+ spin_lock_irqsave(&hr_qp->rq.lock, flags); + max_sge = hr_qp->rq.max_gs - hr_qp->rq.rsv_sge; for (nreq = 0; wr; ++nreq, wr = wr->next) { if (unlikely(hns_roce_wq_overflow(&hr_qp->rq, nreq, @@ -2088,7 +2178,8 @@ static void set_default_caps(struct hns_roce_dev *hr_dev)
if (hr_dev->pci_dev->revision >= PCI_REVISION_ID_HIP09) { caps->flags |= HNS_ROCE_CAP_FLAG_STASH | - HNS_ROCE_CAP_FLAG_DIRECT_WQE; + HNS_ROCE_CAP_FLAG_DIRECT_WQE | + HNS_ROCE_CAP_FLAG_DCA_MODE; caps->max_sq_inline = HNS_ROCE_V3_MAX_SQ_INLINE; } else { caps->max_sq_inline = HNS_ROCE_V2_MAX_SQ_INLINE; @@ -4162,6 +4253,7 @@ static int hns_roce_v2_poll_cq(struct ib_cq *ibcq, int num_entries, struct hns_roce_qp *cur_qp = NULL; unsigned long flags; int npolled; + int ret;
spin_lock_irqsave(&hr_cq->lock, flags);
@@ -4178,7 +4270,10 @@ static int hns_roce_v2_poll_cq(struct ib_cq *ibcq, int num_entries, }
for (npolled = 0; npolled < num_entries; ++npolled) { - if (hns_roce_v2_poll_one(hr_cq, &cur_qp, wc + npolled)) + ret = hns_roce_v2_poll_one(hr_cq, &cur_qp, wc + npolled); + if (cur_qp && check_qp_dca_enable(cur_qp)) + dca_detach_qp_buf(hr_dev, cur_qp); + if (ret) break; }
@@ -4548,15 +4643,14 @@ static void modify_qp_init_to_init(struct ib_qp *ibqp, static int config_qp_rq_buf(struct hns_roce_dev *hr_dev, struct hns_roce_qp *hr_qp, struct hns_roce_v2_qp_context *context, - struct hns_roce_v2_qp_context *qpc_mask, - struct hns_roce_dca_attr *dca_attr) + struct hns_roce_v2_qp_context *qpc_mask) { u64 mtts[MTT_MIN_COUNT] = { 0 }; u64 wqe_sge_ba; int count;
/* Search qp buf's mtts */ - count = hns_roce_mtr_find(hr_dev, &hr_qp->mtr, dca_attr->rq_offset, + count = hns_roce_mtr_find(hr_dev, &hr_qp->mtr, hr_qp->rq.wqe_offset, mtts, ARRAY_SIZE(mtts), &wqe_sge_ba); if (hr_qp->rq.wqe_cnt && count < 1) { ibdev_err(&hr_dev->ib_dev, @@ -4623,8 +4717,7 @@ static int config_qp_rq_buf(struct hns_roce_dev *hr_dev, static int config_qp_sq_buf(struct hns_roce_dev *hr_dev, struct hns_roce_qp *hr_qp, struct hns_roce_v2_qp_context *context, - struct hns_roce_v2_qp_context *qpc_mask, - struct hns_roce_dca_attr *dca_attr) + struct hns_roce_v2_qp_context *qpc_mask) { struct ib_device *ibdev = &hr_dev->ib_dev; u64 sge_cur_blk = 0; @@ -4632,7 +4725,7 @@ static int config_qp_sq_buf(struct hns_roce_dev *hr_dev, int count;
/* search qp buf's mtts */ - count = hns_roce_mtr_find(hr_dev, &hr_qp->mtr, dca_attr->sq_offset, + count = hns_roce_mtr_find(hr_dev, &hr_qp->mtr, hr_qp->sq.wqe_offset, &sq_cur_blk, 1, NULL); if (count < 1) { ibdev_err(ibdev, "failed to find QP(0x%lx) SQ buf.\n", @@ -4641,8 +4734,8 @@ static int config_qp_sq_buf(struct hns_roce_dev *hr_dev, } if (hr_qp->sge.sge_cnt > 0) { count = hns_roce_mtr_find(hr_dev, &hr_qp->mtr, - dca_attr->sge_offset, &sge_cur_blk, 1, - NULL); + hr_qp->sge.wqe_offset, &sge_cur_blk, + 1, NULL); if (count < 1) { ibdev_err(ibdev, "failed to find QP(0x%lx) SGE buf.\n", hr_qp->qpn); @@ -4700,7 +4793,6 @@ static int modify_qp_init_to_rtr(struct ib_qp *ibqp, struct hns_roce_dev *hr_dev = to_hr_dev(ibqp->device); struct hns_roce_qp *hr_qp = to_hr_qp(ibqp); struct ib_device *ibdev = &hr_dev->ib_dev; - struct hns_roce_dca_attr dca_attr = {}; dma_addr_t trrl_ba; dma_addr_t irrl_ba; enum ib_mtu ib_mtu; @@ -4712,8 +4804,8 @@ static int modify_qp_init_to_rtr(struct ib_qp *ibqp, int port; int ret;
- dca_attr.rq_offset = hr_qp->rq.offset; - ret = config_qp_rq_buf(hr_dev, hr_qp, context, qpc_mask, &dca_attr); + hr_qp->rq.wqe_offset = hr_qp->rq.offset; + ret = config_qp_rq_buf(hr_dev, hr_qp, context, qpc_mask); if (ret) { ibdev_err(ibdev, "failed to config rq buf, ret = %d.\n", ret); return ret; @@ -4859,7 +4951,6 @@ static int modify_qp_rtr_to_rts(struct ib_qp *ibqp, struct hns_roce_dev *hr_dev = to_hr_dev(ibqp->device); struct hns_roce_qp *hr_qp = to_hr_qp(ibqp); struct ib_device *ibdev = &hr_dev->ib_dev; - struct hns_roce_dca_attr dca_attr = {}; int ret;
/* Not support alternate path and path migration */ @@ -4868,9 +4959,9 @@ static int modify_qp_rtr_to_rts(struct ib_qp *ibqp, return -EINVAL; }
- dca_attr.sq_offset = hr_qp->sq.offset; - dca_attr.sge_offset = hr_qp->sge.offset; - ret = config_qp_sq_buf(hr_dev, hr_qp, context, qpc_mask, &dca_attr); + hr_qp->sq.wqe_offset = hr_qp->sq.offset; + hr_qp->sge.wqe_offset = hr_qp->sge.offset; + ret = config_qp_sq_buf(hr_dev, hr_qp, context, qpc_mask); if (ret) { ibdev_err(ibdev, "failed to config sq buf, ret = %d.\n", ret); return ret; @@ -5530,83 +5621,38 @@ static int hns_roce_v2_modify_qp(struct ib_qp *ibqp,
if (check_qp_dca_enable(hr_qp) && (new_state == IB_QPS_RESET || new_state == IB_QPS_ERR)) - hns_roce_dca_kick(hr_dev, hr_qp); + hns_roce_dca_kick(hr_dev, hr_qp, udata);
out: return ret; }
-static int init_dca_buf_attr(struct hns_roce_dev *hr_dev, - struct hns_roce_qp *hr_qp, - struct hns_roce_dca_attr *init_attr, - struct hns_roce_dca_attr *dca_attr) -{ - struct ib_device *ibdev = &hr_dev->ib_dev; - - if (hr_qp->sq.wqe_cnt > 0) { - dca_attr->sq_offset = hr_qp->sq.offset + init_attr->sq_offset; - if (dca_attr->sq_offset >= hr_qp->sge.offset) { - ibdev_err(ibdev, "failed to check SQ offset = %u\n", - init_attr->sq_offset); - return -EINVAL; - } - } - - if (hr_qp->sge.sge_cnt > 0) { - dca_attr->sge_offset = hr_qp->sge.offset + init_attr->sge_offset; - if (dca_attr->sge_offset >= hr_qp->rq.offset) { - ibdev_err(ibdev, "failed to check exSGE offset = %u\n", - init_attr->sge_offset); - return -EINVAL; - } - } - - if (hr_qp->rq.wqe_cnt > 0) { - dca_attr->rq_offset = hr_qp->rq.offset + init_attr->rq_offset; - if (dca_attr->rq_offset >= hr_qp->buff_size) { - ibdev_err(ibdev, "failed to check RQ offset = %u\n", - init_attr->rq_offset); - return -EINVAL; - } - } - - return 0; -} - static int hns_roce_v2_set_dca_buf(struct hns_roce_dev *hr_dev, - struct hns_roce_qp *hr_qp, - struct hns_roce_dca_attr *init_attr) + struct hns_roce_qp *hr_qp) { struct ib_device *ibdev = &hr_dev->ib_dev; struct hns_roce_v2_qp_context *qpc, *msk; - struct hns_roce_dca_attr dca_attr = {}; struct hns_roce_mbox_msg mbox_msg = {}; dma_addr_t dma_handle; int qpc_sz; int ret;
- ret = init_dca_buf_attr(hr_dev, hr_qp, init_attr, &dca_attr); - if (ret) { - ibdev_err(ibdev, "failed to init DCA attr, ret = %d.\n", ret); - return ret; - } - qpc_sz = hr_dev->caps.qpc_sz; WARN_ON(2 * qpc_sz > HNS_ROCE_MAILBOX_SIZE); - qpc = dma_pool_alloc(hr_dev->cmd.pool, GFP_NOWAIT, &dma_handle); + qpc = dma_pool_alloc(hr_dev->cmd.pool, GFP_ATOMIC, &dma_handle); if (!qpc) return -ENOMEM;
msk = (struct hns_roce_v2_qp_context *)((void *)qpc + qpc_sz); memset(msk, 0xff, qpc_sz);
- ret = config_qp_rq_buf(hr_dev, hr_qp, qpc, msk, &dca_attr); + ret = config_qp_rq_buf(hr_dev, hr_qp, qpc, msk); if (ret) { ibdev_err(ibdev, "failed to config rq qpc, ret = %d.\n", ret); goto done; }
- ret = config_qp_sq_buf(hr_dev, hr_qp, qpc, msk, &dca_attr); + ret = config_qp_sq_buf(hr_dev, hr_qp, qpc, msk); if (ret) { ibdev_err(ibdev, "failed to config sq qpc, ret = %d.\n", ret); goto done; diff --git a/drivers/infiniband/hw/hns/hns_roce_hw_v2.h b/drivers/infiniband/hw/hns/hns_roce_hw_v2.h index e01d24f95933..28381993278f 100644 --- a/drivers/infiniband/hw/hns/hns_roce_hw_v2.h +++ b/drivers/infiniband/hw/hns/hns_roce_hw_v2.h @@ -920,6 +920,8 @@ struct hns_roce_v2_rc_send_wqe { #define RC_SEND_WQE_OPCODE RC_SEND_WQE_FIELD_LOC(4, 0) #define RC_SEND_WQE_DB_SL_L RC_SEND_WQE_FIELD_LOC(6, 5) #define RC_SEND_WQE_DB_SL_H RC_SEND_WQE_FIELD_LOC(14, 13) +#define RC_SEND_WQE_SQPN_L RC_SEND_WQE_FIELD_LOC(6, 5) +#define RC_SEND_WQE_SQPN_H RC_SEND_WQE_FIELD_LOC(30, 13) #define RC_SEND_WQE_OWNER RC_SEND_WQE_FIELD_LOC(7, 7) #define RC_SEND_WQE_CQE RC_SEND_WQE_FIELD_LOC(8, 8) #define RC_SEND_WQE_FENCE RC_SEND_WQE_FIELD_LOC(9, 9) @@ -933,6 +935,8 @@ struct hns_roce_v2_rc_send_wqe { #define RC_SEND_WQE_MSG_START_SGE_IDX RC_SEND_WQE_FIELD_LOC(151, 128) #define RC_SEND_WQE_INL_TYPE RC_SEND_WQE_FIELD_LOC(159, 159)
+#define V2_RC_SEND_WQE_BYTE_4_SQPN_L_W 2 + struct hns_roce_wqe_frmr_seg { __le32 pbl_size; __le32 byte_40; diff --git a/drivers/infiniband/hw/hns/hns_roce_main.c b/drivers/infiniband/hw/hns/hns_roce_main.c index d9d787fbc70b..d14eaecdbf15 100644 --- a/drivers/infiniband/hw/hns/hns_roce_main.c +++ b/drivers/infiniband/hw/hns/hns_roce_main.c @@ -945,6 +945,14 @@ static int hns_roce_init_hem(struct hns_roce_dev *hr_dev) return ret; }
+static void hns_roce_teardown_hca(struct hns_roce_dev *hr_dev) +{ + if (hr_dev->caps.flags & HNS_ROCE_CAP_FLAG_DCA_MODE) + hns_roce_cleanup_dca(hr_dev); + + hns_roce_cleanup_bitmap(hr_dev); +} + /** * hns_roce_setup_hca - setup host channel adapter * @hr_dev: pointer to hns roce device @@ -957,6 +965,14 @@ static int hns_roce_setup_hca(struct hns_roce_dev *hr_dev)
spin_lock_init(&hr_dev->sm_lock);
+ INIT_LIST_HEAD(&hr_dev->qp_list); + spin_lock_init(&hr_dev->qp_list_lock); + INIT_LIST_HEAD(&hr_dev->dip_list); + spin_lock_init(&hr_dev->dip_list_lock); + + INIT_LIST_HEAD(&hr_dev->uctx_list); + spin_lock_init(&hr_dev->uctx_list_lock); + if (hr_dev->caps.flags & HNS_ROCE_CAP_FLAG_CQ_RECORD_DB || hr_dev->caps.flags & HNS_ROCE_CAP_FLAG_QP_RECORD_DB) { INIT_LIST_HEAD(&hr_dev->pgdir_list); @@ -990,6 +1006,9 @@ static int hns_roce_setup_hca(struct hns_roce_dev *hr_dev) hns_roce_init_srq_table(hr_dev); }
+ if (hr_dev->caps.flags & HNS_ROCE_CAP_FLAG_DCA_MODE) + hns_roce_init_dca(hr_dev); + return 0;
err_uar_table_free: @@ -1014,7 +1033,7 @@ static void check_and_get_armed_cq(struct list_head *cq_list, struct ib_cq *cq)
void hns_roce_handle_device_err(struct hns_roce_dev *hr_dev) { - struct hns_roce_qp *hr_qp; + struct hns_roce_qp *hr_qp, *hr_qp_next; struct hns_roce_cq *hr_cq; struct list_head cq_list; unsigned long flags_qp; @@ -1023,7 +1042,7 @@ void hns_roce_handle_device_err(struct hns_roce_dev *hr_dev) INIT_LIST_HEAD(&cq_list);
spin_lock_irqsave(&hr_dev->qp_list_lock, flags); - list_for_each_entry(hr_qp, &hr_dev->qp_list, node) { + list_for_each_entry_safe(hr_qp, hr_qp_next, &hr_dev->qp_list, node) { spin_lock_irqsave(&hr_qp->sq.lock, flags_qp); if (hr_qp->sq.tail != hr_qp->sq.head) check_and_get_armed_cq(&cq_list, hr_qp->ibqp.send_cq); @@ -1102,11 +1121,6 @@ int hns_roce_init(struct hns_roce_dev *hr_dev) } }
- INIT_LIST_HEAD(&hr_dev->qp_list); - spin_lock_init(&hr_dev->qp_list_lock); - INIT_LIST_HEAD(&hr_dev->dip_list); - spin_lock_init(&hr_dev->dip_list_lock); - ret = hns_roce_register_device(hr_dev); if (ret) goto error_failed_register_device; @@ -1118,7 +1132,7 @@ int hns_roce_init(struct hns_roce_dev *hr_dev) hr_dev->hw->hw_exit(hr_dev);
error_failed_engine_init: - hns_roce_cleanup_bitmap(hr_dev); + hns_roce_teardown_hca(hr_dev);
error_failed_setup_hca: hns_roce_cleanup_hem(hr_dev); @@ -1144,7 +1158,7 @@ void hns_roce_exit(struct hns_roce_dev *hr_dev)
if (hr_dev->hw->hw_exit) hr_dev->hw->hw_exit(hr_dev); - hns_roce_cleanup_bitmap(hr_dev); + hns_roce_teardown_hca(hr_dev); hns_roce_cleanup_hem(hr_dev);
if (hr_dev->cmd_mod) diff --git a/drivers/infiniband/hw/hns/hns_roce_mr.c b/drivers/infiniband/hw/hns/hns_roce_mr.c index 30c2f5e8e84a..111a397544d7 100644 --- a/drivers/infiniband/hw/hns/hns_roce_mr.c +++ b/drivers/infiniband/hw/hns/hns_roce_mr.c @@ -652,16 +652,12 @@ static inline int mtr_check_direct_pages(dma_addr_t *pages, int page_count, static void mtr_free_bufs(struct hns_roce_dev *hr_dev, struct hns_roce_mtr *mtr) { /* release user buffers */ - if (mtr->umem) { - ib_umem_release(mtr->umem); - mtr->umem = NULL; - } + ib_umem_release(mtr->umem); + mtr->umem = NULL;
/* release kernel buffers */ - if (mtr->kmem) { - hns_roce_buf_free(hr_dev, mtr->kmem); - mtr->kmem = NULL; - } + hns_roce_buf_free(hr_dev, mtr->kmem); + mtr->kmem = NULL; }
static int mtr_alloc_bufs(struct hns_roce_dev *hr_dev, struct hns_roce_mtr *mtr, diff --git a/drivers/infiniband/hw/hns/hns_roce_qp.c b/drivers/infiniband/hw/hns/hns_roce_qp.c index 36868618cd51..302f2ea75749 100644 --- a/drivers/infiniband/hw/hns/hns_roce_qp.c +++ b/drivers/infiniband/hw/hns/hns_roce_qp.c @@ -639,7 +639,9 @@ static int set_user_sq_size(struct hns_roce_dev *hr_dev, return 0; }
-static bool check_dca_is_enable(struct hns_roce_dev *hr_dev, bool is_user, +static bool check_dca_is_enable(struct hns_roce_dev *hr_dev, + struct hns_roce_qp *hr_qp, + struct ib_qp_init_attr *init_attr, bool is_user, unsigned long addr) { if (!(hr_dev->caps.flags & HNS_ROCE_CAP_FLAG_DCA_MODE)) @@ -649,6 +651,12 @@ static bool check_dca_is_enable(struct hns_roce_dev *hr_dev, bool is_user, if (is_user) return !addr;
+ /* Only RC and XRC support DCA for kernel QP */ + if (hr_dev->dca_ctx.max_size > 0 && + (init_attr->qp_type == IB_QPT_RC || + init_attr->qp_type == IB_QPT_XRC_INI)) + return !!(init_attr->create_flags & HNS_ROCE_QP_CREATE_DCA_EN); + return false; }
@@ -772,8 +780,13 @@ static int alloc_wqe_buf(struct hns_roce_dev *hr_dev, struct hns_roce_qp *hr_qp, int ret;
if (dca_en) { - /* DCA must be enabled after the buffer size is configured. */ - hns_roce_enable_dca(hr_dev, hr_qp); + /* DCA must be enabled after the buffer attr is configured. */ + ret = hns_roce_enable_dca(hr_dev, hr_qp, udata); + if (ret) { + ibdev_err(ibdev, "failed to enable DCA, ret = %d.\n", + ret); + return ret; + }
hr_qp->en_flags |= HNS_ROCE_QP_CAP_DCA; } else { @@ -815,7 +828,7 @@ static int alloc_qp_wqe(struct hns_roce_dev *hr_dev, struct hns_roce_qp *hr_qp, bool dca_en; int ret;
- dca_en = check_dca_is_enable(hr_dev, !!udata, addr); + dca_en = check_dca_is_enable(hr_dev, hr_qp, init_attr, !!udata, addr); ret = set_wqe_buf_attr(hr_dev, hr_qp, dca_en, &buf_attr); if (ret) { ibdev_err(ibdev, "failed to split WQE buf, ret = %d.\n", ret); @@ -1509,9 +1522,18 @@ void hns_roce_unlock_cqs(struct hns_roce_cq *send_cq, } }
+static inline void *dca_buf_offset(struct hns_roce_dca_cfg *dca_cfg, u32 offset) +{ + return (char *)(dca_cfg->buf_list[offset >> HNS_HW_PAGE_SHIFT]) + + (offset & ((1 << HNS_HW_PAGE_SHIFT) - 1)); +} + static inline void *get_wqe(struct hns_roce_qp *hr_qp, u32 offset) { - return hns_roce_buf_offset(hr_qp->mtr.kmem, offset); + if (hr_qp->en_flags & HNS_ROCE_QP_CAP_DCA) + return dca_buf_offset(&hr_qp->dca_cfg, offset); + else + return hns_roce_buf_offset(hr_qp->mtr.kmem, offset); }
void *hns_roce_get_recv_wqe(struct hns_roce_qp *hr_qp, unsigned int n) diff --git a/include/uapi/rdma/hns-abi.h b/include/uapi/rdma/hns-abi.h index 3429df900cdf..1faa11b8060b 100644 --- a/include/uapi/rdma/hns-abi.h +++ b/include/uapi/rdma/hns-abi.h @@ -150,6 +150,7 @@ enum hns_ib_dca_mem_methods {
enum hns_ib_dca_mem_reg_attrs { HNS_IB_ATTR_DCA_MEM_REG_HANDLE = (1U << UVERBS_ID_NS_SHIFT), + HNS_IB_ATTR_DCA_MEM_REG_FLAGS, HNS_IB_ATTR_DCA_MEM_REG_LEN, HNS_IB_ATTR_DCA_MEM_REG_ADDR, HNS_IB_ATTR_DCA_MEM_REG_KEY, @@ -166,8 +167,6 @@ enum hns_ib_dca_mem_shrink_attrs { HNS_IB_ATTR_DCA_MEM_SHRINK_OUT_FREE_MEMS, };
-#define HNS_IB_ATTACH_FLAGS_NEW_BUFFER 1U - enum hns_ib_dca_mem_attach_attrs { HNS_IB_ATTR_DCA_MEM_ATTACH_HANDLE = (1U << UVERBS_ID_NS_SHIFT), HNS_IB_ATTR_DCA_MEM_ATTACH_SQ_OFFSET,
From: Chengchang Tang tangchengchang@huawei.com
driver inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I63KVU
----------------------------------------------------------
This patch synchonize DCA code from CI and is based on RFC v2 from the community including DCA kernel suuport and debugfs support.
Add a group of debugfs files for DCA memory pool statistics.
The debugfs entries for DCA memory statistics include: hns_roce/<ibdev_name>/dca/qp : show all DCA QPs for each device. hns_roce/<ibdev_name>/dca/pool : show all DCA mem for each device. hns_roce/<ibdev_name>/<pid>/qp : show all active DCA QPs for one process. hns_roce/<ibdev_name>/<pid>/mstats : show DCA mem info for one process.
Signed-off-by: Chengchang Tang tangchengchang@huawei.com Reviewed-by: Yangyang Li liyangyang20@huawei.com Reviewed-by: YueHaibing yuehaibing@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- drivers/infiniband/hw/hns/Makefile | 2 +- drivers/infiniband/hw/hns/hns_roce_dca.c | 26 + drivers/infiniband/hw/hns/hns_roce_dca.h | 2 + drivers/infiniband/hw/hns/hns_roce_debugfs.c | 590 +++++++++++++++++++ drivers/infiniband/hw/hns/hns_roce_debugfs.h | 16 + drivers/infiniband/hw/hns/hns_roce_device.h | 3 + drivers/infiniband/hw/hns/hns_roce_hw_v2.c | 3 + drivers/infiniband/hw/hns/hns_roce_main.c | 19 + 8 files changed, 660 insertions(+), 1 deletion(-) create mode 100644 drivers/infiniband/hw/hns/hns_roce_debugfs.c create mode 100644 drivers/infiniband/hw/hns/hns_roce_debugfs.h
diff --git a/drivers/infiniband/hw/hns/Makefile b/drivers/infiniband/hw/hns/Makefile index a55bcceeef98..c92dd14dc717 100644 --- a/drivers/infiniband/hw/hns/Makefile +++ b/drivers/infiniband/hw/hns/Makefile @@ -8,7 +8,7 @@ ccflags-y := -I $(srctree)/drivers/net/ethernet/hisilicon/hns3 hns-roce-objs := hns_roce_main.o hns_roce_cmd.o hns_roce_pd.o \ hns_roce_ah.o hns_roce_hem.o hns_roce_mr.o hns_roce_qp.o \ hns_roce_cq.o hns_roce_alloc.o hns_roce_db.o hns_roce_srq.o hns_roce_restrack.o \ - hns_roce_bond.o hns_roce_dca.o + hns_roce_bond.o hns_roce_dca.o hns_roce_debugfs.o
ifdef CONFIG_INFINIBAND_HNS_HIP08 hns-roce-hw-v2-objs := hns_roce_hw_v2.o $(hns-roce-objs) diff --git a/drivers/infiniband/hw/hns/hns_roce_dca.c b/drivers/infiniband/hw/hns/hns_roce_dca.c index f33a59ef3bc3..2c13c619ee32 100644 --- a/drivers/infiniband/hw/hns/hns_roce_dca.c +++ b/drivers/infiniband/hw/hns/hns_roce_dca.c @@ -1609,6 +1609,32 @@ const struct uapi_definition hns_roce_dca_uapi_defs[] = { {} };
+/* enum DCA pool */ +struct dca_mem_enum_attr { + void *param; + hns_dca_enum_callback enum_fn; +}; + +static int enum_dca_pool_proc(struct dca_mem *mem, int index, void *param) +{ + struct dca_mem_enum_attr *attr = param; + int ret; + + ret = attr->enum_fn(mem->states, mem->page_count, attr->param); + + return ret ? DCA_MEM_STOP_ITERATE : DCA_MEM_NEXT_ITERATE; +} + +void hns_roce_enum_dca_pool(struct hns_roce_dca_ctx *dca_ctx, void *param, + hns_dca_enum_callback cb) +{ + struct dca_mem_enum_attr attr; + + attr.enum_fn = cb; + attr.param = param; + travel_dca_pages(dca_ctx, &attr, enum_dca_pool_proc); +} + module_param(dca_unit_size, uint, 0444); module_param(dca_max_size, ulong, 0444); module_param(dca_min_size, ulong, 0444); diff --git a/drivers/infiniband/hw/hns/hns_roce_dca.h b/drivers/infiniband/hw/hns/hns_roce_dca.h index f378102778e3..11bade706bd7 100644 --- a/drivers/infiniband/hw/hns/hns_roce_dca.h +++ b/drivers/infiniband/hw/hns/hns_roce_dca.h @@ -73,4 +73,6 @@ void hns_roce_dca_detach(struct hns_roce_dev *hr_dev, struct hns_roce_qp *hr_qp, void hns_roce_dca_kick(struct hns_roce_dev *hr_dev, struct hns_roce_qp *hr_qp, struct ib_udata *udata);
+void hns_roce_enum_dca_pool(struct hns_roce_dca_ctx *dca_ctx, void *param, + hns_dca_enum_callback cb); #endif diff --git a/drivers/infiniband/hw/hns/hns_roce_debugfs.c b/drivers/infiniband/hw/hns/hns_roce_debugfs.c new file mode 100644 index 000000000000..eedb24ee103e --- /dev/null +++ b/drivers/infiniband/hw/hns/hns_roce_debugfs.c @@ -0,0 +1,590 @@ +// SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB +/* + * Copyright (c) 2022 Hisilicon Limited. + */ + +#include <linux/debugfs.h> +#include <linux/device.h> + +#include "hns_roce_common.h" +#include "hns_roce_device.h" +#include "hns_roce_dca.h" +#include "hns_roce_debugfs.h" + +static struct dentry *hns_roce_dbgfs_root; + +#define KB 1024 + +/* debugfs seqfile */ +struct hns_debugfs_seqfile { + struct dentry *entry; + int (*read)(struct seq_file *seq, void *data); + void *data; +}; + +static int hns_debugfs_seqfile_open(struct inode *inode, struct file *f) +{ + struct hns_debugfs_seqfile *seqfile = inode->i_private; + + return single_open(f, seqfile->read, seqfile->data); +} + +static const struct file_operations hns_debugfs_seqfile_fops = { + .owner = THIS_MODULE, + .open = hns_debugfs_seqfile_open, + .release = single_release, + .read = seq_read, + .llseek = seq_lseek +}; + +static void init_debugfs_seqfile(struct hns_debugfs_seqfile *seq, + const char *name, struct dentry *parent, + int (*read_fn)(struct seq_file *, void *), + void *data) +{ + struct dentry *entry; + + entry = debugfs_create_file(name, 0400, parent, seq, + &hns_debugfs_seqfile_fops); + if (IS_ERR(entry)) + return; + + seq->read = read_fn; + seq->data = data; + seq->entry = entry; +} + +static void cleanup_debugfs_seqfile(struct hns_debugfs_seqfile *seq) +{ + debugfs_remove(seq->entry); + seq->entry = NULL; +} + +/* DCA debugfs */ +struct hns_dca_ctx_debugfs { + struct dentry *root; /* pool debugfs entry */ + struct hns_debugfs_seqfile mem; /* mems in pool */ + struct hns_debugfs_seqfile qp; /* QPs stats in pool */ +}; + +struct hns_dca_debugfs { + struct dentry *root; /* dev debugfs entry */ + struct hns_debugfs_seqfile pool; /* pools stats on device */ + struct hns_debugfs_seqfile qp; /* QPs stats on device */ + struct hns_dca_ctx_debugfs kctx; /* kDCA context */ +}; + +/* Debugfs for device */ +struct hns_roce_dev_debugfs { + struct dentry *root; + struct hns_dca_debugfs *dca_root; +}; + +struct dca_mem_stats { + unsigned int total_mems; + unsigned int clean_mems; + size_t free_size; + size_t total_size; + size_t active_size; + size_t locked_size; +}; + +#define DCA_CTX_PID_LEN 10 +#define DCA_CTX_STATE_LEN 22 + +#define LOADING_PERCENT_SCALE 100 +#define LOADING_PERCENT_SHIFT 2 + +static int stats_dca_pool_proc(struct hns_dca_page_state *states, u32 count, + void *param) +{ + struct dca_mem_stats *stats = param; + struct hns_dca_page_state *s; + int i, free_pages; + + free_pages = 0; + for (i = 0; i < count; i++) { + s = &states[i]; + if (s->buf_id == HNS_DCA_INVALID_BUF_ID) { + free_pages++; + stats->free_size += HNS_HW_PAGE_SIZE; + } else { + if (s->lock) + stats->locked_size += HNS_HW_PAGE_SIZE; + + if (s->active) + stats->active_size += HNS_HW_PAGE_SIZE; + } + } + + stats->total_size += (count * HNS_HW_PAGE_SIZE); + stats->total_mems++; + if (free_pages == count) + stats->clean_mems++; + + return 0; +} + +/* stats QPs in DCA pool */ +struct dca_stats_qp_attr { + unsigned long *qpn_bitmap; + unsigned int qpn_max; +}; + +static int stats_dca_qp_proc(struct hns_dca_page_state *states, u32 count, + void *param) +{ + struct dca_stats_qp_attr *attr = param; + struct hns_dca_page_state *s; + u32 qpn; + int i; + + for (i = 0; i < count; i++) { + s = &states[i]; + if (s->buf_id == HNS_DCA_INVALID_BUF_ID || s->lock || + !s->active) + continue; + + qpn = HNS_DCA_BUF_ID_TO_QPN(s->buf_id); + if (qpn < attr->qpn_max) + set_bit(qpn, attr->qpn_bitmap); + } + + return 0; +} + +static void dca_ctx_stats_qp(struct hns_roce_dca_ctx *ctx, + unsigned long *qpn_bitmap, unsigned int qpn_max) +{ + struct dca_stats_qp_attr attr; + + attr.qpn_bitmap = qpn_bitmap; + attr.qpn_max = qpn_max; + hns_roce_enum_dca_pool(ctx, &attr, stats_dca_qp_proc); +} + +static void dca_ctx_stats_mem(struct hns_roce_dca_ctx *ctx, + struct dca_mem_stats *stats) +{ + hns_roce_enum_dca_pool(ctx, stats, stats_dca_pool_proc); +} + +static void dca_setup_pool_name(pid_t pid, bool is_kdca, char *name, int size) +{ + if (is_kdca) + snprintf(name, size, "kernel"); + else + snprintf(name, size, "%d", pid); +} + +static u64 calc_loading_percent(size_t total, size_t free, u32 *out_rem) +{ + u32 all_pages, used_pages, free_pages, scale; + u64 percent = 0; + u32 rem = 0; + + all_pages = total >> HNS_HW_PAGE_SHIFT; + free_pages = free >> HNS_HW_PAGE_SHIFT; + if (all_pages >= free_pages) { + used_pages = all_pages - free_pages; + scale = LOADING_PERCENT_SCALE * LOADING_PERCENT_SCALE; + percent = (used_pages * scale) / all_pages; + percent = div_u64_rem(percent, LOADING_PERCENT_SCALE, &rem); + } + + if (out_rem) + *out_rem = rem; + + return percent; +} + +static void dca_print_pool_stats(struct hns_roce_dca_ctx *ctx, pid_t pid, + bool is_kdca, struct seq_file *file) +{ + char name[DCA_CTX_PID_LEN]; + u64 percent; + u32 rem = 0; + + percent = calc_loading_percent(ctx->total_size, ctx->free_size, &rem); + dca_setup_pool_name(pid, is_kdca, name, sizeof(name)); + seq_printf(file, "%-10s %-16ld %-16ld %-16u %llu.%0*u\n", name, + ctx->total_size / KB, ctx->free_size / KB, ctx->free_mems, + percent, LOADING_PERCENT_SHIFT, rem); +} + +static void dca_stats_dev_pool_in_seqfile(struct hns_roce_dev *hr_dev, + struct seq_file *file) +{ + struct hns_roce_ucontext *uctx, *tmp; + + seq_printf(file, "%-10s %-16s %-16s %-16s %-s\n", "PID", "Total(kB)", + "Free(kB)", "Clean(BLK)", "Loading"); + + /* Write kernel DCA pool stats */ + dca_print_pool_stats(&hr_dev->dca_ctx, 0, true, file); + /* Write user DCA pool stats */ + spin_lock(&hr_dev->uctx_list_lock); + list_for_each_entry_safe(uctx, tmp, &hr_dev->uctx_list, list) { + spin_unlock(&hr_dev->uctx_list_lock); + dca_print_pool_stats(&uctx->dca_ctx, uctx->pid, false, file); + spin_lock(&hr_dev->uctx_list_lock); + } + spin_unlock(&hr_dev->uctx_list_lock); +} + +struct dca_qp_stats { + char name[DCA_CTX_PID_LEN]; + char state[DCA_CTX_STATE_LEN]; + u32 qpn; + u32 total_size; + u32 sq_size; + u32 rq_size; + u32 sge_size; +}; + +static void dca_setup_qp_state(struct hns_roce_qp *hr_qp, char *buf, int size) +{ + struct hns_roce_dca_cfg *cfg = &hr_qp->dca_cfg; + + if (cfg->buf_id == HNS_DCA_INVALID_BUF_ID) + snprintf(buf, size, "detached"); + else if (hr_qp->rq.wqe_cnt > 0) + snprintf(buf, size, "stable"); + else + snprintf(buf, size, "attached-%-u", cfg->attach_count); +} + +static void dca_setup_qp_stats(struct hns_roce_qp *hr_qp, + struct dca_qp_stats *stats) +{ + struct hns_roce_ucontext *uctx = NULL; + + if (!(hr_qp->en_flags & HNS_ROCE_QP_CAP_DCA) || !hr_qp->ibqp.pd) + return; + + if (hr_qp->ibqp.pd->uobject) + uctx = to_hr_ucontext(hr_qp->ibqp.pd->uobject->context); + + dca_setup_pool_name(uctx ? uctx->pid : 0, !uctx, stats->name, + sizeof(stats->name)); + stats->qpn = (u32)hr_qp->qpn; + stats->total_size = hr_qp->buff_size; + + stats->sq_size = to_hr_hem_entries_size(hr_qp->sq.wqe_cnt, + hr_qp->sq.wqe_shift); + stats->sge_size = to_hr_hem_entries_size(hr_qp->sge.sge_cnt, + hr_qp->sge.sge_shift); + stats->rq_size = to_hr_hem_entries_size(hr_qp->rq.wqe_cnt, + hr_qp->rq.wqe_shift); + + dca_setup_qp_state(hr_qp, stats->state, sizeof(stats->state)); +} + +static void dca_stats_dev_qp_in_seqfile(struct hns_roce_dev *hr_dev, + struct seq_file *file) +{ + struct dca_qp_stats stats; + struct hns_roce_qp *hr_qp; + unsigned long id; + + seq_printf(file, "%-10s %-10s %-10s %s\n", "QPN", "Size(kB)", "PID", + "State"); + + xa_lock(&hr_dev->qp_table_xa); + xa_for_each(&hr_dev->qp_table_xa, id, hr_qp) { + stats.total_size = 0; + dca_setup_qp_stats(hr_qp, &stats); + if (!stats.total_size) + continue; + + xa_unlock(&hr_dev->qp_table_xa); + seq_printf(file, "%-10u %-10u %-10s %-s\n", stats.qpn, + stats.total_size / KB, stats.name, stats.state); + xa_lock(&hr_dev->qp_table_xa); + } + xa_unlock(&hr_dev->qp_table_xa); +} + +static void dca_stats_ctx_qp_in_seqfile(struct hns_roce_dev *hr_dev, + struct hns_roce_dca_ctx *ctx, + struct seq_file *file) +{ + struct dca_qp_stats stats; + struct hns_roce_qp *hr_qp; + unsigned int qpn, nbits; + unsigned long *bitmap; + + nbits = hr_dev->caps.num_qps; + if (nbits < 1) + return; + + bitmap = bitmap_zalloc(nbits, GFP_ATOMIC); + if (!bitmap) + return; + + seq_printf(file, "%-10s %-10s %-10s %-10s %-10s\n", "QPN", "Total(kB)", + "SQ(kB)", "SGE(kB)", "RQ(kB)"); + + dca_ctx_stats_qp(ctx, bitmap, nbits); + for_each_set_bit(qpn, bitmap, nbits) { + stats.total_size = 0; + xa_lock(&hr_dev->qp_table_xa); + hr_qp = __hns_roce_qp_lookup(hr_dev, qpn); + if (hr_qp) + dca_setup_qp_stats(hr_qp, &stats); + xa_unlock(&hr_dev->qp_table_xa); + if (!stats.total_size) + continue; + + seq_printf(file, "%-10u %-10u %-10u %-10u %-10u\n", + stats.qpn, stats.total_size / KB, stats.sq_size / KB, + stats.sge_size / KB, stats.rq_size / KB); + } + bitmap_free(bitmap); +} + +static void dca_stats_ctx_mem_in_seqfile(struct hns_roce_dca_ctx *ctx, + bool is_kdca, struct seq_file *file) +{ + struct dca_mem_stats stats = {}; + u64 percent; + u32 rem = 0; + +#define DCA_STAT_NAME_FMT "%-22s " +#define dca_ctx_print_mem_size(f, n, fmt, v) \ + seq_printf(f, DCA_STAT_NAME_FMT fmt "\n", n, v) + +#define dca_ctx_print_mem_kb(f, n, v) \ + dca_ctx_print_mem_size(f, n, "%-u kB", (u32)((v) / KB)) + + dca_ctx_stats_mem(ctx, &stats); + percent = calc_loading_percent(stats.total_size, stats.free_size, &rem); + seq_printf(file, DCA_STAT_NAME_FMT "%llu.%0*u\n", "Loading:", percent, + LOADING_PERCENT_SHIFT, rem); + dca_ctx_print_mem_kb(file, "Total:", stats.total_size); + dca_ctx_print_mem_kb(file, "Free:", stats.free_size); + dca_ctx_print_mem_kb(file, "Active:", stats.active_size); + dca_ctx_print_mem_kb(file, "Locked:", stats.locked_size); + dca_ctx_print_mem_size(file, "Dirty:", "%-u Blocks", + stats.total_mems - stats.clean_mems); + dca_ctx_print_mem_size(file, "Clean:", "%-u Blocks", stats.clean_mems); + if (is_kdca) { + dca_ctx_print_mem_size(file, "Unit:", "%-u", ctx->unit_size); + dca_ctx_print_mem_size(file, "Max:", "%-zu", ctx->max_size); + dca_ctx_print_mem_size(file, "Min:", "%-zu", ctx->min_size); + } +} + +static int dca_debugfs_pool_show(struct seq_file *file, void *offset) +{ + struct hns_roce_dev *hr_dev = file->private; + + dca_stats_dev_pool_in_seqfile(hr_dev, file); + return 0; +} + +static int dca_debugfs_qp_show(struct seq_file *file, void *offset) +{ + struct hns_roce_dev *hr_dev = file->private; + + dca_stats_dev_qp_in_seqfile(hr_dev, file); + return 0; +} + +static int dca_debugfs_kctx_qp_stats_show(struct seq_file *file, void *offset) +{ + struct hns_roce_dev *hr_dev = file->private; + + dca_stats_ctx_qp_in_seqfile(hr_dev, &hr_dev->dca_ctx, file); + return 0; +} + +static int dca_debugfs_uctx_qp_stats_show(struct seq_file *file, void *offset) +{ + struct hns_roce_ucontext *uctx = file->private; + + dca_stats_ctx_qp_in_seqfile(to_hr_dev(uctx->ibucontext.device), + &uctx->dca_ctx, file); + return 0; +} + +static int dca_debugfs_kctx_mem_stats_show(struct seq_file *file, void *offset) +{ + struct hns_roce_dev *hr_dev = file->private; + + dca_stats_ctx_mem_in_seqfile(&hr_dev->dca_ctx, true, file); + return 0; +} + +static int dca_debugfs_uctx_mem_stats_show(struct seq_file *file, void *offset) +{ + struct hns_roce_ucontext *uctx = file->private; + + dca_stats_ctx_mem_in_seqfile(&uctx->dca_ctx, false, file); + return 0; +} + +static void init_dca_ctx_debugfs(struct hns_dca_ctx_debugfs *dbgfs, + struct dentry *parent, + struct hns_roce_dev *hr_dev, + struct hns_roce_ucontext *uctx) +{ + char name[DCA_CTX_PID_LEN]; + + if (IS_ERR_OR_NULL(parent)) + return; + + dca_setup_pool_name(uctx ? uctx->pid : 0, !uctx, name, sizeof(name)); + dbgfs->root = debugfs_create_dir(name, parent); + if (IS_ERR_OR_NULL(dbgfs->root)) + return; + + if (uctx) { + init_debugfs_seqfile(&dbgfs->mem, "mstats", dbgfs->root, + dca_debugfs_uctx_mem_stats_show, uctx); + init_debugfs_seqfile(&dbgfs->qp, "qp", dbgfs->root, + dca_debugfs_uctx_qp_stats_show, uctx); + } else { + init_debugfs_seqfile(&dbgfs->mem, "mstats", dbgfs->root, + dca_debugfs_kctx_mem_stats_show, hr_dev); + init_debugfs_seqfile(&dbgfs->qp, "qp", dbgfs->root, + dca_debugfs_kctx_qp_stats_show, hr_dev); + } +} + +static void cleanup_dca_ctx_debugfs(struct hns_dca_ctx_debugfs *ctx_dbgfs) +{ + cleanup_debugfs_seqfile(&ctx_dbgfs->qp); + cleanup_debugfs_seqfile(&ctx_dbgfs->mem); + debugfs_remove_recursive(ctx_dbgfs->root); +} + +static struct hns_dca_debugfs * +create_dca_debugfs(struct hns_roce_dev *hr_dev, struct dentry *parent) +{ + struct hns_dca_debugfs *dbgfs; + + if (IS_ERR(parent)) + return NULL; + + dbgfs = kzalloc(sizeof(*dbgfs), GFP_KERNEL); + if (!dbgfs) + return NULL; + + dbgfs->root = debugfs_create_dir("dca", parent); + if (IS_ERR_OR_NULL(dbgfs->root)) { + kfree(dbgfs); + return NULL; + } + + init_debugfs_seqfile(&dbgfs->pool, "pool", dbgfs->root, + dca_debugfs_pool_show, hr_dev); + init_debugfs_seqfile(&dbgfs->qp, "qp", dbgfs->root, + dca_debugfs_qp_show, hr_dev); + + init_dca_ctx_debugfs(&dbgfs->kctx, dbgfs->root, hr_dev, NULL); + + return dbgfs; +} + +static void destroy_dca_debugfs(struct hns_dca_debugfs *dca_dbgfs) +{ + cleanup_dca_ctx_debugfs(&dca_dbgfs->kctx); + cleanup_debugfs_seqfile(&dca_dbgfs->pool); + cleanup_debugfs_seqfile(&dca_dbgfs->qp); + debugfs_remove_recursive(dca_dbgfs->root); + kfree(dca_dbgfs); +} + +/* debugfs for ucontext */ +void hns_roce_register_uctx_debugfs(struct hns_roce_dev *hr_dev, + struct hns_roce_ucontext *uctx) +{ + struct hns_roce_dev_debugfs *dev_dbgfs = hr_dev->dbgfs; + struct hns_dca_debugfs *dca_dbgfs; + + if (!dev_dbgfs) + return; + + dca_dbgfs = dev_dbgfs->dca_root; + if (dca_dbgfs) { + uctx->dca_dbgfs = kzalloc(sizeof(struct hns_dca_ctx_debugfs), + GFP_KERNEL); + if (!uctx->dca_dbgfs) + return; + + init_dca_ctx_debugfs(uctx->dca_dbgfs, dca_dbgfs->root, + hr_dev, uctx); + } +} + +void hns_roce_unregister_uctx_debugfs(struct hns_roce_dev *hr_dev, + struct hns_roce_ucontext *uctx) +{ + struct hns_dca_ctx_debugfs *dbgfs = uctx->dca_dbgfs; + + if (dbgfs) { + uctx->dca_dbgfs = NULL; + cleanup_dca_ctx_debugfs(dbgfs); + kfree(dbgfs); + } +} + +/* debugfs for device */ +void hns_roce_register_debugfs(struct hns_roce_dev *hr_dev) +{ + struct hns_roce_dev_debugfs *dbgfs; + + if (IS_ERR_OR_NULL(hns_roce_dbgfs_root)) + return; + + dbgfs = kzalloc(sizeof(*dbgfs), GFP_KERNEL); + if (!dbgfs) + return; + + dbgfs->root = debugfs_create_dir(dev_name(&hr_dev->ib_dev.dev), + hns_roce_dbgfs_root); + if (IS_ERR(dbgfs->root)) { + kfree(dbgfs); + return; + } + + if (hr_dev->caps.flags & HNS_ROCE_CAP_FLAG_DCA_MODE) + dbgfs->dca_root = create_dca_debugfs(hr_dev, dbgfs->root); + + hr_dev->dbgfs = dbgfs; +} + +void hns_roce_unregister_debugfs(struct hns_roce_dev *hr_dev) +{ + struct hns_roce_dev_debugfs *dbgfs; + + if (IS_ERR_OR_NULL(hns_roce_dbgfs_root)) + return; + + dbgfs = hr_dev->dbgfs; + if (!dbgfs) + return; + + hr_dev->dbgfs = NULL; + + if (dbgfs->dca_root) { + destroy_dca_debugfs(dbgfs->dca_root); + dbgfs->dca_root = NULL; + } + + debugfs_remove_recursive(dbgfs->root); + kfree(dbgfs); +} + +/* debugfs for hns module */ +void hns_roce_init_debugfs(void) +{ + hns_roce_dbgfs_root = debugfs_create_dir("hns_roce", NULL); +} + +void hns_roce_cleanup_debugfs(void) +{ + debugfs_remove_recursive(hns_roce_dbgfs_root); + hns_roce_dbgfs_root = NULL; +} diff --git a/drivers/infiniband/hw/hns/hns_roce_debugfs.h b/drivers/infiniband/hw/hns/hns_roce_debugfs.h new file mode 100644 index 000000000000..33b911eb1c0c --- /dev/null +++ b/drivers/infiniband/hw/hns/hns_roce_debugfs.h @@ -0,0 +1,16 @@ +/* SPDX-License-Identifier: GPL-2.0+ */ +// Copyright (c) 2016-2017 Hisilicon Limited. + +#ifndef __HNS_ROCE_DEBUGFS_H +#define __HNS_ROCE_DEBUGFS_H + +void hns_roce_init_debugfs(void); +void hns_roce_cleanup_debugfs(void); +void hns_roce_register_debugfs(struct hns_roce_dev *hr_dev); +void hns_roce_unregister_debugfs(struct hns_roce_dev *hr_dev); +void hns_roce_register_uctx_debugfs(struct hns_roce_dev *hr_dev, + struct hns_roce_ucontext *uctx); +void hns_roce_unregister_uctx_debugfs(struct hns_roce_dev *hr_dev, + struct hns_roce_ucontext *uctx); + +#endif diff --git a/drivers/infiniband/hw/hns/hns_roce_device.h b/drivers/infiniband/hw/hns/hns_roce_device.h index b1c1f640a7a0..e8ba256a1885 100644 --- a/drivers/infiniband/hw/hns/hns_roce_device.h +++ b/drivers/infiniband/hw/hns/hns_roce_device.h @@ -226,12 +226,15 @@ struct hns_roce_dca_ctx {
struct hns_roce_ucontext { struct ib_ucontext ibucontext; + struct list_head list; /* link all uctx to uctx_list on hr_dev */ + pid_t pid; /* process id to which the uctx belongs */ struct hns_roce_uar uar; struct list_head page_list; struct mutex page_mutex; struct hns_user_mmap_entry *db_mmap_entry; u32 config; struct hns_roce_dca_ctx dca_ctx; + void *dca_dbgfs; };
struct hns_roce_pd { diff --git a/drivers/infiniband/hw/hns/hns_roce_hw_v2.c b/drivers/infiniband/hw/hns/hns_roce_hw_v2.c index c4b61266a242..dfdbcaf6508c 100644 --- a/drivers/infiniband/hw/hns/hns_roce_hw_v2.c +++ b/drivers/infiniband/hw/hns/hns_roce_hw_v2.c @@ -49,6 +49,7 @@ #include "hns_roce_hem.h" #include "hns_roce_dca.h" #include "hns_roce_hw_v2.h" +#include "hns_roce_debugfs.h"
enum { CMD_RST_PRC_OTHERS, @@ -7419,12 +7420,14 @@ static struct hnae3_client hns_roce_hw_v2_client = {
static int __init hns_roce_hw_v2_init(void) { + hns_roce_init_debugfs(); return hnae3_register_client(&hns_roce_hw_v2_client); }
static void __exit hns_roce_hw_v2_exit(void) { hnae3_unregister_client(&hns_roce_hw_v2_client); + hns_roce_cleanup_debugfs(); }
module_init(hns_roce_hw_v2_init); diff --git a/drivers/infiniband/hw/hns/hns_roce_main.c b/drivers/infiniband/hw/hns/hns_roce_main.c index d14eaecdbf15..fe4ad13654cf 100644 --- a/drivers/infiniband/hw/hns/hns_roce_main.c +++ b/drivers/infiniband/hw/hns/hns_roce_main.c @@ -44,6 +44,7 @@ #include "hns_roce_hem.h" #include "hns_roce_hw_v2.h" #include "hns_roce_dca.h" +#include "hns_roce_debugfs.h"
static int hns_roce_set_mac(struct hns_roce_dev *hr_dev, u32 port, const u8 *addr) @@ -401,6 +402,9 @@ static int hns_roce_alloc_ucontext(struct ib_ucontext *uctx, if (!hr_dev->active) return -EAGAIN;
+ context->pid = current->pid; + INIT_LIST_HEAD(&context->list); + ret = ib_copy_from_udata(&ucmd, udata, min(udata->inlen, sizeof(ucmd))); if (ret) @@ -450,6 +454,12 @@ static int hns_roce_alloc_ucontext(struct ib_ucontext *uctx, if (ret) goto error_fail_copy_to_udata;
+ spin_lock(&hr_dev->uctx_list_lock); + list_add(&context->list, &hr_dev->uctx_list); + spin_unlock(&hr_dev->uctx_list_lock); + + hns_roce_register_uctx_debugfs(hr_dev, context); + return 0;
error_fail_copy_to_udata: @@ -469,6 +479,12 @@ static void hns_roce_dealloc_ucontext(struct ib_ucontext *ibcontext) struct hns_roce_ucontext *context = to_hr_ucontext(ibcontext); struct hns_roce_dev *hr_dev = to_hr_dev(ibcontext->device);
+ spin_lock(&hr_dev->uctx_list_lock); + list_del(&context->list); + spin_unlock(&hr_dev->uctx_list_lock); + + hns_roce_unregister_uctx_debugfs(hr_dev, context); + hns_roce_unregister_udca(hr_dev, context);
hns_roce_dealloc_uar_entry(context); @@ -1125,6 +1141,8 @@ int hns_roce_init(struct hns_roce_dev *hr_dev) if (ret) goto error_failed_register_device;
+ hns_roce_register_debugfs(hr_dev); + return 0;
error_failed_register_device: @@ -1155,6 +1173,7 @@ int hns_roce_init(struct hns_roce_dev *hr_dev) void hns_roce_exit(struct hns_roce_dev *hr_dev) { hns_roce_unregister_device(hr_dev); + hns_roce_unregister_debugfs(hr_dev);
if (hr_dev->hw->hw_exit) hr_dev->hw->hw_exit(hr_dev);
From: Chengchang Tang tangchengchang@huawei.com
driver inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I63KVU
----------------------------------------------------------
Use the shared memory to store the DCA status by getting the max qp num from uctx alloc param.
Signed-off-by: Chengchang Tang tangchengchang@huawei.com Reviewed-by: Yangyang Li liyangyang20@huawei.com Reviewed-by: YueHaibing yuehaibing@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- drivers/infiniband/hw/hns/hns_roce_dca.c | 338 +++++++++++++++---- drivers/infiniband/hw/hns/hns_roce_dca.h | 8 +- drivers/infiniband/hw/hns/hns_roce_debugfs.c | 3 +- drivers/infiniband/hw/hns/hns_roce_device.h | 22 +- drivers/infiniband/hw/hns/hns_roce_hw_v2.c | 30 +- drivers/infiniband/hw/hns/hns_roce_main.c | 66 +++- drivers/infiniband/hw/hns/hns_roce_qp.c | 59 ++-- include/uapi/rdma/hns-abi.h | 14 +- 8 files changed, 435 insertions(+), 105 deletions(-)
diff --git a/drivers/infiniband/hw/hns/hns_roce_dca.c b/drivers/infiniband/hw/hns/hns_roce_dca.c index 2c13c619ee32..d79c90ef33ed 100644 --- a/drivers/infiniband/hw/hns/hns_roce_dca.c +++ b/drivers/infiniband/hw/hns/hns_roce_dca.c @@ -143,7 +143,7 @@ static void *alloc_dca_pages(struct hns_roce_dev *hr_dev, bool is_user, }
mem->page_count = kmem->npages; - /* Override the attr->size by actually alloced size */ + /* Overwrite the attr->size by actually alloced size */ attr->size = kmem->ntrunks << kmem->trunk_shift; return kmem;
@@ -731,6 +731,72 @@ static int active_alloced_buf(struct hns_roce_qp *hr_qp, return ret; }
+#define DCAN_TO_SYNC_BIT(n) ((n) * HNS_DCA_BITS_PER_STATUS) +#define DCAN_TO_STAT_BIT(n) DCAN_TO_SYNC_BIT(n) +static bool start_free_dca_buf(struct hns_roce_dca_ctx *ctx, u32 dcan) +{ + unsigned long *st = ctx->sync_status; + + if (st && dcan < ctx->max_qps) + return !test_and_set_bit_lock(DCAN_TO_SYNC_BIT(dcan), st); + + return true; +} + +static void stop_free_dca_buf(struct hns_roce_dca_ctx *ctx, u32 dcan) +{ + unsigned long *st = ctx->sync_status; + + if (st && dcan < ctx->max_qps) + clear_bit_unlock(DCAN_TO_SYNC_BIT(dcan), st); +} + +static void update_dca_buf_status(struct hns_roce_dca_ctx *ctx, u32 dcan, + bool en) +{ + unsigned long *st = ctx->buf_status; + + if (st && dcan < ctx->max_qps) { + if (en) + set_bit(DCAN_TO_STAT_BIT(dcan), st); + else + clear_bit(DCAN_TO_STAT_BIT(dcan), st); + + /* sync status with user-space rdma */ + smp_mb__after_atomic(); + } +} + +static void restart_aging_dca_mem(struct hns_roce_dev *hr_dev, + struct hns_roce_dca_ctx *ctx) +{ + spin_lock(&ctx->aging_lock); + ctx->exit_aging = false; + if (!list_empty(&ctx->aging_new_list)) + queue_delayed_work(hr_dev->irq_workq, &ctx->aging_dwork, + msecs_to_jiffies(DCA_MEM_AGEING_MSES)); + + spin_unlock(&ctx->aging_lock); +} + +static void stop_aging_dca_mem(struct hns_roce_dca_ctx *ctx, + struct hns_roce_dca_cfg *cfg, bool stop_worker) +{ + spin_lock(&ctx->aging_lock); + if (stop_worker) { + ctx->exit_aging = true; + cancel_delayed_work(&ctx->aging_dwork); + } + + spin_lock(&cfg->lock); + + if (!list_empty(&cfg->aging_node)) + list_del_init(&cfg->aging_node); + + spin_unlock(&cfg->lock); + spin_unlock(&ctx->aging_lock); +} + static int attach_dca_mem(struct hns_roce_dev *hr_dev, struct hns_roce_qp *hr_qp, struct hns_dca_attach_attr *attr, @@ -741,8 +807,8 @@ static int attach_dca_mem(struct hns_roce_dev *hr_dev, u32 buf_id; int ret;
- /* Stop DCA mem ageing worker */ - cancel_delayed_work(&cfg->dwork); + if (hr_qp->en_flags & HNS_ROCE_QP_CAP_DYNAMIC_CTX_DETACH) + stop_aging_dca_mem(ctx, cfg, false); resp->alloc_flags = 0;
spin_lock(&cfg->lock); @@ -779,6 +845,7 @@ static int attach_dca_mem(struct hns_roce_dev *hr_dev,
resp->alloc_flags |= HNS_DCA_ATTACH_FLAGS_NEW_BUFFER; resp->alloc_pages = cfg->npages; + update_dca_buf_status(ctx, cfg->dcan, true);
return 0; } @@ -831,6 +898,7 @@ static void free_buf_from_dca_mem(struct hns_roce_dca_ctx *ctx, unsigned long flags; u32 buf_id;
+ update_dca_buf_status(ctx, cfg->dcan, false); spin_lock(&cfg->lock); buf_id = cfg->buf_id; cfg->buf_id = HNS_DCA_INVALID_BUF_ID; @@ -849,19 +917,22 @@ static void free_buf_from_dca_mem(struct hns_roce_dca_ctx *ctx, spin_unlock_irqrestore(&ctx->pool_lock, flags); }
-static void detach_dca_mem(struct hns_roce_dev *hr_dev, - struct hns_roce_qp *hr_qp, - struct hns_dca_detach_attr *attr) +void hns_roce_dca_detach(struct hns_roce_dev *hr_dev, struct hns_roce_qp *hr_qp, + struct hns_dca_detach_attr *attr) { + struct hns_roce_dca_ctx *ctx = hr_qp_to_dca_ctx(hr_dev, hr_qp); struct hns_roce_dca_cfg *cfg = &hr_qp->dca_cfg;
- /* Start an ageing worker to free buffer */ - cancel_delayed_work(&cfg->dwork); + stop_aging_dca_mem(ctx, cfg, true); + + spin_lock(&ctx->aging_lock); spin_lock(&cfg->lock); cfg->sq_idx = attr->sq_idx; - queue_delayed_work(hr_dev->irq_workq, &cfg->dwork, - msecs_to_jiffies(DCA_MEM_AGEING_MSES)); + list_add_tail(&cfg->aging_node, &ctx->aging_new_list); spin_unlock(&cfg->lock); + spin_unlock(&ctx->aging_lock); + + restart_aging_dca_mem(hr_dev, ctx); }
struct dca_mem_shrink_attr { @@ -924,11 +995,87 @@ static void shrink_dca_mem(struct hns_roce_dev *hr_dev, resp->free_key = attr.shrink_key; }
-static void init_dca_context(struct hns_roce_dca_ctx *ctx) +static void process_aging_dca_mem(struct hns_roce_dev *hr_dev, + struct hns_roce_dca_ctx *ctx) +{ + struct hns_roce_dca_cfg *cfg, *tmp_cfg; + struct hns_roce_qp *hr_qp; + + spin_lock(&ctx->aging_lock); + list_for_each_entry_safe(cfg, tmp_cfg, &ctx->aging_new_list, aging_node) + list_move(&cfg->aging_node, &ctx->aging_proc_list); + + while (!ctx->exit_aging && !list_empty(&ctx->aging_proc_list)) { + cfg = list_first_entry(&ctx->aging_proc_list, + struct hns_roce_dca_cfg, aging_node); + list_del_init_careful(&cfg->aging_node); + hr_qp = container_of(cfg, struct hns_roce_qp, dca_cfg); + spin_unlock(&ctx->aging_lock); + + if (start_free_dca_buf(ctx, cfg->dcan)) { + if (hr_dev->hw->chk_dca_buf_inactive(hr_dev, hr_qp)) + free_buf_from_dca_mem(ctx, cfg); + + stop_free_dca_buf(ctx, cfg->dcan); + } + + spin_lock(&ctx->aging_lock); + + spin_lock(&cfg->lock); + + if (cfg->buf_id != HNS_DCA_INVALID_BUF_ID) + list_move(&cfg->aging_node, &ctx->aging_new_list); + + spin_unlock(&cfg->lock); + } + spin_unlock(&ctx->aging_lock); +} + +static void udca_mem_aging_work(struct work_struct *work) +{ + struct hns_roce_dca_ctx *ctx = container_of(work, + struct hns_roce_dca_ctx, aging_dwork.work); + struct hns_roce_ucontext *uctx = container_of(ctx, + struct hns_roce_ucontext, dca_ctx); + struct hns_roce_dev *hr_dev = to_hr_dev(uctx->ibucontext.device); + + cancel_delayed_work(&ctx->aging_dwork); + process_aging_dca_mem(hr_dev, ctx); + if (!ctx->exit_aging) + restart_aging_dca_mem(hr_dev, ctx); +} + +static void remove_unused_dca_mem(struct hns_roce_dev *hr_dev); + +static void kdca_mem_aging_work(struct work_struct *work) +{ + struct hns_roce_dca_ctx *ctx = container_of(work, + struct hns_roce_dca_ctx, aging_dwork.work); + struct hns_roce_dev *hr_dev = container_of(ctx, struct hns_roce_dev, + dca_ctx); + + cancel_delayed_work(&ctx->aging_dwork); + process_aging_dca_mem(hr_dev, ctx); + remove_unused_dca_mem(hr_dev); + if (!ctx->exit_aging) + restart_aging_dca_mem(hr_dev, ctx); +} + +static void init_dca_context(struct hns_roce_dca_ctx *ctx, bool is_user) { INIT_LIST_HEAD(&ctx->pool); spin_lock_init(&ctx->pool_lock); ctx->total_size = 0; + + ida_init(&ctx->ida); + INIT_LIST_HEAD(&ctx->aging_new_list); + INIT_LIST_HEAD(&ctx->aging_proc_list); + spin_lock_init(&ctx->aging_lock); + ctx->exit_aging = false; + if (is_user) + INIT_DELAYED_WORK(&ctx->aging_dwork, udca_mem_aging_work); + else + INIT_DELAYED_WORK(&ctx->aging_dwork, kdca_mem_aging_work); }
static void cleanup_dca_context(struct hns_roce_dev *hr_dev, @@ -938,6 +1085,10 @@ static void cleanup_dca_context(struct hns_roce_dev *hr_dev, unsigned long flags; bool is_user;
+ spin_lock(&ctx->aging_lock); + cancel_delayed_work_sync(&ctx->aging_dwork); + spin_unlock(&ctx->aging_lock); + is_user = (ctx != &hr_dev->dca_ctx); spin_lock_irqsave(&ctx->pool_lock, flags); list_for_each_entry_safe(mem, tmp, &ctx->pool, list) { @@ -963,7 +1114,7 @@ static uint dca_unit_size; static ulong dca_min_size = DCA_MAX_MEM_SIZE; static ulong dca_max_size = DCA_MAX_MEM_SIZE;
-static void config_kdca_context(struct hns_roce_dca_ctx *ctx) +static void load_kdca_param(struct hns_roce_dca_ctx *ctx) { unsigned int unit_size;
@@ -985,9 +1136,8 @@ static void config_kdca_context(struct hns_roce_dca_ctx *ctx)
void hns_roce_init_dca(struct hns_roce_dev *hr_dev) { - init_dca_context(&hr_dev->dca_ctx); - - config_kdca_context(&hr_dev->dca_ctx); + load_kdca_param(&hr_dev->dca_ctx); + init_dca_context(&hr_dev->dca_ctx, false); }
void hns_roce_cleanup_dca(struct hns_roce_dev *hr_dev) @@ -995,22 +1145,68 @@ void hns_roce_cleanup_dca(struct hns_roce_dev *hr_dev) cleanup_dca_context(hr_dev, &hr_dev->dca_ctx); }
-void hns_roce_register_udca(struct hns_roce_dev *hr_dev, +static void init_udca_status(struct hns_roce_ucontext *uctx, int udca_max_qps, + unsigned int dev_max_qps) +{ + struct hns_roce_dev *hr_dev = to_hr_dev(uctx->ibucontext.device); + const unsigned int bits_per_qp = 2 * HNS_DCA_BITS_PER_STATUS; + struct hns_roce_dca_ctx *ctx = to_hr_dca_ctx(hr_dev, uctx); + struct ib_ucontext *ib_uctx = &uctx->ibucontext; + void *kaddr; + size_t size; + + size = BITS_TO_BYTES(udca_max_qps * bits_per_qp); + ctx->status_npage = DIV_ROUND_UP(size, PAGE_SIZE); + + size = ctx->status_npage * PAGE_SIZE; + ctx->max_qps = min_t(unsigned int, dev_max_qps, + size * BITS_PER_BYTE / bits_per_qp); + + kaddr = alloc_pages_exact(size, GFP_KERNEL | __GFP_ZERO); + if (!kaddr) + return; + + ctx->dca_mmap_entry = hns_roce_user_mmap_entry_insert(ib_uctx, + (u64)kaddr, size, HNS_ROCE_MMAP_TYPE_DCA); + if (!ctx->dca_mmap_entry) { + free_pages_exact(kaddr, size); + return; + } + + ctx->buf_status = (unsigned long *)kaddr; + ctx->sync_status = (unsigned long *)(kaddr + size / 2); +} + +void hns_roce_register_udca(struct hns_roce_dev *hr_dev, int max_qps, struct hns_roce_ucontext *uctx) { + struct hns_roce_dca_ctx *ctx = to_hr_dca_ctx(hr_dev, uctx); + if (!(uctx->config & HNS_ROCE_UCTX_CONFIG_DCA)) return;
- init_dca_context(&uctx->dca_ctx); + init_dca_context(ctx, true); + if (max_qps > 0) + init_udca_status(uctx, max_qps, hr_dev->caps.num_qps); }
void hns_roce_unregister_udca(struct hns_roce_dev *hr_dev, struct hns_roce_ucontext *uctx) { + struct hns_roce_dca_ctx *ctx = to_hr_dca_ctx(hr_dev, uctx); + if (!(uctx->config & HNS_ROCE_UCTX_CONFIG_DCA)) return;
- cleanup_dca_context(hr_dev, &uctx->dca_ctx); + cleanup_dca_context(hr_dev, ctx); + + if (ctx->buf_status) { + free_pages_exact(ctx->buf_status, + ctx->status_npage * PAGE_SIZE); + ctx->buf_status = NULL; + } + + ida_destroy(&ctx->ida); }
static struct dca_mem *key_to_dca_mem(struct list_head *head, u64 key) @@ -1227,6 +1423,7 @@ static void remove_unused_dca_mem(struct hns_roce_dev *hr_dev) spin_unlock_irqrestore(&ctx->pool_lock, flags); if (!mem) break; + unregister_dca_mem(hr_dev, NULL, mem); free_dca_mem(mem); /* No more free memory */ @@ -1235,52 +1432,56 @@ static void remove_unused_dca_mem(struct hns_roce_dev *hr_dev) } }
-static void kick_dca_mem(struct hns_roce_dev *hr_dev, +static void kick_dca_buf(struct hns_roce_dev *hr_dev, struct hns_roce_dca_cfg *cfg, - struct hns_roce_ucontext *uctx) + struct hns_roce_dca_ctx *ctx) { - struct hns_roce_dca_ctx *ctx = to_hr_dca_ctx(hr_dev, uctx); - - /* Stop ageing worker and free DCA buffer from pool */ - cancel_delayed_work_sync(&cfg->dwork); + stop_aging_dca_mem(ctx, cfg, true); free_buf_from_dca_mem(ctx, cfg); + restart_aging_dca_mem(hr_dev, ctx);
/* Shrink kenrel DCA mem */ - if (!uctx) + if (ctx == &hr_dev->dca_ctx) remove_unused_dca_mem(hr_dev); }
-static void dca_mem_ageing_work(struct work_struct *work) +static u32 alloc_dca_num(struct hns_roce_dca_ctx *ctx) { - struct hns_roce_qp *hr_qp = container_of(work, struct hns_roce_qp, - dca_cfg.dwork.work); - struct hns_roce_dev *hr_dev = to_hr_dev(hr_qp->ibqp.device); - struct hns_roce_dca_ctx *ctx = hr_qp_to_dca_ctx(hr_dev, hr_qp); - bool hw_is_inactive; + int ret;
- hw_is_inactive = hr_dev->hw->chk_dca_buf_inactive && - hr_dev->hw->chk_dca_buf_inactive(hr_dev, hr_qp); - if (hw_is_inactive) - free_buf_from_dca_mem(ctx, &hr_qp->dca_cfg); + ret = ida_alloc_max(&ctx->ida, ctx->max_qps - 1, GFP_KERNEL); + if (ret < 0) + return HNS_DCA_INVALID_DCA_NUM;
- /* Shrink kenrel DCA mem */ - if (!hr_qp->ibqp.uobject) - remove_unused_dca_mem(hr_dev); + stop_free_dca_buf(ctx, ret); + update_dca_buf_status(ctx, ret, false); + return ret; }
-void hns_roce_dca_detach(struct hns_roce_dev *hr_dev, struct hns_roce_qp *hr_qp, - struct hns_dca_detach_attr *attr) +static void free_dca_num(u32 dcan, struct hns_roce_dca_ctx *ctx) { - detach_dca_mem(hr_dev, hr_qp, attr); + if (dcan == HNS_DCA_INVALID_DCA_NUM) + return; + + ida_free(&ctx->ida, dcan); }
-void hns_roce_dca_kick(struct hns_roce_dev *hr_dev, struct hns_roce_qp *hr_qp, - struct ib_udata *udata) +static int setup_kdca(struct hns_roce_dca_cfg *cfg) { - struct hns_roce_ucontext *uctx = rdma_udata_to_drv_context(udata, - struct hns_roce_ucontext, ibucontext); + if (!cfg->npages) + return -EINVAL; + + cfg->buf_list = kcalloc(cfg->npages, sizeof(void *), GFP_KERNEL); + if (!cfg->buf_list) + return -ENOMEM;
- kick_dca_mem(hr_dev, &hr_qp->dca_cfg, uctx); + return 0; +} + +static void teardown_kdca(struct hns_roce_dca_cfg *cfg) +{ + kfree(cfg->buf_list); + cfg->buf_list = NULL; }
int hns_roce_enable_dca(struct hns_roce_dev *hr_dev, struct hns_roce_qp *hr_qp, @@ -1289,17 +1490,16 @@ int hns_roce_enable_dca(struct hns_roce_dev *hr_dev, struct hns_roce_qp *hr_qp, struct hns_roce_dca_cfg *cfg = &hr_qp->dca_cfg;
spin_lock_init(&cfg->lock); - INIT_DELAYED_WORK(&cfg->dwork, dca_mem_ageing_work); + INIT_LIST_HEAD(&cfg->aging_node); cfg->buf_id = HNS_DCA_INVALID_BUF_ID; cfg->npages = hr_qp->buff_size >> HNS_HW_PAGE_SHIFT; + cfg->dcan = HNS_DCA_INVALID_DCA_NUM; + /* Cannot support dynamic detach when rq is not empty */ + if (!hr_qp->rq.wqe_cnt) + hr_qp->en_flags |= HNS_ROCE_QP_CAP_DYNAMIC_CTX_DETACH;
- /* DCA page list for kernel QP */ - if (!udata && cfg->npages) { - cfg->buf_list = kcalloc(cfg->npages, sizeof(void *), - GFP_KERNEL); - if (!cfg->buf_list) - return -ENOMEM; - } + if (!udata) + return setup_kdca(cfg);
return 0; } @@ -1309,14 +1509,32 @@ void hns_roce_disable_dca(struct hns_roce_dev *hr_dev, { struct hns_roce_ucontext *uctx = rdma_udata_to_drv_context(udata, struct hns_roce_ucontext, ibucontext); + struct hns_roce_dca_ctx *ctx = to_hr_dca_ctx(hr_dev, uctx); struct hns_roce_dca_cfg *cfg = &hr_qp->dca_cfg;
- kick_dca_mem(hr_dev, cfg, uctx); + kick_dca_buf(hr_dev, cfg, ctx); + free_dca_num(cfg->dcan, ctx); + cfg->dcan = HNS_DCA_INVALID_DCA_NUM; + + if (!udata) + teardown_kdca(&hr_qp->dca_cfg); +} + +void hns_roce_modify_dca(struct hns_roce_dev *hr_dev, struct hns_roce_qp *hr_qp, + struct ib_udata *udata) +{ + struct hns_roce_ucontext *uctx = rdma_udata_to_drv_context(udata, + struct hns_roce_ucontext, ibucontext); + struct hns_roce_dca_ctx *ctx = to_hr_dca_ctx(hr_dev, uctx); + struct hns_roce_dca_cfg *cfg = &hr_qp->dca_cfg;
- /* Free kenrel DCA buffer list */ - if (!udata && cfg->buf_list) { - kfree(cfg->buf_list); - cfg->buf_list = NULL; + if (hr_qp->state == IB_QPS_RESET || hr_qp->state == IB_QPS_ERR) { + kick_dca_buf(hr_dev, cfg, ctx); + free_dca_num(cfg->dcan, ctx); + cfg->dcan = HNS_DCA_INVALID_DCA_NUM; + } else if (hr_qp->state == IB_QPS_RTR) { + free_dca_num(cfg->dcan, ctx); + cfg->dcan = alloc_dca_num(ctx); } }
@@ -1520,7 +1738,7 @@ static int UVERBS_HANDLER(HNS_IB_METHOD_DCA_MEM_DETACH)( if (ret) return ret;
- detach_dca_mem(to_hr_dev(hr_qp->ibqp.device), hr_qp, &attr); + hns_roce_dca_detach(to_hr_dev(hr_qp->ibqp.device), hr_qp, &attr);
return 0; } diff --git a/drivers/infiniband/hw/hns/hns_roce_dca.h b/drivers/infiniband/hw/hns/hns_roce_dca.h index 11bade706bd7..7733887ce5e1 100644 --- a/drivers/infiniband/hw/hns/hns_roce_dca.h +++ b/drivers/infiniband/hw/hns/hns_roce_dca.h @@ -19,6 +19,7 @@ struct hns_dca_page_state { extern const struct uapi_definition hns_roce_dca_uapi_defs[];
#define HNS_DCA_INVALID_BUF_ID 0UL +#define HNS_DCA_INVALID_DCA_NUM ~0U
/* * buffer id(29b) = tag(7b) + owner(22b) @@ -55,7 +56,7 @@ typedef int (*hns_dca_enum_callback)(struct hns_dca_page_state *, u32, void *); void hns_roce_init_dca(struct hns_roce_dev *hr_dev); void hns_roce_cleanup_dca(struct hns_roce_dev *hr_dev);
-void hns_roce_register_udca(struct hns_roce_dev *hr_dev, +void hns_roce_register_udca(struct hns_roce_dev *hr_dev, int max_qps, struct hns_roce_ucontext *uctx); void hns_roce_unregister_udca(struct hns_roce_dev *hr_dev, struct hns_roce_ucontext *uctx); @@ -69,9 +70,8 @@ int hns_roce_dca_attach(struct hns_roce_dev *hr_dev, struct hns_roce_qp *hr_qp, struct hns_dca_attach_attr *attr); void hns_roce_dca_detach(struct hns_roce_dev *hr_dev, struct hns_roce_qp *hr_qp, struct hns_dca_detach_attr *attr); - -void hns_roce_dca_kick(struct hns_roce_dev *hr_dev, struct hns_roce_qp *hr_qp, - struct ib_udata *udata); +void hns_roce_modify_dca(struct hns_roce_dev *hr_dev, struct hns_roce_qp *hr_qp, + struct ib_udata *udata);
void hns_roce_enum_dca_pool(struct hns_roce_dca_ctx *dca_ctx, void *param, hns_dca_enum_callback cb); diff --git a/drivers/infiniband/hw/hns/hns_roce_debugfs.c b/drivers/infiniband/hw/hns/hns_roce_debugfs.c index eedb24ee103e..cacdeb4d9fad 100644 --- a/drivers/infiniband/hw/hns/hns_roce_debugfs.c +++ b/drivers/infiniband/hw/hns/hns_roce_debugfs.c @@ -259,7 +259,8 @@ static void dca_setup_qp_stats(struct hns_roce_qp *hr_qp, { struct hns_roce_ucontext *uctx = NULL;
- if (!(hr_qp->en_flags & HNS_ROCE_QP_CAP_DCA) || !hr_qp->ibqp.pd) + if (!(hr_qp->en_flags & HNS_ROCE_QP_CAP_DYNAMIC_CTX_ATTACH) || + !hr_qp->ibqp.pd) return;
if (hr_qp->ibqp.pd->uobject) diff --git a/drivers/infiniband/hw/hns/hns_roce_device.h b/drivers/infiniband/hw/hns/hns_roce_device.h index e8ba256a1885..5b5f6c5920f1 100644 --- a/drivers/infiniband/hw/hns/hns_roce_device.h +++ b/drivers/infiniband/hw/hns/hns_roce_device.h @@ -205,6 +205,7 @@ struct hns_roce_uar { enum hns_roce_mmap_type { HNS_ROCE_MMAP_TYPE_DB = 1, HNS_ROCE_MMAP_TYPE_DWQE, + HNS_ROCE_MMAP_TYPE_DCA, };
struct hns_user_mmap_entry { @@ -222,6 +223,21 @@ struct hns_roce_dca_ctx { size_t max_size; /* max size the pool can expand to */ size_t min_size; /* shrink if @free_size > @min_size */ unsigned int unit_size; /* unit size per DCA mem */ + + unsigned int max_qps; + unsigned int status_npage; + struct ida ida; + +#define HNS_DCA_BITS_PER_STATUS 1 + unsigned long *buf_status; + unsigned long *sync_status; + + bool exit_aging; + struct list_head aging_proc_list; + struct list_head aging_new_list; + spinlock_t aging_lock; + struct delayed_work aging_dwork; + struct hns_user_mmap_entry *dca_mmap_entry; };
struct hns_roce_ucontext { @@ -332,12 +348,14 @@ struct hns_roce_mtr { /* DCA config */ struct hns_roce_dca_cfg { spinlock_t lock; - u32 buf_id; u16 attach_count; + u32 buf_id; + u32 dcan; void **buf_list; u32 npages; u32 sq_idx; - struct delayed_work dwork; + bool aging_enable; + struct list_head aging_node; };
struct hns_roce_mw { diff --git a/drivers/infiniband/hw/hns/hns_roce_hw_v2.c b/drivers/infiniband/hw/hns/hns_roce_hw_v2.c index dfdbcaf6508c..728f860ffc99 100644 --- a/drivers/infiniband/hw/hns/hns_roce_hw_v2.c +++ b/drivers/infiniband/hw/hns/hns_roce_hw_v2.c @@ -372,9 +372,9 @@ static int set_rwqe_data_seg(struct ib_qp *ibqp, const struct ib_send_wr *wr, return 0; }
-static inline bool check_qp_dca_enable(struct hns_roce_qp *hr_qp) +static bool check_dca_attach_enable(struct hns_roce_qp *hr_qp) { - return !!(hr_qp->en_flags & HNS_ROCE_QP_CAP_DCA); + return hr_qp->en_flags & HNS_ROCE_QP_CAP_DYNAMIC_CTX_ATTACH; }
static int dca_attach_qp_buf(struct hns_roce_dev *hr_dev, @@ -408,6 +408,11 @@ static int dca_attach_qp_buf(struct hns_roce_dev *hr_dev, return hns_roce_dca_attach(hr_dev, hr_qp, &attr); }
+static bool check_dca_detach_enable(struct hns_roce_qp *hr_qp) +{ + return hr_qp->en_flags & HNS_ROCE_QP_CAP_DYNAMIC_CTX_DETACH; +} + static void dca_detach_qp_buf(struct hns_roce_dev *hr_dev, struct hns_roce_qp *hr_qp) { @@ -454,7 +459,7 @@ static int check_send_valid(struct hns_roce_dev *hr_dev, return -EIO; }
- if (check_qp_dca_enable(hr_qp)) { + if (check_dca_attach_enable(hr_qp)) { ret = dca_attach_qp_buf(hr_dev, hr_qp); if (unlikely(ret)) { ibdev_err(&hr_dev->ib_dev, @@ -694,7 +699,7 @@ static inline int set_rc_wqe(struct hns_roce_qp *qp, ret = set_rwqe_data_seg(&qp->ibqp, wr, rc_sq_wqe, &curr_idx, valid_num_sge);
- if (qp->en_flags & HNS_ROCE_QP_CAP_DCA) + if (qp->en_flags & HNS_ROCE_QP_CAP_DYNAMIC_CTX_ATTACH) fill_dca_fields(qp, rc_sq_wqe);
/* @@ -883,7 +888,7 @@ static int check_recv_valid(struct hns_roce_dev *hr_dev, if (hr_qp->state == IB_QPS_RESET) return -EINVAL;
- if (check_qp_dca_enable(hr_qp)) { + if (check_dca_attach_enable(hr_qp)) { ret = dca_attach_qp_buf(hr_dev, hr_qp); if (unlikely(ret)) { ibdev_err(ibdev, @@ -4272,7 +4277,7 @@ static int hns_roce_v2_poll_cq(struct ib_cq *ibcq, int num_entries,
for (npolled = 0; npolled < num_entries; ++npolled) { ret = hns_roce_v2_poll_one(hr_cq, &cur_qp, wc + npolled); - if (cur_qp && check_qp_dca_enable(cur_qp)) + if (cur_qp && check_dca_detach_enable(cur_qp)) dca_detach_qp_buf(hr_dev, cur_qp); if (ret) break; @@ -4842,7 +4847,7 @@ static int modify_qp_init_to_rtr(struct ib_qp *ibqp, hr_reg_clear(qpc_mask, QPC_TRRL_BA_H);
if (hr_dev->pci_dev->revision >= PCI_REVISION_ID_HIP09) { - if (hr_qp->en_flags & HNS_ROCE_QP_CAP_DCA) { + if (hr_qp->en_flags & HNS_ROCE_QP_CAP_DYNAMIC_CTX_ATTACH) { hr_reg_enable(context, QPC_DCA_MODE); hr_reg_clear(qpc_mask, QPC_DCA_MODE); } @@ -5620,9 +5625,8 @@ static int hns_roce_v2_modify_qp(struct ib_qp *ibqp, if (new_state == IB_QPS_RESET && !ibqp->uobject) clear_qp(hr_qp);
- if (check_qp_dca_enable(hr_qp) && - (new_state == IB_QPS_RESET || new_state == IB_QPS_ERR)) - hns_roce_dca_kick(hr_dev, hr_qp, udata); + if (check_dca_attach_enable(hr_qp)) + hns_roce_modify_dca(hr_dev, hr_qp, udata);
out: return ret; @@ -5831,12 +5835,6 @@ static bool hns_roce_v2_chk_dca_buf_inactive(struct hns_roce_dev *hr_dev, if (state == HNS_ROCE_QP_ST_ERR || state == HNS_ROCE_QP_ST_RST) return true;
- /* If RQ is not empty, the buffer is always active until the QP stops - * working. - */ - if (hr_qp->rq.wqe_cnt > 0) - return false; - if (hr_qp->sq.wqe_cnt > 0) { tmp = (u32)hr_reg_read(&context, QPC_RETRY_MSG_MSN); sq_idx = tmp & (hr_qp->sq.wqe_cnt - 1); diff --git a/drivers/infiniband/hw/hns/hns_roce_main.c b/drivers/infiniband/hw/hns/hns_roce_main.c index fe4ad13654cf..cdfcefb1f660 100644 --- a/drivers/infiniband/hw/hns/hns_roce_main.c +++ b/drivers/infiniband/hw/hns/hns_roce_main.c @@ -37,6 +37,7 @@ #include <rdma/ib_smi.h> #include <rdma/ib_user_verbs.h> #include <rdma/ib_cache.h> +#include <rdma/uverbs_ioctl.h>
#include "hnae3.h" #include "hns_roce_common.h" @@ -341,6 +342,7 @@ hns_roce_user_mmap_entry_insert(struct ib_ucontext *ucontext, u64 address, ucontext, &entry->rdma_entry, length, 0); break; case HNS_ROCE_MMAP_TYPE_DWQE: + case HNS_ROCE_MMAP_TYPE_DCA: ret = rdma_user_mmap_entry_insert_range( ucontext, &entry->rdma_entry, length, 1, U32_MAX); @@ -363,6 +365,9 @@ static void hns_roce_dealloc_uar_entry(struct hns_roce_ucontext *context) if (context->db_mmap_entry) rdma_user_mmap_entry_remove( &context->db_mmap_entry->rdma_entry); + if (context->dca_ctx.dca_mmap_entry) + rdma_user_mmap_entry_remove( + &context->dca_ctx.dca_mmap_entry->rdma_entry); }
static int hns_roce_alloc_uar_entry(struct ib_ucontext *uctx) @@ -382,12 +387,36 @@ static int hns_roce_alloc_uar_entry(struct ib_ucontext *uctx) static void ucontext_set_resp(struct ib_ucontext *uctx, struct hns_roce_ib_alloc_ucontext_resp *resp) { + struct hns_roce_ucontext *context = to_hr_ucontext(uctx); struct hns_roce_dev *hr_dev = to_hr_dev(uctx->device); + struct rdma_user_mmap_entry *rdma_entry;
resp->qp_tab_size = hr_dev->caps.num_qps; resp->srq_tab_size = hr_dev->caps.num_srqs; resp->cqe_size = hr_dev->caps.cqe_sz; resp->mac_type = hr_dev->mac_type; + if (context->dca_ctx.dca_mmap_entry) { + resp->dca_qps = context->dca_ctx.max_qps; + resp->dca_mmap_size = PAGE_SIZE * context->dca_ctx.status_npage; + rdma_entry = &context->dca_ctx.dca_mmap_entry->rdma_entry; + resp->dca_mmap_key = rdma_user_mmap_get_offset(rdma_entry); + } +} + +static u32 get_udca_max_qps(struct hns_roce_dev *hr_dev, + struct hns_roce_ib_alloc_ucontext *ucmd) +{ + u32 qp_num; + + if (ucmd->comp & HNS_ROCE_ALLOC_UCTX_COMP_DCA_MAX_QPS) { + qp_num = ucmd->dca_max_qps; + if (!qp_num) + qp_num = hr_dev->caps.num_qps; + } else { + qp_num = 0; + } + + return qp_num; }
static int hns_roce_alloc_ucontext(struct ib_ucontext *uctx, @@ -447,7 +476,8 @@ static int hns_roce_alloc_ucontext(struct ib_ucontext *uctx, mutex_init(&context->page_mutex); }
- hns_roce_register_udca(hr_dev, context); + hns_roce_register_udca(hr_dev, get_udca_max_qps(hr_dev, &ucmd), + context);
ucontext_set_resp(uctx, &resp); ret = ib_copy_to_udata(udata, &resp, min(udata->outlen, sizeof(resp))); @@ -492,6 +522,36 @@ static void hns_roce_dealloc_ucontext(struct ib_ucontext *ibcontext) ida_free(&hr_dev->uar_ida.ida, (int)context->uar.logic_idx); }
+static int mmap_dca(struct ib_ucontext *context, struct vm_area_struct *vma) +{ + struct hns_roce_ucontext *uctx = to_hr_ucontext(context); + struct hns_roce_dca_ctx *ctx = &uctx->dca_ctx; + struct page **pages; + unsigned long num; + int ret; + + if ((vma->vm_end - vma->vm_start != (ctx->status_npage * PAGE_SIZE) || + !(vma->vm_flags & VM_SHARED))) + return -EINVAL; + + if (!(vma->vm_flags & VM_WRITE) || (vma->vm_flags & VM_EXEC)) + return -EPERM; + + if (!ctx->buf_status) + return -EOPNOTSUPP; + + pages = kcalloc(ctx->status_npage, sizeof(struct page *), GFP_KERNEL); + if (!pages) + return -ENOMEM; + + for (num = 0; num < ctx->status_npage; num++) + pages[num] = virt_to_page(ctx->buf_status + num * PAGE_SIZE); + + ret = vm_insert_pages(vma, vma->vm_start, pages, &num); + kfree(pages); + return ret; +} + static int hns_roce_mmap(struct ib_ucontext *uctx, struct vm_area_struct *vma) { struct rdma_user_mmap_entry *rdma_entry; @@ -512,6 +572,9 @@ static int hns_roce_mmap(struct ib_ucontext *uctx, struct vm_area_struct *vma) case HNS_ROCE_MMAP_TYPE_DWQE: prot = pgprot_device(vma->vm_page_prot); break; + case HNS_ROCE_MMAP_TYPE_DCA: + ret = mmap_dca(uctx, vma); + goto out; default: return -EINVAL; } @@ -519,6 +582,7 @@ static int hns_roce_mmap(struct ib_ucontext *uctx, struct vm_area_struct *vma) ret = rdma_user_mmap_io(uctx, vma, pfn, rdma_entry->npages * PAGE_SIZE, prot, rdma_entry);
+out: rdma_user_mmap_entry_put(rdma_entry);
return ret; diff --git a/drivers/infiniband/hw/hns/hns_roce_qp.c b/drivers/infiniband/hw/hns/hns_roce_qp.c index 302f2ea75749..de70c8637333 100644 --- a/drivers/infiniband/hw/hns/hns_roce_qp.c +++ b/drivers/infiniband/hw/hns/hns_roce_qp.c @@ -788,7 +788,7 @@ static int alloc_wqe_buf(struct hns_roce_dev *hr_dev, struct hns_roce_qp *hr_qp, return ret; }
- hr_qp->en_flags |= HNS_ROCE_QP_CAP_DCA; + hr_qp->en_flags |= HNS_ROCE_QP_CAP_DYNAMIC_CTX_ATTACH; } else { /* * Because DCA and DWQE share the same fileds in RCWQE buffer, @@ -815,7 +815,7 @@ static void free_wqe_buf(struct hns_roce_dev *hr_dev, struct hns_roce_qp *hr_qp, { hns_roce_mtr_destroy(hr_dev, &hr_qp->mtr);
- if (hr_qp->en_flags & HNS_ROCE_QP_CAP_DCA) + if (hr_qp->en_flags & HNS_ROCE_QP_CAP_DYNAMIC_CTX_ATTACH) hns_roce_disable_dca(hr_dev, hr_qp, udata); }
@@ -1408,22 +1408,17 @@ static int hns_roce_check_qp_attr(struct ib_qp *ibqp, struct ib_qp_attr *attr, return 0; }
-int hns_roce_modify_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr, - int attr_mask, struct ib_udata *udata) +static int check_modify_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr, + int attr_mask, enum ib_qp_state cur_state, + enum ib_qp_state new_state) { - struct hns_roce_dev *hr_dev = to_hr_dev(ibqp->device); struct hns_roce_qp *hr_qp = to_hr_qp(ibqp); - struct hns_roce_ib_modify_qp_resp resp = {}; - enum ib_qp_state cur_state, new_state; - int ret = -EINVAL; - - mutex_lock(&hr_qp->mutex); - - if (attr_mask & IB_QP_CUR_STATE && attr->cur_qp_state != hr_qp->state) - goto out; + int ret;
- cur_state = hr_qp->state; - new_state = attr_mask & IB_QP_STATE ? attr->qp_state : cur_state; + if (attr_mask & IB_QP_CUR_STATE && attr->cur_qp_state != hr_qp->state) { + ibdev_err(ibqp->device, "failed to check modify curr state\n"); + return -EINVAL; + }
if (ibqp->uobject && (attr_mask & IB_QP_STATE) && new_state == IB_QPS_ERR) { @@ -1433,19 +1428,42 @@ int hns_roce_modify_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr, if (hr_qp->en_flags & HNS_ROCE_QP_CAP_RQ_RECORD_DB) hr_qp->rq.head = *(int *)(hr_qp->rdb.virt_addr); } else { - ibdev_warn(&hr_dev->ib_dev, + ibdev_warn(ibqp->device, "flush cqe is not supported in userspace!\n"); - goto out; + return -EINVAL; } }
if (!ib_modify_qp_is_ok(cur_state, new_state, ibqp->qp_type, attr_mask)) { - ibdev_err(&hr_dev->ib_dev, "ib_modify_qp_is_ok failed\n"); - goto out; + ibdev_err(ibqp->device, "failed to check modify qp state\n"); + return -EINVAL; }
ret = hns_roce_check_qp_attr(ibqp, attr, attr_mask); + if (ret) { + ibdev_err(ibqp->device, "failed to check modify qp attr\n"); + return ret; + } + + return 0; +} + +int hns_roce_modify_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr, + int attr_mask, struct ib_udata *udata) +{ + struct hns_roce_dev *hr_dev = to_hr_dev(ibqp->device); + struct hns_roce_qp *hr_qp = to_hr_qp(ibqp); + struct hns_roce_ib_modify_qp_resp resp = {}; + enum ib_qp_state cur_state, new_state; + int ret; + + mutex_lock(&hr_qp->mutex); + + cur_state = hr_qp->state; + new_state = attr_mask & IB_QP_STATE ? attr->qp_state : cur_state; + + ret = check_modify_qp(ibqp, attr, attr_mask, cur_state, new_state); if (ret) goto out;
@@ -1460,6 +1478,7 @@ int hns_roce_modify_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr, if (udata && udata->outlen) { resp.tc_mode = hr_qp->tc_mode; resp.priority = hr_qp->sl; + resp.dcan = hr_qp->dca_cfg.dcan; ret = ib_copy_to_udata(udata, &resp, min(udata->outlen, sizeof(resp))); if (ret) @@ -1530,7 +1549,7 @@ static inline void *dca_buf_offset(struct hns_roce_dca_cfg *dca_cfg, u32 offset)
static inline void *get_wqe(struct hns_roce_qp *hr_qp, u32 offset) { - if (hr_qp->en_flags & HNS_ROCE_QP_CAP_DCA) + if (unlikely(hr_qp->dca_cfg.buf_list)) return dca_buf_offset(&hr_qp->dca_cfg, offset); else return hns_roce_buf_offset(hr_qp->mtr.kmem, offset); diff --git a/include/uapi/rdma/hns-abi.h b/include/uapi/rdma/hns-abi.h index 1faa11b8060b..69508419d3a0 100644 --- a/include/uapi/rdma/hns-abi.h +++ b/include/uapi/rdma/hns-abi.h @@ -77,8 +77,9 @@ enum hns_roce_qp_cap_flags { HNS_ROCE_QP_CAP_RQ_RECORD_DB = 1 << 0, HNS_ROCE_QP_CAP_SQ_RECORD_DB = 1 << 1, HNS_ROCE_QP_CAP_OWNER_DB = 1 << 2, - HNS_ROCE_QP_CAP_DCA = 1 << 4, + HNS_ROCE_QP_CAP_DYNAMIC_CTX_ATTACH = 1 << 4, HNS_ROCE_QP_CAP_DIRECT_WQE = 1 << 5, + HNS_ROCE_QP_CAP_DYNAMIC_CTX_DETACH = 1 << 6, };
struct hns_roce_ib_create_qp_resp { @@ -96,6 +97,8 @@ struct hns_roce_ib_modify_qp_resp { __u8 tc_mode; __u8 priority; __u8 reserved[6]; + __u32 dcan; + __u32 rsv2; };
enum { @@ -121,10 +124,19 @@ struct hns_roce_ib_alloc_ucontext_resp { __u32 max_inline_data; __u8 mac_type; __u8 rsv1[7]; + __u32 dca_qps; + __u32 dca_mmap_size; + __aligned_u64 dca_mmap_key; +}; + +enum hns_roce_uctx_comp_mask { + HNS_ROCE_ALLOC_UCTX_COMP_DCA_MAX_QPS = 1 << 0, };
struct hns_roce_ib_alloc_ucontext { __u32 config; + __u32 comp; /* use hns_roce_uctx_comp_mask */ + __u32 dca_max_qps; __u32 reserved; };
From: Chengchang Tang tangchengchang@huawei.com
driver inclusion category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I63KVU
----------------------------------------------------------
read_poll_timeout() in MBX may cause sleep, especially at reset, the probability becomes higher. In other words, it is not safe to use MBX in an atomic context.
In order to ensure the atomicity of QPC setup, DCA will use locks to protect the QPC setup operation in DCA ATTACH_MEM phase(i.e. post_send/post_recv). This results in the above-mentioned problem at reset.
Replace read_poll_timeout() with read_poll_timeout_atomic() to avoid MBX operation sleep in an atomic context().
Fixes: 306b8c76257b ("RDMA/hns: Do not destroy QP resources in the hw resetting phase") Signed-off-by: Chengchang Tang tangchengchang@huawei.com Reviewed-by: Yangyang Li liyangyang20@huawei.com Reviewed-by: YueHaibing yuehaibing@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- drivers/infiniband/hw/hns/hns_roce_hw_v2.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-)
diff --git a/drivers/infiniband/hw/hns/hns_roce_hw_v2.c b/drivers/infiniband/hw/hns/hns_roce_hw_v2.c index 728f860ffc99..5c58fb2070c4 100644 --- a/drivers/infiniband/hw/hns/hns_roce_hw_v2.c +++ b/drivers/infiniband/hw/hns/hns_roce_hw_v2.c @@ -1150,7 +1150,7 @@ static u32 hns_roce_v2_cmd_hw_resetting(struct hns_roce_dev *hr_dev, unsigned long reset_stage) { #define HW_RESET_TIMEOUT_US 1000000 -#define HW_RESET_SLEEP_US 1000 +#define HW_RESET_DELAY_US 1
struct hns_roce_v2_priv *priv = hr_dev->priv; struct hnae3_handle *handle = priv->handle; @@ -1169,8 +1169,8 @@ static u32 hns_roce_v2_cmd_hw_resetting(struct hns_roce_dev *hr_dev, */ hr_dev->dis_db = true;
- ret = read_poll_timeout(ops->ae_dev_reset_cnt, val, - val > hr_dev->reset_cnt, HW_RESET_SLEEP_US, + ret = read_poll_timeout_atomic(ops->ae_dev_reset_cnt, val, + val > hr_dev->reset_cnt, HW_RESET_DELAY_US, HW_RESET_TIMEOUT_US, false, handle); if (!ret) hr_dev->is_reset = true;
From: Kunkun Jiang jiangkunkun@huawei.com
virt inclusion category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I61SPO CVE: NA
--------------------------------
This reverts commit b22a06ea6ff96075d4a443fb4f318f41a9823e08.
Signed-off-by: Kunkun Jiang jiangkunkun@huawei.com Reviewed-by: Keqian Zhu zhukeqian1@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-)
diff --git a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c index 9e702788e90f..11c452b19fad 100644 --- a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c +++ b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c @@ -3534,7 +3534,7 @@ static int arm_smmu_switch_dirty_log(struct iommu_domain *domain, bool enable,
if (!(smmu->features & ARM_SMMU_FEAT_HD)) return -ENODEV; - if (smmu_domain->stage == ARM_SMMU_DOMAIN_BYPASS) + if (smmu_domain->stage != ARM_SMMU_DOMAIN_S1) return -EINVAL;
if (enable) { @@ -3575,7 +3575,7 @@ static int arm_smmu_sync_dirty_log(struct iommu_domain *domain,
if (!(smmu->features & ARM_SMMU_FEAT_HD)) return -ENODEV; - if (smmu_domain->stage == ARM_SMMU_DOMAIN_BYPASS) + if (smmu_domain->stage != ARM_SMMU_DOMAIN_S1) return -EINVAL;
if (!ops || !ops->sync_dirty_log) { @@ -3604,7 +3604,7 @@ static int arm_smmu_clear_dirty_log(struct iommu_domain *domain,
if (!(smmu->features & ARM_SMMU_FEAT_HD)) return -ENODEV; - if (smmu_domain->stage == ARM_SMMU_DOMAIN_BYPASS) + if (smmu_domain->stage != ARM_SMMU_DOMAIN_S1) return -EINVAL;
if (!ops || !ops->clear_dirty_log) {
From: Kunkun Jiang jiangkunkun@huawei.com
virt inclusion category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I61SPO CVE: NA
--------------------------------
This reverts commit 22f7a4bf1186b3f50b6716b714927e602fa32392.
Signed-off-by: Kunkun Jiang jiangkunkun@huawei.com Reviewed-by: Keqian Zhu zhukeqian1@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- drivers/vfio/pci/vfio_pci.c | 16 ++++------------ 1 file changed, 4 insertions(+), 12 deletions(-)
diff --git a/drivers/vfio/pci/vfio_pci.c b/drivers/vfio/pci/vfio_pci.c index ee73b1b2e200..2ff6f3ba9f39 100644 --- a/drivers/vfio/pci/vfio_pci.c +++ b/drivers/vfio/pci/vfio_pci.c @@ -557,12 +557,8 @@ static int vfio_pci_dma_fault_init(struct vfio_pci_device *vdev) return 0;
ret = iommu_domain_get_attr(domain, DOMAIN_ATTR_NESTING, &nested); - if (ret || !nested) { - if (ret) - pr_warn("%s: Get DOMAIN_ATTR_NESTING failed: %d.\n", - __func__, ret); - return 0; - } + if (ret || !nested) + return ret;
mutex_init(&vdev->fault_queue_lock);
@@ -651,12 +647,8 @@ static int vfio_pci_dma_fault_response_init(struct vfio_pci_device *vdev) return 0;
ret = iommu_domain_get_attr(domain, DOMAIN_ATTR_NESTING, &nested); - if (ret || !nested) { - if (ret) - pr_warn("%s: Get DOMAIN_ATTR_NESTING failed: %d.\n", - __func__, ret); - return 0; - } + if (ret || !nested) + return ret;
mutex_init(&vdev->fault_response_queue_lock);
From: Kunkun Jiang jiangkunkun@huawei.com
virt inclusion category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I61SPO CVE: NA
--------------------------------
This reverts commit 3afa66c6a1ca51433487ba116455af878ac17227.
Signed-off-by: Kunkun Jiang jiangkunkun@huawei.com Reviewed-by: Keqian Zhu zhukeqian1@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- include/linux/iommu.h | 1 - 1 file changed, 1 deletion(-)
diff --git a/include/linux/iommu.h b/include/linux/iommu.h index d993036c94c2..95320164dcf3 100644 --- a/include/linux/iommu.h +++ b/include/linux/iommu.h @@ -1221,7 +1221,6 @@ iommu_sva_bind_group(struct iommu_group *group, struct mm_struct *mm, return NULL; }
-static inline int iommu_bind_guest_msi(struct iommu_domain *domain, dma_addr_t giova, phys_addr_t gpa, size_t size) {
From: Kunkun Jiang jiangkunkun@huawei.com
virt inclusion category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I61SPO CVE: NA
--------------------------------
This reverts commit c046c2a2c57587243b1fc53e65061f3e842848e3.
Signed-off-by: Kunkun Jiang jiangkunkun@huawei.com Reviewed-by: Keqian Zhu zhukeqian1@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c | 1 + 1 file changed, 1 insertion(+)
diff --git a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c index 11c452b19fad..90e6ea871183 100644 --- a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c +++ b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c @@ -3832,6 +3832,7 @@ arm_smmu_cache_invalidate(struct iommu_domain *domain, struct device *dev, !(granule_size & smmu_domain->domain.pgsize_bitmap)) { tg = __ffs(smmu_domain->domain.pgsize_bitmap); granule_size = 1 << tg; + size = size >> tg; }
arm_smmu_tlb_inv_range_domain(info->addr, size,
From: Kunkun Jiang jiangkunkun@huawei.com
virt inclusion category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I61SPO CVE: NA
--------------------------------
This reverts commit f7cdf6923af762bfec3d8d7e919cca4de79de73a.
Signed-off-by: Kunkun Jiang jiangkunkun@huawei.com Reviewed-by: Keqian Zhu zhukeqian1@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c | 5 ----- 1 file changed, 5 deletions(-)
diff --git a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c index 90e6ea871183..87179d536652 100644 --- a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c +++ b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c @@ -2297,11 +2297,6 @@ static void __arm_smmu_tlb_inv_range(struct arm_smmu_cmdq_ent *cmd, cmd->tlbi.tg = (tg - 10) / 2;
/* Determine what level the granule is at */ - if (!(granule & smmu_domain->domain.pgsize_bitmap) || - (granule & (granule - 1))) { - granule = leaf_pgsize; - iova = ALIGN_DOWN(iova, leaf_pgsize); - } cmd->tlbi.ttl = 4 - ((ilog2(granule) - 3) / (tg - 3));
/* Align size with the leaf page size upwards */
From: Kunkun Jiang jiangkunkun@huawei.com
virt inclusion category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I61SPO CVE: NA
--------------------------------
This reverts commit 8f1d8ede3a5b0aa8d73b3932332a2ca39d9a2d2b.
Signed-off-by: Kunkun Jiang jiangkunkun@huawei.com Reviewed-by: Keqian Zhu zhukeqian1@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c | 5 ----- 1 file changed, 5 deletions(-)
diff --git a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c index 87179d536652..23208e1cf2d4 100644 --- a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c +++ b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c @@ -2288,10 +2288,7 @@ static void __arm_smmu_tlb_inv_range(struct arm_smmu_cmdq_ent *cmd,
if (smmu->features & ARM_SMMU_FEAT_RANGE_INV) { /* Get the leaf page size */ - size_t leaf_pgsize; - tg = __ffs(smmu_domain->domain.pgsize_bitmap); - leaf_pgsize = 1 << tg;
/* Convert page size of 12,14,16 (log2) to 1,2,3 */ cmd->tlbi.tg = (tg - 10) / 2; @@ -2299,8 +2296,6 @@ static void __arm_smmu_tlb_inv_range(struct arm_smmu_cmdq_ent *cmd, /* Determine what level the granule is at */ cmd->tlbi.ttl = 4 - ((ilog2(granule) - 3) / (tg - 3));
- /* Align size with the leaf page size upwards */ - size = ALIGN(size, leaf_pgsize); num_pages = size >> tg; }
From: Kunkun Jiang jiangkunkun@huawei.com
virt inclusion category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I61SPO CVE: NA
--------------------------------
This reverts commit 9ed8587a5c4bc58f7136343c4c7930eb35187ea0.
Signed-off-by: Kunkun Jiang jiangkunkun@huawei.com Reviewed-by: Keqian Zhu zhukeqian1@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c | 3 +-- drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h | 1 - 2 files changed, 1 insertion(+), 3 deletions(-)
diff --git a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c index 23208e1cf2d4..e261d7c01adc 100644 --- a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c +++ b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c @@ -338,7 +338,6 @@ static int arm_smmu_cmdq_build_cmd(u64 *cmd, struct arm_smmu_cmdq_ent *ent) case CMDQ_OP_TLBI_NH_ASID: cmd[0] |= FIELD_PREP(CMDQ_TLBI_0_ASID, ent->tlbi.asid); fallthrough; - case CMDQ_OP_TLBI_NH_ALL: case CMDQ_OP_TLBI_S12_VMALL: cmd[0] |= FIELD_PREP(CMDQ_TLBI_0_VMID, ent->tlbi.vmid); break; @@ -3759,7 +3758,7 @@ static int arm_smmu_cache_invalidate(struct iommu_domain *domain, struct device *dev, struct iommu_cache_invalidate_info *inv_info) { - struct arm_smmu_cmdq_ent cmd = {.opcode = CMDQ_OP_TLBI_NH_ALL}; + struct arm_smmu_cmdq_ent cmd = {.opcode = CMDQ_OP_TLBI_NSNH_ALL}; struct arm_smmu_domain *smmu_domain = to_smmu_domain(domain); struct arm_smmu_device *smmu = smmu_domain->smmu;
diff --git a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h index 1dd49bed58df..9abce4732456 100644 --- a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h +++ b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h @@ -509,7 +509,6 @@ struct arm_smmu_cmdq_ent { }; } cfgi;
- #define CMDQ_OP_TLBI_NH_ALL 0x10 #define CMDQ_OP_TLBI_NH_ASID 0x11 #define CMDQ_OP_TLBI_NH_VA 0x12 #define CMDQ_OP_TLBI_EL2_ALL 0x20
From: Kunkun Jiang jiangkunkun@huawei.com
virt inclusion category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I61SPO CVE: NA
--------------------------------
This reverts commit 21d56f9c91a0a1b55cc7e5933974d6afcef7b001.
Signed-off-by: Kunkun Jiang jiangkunkun@huawei.com Reviewed-by: Keqian Zhu zhukeqian1@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- drivers/iommu/io-pgtable-arm.c | 8 ++++++++ 1 file changed, 8 insertions(+)
diff --git a/drivers/iommu/io-pgtable-arm.c b/drivers/iommu/io-pgtable-arm.c index 0969224aff7b..34f6366dcc6c 100644 --- a/drivers/iommu/io-pgtable-arm.c +++ b/drivers/iommu/io-pgtable-arm.c @@ -980,6 +980,10 @@ static int arm_lpae_sync_dirty_log(struct io_pgtable_ops *ops, if (WARN_ON(iaext)) return -EINVAL;
+ if (data->iop.fmt != ARM_64_LPAE_S1 && + data->iop.fmt != ARM_32_LPAE_S1) + return -EINVAL; + return __arm_lpae_sync_dirty_log(data, iova, size, lvl, ptep, bitmap, base_iova, bitmap_pgshift); } @@ -1072,6 +1076,10 @@ static int arm_lpae_clear_dirty_log(struct io_pgtable_ops *ops, if (WARN_ON(iaext)) return -EINVAL;
+ if (data->iop.fmt != ARM_64_LPAE_S1 && + data->iop.fmt != ARM_32_LPAE_S1) + return -EINVAL; + return __arm_lpae_clear_dirty_log(data, iova, size, lvl, ptep, bitmap, base_iova, bitmap_pgshift); }
From: Kunkun Jiang jiangkunkun@huawei.com
virt inclusion category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I61SPO CVE: NA
--------------------------------
This reverts commit 524f1339c8a5eba5cc59161679f643f01edd89ec.
Signed-off-by: Kunkun Jiang jiangkunkun@huawei.com Reviewed-by: Keqian Zhu zhukeqian1@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- drivers/iommu/io-pgtable-arm.c | 33 ++++++--------------------------- 1 file changed, 6 insertions(+), 27 deletions(-)
diff --git a/drivers/iommu/io-pgtable-arm.c b/drivers/iommu/io-pgtable-arm.c index 34f6366dcc6c..2740a35fe714 100644 --- a/drivers/iommu/io-pgtable-arm.c +++ b/drivers/iommu/io-pgtable-arm.c @@ -159,23 +159,6 @@ static inline bool iopte_leaf(arm_lpae_iopte pte, int lvl, return iopte_type(pte, lvl) == ARM_LPAE_PTE_TYPE_BLOCK; }
-static inline bool arm_lpae_pte_writable(struct arm_lpae_io_pgtable *data, - arm_lpae_iopte pte, int lvl) -{ - if (iopte_leaf(pte, lvl, data->iop.fmt)) { - if (data->iop.fmt == ARM_64_LPAE_S1 || - data->iop.fmt == ARM_32_LPAE_S1) { - if (!(pte & ARM_LPAE_PTE_AP_RDONLY)) - return true; - } else { - if (pte & ARM_LPAE_PTE_HAP_WRITE) - return true; - } - } - - return false; -} - static arm_lpae_iopte paddr_to_iopte(phys_addr_t paddr, struct arm_lpae_io_pgtable *data) { @@ -769,7 +752,7 @@ static size_t __arm_lpae_split_block(struct arm_lpae_io_pgtable *data, if (size == ARM_LPAE_BLOCK_SIZE(lvl, data)) { if (iopte_leaf(pte, lvl, iop->fmt)) { if (lvl == (ARM_LPAE_MAX_LEVELS - 1) || - !arm_lpae_pte_writable(data, pte, lvl)) + (pte & ARM_LPAE_PTE_AP_RDONLY)) return size;
/* We find a writable block, split it. */ @@ -923,7 +906,7 @@ static int __arm_lpae_sync_dirty_log(struct arm_lpae_io_pgtable *data,
if (size == ARM_LPAE_BLOCK_SIZE(lvl, data)) { if (iopte_leaf(pte, lvl, iop->fmt)) { - if (!arm_lpae_pte_writable(data, pte, lvl)) + if (pte & ARM_LPAE_PTE_AP_RDONLY) return 0;
/* It is writable, set the bitmap */ @@ -944,7 +927,7 @@ static int __arm_lpae_sync_dirty_log(struct arm_lpae_io_pgtable *data, } return 0; } else if (iopte_leaf(pte, lvl, iop->fmt)) { - if (!arm_lpae_pte_writable(data, pte, lvl)) + if (pte & ARM_LPAE_PTE_AP_RDONLY) return 0;
/* Though the size is too small, also set bitmap */ @@ -1011,7 +994,7 @@ static int __arm_lpae_clear_dirty_log(struct arm_lpae_io_pgtable *data,
if (size == ARM_LPAE_BLOCK_SIZE(lvl, data)) { if (iopte_leaf(pte, lvl, iop->fmt)) { - if (!arm_lpae_pte_writable(data, pte, lvl)) + if (pte & ARM_LPAE_PTE_AP_RDONLY) return 0;
/* Ensure all corresponding bits are set */ @@ -1023,11 +1006,7 @@ static int __arm_lpae_clear_dirty_log(struct arm_lpae_io_pgtable *data, }
/* Race does not exist */ - if ((data->iop.fmt == ARM_64_LPAE_S1) || - (data->iop.fmt == ARM_32_LPAE_S1)) - pte |= ARM_LPAE_PTE_AP_RDONLY; - else - pte &= ~ARM_LPAE_PTE_HAP_WRITE; + pte |= ARM_LPAE_PTE_AP_RDONLY; __arm_lpae_set_pte(ptep, pte, &iop->cfg); return 0; } @@ -1044,7 +1023,7 @@ static int __arm_lpae_clear_dirty_log(struct arm_lpae_io_pgtable *data, return 0; } else if (iopte_leaf(pte, lvl, iop->fmt)) { /* Though the size is too small, it is already clean */ - if (!arm_lpae_pte_writable(data, pte, lvl)) + if (pte & ARM_LPAE_PTE_AP_RDONLY) return 0;
return -EINVAL;
From: Kunkun Jiang jiangkunkun@huawei.com
virt inclusion category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I61SPO CVE: NA
--------------------------------
This reverts commit 97e11307edcc8734359ec7bdcdbffc37633ae716.
Signed-off-by: Kunkun Jiang jiangkunkun@huawei.com Reviewed-by: Keqian Zhu zhukeqian1@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c | 5 ----- drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h | 2 -- drivers/iommu/io-pgtable-arm.c | 6 +----- 3 files changed, 1 insertion(+), 12 deletions(-)
diff --git a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c index e261d7c01adc..457ebc1b90d2 100644 --- a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c +++ b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c @@ -1610,11 +1610,6 @@ static void arm_smmu_write_strtab_ent(struct arm_smmu_master *master, u32 sid, STRTAB_STE_2_S2PTW | STRTAB_STE_2_S2AA64 | STRTAB_STE_2_S2R);
- if (smmu->features & ARM_SMMU_FEAT_HA) - dst[2] |= cpu_to_le64(STRTAB_STE_2_S2HA); - if (smmu->features & ARM_SMMU_FEAT_HD) - dst[2] |= cpu_to_le64(STRTAB_STE_2_S2HD); - dst[3] = cpu_to_le64(vttbr);
val |= FIELD_PREP(STRTAB_STE_0_CFG, STRTAB_STE_0_CFG_S2_TRANS); diff --git a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h index 9abce4732456..d0f3181a22c5 100644 --- a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h +++ b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h @@ -294,8 +294,6 @@ #define STRTAB_STE_2_S2AA64 (1UL << 51) #define STRTAB_STE_2_S2ENDI (1UL << 52) #define STRTAB_STE_2_S2PTW (1UL << 54) -#define STRTAB_STE_2_S2HD (1UL << 55) -#define STRTAB_STE_2_S2HA (1UL << 56) #define STRTAB_STE_2_S2R (1UL << 58)
#define STRTAB_STE_3_S2TTB_MASK GENMASK_ULL(51, 4) diff --git a/drivers/iommu/io-pgtable-arm.c b/drivers/iommu/io-pgtable-arm.c index 2740a35fe714..3fc6ae00dc96 100644 --- a/drivers/iommu/io-pgtable-arm.c +++ b/drivers/iommu/io-pgtable-arm.c @@ -401,12 +401,8 @@ static arm_lpae_iopte arm_lpae_prot_to_pte(struct arm_lpae_io_pgtable *data, pte = ARM_LPAE_PTE_HAP_FAULT; if (prot & IOMMU_READ) pte |= ARM_LPAE_PTE_HAP_READ; - if (prot & IOMMU_WRITE) { + if (prot & IOMMU_WRITE) pte |= ARM_LPAE_PTE_HAP_WRITE; - if (data->iop.fmt == ARM_64_LPAE_S2 && - cfg->quirks & IO_PGTABLE_QUIRK_ARM_HD) - pte |= ARM_LPAE_PTE_DBM; - } }
/*
From: Kunkun Jiang jiangkunkun@huawei.com
virt inclusion category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I61SPO CVE: NA
--------------------------------
This reverts commit 9b4742a6dd67e4a9309c325376682bde5da60fdf.
Signed-off-by: Kunkun Jiang jiangkunkun@huawei.com Reviewed-by: Keqian Zhu zhukeqian1@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- drivers/vfio/pci/vfio_pci.c | 40 ----------------------------- drivers/vfio/pci/vfio_pci_private.h | 7 ----- drivers/vfio/pci/vfio_pci_rdwr.c | 1 - 3 files changed, 48 deletions(-)
diff --git a/drivers/vfio/pci/vfio_pci.c b/drivers/vfio/pci/vfio_pci.c index 2ff6f3ba9f39..36bdcc0a4fc9 100644 --- a/drivers/vfio/pci/vfio_pci.c +++ b/drivers/vfio/pci/vfio_pci.c @@ -607,32 +607,6 @@ static int vfio_pci_dma_fault_init(struct vfio_pci_device *vdev) return ret; }
-static void dma_response_inject(struct work_struct *work) -{ - struct vfio_pci_dma_fault_response_work *rwork = - container_of(work, struct vfio_pci_dma_fault_response_work, inject); - struct vfio_region_dma_fault_response *header = rwork->header; - struct vfio_pci_device *vdev = rwork->vdev; - struct iommu_page_response *resp; - u32 tail, head, size; - - mutex_lock(&vdev->fault_response_queue_lock); - - tail = header->tail; - head = header->head; - size = header->nb_entries; - - while (CIRC_CNT(head, tail, size) >= 1) { - resp = (struct iommu_page_response *)(vdev->fault_response_pages + header->offset + - tail * header->entry_size); - - /* TODO: properly handle the return value */ - iommu_page_response(&vdev->pdev->dev, resp); - header->tail = tail = (tail + 1) % size; - } - mutex_unlock(&vdev->fault_response_queue_lock); -} - #define DMA_FAULT_RESPONSE_RING_LENGTH 512
static int vfio_pci_dma_fault_response_init(struct vfio_pci_device *vdev) @@ -678,22 +652,8 @@ static int vfio_pci_dma_fault_response_init(struct vfio_pci_device *vdev) header->nb_entries = DMA_FAULT_RESPONSE_RING_LENGTH; header->offset = PAGE_SIZE;
- vdev->response_work = kzalloc(sizeof(*vdev->response_work), GFP_KERNEL); - if (!vdev->response_work) - goto out; - vdev->response_work->header = header; - vdev->response_work->vdev = vdev; - - /* launch the thread that will extract the response */ - INIT_WORK(&vdev->response_work->inject, dma_response_inject); - vdev->dma_fault_response_wq = - create_singlethread_workqueue("vfio-dma-fault-response"); - if (!vdev->dma_fault_response_wq) - return -ENOMEM; - return 0; out: - kfree(vdev->fault_response_pages); vdev->fault_response_pages = NULL; return ret; } diff --git a/drivers/vfio/pci/vfio_pci_private.h b/drivers/vfio/pci/vfio_pci_private.h index 318328602874..70abd68a2ed9 100644 --- a/drivers/vfio/pci/vfio_pci_private.h +++ b/drivers/vfio/pci/vfio_pci_private.h @@ -52,12 +52,6 @@ struct vfio_pci_irq_ctx { struct irq_bypass_producer producer; };
-struct vfio_pci_dma_fault_response_work { - struct work_struct inject; - struct vfio_region_dma_fault_response *header; - struct vfio_pci_device *vdev; -}; - struct vfio_pci_device; struct vfio_pci_region;
@@ -159,7 +153,6 @@ struct vfio_pci_device { u8 *fault_pages; u8 *fault_response_pages; struct workqueue_struct *dma_fault_response_wq; - struct vfio_pci_dma_fault_response_work *response_work; struct mutex fault_queue_lock; struct mutex fault_response_queue_lock; struct list_head dummy_resources_list; diff --git a/drivers/vfio/pci/vfio_pci_rdwr.c b/drivers/vfio/pci/vfio_pci_rdwr.c index 43c11b5f5486..04828d0b752f 100644 --- a/drivers/vfio/pci/vfio_pci_rdwr.c +++ b/drivers/vfio/pci/vfio_pci_rdwr.c @@ -440,7 +440,6 @@ size_t vfio_pci_dma_fault_response_rw(struct vfio_pci_device *vdev, char __user mutex_lock(&vdev->fault_response_queue_lock); header->head = new_head; mutex_unlock(&vdev->fault_response_queue_lock); - queue_work(vdev->dma_fault_response_wq, &vdev->response_work->inject); } else { if (copy_to_user(buf, base + pos, count)) return -EFAULT;
From: Kunkun Jiang jiangkunkun@huawei.com
virt inclusion category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I61SPO CVE: NA
--------------------------------
This reverts commit cbbf4b3a64870f66d1c43b3900225adcf2d3fb48.
Signed-off-by: Kunkun Jiang jiangkunkun@huawei.com Reviewed-by: Keqian Zhu zhukeqian1@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- drivers/vfio/pci/vfio_pci.c | 125 ++-------------------------- drivers/vfio/pci/vfio_pci_private.h | 6 -- drivers/vfio/pci/vfio_pci_rdwr.c | 39 --------- include/uapi/linux/vfio.h | 32 ------- 4 files changed, 9 insertions(+), 193 deletions(-)
diff --git a/drivers/vfio/pci/vfio_pci.c b/drivers/vfio/pci/vfio_pci.c index 36bdcc0a4fc9..352abea42649 100644 --- a/drivers/vfio/pci/vfio_pci.c +++ b/drivers/vfio/pci/vfio_pci.c @@ -371,20 +371,9 @@ static void vfio_pci_dma_fault_release(struct vfio_pci_device *vdev, kfree(vdev->fault_pages); }
-static void -vfio_pci_dma_fault_response_release(struct vfio_pci_device *vdev, - struct vfio_pci_region *region) -{ - if (vdev->dma_fault_response_wq) - destroy_workqueue(vdev->dma_fault_response_wq); - kfree(vdev->fault_response_pages); - vdev->fault_response_pages = NULL; -} - -static int __vfio_pci_dma_fault_mmap(struct vfio_pci_device *vdev, - struct vfio_pci_region *region, - struct vm_area_struct *vma, - u8 *pages) +static int vfio_pci_dma_fault_mmap(struct vfio_pci_device *vdev, + struct vfio_pci_region *region, + struct vm_area_struct *vma) { u64 phys_len, req_len, pgoff, req_start; unsigned long long addr; @@ -397,14 +386,14 @@ static int __vfio_pci_dma_fault_mmap(struct vfio_pci_device *vdev, ((1U << (VFIO_PCI_OFFSET_SHIFT - PAGE_SHIFT)) - 1); req_start = pgoff << PAGE_SHIFT;
- /* only the second page of the fault region is mmappable */ + /* only the second page of the producer fault region is mmappable */ if (req_start < PAGE_SIZE) return -EINVAL;
if (req_start + req_len > phys_len) return -EINVAL;
- addr = virt_to_phys(pages); + addr = virt_to_phys(vdev->fault_pages); vma->vm_private_data = vdev; vma->vm_pgoff = (addr >> PAGE_SHIFT) + pgoff;
@@ -413,29 +402,13 @@ static int __vfio_pci_dma_fault_mmap(struct vfio_pci_device *vdev, return ret; }
-static int vfio_pci_dma_fault_mmap(struct vfio_pci_device *vdev, - struct vfio_pci_region *region, - struct vm_area_struct *vma) -{ - return __vfio_pci_dma_fault_mmap(vdev, region, vma, vdev->fault_pages); -} - -static int -vfio_pci_dma_fault_response_mmap(struct vfio_pci_device *vdev, - struct vfio_pci_region *region, - struct vm_area_struct *vma) -{ - return __vfio_pci_dma_fault_mmap(vdev, region, vma, vdev->fault_response_pages); -} - -static int __vfio_pci_dma_fault_add_capability(struct vfio_pci_device *vdev, - struct vfio_pci_region *region, - struct vfio_info_cap *caps, - u32 cap_id) +static int vfio_pci_dma_fault_add_capability(struct vfio_pci_device *vdev, + struct vfio_pci_region *region, + struct vfio_info_cap *caps) { struct vfio_region_info_cap_sparse_mmap *sparse = NULL; struct vfio_region_info_cap_fault cap = { - .header.id = cap_id, + .header.id = VFIO_REGION_INFO_CAP_DMA_FAULT, .header.version = 1, .version = 1, }; @@ -463,23 +436,6 @@ static int __vfio_pci_dma_fault_add_capability(struct vfio_pci_device *vdev, return ret; }
-static int vfio_pci_dma_fault_add_capability(struct vfio_pci_device *vdev, - struct vfio_pci_region *region, - struct vfio_info_cap *caps) -{ - return __vfio_pci_dma_fault_add_capability(vdev, region, caps, - VFIO_REGION_INFO_CAP_DMA_FAULT); -} - -static int -vfio_pci_dma_fault_response_add_capability(struct vfio_pci_device *vdev, - struct vfio_pci_region *region, - struct vfio_info_cap *caps) -{ - return __vfio_pci_dma_fault_add_capability(vdev, region, caps, - VFIO_REGION_INFO_CAP_DMA_FAULT_RESPONSE); -} - static const struct vfio_pci_regops vfio_pci_dma_fault_regops = { .rw = vfio_pci_dma_fault_rw, .release = vfio_pci_dma_fault_release, @@ -487,13 +443,6 @@ static const struct vfio_pci_regops vfio_pci_dma_fault_regops = { .add_capability = vfio_pci_dma_fault_add_capability, };
-static const struct vfio_pci_regops vfio_pci_dma_fault_response_regops = { - .rw = vfio_pci_dma_fault_response_rw, - .release = vfio_pci_dma_fault_response_release, - .mmap = vfio_pci_dma_fault_response_mmap, - .add_capability = vfio_pci_dma_fault_response_add_capability, -}; - static int vfio_pci_iommu_dev_fault_handler(struct iommu_fault *fault, void *data) { @@ -607,57 +556,6 @@ static int vfio_pci_dma_fault_init(struct vfio_pci_device *vdev) return ret; }
-#define DMA_FAULT_RESPONSE_RING_LENGTH 512 - -static int vfio_pci_dma_fault_response_init(struct vfio_pci_device *vdev) -{ - struct vfio_region_dma_fault_response *header; - struct iommu_domain *domain; - int nested, ret; - size_t size; - - domain = iommu_get_domain_for_dev(&vdev->pdev->dev); - if (!domain) - return 0; - - ret = iommu_domain_get_attr(domain, DOMAIN_ATTR_NESTING, &nested); - if (ret || !nested) - return ret; - - mutex_init(&vdev->fault_response_queue_lock); - - /* - * We provision 1 page for the header and space for - * DMA_FAULT_RING_LENGTH fault records in the ring buffer. - */ - size = ALIGN(sizeof(struct iommu_page_response) * - DMA_FAULT_RESPONSE_RING_LENGTH, PAGE_SIZE) + PAGE_SIZE; - - vdev->fault_response_pages = kzalloc(size, GFP_KERNEL); - if (!vdev->fault_response_pages) - return -ENOMEM; - - ret = vfio_pci_register_dev_region(vdev, - VFIO_REGION_TYPE_NESTED, - VFIO_REGION_SUBTYPE_NESTED_DMA_FAULT_RESPONSE, - &vfio_pci_dma_fault_response_regops, size, - VFIO_REGION_INFO_FLAG_READ | VFIO_REGION_INFO_FLAG_WRITE | - VFIO_REGION_INFO_FLAG_MMAP, - vdev->fault_response_pages); - if (ret) - goto out; - - header = (struct vfio_region_dma_fault_response *)vdev->fault_response_pages; - header->entry_size = sizeof(struct iommu_page_response); - header->nb_entries = DMA_FAULT_RESPONSE_RING_LENGTH; - header->offset = PAGE_SIZE; - - return 0; -out: - vdev->fault_response_pages = NULL; - return ret; -} - static int vfio_pci_enable(struct vfio_pci_device *vdev) { struct pci_dev *pdev = vdev->pdev; @@ -760,10 +658,6 @@ static int vfio_pci_enable(struct vfio_pci_device *vdev) if (ret) goto disable_exit;
- ret = vfio_pci_dma_fault_response_init(vdev); - if (ret) - goto disable_exit; - vfio_pci_probe_mmaps(vdev);
return 0; @@ -2507,7 +2401,6 @@ static int vfio_pci_probe(struct pci_dev *pdev, const struct pci_device_id *id) INIT_LIST_HEAD(&vdev->ioeventfds_list); mutex_init(&vdev->vma_lock); INIT_LIST_HEAD(&vdev->vma_list); - INIT_LIST_HEAD(&vdev->dummy_resources_list); init_rwsem(&vdev->memory_lock);
ret = vfio_pci_reflck_attach(vdev); diff --git a/drivers/vfio/pci/vfio_pci_private.h b/drivers/vfio/pci/vfio_pci_private.h index 70abd68a2ed9..a578723a34a5 100644 --- a/drivers/vfio/pci/vfio_pci_private.h +++ b/drivers/vfio/pci/vfio_pci_private.h @@ -151,10 +151,7 @@ struct vfio_pci_device { struct eventfd_ctx *err_trigger; struct eventfd_ctx *req_trigger; u8 *fault_pages; - u8 *fault_response_pages; - struct workqueue_struct *dma_fault_response_wq; struct mutex fault_queue_lock; - struct mutex fault_response_queue_lock; struct list_head dummy_resources_list; struct mutex ioeventfds_lock; struct list_head ioeventfds_list; @@ -201,9 +198,6 @@ extern long vfio_pci_ioeventfd(struct vfio_pci_device *vdev, loff_t offset, extern size_t vfio_pci_dma_fault_rw(struct vfio_pci_device *vdev, char __user *buf, size_t count, loff_t *ppos, bool iswrite); -extern size_t vfio_pci_dma_fault_response_rw(struct vfio_pci_device *vdev, - char __user *buf, size_t count, - loff_t *ppos, bool iswrite);
extern int vfio_pci_init_perm_bits(void); extern void vfio_pci_uninit_perm_bits(void); diff --git a/drivers/vfio/pci/vfio_pci_rdwr.c b/drivers/vfio/pci/vfio_pci_rdwr.c index 04828d0b752f..7f4d377ac9be 100644 --- a/drivers/vfio/pci/vfio_pci_rdwr.c +++ b/drivers/vfio/pci/vfio_pci_rdwr.c @@ -410,45 +410,6 @@ size_t vfio_pci_dma_fault_rw(struct vfio_pci_device *vdev, char __user *buf, return ret; }
-size_t vfio_pci_dma_fault_response_rw(struct vfio_pci_device *vdev, char __user *buf, - size_t count, loff_t *ppos, bool iswrite) -{ - unsigned int i = VFIO_PCI_OFFSET_TO_INDEX(*ppos) - VFIO_PCI_NUM_REGIONS; - loff_t pos = *ppos & VFIO_PCI_OFFSET_MASK; - void *base = vdev->region[i].data; - int ret = -EFAULT; - - if (pos >= vdev->region[i].size) - return -EINVAL; - - count = min(count, (size_t)(vdev->region[i].size - pos)); - - if (iswrite) { - struct vfio_region_dma_fault_response *header = - (struct vfio_region_dma_fault_response *)base; - uint32_t new_head; - - if (pos != 0 || count != 4) - return -EINVAL; - - if (copy_from_user((void *)&new_head, buf, count)) - return -EFAULT; - - if (new_head >= header->nb_entries) - return -EINVAL; - - mutex_lock(&vdev->fault_response_queue_lock); - header->head = new_head; - mutex_unlock(&vdev->fault_response_queue_lock); - } else { - if (copy_to_user(buf, base + pos, count)) - return -EFAULT; - } - *ppos += count; - ret = count; - return ret; -} - static void vfio_pci_ioeventfd_do_write(struct vfio_pci_ioeventfd *ioeventfd, bool test_mem) { diff --git a/include/uapi/linux/vfio.h b/include/uapi/linux/vfio.h index 9ae6c31796ed..6574032973a3 100644 --- a/include/uapi/linux/vfio.h +++ b/include/uapi/linux/vfio.h @@ -356,7 +356,6 @@ struct vfio_region_info_cap_type {
/* sub-types for VFIO_REGION_TYPE_NESTED */ #define VFIO_REGION_SUBTYPE_NESTED_DMA_FAULT (1) -#define VFIO_REGION_SUBTYPE_NESTED_DMA_FAULT_RESPONSE (2)
/** * struct vfio_region_gfx_edid - EDID region layout. @@ -1034,17 +1033,6 @@ struct vfio_region_info_cap_fault { __u32 version; };
-/* - * Capability exposed by the DMA fault response region - * @version: ABI version - */ -#define VFIO_REGION_INFO_CAP_DMA_FAULT_RESPONSE 7 - -struct vfio_region_info_cap_fault_response { - struct vfio_info_cap_header header; - __u32 version; -}; - /* * DMA Fault Region Layout * @tail: index relative to the start of the ring buffer at which the @@ -1065,26 +1053,6 @@ struct vfio_region_dma_fault { __u32 head; };
-/* - * DMA Fault Response Region Layout - * @head: index relative to the start of the ring buffer at which the - * producer (userspace) insert responses into the buffer - * @entry_size: fault ring buffer entry size in bytes - * @nb_entries: max capacity of the fault ring buffer - * @offset: ring buffer offset relative to the start of the region - * @tail: index relative to the start of the ring buffer at which the - * consumer (kernel) finds the next item in the buffer - */ -struct vfio_region_dma_fault_response { - /* Write-Only */ - __u32 head; - /* Read-Only */ - __u32 entry_size; - __u32 nb_entries; - __u32 offset; - __u32 tail; -}; - /* -------- API for Type1 VFIO IOMMU -------- */
/**
From: Kunkun Jiang jiangkunkun@huawei.com
virt inclusion category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I61SPO CVE: NA
--------------------------------
This reverts commit aa2addedeae2756de0265c56c4e8d96aac737a23.
Signed-off-by: Kunkun Jiang jiangkunkun@huawei.com Reviewed-by: Keqian Zhu zhukeqian1@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- Documentation/driver-api/vfio.rst | 77 ------------------------------- 1 file changed, 77 deletions(-)
diff --git a/Documentation/driver-api/vfio.rst b/Documentation/driver-api/vfio.rst index b57a96d20d3b..d3a02300913a 100644 --- a/Documentation/driver-api/vfio.rst +++ b/Documentation/driver-api/vfio.rst @@ -239,83 +239,6 @@ group and can access them as follows:: /* Gratuitous device reset and go... */ ioctl(device, VFIO_DEVICE_RESET);
-IOMMU Dual Stage Control ------------------------- - -Some IOMMUs support 2 stages/levels of translation. "Stage" corresponds to -the ARM terminology while "level" corresponds to Intel's VTD terminology. In -the following text we use either without distinction. - -This is useful when the guest is exposed with a virtual IOMMU and some -devices are assigned to the guest through VFIO. Then the guest OS can use -stage 1 (IOVA -> GPA), while the hypervisor uses stage 2 for VM isolation -(GPA -> HPA). - -The guest gets ownership of the stage 1 page tables and also owns stage 1 -configuration structures. The hypervisor owns the root configuration structure -(for security reason), including stage 2 configuration. This works as long -configuration structures and page table format are compatible between the -virtual IOMMU and the physical IOMMU. - -Assuming the HW supports it, this nested mode is selected by choosing the -VFIO_TYPE1_NESTING_IOMMU type through: - -ioctl(container, VFIO_SET_IOMMU, VFIO_TYPE1_NESTING_IOMMU); - -This forces the hypervisor to use the stage 2, leaving stage 1 available for -guest usage. - -Once groups are attached to the container, the guest stage 1 translation -configuration data can be passed to VFIO by using - -ioctl(container, VFIO_IOMMU_SET_PASID_TABLE, &pasid_table_info); - -This allows to combine the guest stage 1 configuration structure along with -the hypervisor stage 2 configuration structure. Stage 1 configuration -structures are dependent on the IOMMU type. - -As the stage 1 translation is fully delegated to the HW, translation faults -encountered during the translation process need to be propagated up to -the virtualizer and re-injected into the guest. - -The userspace must be prepared to receive faults. The VFIO-PCI device -exposes one dedicated DMA FAULT region: it contains a ring buffer and -its header that allows to manage the head/tail indices. The region is -identified by the following index/subindex: -- VFIO_REGION_TYPE_NESTED/VFIO_REGION_SUBTYPE_NESTED_DMA_FAULT - -The DMA FAULT region exposes a VFIO_REGION_INFO_CAP_DMA_FAULT -region capability that allows the userspace to retrieve the ABI version -of the fault records filled by the host. - -On top of that region, the userspace can be notified whenever a fault -occurs at the physical level. It can use the VFIO_IRQ_TYPE_NESTED/ -VFIO_IRQ_SUBTYPE_DMA_FAULT specific IRQ to attach the eventfd to be -signalled. - -The ring buffer containing the fault records can be mmapped. When -the userspace consumes a fault in the queue, it should increment -the consumer index to allow new fault records to replace the used ones. - -The queue size and the entry size can be retrieved in the header. -The tail index should never overshoot the producer index as in any -other circular buffer scheme. Also it must be less than the queue size -otherwise the change fails. - -When the guest invalidates stage 1 related caches, invalidations must be -forwarded to the host through -ioctl(container, VFIO_IOMMU_CACHE_INVALIDATE, &inv_data); -Those invalidations can happen at various granularity levels, page, context, ... - -The ARM SMMU specification introduces another challenge: MSIs are translated by -both the virtual SMMU and the physical SMMU. To build a nested mapping for the -IOVA programmed into the assigned device, the guest needs to pass its IOVA/MSI -doorbell GPA binding to the host. Then the hypervisor can build a nested stage 2 -binding eventually translating into the physical MSI doorbell. - -This is achieved by calling -ioctl(container, VFIO_IOMMU_SET_MSI_BINDING, &guest_binding); - VFIO User API -------------------------------------------------------------------------------
From: Kunkun Jiang jiangkunkun@huawei.com
virt inclusion category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I61SPO CVE: NA
--------------------------------
This reverts commit b6f29e4d0dc417e7eec27d84a7913b80f1b760e1.
Signed-off-by: Kunkun Jiang jiangkunkun@huawei.com Reviewed-by: Keqian Zhu zhukeqian1@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- drivers/vfio/pci/vfio_pci.c | 21 +-------------------- 1 file changed, 1 insertion(+), 20 deletions(-)
diff --git a/drivers/vfio/pci/vfio_pci.c b/drivers/vfio/pci/vfio_pci.c index 352abea42649..514b004c2cc6 100644 --- a/drivers/vfio/pci/vfio_pci.c +++ b/drivers/vfio/pci/vfio_pci.c @@ -451,7 +451,6 @@ vfio_pci_iommu_dev_fault_handler(struct iommu_fault *fault, void *data) (struct vfio_region_dma_fault *)vdev->fault_pages; struct iommu_fault *new; u32 head, tail, size; - int ext_irq_index; int ret = -EINVAL;
if (WARN_ON(!reg)) @@ -476,19 +475,7 @@ vfio_pci_iommu_dev_fault_handler(struct iommu_fault *fault, void *data) ret = 0; unlock: mutex_unlock(&vdev->fault_queue_lock); - if (ret) - return ret; - - ext_irq_index = vfio_pci_get_ext_irq_index(vdev, VFIO_IRQ_TYPE_NESTED, - VFIO_IRQ_SUBTYPE_DMA_FAULT); - if (ext_irq_index < 0) - return -EINVAL; - - mutex_lock(&vdev->igate); - if (vdev->ext_irqs[ext_irq_index].trigger) - eventfd_signal(vdev->ext_irqs[ext_irq_index].trigger, 1); - mutex_unlock(&vdev->igate); - return 0; + return ret; }
#define DMA_FAULT_RING_LENGTH 512 @@ -543,12 +530,6 @@ static int vfio_pci_dma_fault_init(struct vfio_pci_device *vdev) if (ret) /* the dma fault region is freed in vfio_pci_disable() */ goto out;
- ret = vfio_pci_register_irq(vdev, VFIO_IRQ_TYPE_NESTED, - VFIO_IRQ_SUBTYPE_DMA_FAULT, - VFIO_IRQ_INFO_EVENTFD); - if (ret) /* the fault handler is also freed in vfio_pci_disable() */ - goto out; - return 0; out: kfree(vdev->fault_pages);
From: Kunkun Jiang jiangkunkun@huawei.com
virt inclusion category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I61SPO CVE: NA
--------------------------------
This reverts commit 31ed6dc2484b533447c26163dcdecdfd93063b25.
Signed-off-by: Kunkun Jiang jiangkunkun@huawei.com Reviewed-by: Keqian Zhu zhukeqian1@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- include/uapi/linux/vfio.h | 3 --- 1 file changed, 3 deletions(-)
diff --git a/include/uapi/linux/vfio.h b/include/uapi/linux/vfio.h index 6574032973a3..fa3ac73c47be 100644 --- a/include/uapi/linux/vfio.h +++ b/include/uapi/linux/vfio.h @@ -733,9 +733,6 @@ struct vfio_irq_info_cap_type { __u32 subtype; /* type specific */ };
-#define VFIO_IRQ_TYPE_NESTED (1) -#define VFIO_IRQ_SUBTYPE_DMA_FAULT (1) - /** * VFIO_DEVICE_SET_IRQS - _IOW(VFIO_TYPE, VFIO_BASE + 10, struct vfio_irq_set) *
From: Kunkun Jiang jiangkunkun@huawei.com
virt inclusion category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I61SPO CVE: NA
--------------------------------
This reverts commit e3489f77845cdf900002271db11bd7bdc10c7696.
Signed-off-by: Kunkun Jiang jiangkunkun@huawei.com Reviewed-by: Keqian Zhu zhukeqian1@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- drivers/vfio/pci/vfio_pci.c | 98 +++++------------------------ drivers/vfio/pci/vfio_pci_intrs.c | 62 ------------------ drivers/vfio/pci/vfio_pci_private.h | 14 ----- 3 files changed, 17 insertions(+), 157 deletions(-)
diff --git a/drivers/vfio/pci/vfio_pci.c b/drivers/vfio/pci/vfio_pci.c index 514b004c2cc6..9a3d0a54ee08 100644 --- a/drivers/vfio/pci/vfio_pci.c +++ b/drivers/vfio/pci/vfio_pci.c @@ -665,14 +665,6 @@ static void vfio_pci_disable(struct vfio_pci_device *vdev) ret = iommu_unregister_device_fault_handler(&vdev->pdev->dev); WARN_ON(ret == -EBUSY);
- for (i = 0; i < vdev->num_ext_irqs; i++) - vfio_pci_set_irqs_ioctl(vdev, VFIO_IRQ_SET_DATA_NONE | - VFIO_IRQ_SET_ACTION_TRIGGER, - VFIO_PCI_NUM_IRQS + i, 0, 0, NULL); - vdev->num_ext_irqs = 0; - kfree(vdev->ext_irqs); - vdev->ext_irqs = NULL; - /* Device closed, don't need mutex here */ list_for_each_entry_safe(ioeventfd, ioeventfd_tmp, &vdev->ioeventfds_list, next) { @@ -890,9 +882,6 @@ static int vfio_pci_get_irq_count(struct vfio_pci_device *vdev, int irq_type) return 1; } else if (irq_type == VFIO_PCI_REQ_IRQ_INDEX) { return 1; - } else if (irq_type >= VFIO_PCI_NUM_IRQS && - irq_type < VFIO_PCI_NUM_IRQS + vdev->num_ext_irqs) { - return 1; }
return 0; @@ -1077,10 +1066,9 @@ long vfio_pci_ioctl(void *device_data, if (vdev->reset_works) info.flags |= VFIO_DEVICE_FLAGS_RESET;
- info.num_regions = VFIO_PCI_NUM_REGIONS + vdev->num_regions + - vdev->num_vendor_regions; - info.num_irqs = VFIO_PCI_NUM_IRQS + vdev->num_ext_irqs + - vdev->num_vendor_irqs; + info.num_regions = VFIO_PCI_NUM_REGIONS + + vdev->num_vendor_regions; + info.num_irqs = VFIO_PCI_NUM_IRQS + vdev->num_vendor_irqs;
if (IS_ENABLED(CONFIG_VFIO_PCI_ZDEV)) { int ret = vfio_pci_info_zdev_add_caps(vdev, &caps); @@ -1259,87 +1247,36 @@ long vfio_pci_ioctl(void *device_data,
} else if (cmd == VFIO_DEVICE_GET_IRQ_INFO) { struct vfio_irq_info info; - struct vfio_info_cap caps = { .buf = NULL, .size = 0 }; - unsigned long capsz;
minsz = offsetofend(struct vfio_irq_info, count);
- /* For backward compatibility, cannot require this */ - capsz = offsetofend(struct vfio_irq_info, cap_offset); - if (copy_from_user(&info, (void __user *)arg, minsz)) return -EFAULT;
- if (info.argsz < minsz || - info.index >= VFIO_PCI_NUM_IRQS + vdev->num_ext_irqs) + if (info.argsz < minsz || info.index >= VFIO_PCI_NUM_IRQS) return -EINVAL;
- if (info.argsz >= capsz) - minsz = capsz; - - info.flags = VFIO_IRQ_INFO_EVENTFD; - switch (info.index) { - case VFIO_PCI_INTX_IRQ_INDEX: - info.flags |= (VFIO_IRQ_INFO_MASKABLE | - VFIO_IRQ_INFO_AUTOMASKED); - break; - case VFIO_PCI_MSI_IRQ_INDEX ... VFIO_PCI_MSIX_IRQ_INDEX: + case VFIO_PCI_INTX_IRQ_INDEX ... VFIO_PCI_MSIX_IRQ_INDEX: case VFIO_PCI_REQ_IRQ_INDEX: - info.flags |= VFIO_IRQ_INFO_NORESIZE; break; case VFIO_PCI_ERR_IRQ_INDEX: - info.flags |= VFIO_IRQ_INFO_NORESIZE; - if (!pci_is_pcie(vdev->pdev)) - return -EINVAL; - break; + if (pci_is_pcie(vdev->pdev)) + break; + fallthrough; default: - { - struct vfio_irq_info_cap_type cap_type = { - .header.id = VFIO_IRQ_INFO_CAP_TYPE, - .header.version = 1 }; - int ret, i; - - if (info.index >= VFIO_PCI_NUM_IRQS + - vdev->num_ext_irqs) - return -EINVAL; - info.index = array_index_nospec(info.index, - VFIO_PCI_NUM_IRQS + - vdev->num_ext_irqs); - i = info.index - VFIO_PCI_NUM_IRQS; - - info.flags = vdev->ext_irqs[i].flags; - cap_type.type = vdev->ext_irqs[i].type; - cap_type.subtype = vdev->ext_irqs[i].subtype; - - ret = vfio_info_add_capability(&caps, - &cap_type.header, - sizeof(cap_type)); - if (ret) - return ret; - } + return -EINVAL; }
- info.count = vfio_pci_get_irq_count(vdev, info.index); + info.flags = VFIO_IRQ_INFO_EVENTFD;
- if (caps.size) { - info.flags |= VFIO_IRQ_INFO_FLAG_CAPS; - if (info.argsz < sizeof(info) + caps.size) { - info.argsz = sizeof(info) + caps.size; - info.cap_offset = 0; - } else { - vfio_info_cap_shift(&caps, sizeof(info)); - if (copy_to_user((void __user *)arg + - sizeof(info), caps.buf, - caps.size)) { - kfree(caps.buf); - return -EFAULT; - } - info.cap_offset = sizeof(info); - } + info.count = vfio_pci_get_irq_count(vdev, info.index);
- kfree(caps.buf); - } + if (info.index == VFIO_PCI_INTX_IRQ_INDEX) + info.flags |= (VFIO_IRQ_INFO_MASKABLE | + VFIO_IRQ_INFO_AUTOMASKED); + else + info.flags |= VFIO_IRQ_INFO_NORESIZE;
return copy_to_user((void __user *)arg, &info, minsz) ? -EFAULT : 0; @@ -1358,8 +1295,7 @@ long vfio_pci_ioctl(void *device_data, max = vfio_pci_get_irq_count(vdev, hdr.index);
ret = vfio_set_irqs_validate_and_prepare(&hdr, max, - VFIO_PCI_NUM_IRQS + vdev->num_ext_irqs, - &data_size); + VFIO_PCI_NUM_IRQS, &data_size); if (ret) return ret;
diff --git a/drivers/vfio/pci/vfio_pci_intrs.c b/drivers/vfio/pci/vfio_pci_intrs.c index d67995fe872f..869dce5f134d 100644 --- a/drivers/vfio/pci/vfio_pci_intrs.c +++ b/drivers/vfio/pci/vfio_pci_intrs.c @@ -19,7 +19,6 @@ #include <linux/vfio.h> #include <linux/wait.h> #include <linux/slab.h> -#include <linux/nospec.h>
#include "vfio_pci_private.h"
@@ -636,24 +635,6 @@ static int vfio_pci_set_req_trigger(struct vfio_pci_device *vdev, count, flags, data); }
-static int vfio_pci_set_ext_irq_trigger(struct vfio_pci_device *vdev, - unsigned int index, unsigned int start, - unsigned int count, uint32_t flags, - void *data) -{ - int i; - - if (start != 0 || count > 1 || !vdev->num_ext_irqs) - return -EINVAL; - - index = array_index_nospec(index, - VFIO_PCI_NUM_IRQS + vdev->num_ext_irqs); - i = index - VFIO_PCI_NUM_IRQS; - - return vfio_pci_set_ctx_trigger_single(&vdev->ext_irqs[i].trigger, - count, flags, data); -} - int vfio_pci_set_irqs_ioctl(struct vfio_pci_device *vdev, uint32_t flags, unsigned index, unsigned start, unsigned count, void *data) @@ -703,13 +684,6 @@ int vfio_pci_set_irqs_ioctl(struct vfio_pci_device *vdev, uint32_t flags, break; } break; - default: - switch (flags & VFIO_IRQ_SET_ACTION_TYPE_MASK) { - case VFIO_IRQ_SET_ACTION_TRIGGER: - func = vfio_pci_set_ext_irq_trigger; - break; - } - break; }
if (!func) @@ -717,39 +691,3 @@ int vfio_pci_set_irqs_ioctl(struct vfio_pci_device *vdev, uint32_t flags,
return func(vdev, index, start, count, flags, data); } - -int vfio_pci_get_ext_irq_index(struct vfio_pci_device *vdev, - unsigned int type, unsigned int subtype) -{ - int i; - - for (i = 0; i < vdev->num_ext_irqs; i++) { - if (vdev->ext_irqs[i].type == type && - vdev->ext_irqs[i].subtype == subtype) { - return i; - } - } - return -EINVAL; -} - -int vfio_pci_register_irq(struct vfio_pci_device *vdev, - unsigned int type, unsigned int subtype, - u32 flags) -{ - struct vfio_ext_irq *ext_irqs; - - ext_irqs = krealloc(vdev->ext_irqs, - (vdev->num_ext_irqs + 1) * sizeof(*ext_irqs), - GFP_KERNEL); - if (!ext_irqs) - return -ENOMEM; - - vdev->ext_irqs = ext_irqs; - - vdev->ext_irqs[vdev->num_ext_irqs].type = type; - vdev->ext_irqs[vdev->num_ext_irqs].subtype = subtype; - vdev->ext_irqs[vdev->num_ext_irqs].flags = flags; - vdev->ext_irqs[vdev->num_ext_irqs].trigger = NULL; - vdev->num_ext_irqs++; - return 0; -} diff --git a/drivers/vfio/pci/vfio_pci_private.h b/drivers/vfio/pci/vfio_pci_private.h index a578723a34a5..ab488f11b2db 100644 --- a/drivers/vfio/pci/vfio_pci_private.h +++ b/drivers/vfio/pci/vfio_pci_private.h @@ -77,13 +77,6 @@ struct vfio_pci_region { u32 flags; };
-struct vfio_ext_irq { - u32 type; - u32 subtype; - u32 flags; - struct eventfd_ctx *trigger; -}; - struct vfio_pci_dummy_resource { struct resource resource; int index; @@ -123,8 +116,6 @@ struct vfio_pci_device { struct vfio_pci_irq_ctx *ctx; int num_ctx; int irq_type; - struct vfio_ext_irq *ext_irqs; - int num_ext_irqs; int num_regions; int num_vendor_regions; int num_vendor_irqs; @@ -172,11 +163,6 @@ struct vfio_pci_device {
extern void vfio_pci_intx_mask(struct vfio_pci_device *vdev); extern void vfio_pci_intx_unmask(struct vfio_pci_device *vdev); -extern int vfio_pci_register_irq(struct vfio_pci_device *vdev, - unsigned int type, unsigned int subtype, - u32 flags); -extern int vfio_pci_get_ext_irq_index(struct vfio_pci_device *vdev, - unsigned int type, unsigned int subtype);
extern int vfio_pci_set_irqs_ioctl(struct vfio_pci_device *vdev, uint32_t flags, unsigned index,
From: Kunkun Jiang jiangkunkun@huawei.com
virt inclusion category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I61SPO CVE: NA
--------------------------------
This reverts commit 41e3175a4cabca590ba3be605fb2ec63cc87f7c9.
Signed-off-by: Kunkun Jiang jiangkunkun@huawei.com Reviewed-by: Keqian Zhu zhukeqian1@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- include/uapi/linux/vfio.h | 19 +------------------ 1 file changed, 1 insertion(+), 18 deletions(-)
diff --git a/include/uapi/linux/vfio.h b/include/uapi/linux/vfio.h index fa3ac73c47be..6fe74f7b362c 100644 --- a/include/uapi/linux/vfio.h +++ b/include/uapi/linux/vfio.h @@ -712,27 +712,11 @@ struct vfio_irq_info { #define VFIO_IRQ_INFO_MASKABLE (1 << 1) #define VFIO_IRQ_INFO_AUTOMASKED (1 << 2) #define VFIO_IRQ_INFO_NORESIZE (1 << 3) -#define VFIO_IRQ_INFO_FLAG_CAPS (1 << 4) /* Info supports caps */ __u32 index; /* IRQ index */ __u32 count; /* Number of IRQs within this index */ - __u32 cap_offset; /* Offset within info struct of first cap */ }; #define VFIO_DEVICE_GET_IRQ_INFO _IO(VFIO_TYPE, VFIO_BASE + 9)
-/* - * The irq type capability allows IRQs unique to a specific device or - * class of devices to be exposed. - * - * The structures below define version 1 of this capability. - */ -#define VFIO_IRQ_INFO_CAP_TYPE 3 - -struct vfio_irq_info_cap_type { - struct vfio_info_cap_header header; - __u32 type; /* global per bus driver */ - __u32 subtype; /* type specific */ -}; - /** * VFIO_DEVICE_SET_IRQS - _IOW(VFIO_TYPE, VFIO_BASE + 10, struct vfio_irq_set) * @@ -834,8 +818,7 @@ enum { VFIO_PCI_MSIX_IRQ_INDEX, VFIO_PCI_ERR_IRQ_INDEX, VFIO_PCI_REQ_IRQ_INDEX, - VFIO_PCI_NUM_IRQS = 5 /* Fixed user ABI, IRQ indexes >=5 use */ - /* device specific cap to define content */ + VFIO_PCI_NUM_IRQS };
/*
From: Kunkun Jiang jiangkunkun@huawei.com
virt inclusion category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I61SPO CVE: NA
--------------------------------
This reverts commit e57dd79bca166644d630103e3e96b9345368c753.
Signed-off-by: Kunkun Jiang jiangkunkun@huawei.com Reviewed-by: Keqian Zhu zhukeqian1@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- drivers/vfio/pci/vfio_pci.c | 61 ++----------------------------------- 1 file changed, 3 insertions(+), 58 deletions(-)
diff --git a/drivers/vfio/pci/vfio_pci.c b/drivers/vfio/pci/vfio_pci.c index 9a3d0a54ee08..9493bfe98dd4 100644 --- a/drivers/vfio/pci/vfio_pci.c +++ b/drivers/vfio/pci/vfio_pci.c @@ -371,75 +371,21 @@ static void vfio_pci_dma_fault_release(struct vfio_pci_device *vdev, kfree(vdev->fault_pages); }
-static int vfio_pci_dma_fault_mmap(struct vfio_pci_device *vdev, - struct vfio_pci_region *region, - struct vm_area_struct *vma) -{ - u64 phys_len, req_len, pgoff, req_start; - unsigned long long addr; - unsigned int ret; - - phys_len = region->size; - - req_len = vma->vm_end - vma->vm_start; - pgoff = vma->vm_pgoff & - ((1U << (VFIO_PCI_OFFSET_SHIFT - PAGE_SHIFT)) - 1); - req_start = pgoff << PAGE_SHIFT; - - /* only the second page of the producer fault region is mmappable */ - if (req_start < PAGE_SIZE) - return -EINVAL; - - if (req_start + req_len > phys_len) - return -EINVAL; - - addr = virt_to_phys(vdev->fault_pages); - vma->vm_private_data = vdev; - vma->vm_pgoff = (addr >> PAGE_SHIFT) + pgoff; - - ret = remap_pfn_range(vma, vma->vm_start, vma->vm_pgoff, - req_len, vma->vm_page_prot); - return ret; -} - static int vfio_pci_dma_fault_add_capability(struct vfio_pci_device *vdev, struct vfio_pci_region *region, struct vfio_info_cap *caps) { - struct vfio_region_info_cap_sparse_mmap *sparse = NULL; struct vfio_region_info_cap_fault cap = { .header.id = VFIO_REGION_INFO_CAP_DMA_FAULT, .header.version = 1, .version = 1, }; - size_t size = sizeof(*sparse) + sizeof(*sparse->areas); - int ret; - - ret = vfio_info_add_capability(caps, &cap.header, sizeof(cap)); - if (ret) - return ret; - - sparse = kzalloc(size, GFP_KERNEL); - if (!sparse) - return -ENOMEM; - - sparse->header.id = VFIO_REGION_INFO_CAP_SPARSE_MMAP; - sparse->header.version = 1; - sparse->nr_areas = 1; - sparse->areas[0].offset = PAGE_SIZE; - sparse->areas[0].size = region->size - PAGE_SIZE; - - ret = vfio_info_add_capability(caps, &sparse->header, size); - if (ret) - kfree(sparse); - - return ret; + return vfio_info_add_capability(caps, &cap.header, sizeof(cap)); }
static const struct vfio_pci_regops vfio_pci_dma_fault_regops = { .rw = vfio_pci_dma_fault_rw, .release = vfio_pci_dma_fault_release, - .mmap = vfio_pci_dma_fault_mmap, .add_capability = vfio_pci_dma_fault_add_capability, };
@@ -513,8 +459,7 @@ static int vfio_pci_dma_fault_init(struct vfio_pci_device *vdev) VFIO_REGION_TYPE_NESTED, VFIO_REGION_SUBTYPE_NESTED_DMA_FAULT, &vfio_pci_dma_fault_regops, size, - VFIO_REGION_INFO_FLAG_READ | VFIO_REGION_INFO_FLAG_WRITE | - VFIO_REGION_INFO_FLAG_MMAP, + VFIO_REGION_INFO_FLAG_READ | VFIO_REGION_INFO_FLAG_WRITE, vdev->fault_pages); if (ret) goto out; @@ -522,7 +467,7 @@ static int vfio_pci_dma_fault_init(struct vfio_pci_device *vdev) header = (struct vfio_region_dma_fault *)vdev->fault_pages; header->entry_size = sizeof(struct iommu_fault); header->nb_entries = DMA_FAULT_RING_LENGTH; - header->offset = PAGE_SIZE; + header->offset = sizeof(struct vfio_region_dma_fault);
ret = iommu_register_device_fault_handler(&vdev->pdev->dev, vfio_pci_iommu_dev_fault_handler,
From: Kunkun Jiang jiangkunkun@huawei.com
virt inclusion category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I61SPO CVE: NA
--------------------------------
This reverts commit f7c0c57bf2addf067bc27a82389bd50c25334458.
Signed-off-by: Kunkun Jiang jiangkunkun@huawei.com Reviewed-by: Keqian Zhu zhukeqian1@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- drivers/vfio/pci/vfio_pci.c | 48 +------------------------------------ 1 file changed, 1 insertion(+), 47 deletions(-)
diff --git a/drivers/vfio/pci/vfio_pci.c b/drivers/vfio/pci/vfio_pci.c index 9493bfe98dd4..b68832bcc3e4 100644 --- a/drivers/vfio/pci/vfio_pci.c +++ b/drivers/vfio/pci/vfio_pci.c @@ -27,7 +27,6 @@ #include <linux/vgaarb.h> #include <linux/nospec.h> #include <linux/sched/mm.h> -#include <linux/circ_buf.h>
#include "vfio_pci_private.h"
@@ -389,41 +388,6 @@ static const struct vfio_pci_regops vfio_pci_dma_fault_regops = { .add_capability = vfio_pci_dma_fault_add_capability, };
-static int -vfio_pci_iommu_dev_fault_handler(struct iommu_fault *fault, void *data) -{ - struct vfio_pci_device *vdev = (struct vfio_pci_device *)data; - struct vfio_region_dma_fault *reg = - (struct vfio_region_dma_fault *)vdev->fault_pages; - struct iommu_fault *new; - u32 head, tail, size; - int ret = -EINVAL; - - if (WARN_ON(!reg)) - return ret; - - mutex_lock(&vdev->fault_queue_lock); - - head = reg->head; - tail = reg->tail; - size = reg->nb_entries; - - new = (struct iommu_fault *)(vdev->fault_pages + reg->offset + - head * reg->entry_size); - - if (CIRC_SPACE(head, tail, size) < 1) { - ret = -ENOSPC; - goto unlock; - } - - *new = *fault; - reg->head = (head + 1) % size; - ret = 0; -unlock: - mutex_unlock(&vdev->fault_queue_lock); - return ret; -} - #define DMA_FAULT_RING_LENGTH 512
static int vfio_pci_dma_fault_init(struct vfio_pci_device *vdev) @@ -468,13 +432,6 @@ static int vfio_pci_dma_fault_init(struct vfio_pci_device *vdev) header->entry_size = sizeof(struct iommu_fault); header->nb_entries = DMA_FAULT_RING_LENGTH; header->offset = sizeof(struct vfio_region_dma_fault); - - ret = iommu_register_device_fault_handler(&vdev->pdev->dev, - vfio_pci_iommu_dev_fault_handler, - vdev); - if (ret) /* the dma fault region is freed in vfio_pci_disable() */ - goto out; - return 0; out: kfree(vdev->fault_pages); @@ -598,7 +555,7 @@ static void vfio_pci_disable(struct vfio_pci_device *vdev) struct pci_dev *pdev = vdev->pdev; struct vfio_pci_dummy_resource *dummy_res, *tmp; struct vfio_pci_ioeventfd *ioeventfd, *ioeventfd_tmp; - int i, bar, ret; + int i, bar;
/* Stop the device from further DMA */ pci_clear_master(pdev); @@ -607,9 +564,6 @@ static void vfio_pci_disable(struct vfio_pci_device *vdev) VFIO_IRQ_SET_ACTION_TRIGGER, vdev->irq_type, 0, 0, NULL);
- ret = iommu_unregister_device_fault_handler(&vdev->pdev->dev); - WARN_ON(ret == -EBUSY); - /* Device closed, don't need mutex here */ list_for_each_entry_safe(ioeventfd, ioeventfd_tmp, &vdev->ioeventfds_list, next) {
From: Kunkun Jiang jiangkunkun@huawei.com
virt inclusion category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I61SPO CVE: NA
--------------------------------
This reverts commit 20b23b137402e2c4fd197feacf03b0bd30629b76.
Signed-off-by: Kunkun Jiang jiangkunkun@huawei.com Reviewed-by: Keqian Zhu zhukeqian1@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- drivers/vfio/pci/vfio_pci.c | 79 ----------------------------- drivers/vfio/pci/vfio_pci_private.h | 6 --- drivers/vfio/pci/vfio_pci_rdwr.c | 44 ---------------- include/uapi/linux/vfio.h | 35 ------------- 4 files changed, 164 deletions(-)
diff --git a/drivers/vfio/pci/vfio_pci.c b/drivers/vfio/pci/vfio_pci.c index b68832bcc3e4..af18415942ff 100644 --- a/drivers/vfio/pci/vfio_pci.c +++ b/drivers/vfio/pci/vfio_pci.c @@ -364,81 +364,6 @@ int vfio_pci_set_power_state(struct vfio_pci_device *vdev, pci_power_t state) return ret; }
-static void vfio_pci_dma_fault_release(struct vfio_pci_device *vdev, - struct vfio_pci_region *region) -{ - kfree(vdev->fault_pages); -} - -static int vfio_pci_dma_fault_add_capability(struct vfio_pci_device *vdev, - struct vfio_pci_region *region, - struct vfio_info_cap *caps) -{ - struct vfio_region_info_cap_fault cap = { - .header.id = VFIO_REGION_INFO_CAP_DMA_FAULT, - .header.version = 1, - .version = 1, - }; - return vfio_info_add_capability(caps, &cap.header, sizeof(cap)); -} - -static const struct vfio_pci_regops vfio_pci_dma_fault_regops = { - .rw = vfio_pci_dma_fault_rw, - .release = vfio_pci_dma_fault_release, - .add_capability = vfio_pci_dma_fault_add_capability, -}; - -#define DMA_FAULT_RING_LENGTH 512 - -static int vfio_pci_dma_fault_init(struct vfio_pci_device *vdev) -{ - struct vfio_region_dma_fault *header; - struct iommu_domain *domain; - size_t size; - int nested; - int ret; - - domain = iommu_get_domain_for_dev(&vdev->pdev->dev); - if (!domain) - return 0; - - ret = iommu_domain_get_attr(domain, DOMAIN_ATTR_NESTING, &nested); - if (ret || !nested) - return ret; - - mutex_init(&vdev->fault_queue_lock); - - /* - * We provision 1 page for the header and space for - * DMA_FAULT_RING_LENGTH fault records in the ring buffer. - */ - size = ALIGN(sizeof(struct iommu_fault) * - DMA_FAULT_RING_LENGTH, PAGE_SIZE) + PAGE_SIZE; - - vdev->fault_pages = kzalloc(size, GFP_KERNEL); - if (!vdev->fault_pages) - return -ENOMEM; - - ret = vfio_pci_register_dev_region(vdev, - VFIO_REGION_TYPE_NESTED, - VFIO_REGION_SUBTYPE_NESTED_DMA_FAULT, - &vfio_pci_dma_fault_regops, size, - VFIO_REGION_INFO_FLAG_READ | VFIO_REGION_INFO_FLAG_WRITE, - vdev->fault_pages); - if (ret) - goto out; - - header = (struct vfio_region_dma_fault *)vdev->fault_pages; - header->entry_size = sizeof(struct iommu_fault); - header->nb_entries = DMA_FAULT_RING_LENGTH; - header->offset = sizeof(struct vfio_region_dma_fault); - return 0; -out: - kfree(vdev->fault_pages); - vdev->fault_pages = NULL; - return ret; -} - static int vfio_pci_enable(struct vfio_pci_device *vdev) { struct pci_dev *pdev = vdev->pdev; @@ -537,10 +462,6 @@ static int vfio_pci_enable(struct vfio_pci_device *vdev) } }
- ret = vfio_pci_dma_fault_init(vdev); - if (ret) - goto disable_exit; - vfio_pci_probe_mmaps(vdev);
return 0; diff --git a/drivers/vfio/pci/vfio_pci_private.h b/drivers/vfio/pci/vfio_pci_private.h index ab488f11b2db..861068ec9cf7 100644 --- a/drivers/vfio/pci/vfio_pci_private.h +++ b/drivers/vfio/pci/vfio_pci_private.h @@ -141,8 +141,6 @@ struct vfio_pci_device { int ioeventfds_nr; struct eventfd_ctx *err_trigger; struct eventfd_ctx *req_trigger; - u8 *fault_pages; - struct mutex fault_queue_lock; struct list_head dummy_resources_list; struct mutex ioeventfds_lock; struct list_head ioeventfds_list; @@ -181,10 +179,6 @@ extern ssize_t vfio_pci_vga_rw(struct vfio_pci_device *vdev, char __user *buf, extern long vfio_pci_ioeventfd(struct vfio_pci_device *vdev, loff_t offset, uint64_t data, int count, int fd);
-extern size_t vfio_pci_dma_fault_rw(struct vfio_pci_device *vdev, - char __user *buf, size_t count, - loff_t *ppos, bool iswrite); - extern int vfio_pci_init_perm_bits(void); extern void vfio_pci_uninit_perm_bits(void);
diff --git a/drivers/vfio/pci/vfio_pci_rdwr.c b/drivers/vfio/pci/vfio_pci_rdwr.c index 7f4d377ac9be..4bced6e43afe 100644 --- a/drivers/vfio/pci/vfio_pci_rdwr.c +++ b/drivers/vfio/pci/vfio_pci_rdwr.c @@ -366,50 +366,6 @@ ssize_t vfio_pci_vga_rw(struct vfio_pci_device *vdev, char __user *buf, return done; }
-size_t vfio_pci_dma_fault_rw(struct vfio_pci_device *vdev, char __user *buf, - size_t count, loff_t *ppos, bool iswrite) -{ - unsigned int i = VFIO_PCI_OFFSET_TO_INDEX(*ppos) - VFIO_PCI_NUM_REGIONS; - loff_t pos = *ppos & VFIO_PCI_OFFSET_MASK; - void *base = vdev->region[i].data; - int ret = -EFAULT; - - if (pos >= vdev->region[i].size) - return -EINVAL; - - count = min(count, (size_t)(vdev->region[i].size - pos)); - - mutex_lock(&vdev->fault_queue_lock); - - if (iswrite) { - struct vfio_region_dma_fault *header = - (struct vfio_region_dma_fault *)base; - u32 new_tail; - - if (pos != 0 || count != 4) { - ret = -EINVAL; - goto unlock; - } - - if (copy_from_user((void *)&new_tail, buf, count)) - goto unlock; - - if (new_tail >= header->nb_entries) { - ret = -EINVAL; - goto unlock; - } - header->tail = new_tail; - } else { - if (copy_to_user(buf, base + pos, count)) - goto unlock; - } - *ppos += count; - ret = count; -unlock: - mutex_unlock(&vdev->fault_queue_lock); - return ret; -} - static void vfio_pci_ioeventfd_do_write(struct vfio_pci_ioeventfd *ioeventfd, bool test_mem) { diff --git a/include/uapi/linux/vfio.h b/include/uapi/linux/vfio.h index 6fe74f7b362c..8d75f2f0aebc 100644 --- a/include/uapi/linux/vfio.h +++ b/include/uapi/linux/vfio.h @@ -329,7 +329,6 @@ struct vfio_region_info_cap_type { #define VFIO_REGION_TYPE_GFX (1) #define VFIO_REGION_TYPE_CCW (2) #define VFIO_REGION_TYPE_MIGRATION (3) -#define VFIO_REGION_TYPE_NESTED (4)
/* sub-types for VFIO_REGION_TYPE_PCI_* */
@@ -354,9 +353,6 @@ struct vfio_region_info_cap_type { /* sub-types for VFIO_REGION_TYPE_GFX */ #define VFIO_REGION_SUBTYPE_GFX_EDID (1)
-/* sub-types for VFIO_REGION_TYPE_NESTED */ -#define VFIO_REGION_SUBTYPE_NESTED_DMA_FAULT (1) - /** * struct vfio_region_gfx_edid - EDID region layout. * @@ -1002,37 +998,6 @@ struct vfio_device_feature { */ #define VFIO_DEVICE_FEATURE_PCI_VF_TOKEN (0)
-/* - * Capability exposed by the DMA fault region - * @version: ABI version - */ -#define VFIO_REGION_INFO_CAP_DMA_FAULT 6 - -struct vfio_region_info_cap_fault { - struct vfio_info_cap_header header; - __u32 version; -}; - -/* - * DMA Fault Region Layout - * @tail: index relative to the start of the ring buffer at which the - * consumer finds the next item in the buffer - * @entry_size: fault ring buffer entry size in bytes - * @nb_entries: max capacity of the fault ring buffer - * @offset: ring buffer offset relative to the start of the region - * @head: index relative to the start of the ring buffer at which the - * producer (kernel) inserts items into the buffers - */ -struct vfio_region_dma_fault { - /* Write-Only */ - __u32 tail; - /* Read-Only */ - __u32 entry_size; - __u32 nb_entries; - __u32 offset; - __u32 head; -}; - /* -------- API for Type1 VFIO IOMMU -------- */
/**
From: Kunkun Jiang jiangkunkun@huawei.com
virt inclusion category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I61SPO CVE: NA
--------------------------------
This reverts commit ac16d334b1ac8664932e725a6a6255692f4e11f6.
Signed-off-by: Kunkun Jiang jiangkunkun@huawei.com Reviewed-by: Keqian Zhu zhukeqian1@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- drivers/vfio/vfio_iommu_type1.c | 62 --------------------------------- include/uapi/linux/vfio.h | 20 ----------- 2 files changed, 82 deletions(-)
diff --git a/drivers/vfio/vfio_iommu_type1.c b/drivers/vfio/vfio_iommu_type1.c index 79d42606fad8..07c868d1adc3 100644 --- a/drivers/vfio/vfio_iommu_type1.c +++ b/drivers/vfio/vfio_iommu_type1.c @@ -3088,41 +3088,6 @@ static int vfio_cache_inv_fn(struct device *dev, void *data) return iommu_uapi_cache_invalidate(dc->domain, dev, (void __user *)arg); }
-static int -vfio_bind_msi(struct vfio_iommu *iommu, - dma_addr_t giova, phys_addr_t gpa, size_t size) -{ - struct vfio_domain *d; - int ret = 0; - - mutex_lock(&iommu->lock); - - list_for_each_entry(d, &iommu->domain_list, next) { - ret = iommu_bind_guest_msi(d->domain, giova, gpa, size); - if (ret) - goto unwind; - } - goto unlock; -unwind: - list_for_each_entry_continue_reverse(d, &iommu->domain_list, next) { - iommu_unbind_guest_msi(d->domain, giova); - } -unlock: - mutex_unlock(&iommu->lock); - return ret; -} - -static void -vfio_unbind_msi(struct vfio_iommu *iommu, dma_addr_t giova) -{ - struct vfio_domain *d; - - mutex_lock(&iommu->lock); - list_for_each_entry(d, &iommu->domain_list, next) - iommu_unbind_guest_msi(d->domain, giova); - mutex_unlock(&iommu->lock); -} - static int vfio_iommu_migration_build_caps(struct vfio_iommu *iommu, struct vfio_info_cap *caps) { @@ -3320,31 +3285,6 @@ static int vfio_iommu_type1_cache_invalidate(struct vfio_iommu *iommu, return ret; }
-static int vfio_iommu_type1_set_msi_binding(struct vfio_iommu *iommu, - unsigned long arg) -{ - struct vfio_iommu_type1_set_msi_binding msi_binding; - unsigned long minsz; - - minsz = offsetofend(struct vfio_iommu_type1_set_msi_binding, - size); - - if (copy_from_user(&msi_binding, (void __user *)arg, minsz)) - return -EFAULT; - - if (msi_binding.argsz < minsz) - return -EINVAL; - - if (msi_binding.flags == VFIO_IOMMU_UNBIND_MSI) { - vfio_unbind_msi(iommu, msi_binding.iova); - return 0; - } else if (msi_binding.flags == VFIO_IOMMU_BIND_MSI) { - return vfio_bind_msi(iommu, msi_binding.iova, - msi_binding.gpa, msi_binding.size); - } - return -EINVAL; -} - static int vfio_iommu_type1_dirty_pages(struct vfio_iommu *iommu, unsigned long arg) { @@ -3654,8 +3594,6 @@ static long vfio_iommu_type1_ioctl(void *iommu_data, return vfio_iommu_type1_set_pasid_table(iommu, arg); case VFIO_IOMMU_CACHE_INVALIDATE: return vfio_iommu_type1_cache_invalidate(iommu, arg); - case VFIO_IOMMU_SET_MSI_BINDING: - return vfio_iommu_type1_set_msi_binding(iommu, arg); default: return -ENOTTY; } diff --git a/include/uapi/linux/vfio.h b/include/uapi/linux/vfio.h index 8d75f2f0aebc..7ea68500b508 100644 --- a/include/uapi/linux/vfio.h +++ b/include/uapi/linux/vfio.h @@ -1323,26 +1323,6 @@ struct vfio_iommu_type1_cache_invalidate { }; #define VFIO_IOMMU_CACHE_INVALIDATE _IO(VFIO_TYPE, VFIO_BASE + 19)
-/** - * VFIO_IOMMU_SET_MSI_BINDING - _IOWR(VFIO_TYPE, VFIO_BASE + 20, - * struct vfio_iommu_type1_set_msi_binding) - * - * Pass a stage 1 MSI doorbell mapping to the host so that this - * latter can build a nested stage2 mapping. Or conversely tear - * down a previously bound stage 1 MSI binding. - */ -struct vfio_iommu_type1_set_msi_binding { - __u32 argsz; - __u32 flags; -#define VFIO_IOMMU_BIND_MSI (1 << 0) -#define VFIO_IOMMU_UNBIND_MSI (1 << 1) - __u64 iova; /* MSI guest IOVA */ - /* Fields below are used on BIND */ - __u64 gpa; /* MSI guest physical address */ - __u64 size; /* size of stage1 mapping (bytes) */ -}; -#define VFIO_IOMMU_SET_MSI_BINDING _IO(VFIO_TYPE, VFIO_BASE + 20) - /* -------- Additional API for SPAPR TCE (Server POWERPC) IOMMU -------- */
/*
From: Kunkun Jiang jiangkunkun@huawei.com
virt inclusion category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I61SPO CVE: NA
--------------------------------
This reverts commit 04ba12c4366f5369157419e73e37e444e2faa232.
Signed-off-by: Kunkun Jiang jiangkunkun@huawei.com Reviewed-by: Keqian Zhu zhukeqian1@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- drivers/vfio/vfio_iommu_type1.c | 60 --------------------------------- include/uapi/linux/vfio.h | 13 ------- 2 files changed, 73 deletions(-)
diff --git a/drivers/vfio/vfio_iommu_type1.c b/drivers/vfio/vfio_iommu_type1.c index 07c868d1adc3..217279ddb7cc 100644 --- a/drivers/vfio/vfio_iommu_type1.c +++ b/drivers/vfio/vfio_iommu_type1.c @@ -159,36 +159,6 @@ struct vfio_regions { #define DIRTY_BITMAP_PAGES_MAX ((u64)INT_MAX) #define DIRTY_BITMAP_SIZE_MAX DIRTY_BITMAP_BYTES(DIRTY_BITMAP_PAGES_MAX)
-#define WAITED 1 - -struct domain_capsule { - struct iommu_domain *domain; - void *data; -}; - -/* iommu->lock must be held */ -static int -vfio_iommu_lookup_dev(struct vfio_iommu *iommu, - int (*fn)(struct device *dev, void *data), - unsigned long arg) -{ - struct domain_capsule dc = {.data = &arg}; - struct vfio_domain *d; - struct vfio_group *g; - int ret = 0; - - list_for_each_entry(d, &iommu->domain_list, next) { - dc.domain = d->domain; - list_for_each_entry(g, &d->group_list, next) { - ret = iommu_group_for_each_dev(g->iommu_group, - &dc, fn); - if (ret) - break; - } - } - return ret; -} - static int put_pfn(unsigned long pfn, int prot);
static struct vfio_group *vfio_iommu_find_iommu_group(struct vfio_iommu *iommu, @@ -3080,13 +3050,6 @@ vfio_attach_pasid_table(struct vfio_iommu *iommu, unsigned long arg) mutex_unlock(&iommu->lock); return ret; } -static int vfio_cache_inv_fn(struct device *dev, void *data) -{ - struct domain_capsule *dc = (struct domain_capsule *)data; - unsigned long arg = *(unsigned long *)dc->data; - - return iommu_uapi_cache_invalidate(dc->domain, dev, (void __user *)arg); -}
static int vfio_iommu_migration_build_caps(struct vfio_iommu *iommu, struct vfio_info_cap *caps) @@ -3264,27 +3227,6 @@ static void vfio_iommu_dirty_log_switch(struct vfio_iommu *iommu, bool enable) } }
-static int vfio_iommu_type1_cache_invalidate(struct vfio_iommu *iommu, - unsigned long arg) -{ - struct vfio_iommu_type1_cache_invalidate cache_inv; - unsigned long minsz; - int ret; - - minsz = offsetofend(struct vfio_iommu_type1_cache_invalidate, flags); - - if (copy_from_user(&cache_inv, (void __user *)arg, minsz)) - return -EFAULT; - - if (cache_inv.argsz < minsz || cache_inv.flags) - return -EINVAL; - - mutex_lock(&iommu->lock); - ret = vfio_iommu_lookup_dev(iommu, vfio_cache_inv_fn, arg + minsz); - mutex_unlock(&iommu->lock); - return ret; -} - static int vfio_iommu_type1_dirty_pages(struct vfio_iommu *iommu, unsigned long arg) { @@ -3592,8 +3534,6 @@ static long vfio_iommu_type1_ioctl(void *iommu_data, return vfio_iommu_type1_unbind(iommu, arg); case VFIO_IOMMU_SET_PASID_TABLE: return vfio_iommu_type1_set_pasid_table(iommu, arg); - case VFIO_IOMMU_CACHE_INVALIDATE: - return vfio_iommu_type1_cache_invalidate(iommu, arg); default: return -ENOTTY; } diff --git a/include/uapi/linux/vfio.h b/include/uapi/linux/vfio.h index 7ea68500b508..38ab2b0d35e0 100644 --- a/include/uapi/linux/vfio.h +++ b/include/uapi/linux/vfio.h @@ -1310,19 +1310,6 @@ struct vfio_iommu_type1_set_pasid_table {
#define VFIO_IOMMU_SET_PASID_TABLE _IO(VFIO_TYPE, VFIO_BASE + 18)
-/** - * VFIO_IOMMU_CACHE_INVALIDATE - _IOWR(VFIO_TYPE, VFIO_BASE + 19, - * struct vfio_iommu_type1_cache_invalidate) - * - * Propagate guest IOMMU cache invalidation to the host. - */ -struct vfio_iommu_type1_cache_invalidate { - __u32 argsz; - __u32 flags; - struct iommu_cache_invalidate_info info; -}; -#define VFIO_IOMMU_CACHE_INVALIDATE _IO(VFIO_TYPE, VFIO_BASE + 19) - /* -------- Additional API for SPAPR TCE (Server POWERPC) IOMMU -------- */
/*
From: Kunkun Jiang jiangkunkun@huawei.com
virt inclusion category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I61SPO CVE: NA
--------------------------------
This reverts commit 4b0423579002261f8ea84ec82ce1039ec174025a.
Signed-off-by: Kunkun Jiang jiangkunkun@huawei.com Reviewed-by: Keqian Zhu zhukeqian1@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- drivers/vfio/vfio_iommu_type1.c | 58 --------------------------------- include/uapi/linux/vfio.h | 20 ------------ 2 files changed, 78 deletions(-)
diff --git a/drivers/vfio/vfio_iommu_type1.c b/drivers/vfio/vfio_iommu_type1.c index 217279ddb7cc..77503d59d973 100644 --- a/drivers/vfio/vfio_iommu_type1.c +++ b/drivers/vfio/vfio_iommu_type1.c @@ -3018,39 +3018,6 @@ static int vfio_iommu_iova_build_caps(struct vfio_iommu *iommu, return ret; }
-static void -vfio_detach_pasid_table(struct vfio_iommu *iommu) -{ - struct vfio_domain *d; - - mutex_lock(&iommu->lock); - list_for_each_entry(d, &iommu->domain_list, next) - iommu_detach_pasid_table(d->domain); - - mutex_unlock(&iommu->lock); -} - -static int -vfio_attach_pasid_table(struct vfio_iommu *iommu, unsigned long arg) -{ - struct vfio_domain *d; - int ret = 0; - - mutex_lock(&iommu->lock); - - list_for_each_entry(d, &iommu->domain_list, next) { - ret = iommu_uapi_attach_pasid_table(d->domain, (void __user *)arg); - if (ret) { - list_for_each_entry_continue_reverse(d, &iommu->domain_list, next) - iommu_detach_pasid_table(d->domain); - break; - } - } - - mutex_unlock(&iommu->lock); - return ret; -} - static int vfio_iommu_migration_build_caps(struct vfio_iommu *iommu, struct vfio_info_cap *caps) { @@ -3489,29 +3456,6 @@ static long vfio_iommu_type1_unbind(struct vfio_iommu *iommu, unsigned long arg) return 0; }
-static int vfio_iommu_type1_set_pasid_table(struct vfio_iommu *iommu, - unsigned long arg) -{ - struct vfio_iommu_type1_set_pasid_table spt; - unsigned long minsz; - - minsz = offsetofend(struct vfio_iommu_type1_set_pasid_table, flags); - - if (copy_from_user(&spt, (void __user *)arg, minsz)) - return -EFAULT; - - if (spt.argsz < minsz) - return -EINVAL; - - if (spt.flags == VFIO_PASID_TABLE_FLAG_SET) { - return vfio_attach_pasid_table(iommu, arg + minsz); - } else if (spt.flags == VFIO_PASID_TABLE_FLAG_UNSET) { - vfio_detach_pasid_table(iommu); - return 0; - } - return -EINVAL; -} - static long vfio_iommu_type1_ioctl(void *iommu_data, unsigned int cmd, unsigned long arg) { @@ -3532,8 +3476,6 @@ static long vfio_iommu_type1_ioctl(void *iommu_data, return vfio_iommu_type1_bind(iommu, arg); case VFIO_IOMMU_UNBIND: return vfio_iommu_type1_unbind(iommu, arg); - case VFIO_IOMMU_SET_PASID_TABLE: - return vfio_iommu_type1_set_pasid_table(iommu, arg); default: return -ENOTTY; } diff --git a/include/uapi/linux/vfio.h b/include/uapi/linux/vfio.h index 38ab2b0d35e0..52658db9aaf7 100644 --- a/include/uapi/linux/vfio.h +++ b/include/uapi/linux/vfio.h @@ -14,7 +14,6 @@
#include <linux/types.h> #include <linux/ioctl.h> -#include <linux/iommu.h>
#define VFIO_API_VERSION 0
@@ -1291,25 +1290,6 @@ struct vfio_iommu_type1_bind { */ #define VFIO_IOMMU_UNBIND _IO(VFIO_TYPE, VFIO_BASE + 23)
-/* - * VFIO_IOMMU_SET_PASID_TABLE - _IOWR(VFIO_TYPE, VFIO_BASE + 18, - * struct vfio_iommu_type1_set_pasid_table) - * - * The SET operation passes a PASID table to the host while the - * UNSET operation detaches the one currently programmed. It is - * allowed to "SET" the table several times without unsetting as - * long as the table config does not stay IOMMU_PASID_CONFIG_TRANSLATE. - */ -struct vfio_iommu_type1_set_pasid_table { - __u32 argsz; - __u32 flags; -#define VFIO_PASID_TABLE_FLAG_SET (1 << 0) -#define VFIO_PASID_TABLE_FLAG_UNSET (1 << 1) - struct iommu_pasid_table_config config; /* used on SET */ -}; - -#define VFIO_IOMMU_SET_PASID_TABLE _IO(VFIO_TYPE, VFIO_BASE + 18) - /* -------- Additional API for SPAPR TCE (Server POWERPC) IOMMU -------- */
/*
From: Kunkun Jiang jiangkunkun@huawei.com
virt inclusion category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I61SPO CVE: NA
--------------------------------
This reverts commit 7733f2e7a689598588f6074acca8b9424a76ea4a.
Signed-off-by: Kunkun Jiang jiangkunkun@huawei.com Reviewed-by: Keqian Zhu zhukeqian1@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c | 40 ++------------------- drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h | 4 --- 2 files changed, 2 insertions(+), 42 deletions(-)
diff --git a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c index 457ebc1b90d2..dd94796db4f9 100644 --- a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c +++ b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c @@ -1700,7 +1700,6 @@ static int arm_smmu_handle_evt(struct arm_smmu_device *smmu, u64 *evt) u32 perm = 0; struct arm_smmu_master *master; bool ssid_valid = evt[0] & EVTQ_0_SSV; - u8 type = FIELD_GET(EVTQ_0_ID, evt[0]); u32 sid = FIELD_GET(EVTQ_0_SID, evt[0]); struct iommu_fault_event fault_evt = { }; struct iommu_fault *flt = &fault_evt.fault; @@ -1753,6 +1752,8 @@ static int arm_smmu_handle_evt(struct arm_smmu_device *smmu, u64 *evt) } else { flt->type = IOMMU_FAULT_DMA_UNRECOV; flt->event = (struct iommu_fault_unrecoverable) { + .reason = reason, + .flags = IOMMU_FAULT_UNRECOV_ADDR_VALID, .perm = perm, .addr = FIELD_GET(EVTQ_2_ADDR, evt[2]), }; @@ -1761,43 +1762,6 @@ static int arm_smmu_handle_evt(struct arm_smmu_device *smmu, u64 *evt) flt->event.flags |= IOMMU_FAULT_UNRECOV_PASID_VALID; flt->event.pasid = FIELD_GET(EVTQ_0_SSID, evt[0]); } - - switch (type) { - case EVT_ID_TRANSLATION_FAULT: - flt->event.reason = IOMMU_FAULT_REASON_PTE_FETCH; - flt->event.flags |= IOMMU_FAULT_UNRECOV_ADDR_VALID; - break; - case EVT_ID_ADDR_SIZE_FAULT: - flt->event.reason = IOMMU_FAULT_REASON_OOR_ADDRESS; - flt->event.flags |= IOMMU_FAULT_UNRECOV_ADDR_VALID; - break; - case EVT_ID_ACCESS_FAULT: - flt->event.reason = IOMMU_FAULT_REASON_ACCESS; - flt->event.flags |= IOMMU_FAULT_UNRECOV_ADDR_VALID; - break; - case EVT_ID_PERMISSION_FAULT: - flt->event.reason = IOMMU_FAULT_REASON_PERMISSION; - flt->event.flags |= IOMMU_FAULT_UNRECOV_ADDR_VALID; - break; - case EVT_ID_BAD_SUBSTREAMID: - flt->event.reason = IOMMU_FAULT_REASON_PASID_INVALID; - break; - case EVT_ID_CD_FETCH: - flt->event.reason = IOMMU_FAULT_REASON_PASID_FETCH; - flt->event.flags |= IOMMU_FAULT_UNRECOV_FETCH_ADDR_VALID; - break; - case EVT_ID_BAD_CD: - flt->event.reason = IOMMU_FAULT_REASON_BAD_PASID_ENTRY; - break; - case EVT_ID_WALK_EABT: - flt->event.reason = IOMMU_FAULT_REASON_WALK_EABT; - flt->event.flags |= IOMMU_FAULT_UNRECOV_ADDR_VALID | - IOMMU_FAULT_UNRECOV_FETCH_ADDR_VALID; - break; - default: - /* TODO: report other unrecoverable faults. */ - return -EFAULT; - } }
mutex_lock(&smmu->streams_mutex); diff --git a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h index d0f3181a22c5..c744d812fc8d 100644 --- a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h +++ b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h @@ -433,10 +433,6 @@
#define EVTQ_0_ID GENMASK_ULL(7, 0)
-#define EVT_ID_BAD_SUBSTREAMID 0x08 -#define EVT_ID_CD_FETCH 0x09 -#define EVT_ID_BAD_CD 0x0a -#define EVT_ID_WALK_EABT 0x0b #define EVT_ID_TRANSLATION_FAULT 0x10 #define EVT_ID_ADDR_SIZE_FAULT 0x11 #define EVT_ID_ACCESS_FAULT 0x12
From: Kunkun Jiang jiangkunkun@huawei.com
virt inclusion category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I61SPO CVE: NA
--------------------------------
This reverts commit 74eeb1a933fe92b75c7140063dd3ee2d7ec5872f.
Signed-off-by: Kunkun Jiang jiangkunkun@huawei.com Reviewed-by: Keqian Zhu zhukeqian1@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c | 43 --------------------- 1 file changed, 43 deletions(-)
diff --git a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c index dd94796db4f9..533daf87879b 100644 --- a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c +++ b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c @@ -3585,47 +3585,6 @@ static void arm_smmu_get_resv_regions(struct device *dev, iommu_dma_get_resv_regions(dev, head); }
-static int -arm_smmu_bind_guest_msi(struct iommu_domain *domain, - dma_addr_t giova, phys_addr_t gpa, size_t size) -{ - struct arm_smmu_domain *smmu_domain = to_smmu_domain(domain); - struct arm_smmu_device *smmu; - int ret = -EINVAL; - - mutex_lock(&smmu_domain->init_mutex); - smmu = smmu_domain->smmu; - if (!smmu) - goto out; - - if (smmu_domain->stage != ARM_SMMU_DOMAIN_NESTED) - goto out; - - ret = iommu_dma_bind_guest_msi(domain, giova, gpa, size); -out: - mutex_unlock(&smmu_domain->init_mutex); - return ret; -} - -static void -arm_smmu_unbind_guest_msi(struct iommu_domain *domain, dma_addr_t giova) -{ - struct arm_smmu_domain *smmu_domain = to_smmu_domain(domain); - struct arm_smmu_device *smmu; - - mutex_lock(&smmu_domain->init_mutex); - smmu = smmu_domain->smmu; - if (!smmu) - goto unlock; - - if (smmu_domain->stage != ARM_SMMU_DOMAIN_NESTED) - goto unlock; - - iommu_dma_unbind_guest_msi(domain, giova); -unlock: - mutex_unlock(&smmu_domain->init_mutex); -} - static int arm_smmu_attach_pasid_table(struct iommu_domain *domain, struct iommu_pasid_table_config *cfg) { @@ -4309,8 +4268,6 @@ static struct iommu_ops arm_smmu_ops = { .attach_pasid_table = arm_smmu_attach_pasid_table, .detach_pasid_table = arm_smmu_detach_pasid_table, .cache_invalidate = arm_smmu_cache_invalidate, - .bind_guest_msi = arm_smmu_bind_guest_msi, - .unbind_guest_msi = arm_smmu_unbind_guest_msi, .dev_has_feat = arm_smmu_dev_has_feature, .dev_feat_enabled = arm_smmu_dev_feature_enabled, .dev_enable_feat = arm_smmu_dev_enable_feature,
From: Kunkun Jiang jiangkunkun@huawei.com
virt inclusion category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I61SPO CVE: NA
--------------------------------
This reverts commit 85412c048741ef1c9d3ae4f1f6218ed15ceac587.
Signed-off-by: Kunkun Jiang jiangkunkun@huawei.com Reviewed-by: Keqian Zhu zhukeqian1@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c | 23 ++------------------- drivers/iommu/iommu.c | 2 -- 2 files changed, 2 insertions(+), 23 deletions(-)
diff --git a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c index 533daf87879b..c739d296f5da 100644 --- a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c +++ b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c @@ -2914,23 +2914,6 @@ static bool arm_smmu_share_msi_domain(struct iommu_domain *domain, return share; }
-static bool arm_smmu_has_hw_msi_resv_region(struct device *dev) -{ - struct iommu_resv_region *region; - bool has_msi_resv_region = false; - LIST_HEAD(resv_regions); - - iommu_get_resv_regions(dev, &resv_regions); - list_for_each_entry(region, &resv_regions, list) { - if (region->type == IOMMU_RESV_MSI) { - has_msi_resv_region = true; - break; - } - } - iommu_put_resv_regions(dev, &resv_regions); - return has_msi_resv_region; -} - static int arm_smmu_attach_dev(struct iommu_domain *domain, struct device *dev) { int ret = 0; @@ -2995,12 +2978,10 @@ static int arm_smmu_attach_dev(struct iommu_domain *domain, struct device *dev) /* * In nested mode we must check all devices belonging to the * domain share the same physical MSI doorbell. Otherwise nested - * stage MSI binding is not supported. Also nested mode is not - * compatible with MSI HW reserved regions. + * stage MSI binding is not supported. */ if (smmu_domain->stage == ARM_SMMU_DOMAIN_NESTED && - (!arm_smmu_share_msi_domain(domain, dev) || - arm_smmu_has_hw_msi_resv_region(dev))) { + !arm_smmu_share_msi_domain(domain, dev)) { ret = -EINVAL; goto out_unlock; } diff --git a/drivers/iommu/iommu.c b/drivers/iommu/iommu.c index 97953fa27630..d2fbebee719b 100644 --- a/drivers/iommu/iommu.c +++ b/drivers/iommu/iommu.c @@ -3127,7 +3127,6 @@ void iommu_get_resv_regions(struct device *dev, struct list_head *list) if (ops && ops->get_resv_regions) ops->get_resv_regions(dev, list); } -EXPORT_SYMBOL_GPL(iommu_get_resv_regions);
void iommu_put_resv_regions(struct device *dev, struct list_head *list) { @@ -3136,7 +3135,6 @@ void iommu_put_resv_regions(struct device *dev, struct list_head *list) if (ops && ops->put_resv_regions) ops->put_resv_regions(dev, list); } -EXPORT_SYMBOL_GPL(iommu_put_resv_regions);
/** * generic_iommu_put_resv_regions - Reserved region driver helper
From: Kunkun Jiang jiangkunkun@huawei.com
virt inclusion category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I61SPO CVE: NA
--------------------------------
This reverts commit b4ddfa737eca0043559055325e6d8c0483425065.
Signed-off-by: Kunkun Jiang jiangkunkun@huawei.com Reviewed-by: Keqian Zhu zhukeqian1@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c | 41 --------------------- 1 file changed, 41 deletions(-)
diff --git a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c index c739d296f5da..8d839fe21297 100644 --- a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c +++ b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c @@ -2883,37 +2883,6 @@ static void arm_smmu_detach_dev(struct arm_smmu_master *master) arm_smmu_install_ste_for_dev(master); }
-static bool arm_smmu_share_msi_domain(struct iommu_domain *domain, - struct device *dev) -{ - struct arm_smmu_domain *smmu_domain = to_smmu_domain(domain); - struct irq_domain *irqd = dev_get_msi_domain(dev); - struct arm_smmu_master *master; - unsigned long flags; - bool share = false; - - if (!irqd) - return true; - - spin_lock_irqsave(&smmu_domain->devices_lock, flags); - list_for_each_entry(master, &smmu_domain->devices, domain_head) { - struct irq_domain *d = dev_get_msi_domain(master->dev); - - if (!d) - continue; - if (irqd != d) { - dev_info(dev, "Nested mode forbids to attach devices " - "using different physical MSI doorbells " - "to the same iommu_domain"); - goto unlock; - } - } - share = true; -unlock: - spin_unlock_irqrestore(&smmu_domain->devices_lock, flags); - return share; -} - static int arm_smmu_attach_dev(struct iommu_domain *domain, struct device *dev) { int ret = 0; @@ -2975,16 +2944,6 @@ static int arm_smmu_attach_dev(struct iommu_domain *domain, struct device *dev) ret = -EINVAL; goto out_unlock; } - /* - * In nested mode we must check all devices belonging to the - * domain share the same physical MSI doorbell. Otherwise nested - * stage MSI binding is not supported. - */ - if (smmu_domain->stage == ARM_SMMU_DOMAIN_NESTED && - !arm_smmu_share_msi_domain(domain, dev)) { - ret = -EINVAL; - goto out_unlock; - }
master->domain = smmu_domain;
From: Kunkun Jiang jiangkunkun@huawei.com
virt inclusion category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I61SPO CVE: NA
--------------------------------
This reverts commit 15700dc0010f823a62c8d77f693ce9ad121f75c6.
Signed-off-by: Kunkun Jiang jiangkunkun@huawei.com Reviewed-by: Keqian Zhu zhukeqian1@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- drivers/iommu/dma-iommu.c | 180 +------------------------------------- include/linux/dma-iommu.h | 16 ---- 2 files changed, 4 insertions(+), 192 deletions(-)
diff --git a/drivers/iommu/dma-iommu.c b/drivers/iommu/dma-iommu.c index 50b3e3a72a00..d1539b7399a9 100644 --- a/drivers/iommu/dma-iommu.c +++ b/drivers/iommu/dma-iommu.c @@ -27,15 +27,12 @@ struct iommu_dma_msi_page { struct list_head list; dma_addr_t iova; - dma_addr_t gpa; phys_addr_t phys; - size_t s1_granule; };
enum iommu_dma_cookie_type { IOMMU_DMA_IOVA_COOKIE, IOMMU_DMA_MSI_COOKIE, - IOMMU_DMA_NESTED_MSI_COOKIE, };
struct iommu_dma_cookie { @@ -47,8 +44,6 @@ struct iommu_dma_cookie { dma_addr_t msi_iova; }; struct list_head msi_page_list; - /* used in nested mode only */ - spinlock_t msi_lock;
/* Domain for flush queue callback; NULL if flush queue not in use */ struct iommu_domain *fq_domain; @@ -67,7 +62,6 @@ static struct iommu_dma_cookie *cookie_alloc(enum iommu_dma_cookie_type type)
cookie = kzalloc(sizeof(*cookie), GFP_KERNEL); if (cookie) { - spin_lock_init(&cookie->msi_lock); INIT_LIST_HEAD(&cookie->msi_page_list); cookie->type = type; } @@ -101,17 +95,14 @@ EXPORT_SYMBOL(iommu_get_dma_cookie); * * Users who manage their own IOVA allocation and do not want DMA API support, * but would still like to take advantage of automatic MSI remapping, can use - * this to initialise their own domain appropriately. Users may reserve a + * this to initialise their own domain appropriately. Users should reserve a * contiguous IOVA region, starting at @base, large enough to accommodate the * number of PAGE_SIZE mappings necessary to cover every MSI doorbell address - * used by the devices attached to @domain. The other way round is to provide - * usable iova pages through the iommu_dma_bind_guest_msi API (nested stages - * use case) + * used by the devices attached to @domain. */ int iommu_get_msi_cookie(struct iommu_domain *domain, dma_addr_t base) { struct iommu_dma_cookie *cookie; - int nesting, ret;
if (domain->type != IOMMU_DOMAIN_UNMANAGED) return -EINVAL; @@ -119,17 +110,11 @@ int iommu_get_msi_cookie(struct iommu_domain *domain, dma_addr_t base) if (domain->iova_cookie) return -EEXIST;
- ret = iommu_domain_get_attr(domain, DOMAIN_ATTR_NESTING, &nesting); - if (!ret && nesting) - cookie = cookie_alloc(IOMMU_DMA_NESTED_MSI_COOKIE); - else - cookie = cookie_alloc(IOMMU_DMA_MSI_COOKIE); - + cookie = cookie_alloc(IOMMU_DMA_MSI_COOKIE); if (!cookie) return -ENOMEM;
- if (!nesting) - cookie->msi_iova = base; + cookie->msi_iova = base; domain->iova_cookie = cookie; return 0; } @@ -153,116 +138,15 @@ void iommu_put_dma_cookie(struct iommu_domain *domain) if (cookie->type == IOMMU_DMA_IOVA_COOKIE && cookie->iovad.granule) put_iova_domain(&cookie->iovad);
- spin_lock(&cookie->msi_lock); list_for_each_entry_safe(msi, tmp, &cookie->msi_page_list, list) { - if (cookie->type == IOMMU_DMA_NESTED_MSI_COOKIE && msi->phys) { - size_t size = cookie_msi_granule(cookie); - - WARN_ON(iommu_unmap(domain, msi->gpa, size) != size); - } list_del(&msi->list); kfree(msi); } - spin_unlock(&cookie->msi_lock); kfree(cookie); domain->iova_cookie = NULL; } EXPORT_SYMBOL(iommu_put_dma_cookie);
-/** - * iommu_dma_bind_guest_msi - Allows to pass the stage 1 - * binding of a virtual MSI doorbell used by @dev. - * - * @domain: domain handle - * @giova: guest iova - * @gpa: gpa of the virtual doorbell - * @size: size of the granule used for the stage1 mapping - * - * In nested stage use case, the user can provide IOVA/IPA bindings - * corresponding to a guest MSI stage 1 mapping. When the host needs - * to map its own MSI doorbells, it can use @gpa as stage 2 input - * and map it onto the physical MSI doorbell. - */ -int iommu_dma_bind_guest_msi(struct iommu_domain *domain, - dma_addr_t giova, phys_addr_t gpa, size_t size) -{ - struct iommu_dma_cookie *cookie = domain->iova_cookie; - struct iommu_dma_msi_page *msi; - int ret = 0; - - if (!cookie) - return -EINVAL; - - if (cookie->type != IOMMU_DMA_NESTED_MSI_COOKIE) - return -EINVAL; - - /* - * we currently do not support S1 granule larger than S2 one - * as this would oblige to have multiple S2 mappings for a - * single S1 one - */ - if (size > cookie_msi_granule(cookie)) - return -EINVAL; - - giova = giova & ~(dma_addr_t)(size - 1); - gpa = gpa & ~(phys_addr_t)(size - 1); - - spin_lock(&cookie->msi_lock); - - list_for_each_entry(msi, &cookie->msi_page_list, list) { - if (msi->iova == giova) - goto unlock; /* this page is already registered */ - } - - msi = kzalloc(sizeof(*msi), GFP_ATOMIC); - if (!msi) { - ret = -ENOMEM; - goto unlock; - } - - msi->iova = giova; - msi->gpa = gpa; - msi->s1_granule = size; - list_add(&msi->list, &cookie->msi_page_list); -unlock: - spin_unlock(&cookie->msi_lock); - return ret; -} -EXPORT_SYMBOL(iommu_dma_bind_guest_msi); - -void iommu_dma_unbind_guest_msi(struct iommu_domain *domain, dma_addr_t giova) -{ - struct iommu_dma_cookie *cookie = domain->iova_cookie; - struct iommu_dma_msi_page *msi; - - if (!cookie) - return; - - if (cookie->type != IOMMU_DMA_NESTED_MSI_COOKIE) - return; - - spin_lock(&cookie->msi_lock); - - list_for_each_entry(msi, &cookie->msi_page_list, list) { - dma_addr_t aligned_giova = - giova & ~(dma_addr_t)(msi->s1_granule - 1); - - if (msi->iova == aligned_giova) { - if (msi->phys) { - /* unmap the stage 2 */ - size_t size = cookie_msi_granule(cookie); - - WARN_ON(iommu_unmap(domain, msi->gpa, size) != size); - } - list_del(&msi->list); - kfree(msi); - break; - } - } - spin_unlock(&cookie->msi_lock); -} -EXPORT_SYMBOL(iommu_dma_unbind_guest_msi); - /** * iommu_dma_get_resv_regions - Reserved region driver helper * @dev: Device from iommu_get_resv_regions() @@ -1314,58 +1198,6 @@ void iommu_setup_dma_ops(struct device *dev, u64 dma_base, u64 size) dev_name(dev)); }
-/* - * iommu_dma_get_nested_msi_page - Returns a nested stage MSI page - * mapping translating into the physical doorbell address @msi_addr - * - * In nested mode, the userspace provides the guest - * gIOVA - gDB stage 1 mappings. When we need to build a stage 2 - * mapping for a physical doorbell (@msi_addr), we look up - * for an unused S1 mapping and map the gDB onto @msi_addr - */ -static struct iommu_dma_msi_page * -iommu_dma_get_nested_msi_page(struct iommu_domain *domain, - phys_addr_t msi_addr) -{ - struct iommu_dma_cookie *cookie = domain->iova_cookie; - struct iommu_dma_msi_page *iter, *msi_page = NULL; - size_t size = cookie_msi_granule(cookie); - int prot = IOMMU_WRITE | IOMMU_NOEXEC | IOMMU_MMIO; - - spin_lock(&cookie->msi_lock); - list_for_each_entry(iter, &cookie->msi_page_list, list) - if (iter->phys == msi_addr) { - msi_page = iter; - goto unlock; - } - - /* - * No nested mapping exists for the physical doorbell, - * look for an unused S1 mapping - */ - list_for_each_entry(iter, &cookie->msi_page_list, list) { - int ret; - - if (iter->phys) - continue; - - /* do the stage 2 mapping */ - ret = iommu_map_atomic(domain, iter->gpa, msi_addr, size, prot); - if (ret) { - pr_warn_once("MSI S2 mapping 0x%llx -> 0x%llx failed (%d)\n", - iter->gpa, msi_addr, ret); - goto unlock; - } - iter->phys = msi_addr; - msi_page = iter; - goto unlock; - } - pr_warn_once("No usable S1 MSI mapping found\n"); -unlock: - spin_unlock(&cookie->msi_lock); - return msi_page; -} - static struct iommu_dma_msi_page *iommu_dma_get_msi_page(struct device *dev, phys_addr_t msi_addr, struct iommu_domain *domain) { @@ -1376,10 +1208,6 @@ static struct iommu_dma_msi_page *iommu_dma_get_msi_page(struct device *dev, size_t size = cookie_msi_granule(cookie);
msi_addr &= ~(phys_addr_t)(size - 1); - - if (cookie->type == IOMMU_DMA_NESTED_MSI_COOKIE) - return iommu_dma_get_nested_msi_page(domain, msi_addr); - list_for_each_entry(msi_page, &cookie->msi_page_list, list) if (msi_page->phys == msi_addr) return msi_page; diff --git a/include/linux/dma-iommu.h b/include/linux/dma-iommu.h index f112ecdb4af6..2112f21f73d8 100644 --- a/include/linux/dma-iommu.h +++ b/include/linux/dma-iommu.h @@ -12,7 +12,6 @@ #include <linux/dma-mapping.h> #include <linux/iommu.h> #include <linux/msi.h> -#include <uapi/linux/iommu.h>
/* Domain management interface for IOMMU drivers */ int iommu_get_dma_cookie(struct iommu_domain *domain); @@ -37,9 +36,6 @@ void iommu_dma_compose_msi_msg(struct msi_desc *desc, struct msi_msg *msg);
void iommu_dma_get_resv_regions(struct device *dev, struct list_head *list); -int iommu_dma_bind_guest_msi(struct iommu_domain *domain, - dma_addr_t iova, phys_addr_t gpa, size_t size); -void iommu_dma_unbind_guest_msi(struct iommu_domain *domain, dma_addr_t giova);
#else /* CONFIG_IOMMU_DMA */
@@ -78,18 +74,6 @@ static inline void iommu_dma_compose_msi_msg(struct msi_desc *desc, { }
-static inline int -iommu_dma_bind_guest_msi(struct iommu_domain *domain, - dma_addr_t iova, phys_addr_t gpa, size_t size) -{ - return -ENODEV; -} - -static inline void -iommu_dma_unbind_guest_msi(struct iommu_domain *domain, dma_addr_t giova) -{ -} - static inline void iommu_dma_get_resv_regions(struct device *dev, struct list_head *list) { }
From: Kunkun Jiang jiangkunkun@huawei.com
virt inclusion category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I61SPO CVE: NA
--------------------------------
This reverts commit 04039cc97a8839f000fb8cfaa71d84ea0bae7850.
Signed-off-by: Kunkun Jiang jiangkunkun@huawei.com Reviewed-by: Keqian Zhu zhukeqian1@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c | 88 --------------------- 1 file changed, 88 deletions(-)
diff --git a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c index 8d839fe21297..1446e4ae1337 100644 --- a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c +++ b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c @@ -3612,93 +3612,6 @@ static void arm_smmu_detach_pasid_table(struct iommu_domain *domain) mutex_unlock(&smmu_domain->init_mutex); }
-static int -arm_smmu_cache_invalidate(struct iommu_domain *domain, struct device *dev, - struct iommu_cache_invalidate_info *inv_info) -{ - struct arm_smmu_cmdq_ent cmd = {.opcode = CMDQ_OP_TLBI_NSNH_ALL}; - struct arm_smmu_domain *smmu_domain = to_smmu_domain(domain); - struct arm_smmu_device *smmu = smmu_domain->smmu; - - if (smmu_domain->stage != ARM_SMMU_DOMAIN_NESTED) - return -EINVAL; - - if (!smmu) - return -EINVAL; - - if (inv_info->version != IOMMU_CACHE_INVALIDATE_INFO_VERSION_1) - return -EINVAL; - - if (inv_info->cache & IOMMU_CACHE_INV_TYPE_PASID || - inv_info->cache & IOMMU_CACHE_INV_TYPE_DEV_IOTLB) { - return -ENOENT; - } - - if (!(inv_info->cache & IOMMU_CACHE_INV_TYPE_IOTLB)) - return -EINVAL; - - /* IOTLB invalidation */ - - switch (inv_info->granularity) { - case IOMMU_INV_GRANU_PASID: - { - struct iommu_inv_pasid_info *info = - &inv_info->granu.pasid_info; - - if (info->flags & IOMMU_INV_ADDR_FLAGS_PASID) - return -ENOENT; - if (!(info->flags & IOMMU_INV_PASID_FLAGS_ARCHID)) - return -EINVAL; - - __arm_smmu_tlb_inv_context(smmu_domain, info->archid); - return 0; - } - case IOMMU_INV_GRANU_ADDR: - { - struct iommu_inv_addr_info *info = &inv_info->granu.addr_info; - size_t granule_size = info->granule_size; - size_t size = info->nb_granules * info->granule_size; - bool leaf = info->flags & IOMMU_INV_ADDR_FLAGS_LEAF; - int tg; - - if (info->flags & IOMMU_INV_ADDR_FLAGS_PASID) - return -ENOENT; - - if (!(info->flags & IOMMU_INV_ADDR_FLAGS_ARCHID)) - break; - - tg = __ffs(granule_size); - if (granule_size & ~(1 << tg)) - return -EINVAL; - /* - * When RIL is not supported, make sure the granule size that is - * passed is supported. In RIL mode, this is enforced in - * __arm_smmu_tlb_inv_range() - */ - if (!(smmu->features & ARM_SMMU_FEAT_RANGE_INV) && - !(granule_size & smmu_domain->domain.pgsize_bitmap)) { - tg = __ffs(smmu_domain->domain.pgsize_bitmap); - granule_size = 1 << tg; - size = size >> tg; - } - - arm_smmu_tlb_inv_range_domain(info->addr, size, - granule_size, leaf, - info->archid, smmu_domain); - return 0; - } - case IOMMU_INV_GRANU_DOMAIN: - break; - default: - return -EINVAL; - } - - /* Global S1 invalidation */ - cmd.tlbi.vmid = smmu_domain->s2_cfg.vmid; - arm_smmu_cmdq_issue_cmd_with_sync(smmu, &cmd); - return 0; -} - static bool arm_smmu_dev_has_feature(struct device *dev, enum iommu_dev_features feat) { @@ -4207,7 +4120,6 @@ static struct iommu_ops arm_smmu_ops = { .put_resv_regions = generic_iommu_put_resv_regions, .attach_pasid_table = arm_smmu_attach_pasid_table, .detach_pasid_table = arm_smmu_detach_pasid_table, - .cache_invalidate = arm_smmu_cache_invalidate, .dev_has_feat = arm_smmu_dev_has_feature, .dev_feat_enabled = arm_smmu_dev_feature_enabled, .dev_enable_feat = arm_smmu_dev_enable_feature,
From: Kunkun Jiang jiangkunkun@huawei.com
virt inclusion category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I61SPO CVE: NA
--------------------------------
This reverts commit eeb79c56db25e457d0e0bf14db747cbd7d456a93.
Signed-off-by: Kunkun Jiang jiangkunkun@huawei.com Reviewed-by: Keqian Zhu zhukeqian1@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c | 44 +++++---------------- 1 file changed, 10 insertions(+), 34 deletions(-)
diff --git a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c index 1446e4ae1337..1167c4087ad1 100644 --- a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c +++ b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c @@ -2191,9 +2191,9 @@ int arm_smmu_atc_inv_domain(struct arm_smmu_domain *smmu_domain, int ssid, }
/* IO_PGTABLE API */ -static void __arm_smmu_tlb_inv_context(struct arm_smmu_domain *smmu_domain, - int ext_asid) +static void arm_smmu_tlb_inv_context(void *cookie) { + struct arm_smmu_domain *smmu_domain = cookie; struct arm_smmu_device *smmu = smmu_domain->smmu; struct arm_smmu_cmdq_ent cmd;
@@ -2204,12 +2204,7 @@ static void __arm_smmu_tlb_inv_context(struct arm_smmu_domain *smmu_domain, * insertion to guarantee those are observed before the TLBI. Do be * careful, 007. */ - if (ext_asid >= 0) { /* guest stage 1 invalidation */ - cmd.opcode = CMDQ_OP_TLBI_NH_ASID; - cmd.tlbi.asid = ext_asid; - cmd.tlbi.vmid = smmu_domain->s2_cfg.vmid; - arm_smmu_cmdq_issue_cmd_with_sync(smmu, &cmd); - } else if (smmu_domain->stage == ARM_SMMU_DOMAIN_S1) { + if (smmu_domain->stage == ARM_SMMU_DOMAIN_S1) { arm_smmu_tlb_inv_asid(smmu, smmu_domain->s1_cfg.cd.asid); } else { cmd.opcode = CMDQ_OP_TLBI_S12_VMALL; @@ -2224,13 +2219,6 @@ static void __arm_smmu_tlb_inv_context(struct arm_smmu_domain *smmu_domain,
}
-static void arm_smmu_tlb_inv_context(void *cookie) -{ - struct arm_smmu_domain *smmu_domain = cookie; - - __arm_smmu_tlb_inv_context(smmu_domain, -1); -} - static void __arm_smmu_tlb_inv_range(struct arm_smmu_cmdq_ent *cmd, unsigned long iova, size_t size, size_t granule, @@ -2292,10 +2280,9 @@ static void __arm_smmu_tlb_inv_range(struct arm_smmu_cmdq_ent *cmd, arm_smmu_preempt_enable(smmu); }
-static void -arm_smmu_tlb_inv_range_domain(unsigned long iova, size_t size, - size_t granule, bool leaf, int ext_asid, - struct arm_smmu_domain *smmu_domain) +static void arm_smmu_tlb_inv_range_domain(unsigned long iova, size_t size, + size_t granule, bool leaf, + struct arm_smmu_domain *smmu_domain) { struct arm_smmu_cmdq_ent cmd = { .tlbi = { @@ -2303,16 +2290,7 @@ arm_smmu_tlb_inv_range_domain(unsigned long iova, size_t size, }, };
- if (ext_asid >= 0) { /* guest stage 1 invalidation */ - /* - * At the moment the guest only uses NS-EL1, to be - * revisited when nested virt gets supported with E2H - * exposed. - */ - cmd.opcode = CMDQ_OP_TLBI_NH_VA; - cmd.tlbi.asid = ext_asid; - cmd.tlbi.vmid = smmu_domain->s2_cfg.vmid; - } else if (smmu_domain->stage == ARM_SMMU_DOMAIN_S1) { + if (smmu_domain->stage == ARM_SMMU_DOMAIN_S1) { cmd.opcode = smmu_domain->smmu->features & ARM_SMMU_FEAT_E2H ? CMDQ_OP_TLBI_EL2_VA : CMDQ_OP_TLBI_NH_VA; cmd.tlbi.asid = smmu_domain->s1_cfg.cd.asid; @@ -2320,7 +2298,6 @@ arm_smmu_tlb_inv_range_domain(unsigned long iova, size_t size, cmd.opcode = CMDQ_OP_TLBI_S2_IPA; cmd.tlbi.vmid = smmu_domain->s2_cfg.vmid; } - __arm_smmu_tlb_inv_range(&cmd, iova, size, granule, smmu_domain);
/* @@ -2363,7 +2340,7 @@ static void arm_smmu_tlb_inv_page_nosync(struct iommu_iotlb_gather *gather, static void arm_smmu_tlb_inv_walk(unsigned long iova, size_t size, size_t granule, void *cookie) { - arm_smmu_tlb_inv_range_domain(iova, size, granule, false, -1, cookie); + arm_smmu_tlb_inv_range_domain(iova, size, granule, false, cookie); }
static const struct iommu_flush_ops arm_smmu_flush_ops = { @@ -2999,9 +2976,8 @@ static void arm_smmu_iotlb_sync(struct iommu_domain *domain, { struct arm_smmu_domain *smmu_domain = to_smmu_domain(domain);
- arm_smmu_tlb_inv_range_domain(gather->start, - gather->end - gather->start + 1, - gather->pgsize, true, -1, smmu_domain); + arm_smmu_tlb_inv_range_domain(gather->start, gather->end - gather->start + 1, + gather->pgsize, true, smmu_domain); }
static phys_addr_t
From: Kunkun Jiang jiangkunkun@huawei.com
virt inclusion category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I61SPO CVE: NA
--------------------------------
This reverts commit c47df7b65d78d7589df949d74e6242a77c6bc2be.
Signed-off-by: Kunkun Jiang jiangkunkun@huawei.com Reviewed-by: Keqian Zhu zhukeqian1@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c | 93 --------------------- 1 file changed, 93 deletions(-)
diff --git a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c index 1167c4087ad1..68e45444b6b6 100644 --- a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c +++ b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c @@ -1202,10 +1202,6 @@ static void arm_smmu_write_cd_l1_desc(__le64 *dst, WRITE_ONCE(*dst, cpu_to_le64(val)); }
-/* - * Must not be used in case of nested mode where the CD table is owned - * by the guest - */ static __le64 *arm_smmu_get_cd_ptr(struct arm_smmu_domain *smmu_domain, u32 ssid) { @@ -3501,93 +3497,6 @@ static void arm_smmu_get_resv_regions(struct device *dev, iommu_dma_get_resv_regions(dev, head); }
-static int arm_smmu_attach_pasid_table(struct iommu_domain *domain, - struct iommu_pasid_table_config *cfg) -{ - struct arm_smmu_domain *smmu_domain = to_smmu_domain(domain); - struct arm_smmu_master *master; - struct arm_smmu_device *smmu; - unsigned long flags; - int ret = -EINVAL; - - if (cfg->format != IOMMU_PASID_FORMAT_SMMUV3) - return -EINVAL; - - if (cfg->version != PASID_TABLE_CFG_VERSION_1 || - cfg->vendor_data.smmuv3.version != PASID_TABLE_SMMUV3_CFG_VERSION_1) - return -EINVAL; - - mutex_lock(&smmu_domain->init_mutex); - - smmu = smmu_domain->smmu; - - if (!smmu) - goto out; - - if (smmu_domain->stage != ARM_SMMU_DOMAIN_NESTED) - goto out; - - switch (cfg->config) { - case IOMMU_PASID_CONFIG_ABORT: - smmu_domain->s1_cfg.set = false; - smmu_domain->abort = true; - break; - case IOMMU_PASID_CONFIG_BYPASS: - smmu_domain->s1_cfg.set = false; - smmu_domain->abort = false; - break; - case IOMMU_PASID_CONFIG_TRANSLATE: - /* we do not support S1 <-> S1 transitions */ - if (smmu_domain->s1_cfg.set) - goto out; - - /* - * we currently support a single CD so s1fmt and s1dss - * fields are also ignored - */ - if (cfg->pasid_bits) - goto out; - - smmu_domain->s1_cfg.cdcfg.cdtab_dma = cfg->base_ptr; - smmu_domain->s1_cfg.set = true; - smmu_domain->abort = false; - break; - default: - goto out; - } - spin_lock_irqsave(&smmu_domain->devices_lock, flags); - list_for_each_entry(master, &smmu_domain->devices, domain_head) - arm_smmu_install_ste_for_dev(master); - spin_unlock_irqrestore(&smmu_domain->devices_lock, flags); - ret = 0; -out: - mutex_unlock(&smmu_domain->init_mutex); - return ret; -} - -static void arm_smmu_detach_pasid_table(struct iommu_domain *domain) -{ - struct arm_smmu_domain *smmu_domain = to_smmu_domain(domain); - struct arm_smmu_master *master; - unsigned long flags; - - mutex_lock(&smmu_domain->init_mutex); - - if (smmu_domain->stage != ARM_SMMU_DOMAIN_NESTED) - goto unlock; - - smmu_domain->s1_cfg.set = false; - smmu_domain->abort = false; - - spin_lock_irqsave(&smmu_domain->devices_lock, flags); - list_for_each_entry(master, &smmu_domain->devices, domain_head) - arm_smmu_install_ste_for_dev(master); - spin_unlock_irqrestore(&smmu_domain->devices_lock, flags); - -unlock: - mutex_unlock(&smmu_domain->init_mutex); -} - static bool arm_smmu_dev_has_feature(struct device *dev, enum iommu_dev_features feat) { @@ -4094,8 +4003,6 @@ static struct iommu_ops arm_smmu_ops = { .of_xlate = arm_smmu_of_xlate, .get_resv_regions = arm_smmu_get_resv_regions, .put_resv_regions = generic_iommu_put_resv_regions, - .attach_pasid_table = arm_smmu_attach_pasid_table, - .detach_pasid_table = arm_smmu_detach_pasid_table, .dev_has_feat = arm_smmu_dev_has_feature, .dev_feat_enabled = arm_smmu_dev_feature_enabled, .dev_enable_feat = arm_smmu_dev_enable_feature,
From: Kunkun Jiang jiangkunkun@huawei.com
virt inclusion category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I61SPO CVE: NA
--------------------------------
This reverts commit 54cb4594c8396fb3dcb846d13c64ebcd72f0aabc.
Signed-off-by: Kunkun Jiang jiangkunkun@huawei.com Reviewed-by: Keqian Zhu zhukeqian1@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c | 55 +++------------------ drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h | 2 - 2 files changed, 8 insertions(+), 49 deletions(-)
diff --git a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c index 68e45444b6b6..02116ae13c01 100644 --- a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c +++ b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c @@ -1461,8 +1461,7 @@ static void arm_smmu_write_strtab_ent(struct arm_smmu_master *master, u32 sid, * 3. Update Config, sync */ u64 val = le64_to_cpu(dst[0]); - bool s1_live = false, s2_live = false, ste_live; - bool abort, translate = false; + bool ste_live = false; struct arm_smmu_device *smmu = NULL; struct arm_smmu_s1_cfg *s1_cfg; struct arm_smmu_s2_cfg *s2_cfg; @@ -1502,7 +1501,6 @@ static void arm_smmu_write_strtab_ent(struct arm_smmu_master *master, u32 sid, default: break; } - translate = s1_cfg->set || s2_cfg->set; }
if (val & STRTAB_STE_0_V) { @@ -1510,36 +1508,23 @@ static void arm_smmu_write_strtab_ent(struct arm_smmu_master *master, u32 sid, case STRTAB_STE_0_CFG_BYPASS: break; case STRTAB_STE_0_CFG_S1_TRANS: - s1_live = true; - break; case STRTAB_STE_0_CFG_S2_TRANS: - s2_live = true; - break; - case STRTAB_STE_0_CFG_NESTED: - s1_live = true; - s2_live = true; + ste_live = true; break; case STRTAB_STE_0_CFG_ABORT: + BUG_ON(!disable_bypass); break; default: BUG(); /* STE corruption */ } }
- ste_live = s1_live || s2_live; - /* Nuke the existing STE_0 value, as we're going to rewrite it */ val = STRTAB_STE_0_V;
/* Bypass/fault */ - - if (!smmu_domain) - abort = disable_bypass; - else - abort = smmu_domain->abort; - - if (abort || !translate) { - if (abort) + if (!smmu_domain || !(s1_cfg->set || s2_cfg->set)) { + if (!smmu_domain && disable_bypass) val |= FIELD_PREP(STRTAB_STE_0_CFG, STRTAB_STE_0_CFG_ABORT); else val |= FIELD_PREP(STRTAB_STE_0_CFG, STRTAB_STE_0_CFG_BYPASS); @@ -1557,17 +1542,11 @@ static void arm_smmu_write_strtab_ent(struct arm_smmu_master *master, u32 sid, return; }
- if (ste_live) { - /* First invalidate the live STE */ - dst[0] = cpu_to_le64(STRTAB_STE_0_CFG_ABORT); - arm_smmu_sync_ste_for_sid(smmu, sid); - } - if (s1_cfg->set) { u64 strw = smmu->features & ARM_SMMU_FEAT_E2H ? STRTAB_STE_1_STRW_EL2 : STRTAB_STE_1_STRW_NSEL1;
- BUG_ON(s1_live); + BUG_ON(ste_live); dst[1] = cpu_to_le64( FIELD_PREP(STRTAB_STE_1_S1DSS, STRTAB_STE_1_S1DSS_SSID0) | FIELD_PREP(STRTAB_STE_1_S1CIR, STRTAB_STE_1_S1C_CACHE_WBRA) | @@ -1589,14 +1568,7 @@ static void arm_smmu_write_strtab_ent(struct arm_smmu_master *master, u32 sid, }
if (s2_cfg->set) { - u64 vttbr = s2_cfg->vttbr & STRTAB_STE_3_S2TTB_MASK; - - if (s2_live) { - u64 s2ttb = le64_to_cpu(dst[3]) & STRTAB_STE_3_S2TTB_MASK; - - BUG_ON(s2ttb != vttbr); - } - + BUG_ON(ste_live); dst[2] = cpu_to_le64( FIELD_PREP(STRTAB_STE_2_S2VMID, s2_cfg->vmid) | FIELD_PREP(STRTAB_STE_2_VTCR, s2_cfg->vtcr) | @@ -1606,12 +1578,9 @@ static void arm_smmu_write_strtab_ent(struct arm_smmu_master *master, u32 sid, STRTAB_STE_2_S2PTW | STRTAB_STE_2_S2AA64 | STRTAB_STE_2_S2R);
- dst[3] = cpu_to_le64(vttbr); + dst[3] = cpu_to_le64(s2_cfg->vttbr & STRTAB_STE_3_S2TTB_MASK);
val |= FIELD_PREP(STRTAB_STE_0_CFG, STRTAB_STE_0_CFG_S2_TRANS); - } else { - dst[2] = 0; - dst[3] = 0; }
if (master->ats_enabled) @@ -2555,14 +2524,6 @@ static int arm_smmu_domain_finalise(struct iommu_domain *domain, return 0; }
- if (smmu_domain->stage == ARM_SMMU_DOMAIN_NESTED && - (!(smmu->features & ARM_SMMU_FEAT_TRANS_S1) || - !(smmu->features & ARM_SMMU_FEAT_TRANS_S2))) { - dev_info(smmu_domain->smmu->dev, - "does not implement two stages\n"); - return -EINVAL; - } - /* Restrict the stage to what we can actually support */ if (!(smmu->features & ARM_SMMU_FEAT_TRANS_S1)) smmu_domain->stage = ARM_SMMU_DOMAIN_S2; diff --git a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h index c744d812fc8d..fdf80cf1184c 100644 --- a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h +++ b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h @@ -245,7 +245,6 @@ #define STRTAB_STE_0_CFG_BYPASS 4 #define STRTAB_STE_0_CFG_S1_TRANS 5 #define STRTAB_STE_0_CFG_S2_TRANS 6 -#define STRTAB_STE_0_CFG_NESTED 7
#define STRTAB_STE_0_S1FMT GENMASK_ULL(5, 4) #define STRTAB_STE_0_S1FMT_LINEAR 0 @@ -803,7 +802,6 @@ struct arm_smmu_domain { enum arm_smmu_domain_stage stage; struct arm_smmu_s1_cfg s1_cfg; struct arm_smmu_s2_cfg s2_cfg; - bool abort;
struct iommu_domain domain;
From: Kunkun Jiang jiangkunkun@huawei.com
virt inclusion category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I61SPO CVE: NA
--------------------------------
This reverts commit a07fcc1fc081da9990da18818bafa276ddc227c0.
Signed-off-by: Kunkun Jiang jiangkunkun@huawei.com Reviewed-by: Keqian Zhu zhukeqian1@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c | 47 ++++++++------------- drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h | 8 ++-- 2 files changed, 22 insertions(+), 33 deletions(-)
diff --git a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c index 02116ae13c01..01542e9f6b05 100644 --- a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c +++ b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c @@ -1463,8 +1463,8 @@ static void arm_smmu_write_strtab_ent(struct arm_smmu_master *master, u32 sid, u64 val = le64_to_cpu(dst[0]); bool ste_live = false; struct arm_smmu_device *smmu = NULL; - struct arm_smmu_s1_cfg *s1_cfg; - struct arm_smmu_s2_cfg *s2_cfg; + struct arm_smmu_s1_cfg *s1_cfg = NULL; + struct arm_smmu_s2_cfg *s2_cfg = NULL; struct arm_smmu_domain *smmu_domain = NULL; struct arm_smmu_cmdq_ent prefetch_cmd = { .opcode = CMDQ_OP_PREFETCH_CFG, @@ -1479,24 +1479,13 @@ static void arm_smmu_write_strtab_ent(struct arm_smmu_master *master, u32 sid, }
if (smmu_domain) { - s1_cfg = &smmu_domain->s1_cfg; - s2_cfg = &smmu_domain->s2_cfg; - switch (smmu_domain->stage) { case ARM_SMMU_DOMAIN_S1: - s1_cfg->set = true; - s2_cfg->set = false; + s1_cfg = &smmu_domain->s1_cfg; break; case ARM_SMMU_DOMAIN_S2: - s1_cfg->set = false; - s2_cfg->set = true; - break; case ARM_SMMU_DOMAIN_NESTED: - /* - * Actual usage of stage 1 depends on nested mode: - * legacy (2d stage only) or true nested mode - */ - s2_cfg->set = true; + s2_cfg = &smmu_domain->s2_cfg; break; default: break; @@ -1523,7 +1512,7 @@ static void arm_smmu_write_strtab_ent(struct arm_smmu_master *master, u32 sid, val = STRTAB_STE_0_V;
/* Bypass/fault */ - if (!smmu_domain || !(s1_cfg->set || s2_cfg->set)) { + if (!smmu_domain || !(s1_cfg || s2_cfg)) { if (!smmu_domain && disable_bypass) val |= FIELD_PREP(STRTAB_STE_0_CFG, STRTAB_STE_0_CFG_ABORT); else @@ -1542,7 +1531,7 @@ static void arm_smmu_write_strtab_ent(struct arm_smmu_master *master, u32 sid, return; }
- if (s1_cfg->set) { + if (s1_cfg) { u64 strw = smmu->features & ARM_SMMU_FEAT_E2H ? STRTAB_STE_1_STRW_EL2 : STRTAB_STE_1_STRW_NSEL1;
@@ -1567,7 +1556,7 @@ static void arm_smmu_write_strtab_ent(struct arm_smmu_master *master, u32 sid, FIELD_PREP(STRTAB_STE_0_S1FMT, s1_cfg->s1fmt); }
- if (s2_cfg->set) { + if (s2_cfg) { BUG_ON(ste_live); dst[2] = cpu_to_le64( FIELD_PREP(STRTAB_STE_2_S2VMID, s2_cfg->vmid) | @@ -2381,26 +2370,26 @@ static void arm_smmu_domain_free(struct iommu_domain *domain) { struct arm_smmu_domain *smmu_domain = to_smmu_domain(domain); struct arm_smmu_device *smmu = smmu_domain->smmu; - struct arm_smmu_s1_cfg *s1_cfg = &smmu_domain->s1_cfg; - struct arm_smmu_s2_cfg *s2_cfg = &smmu_domain->s2_cfg;
iommu_put_dma_cookie(domain); free_io_pgtable_ops(smmu_domain->pgtbl_ops);
/* Free the CD and ASID, if we allocated them */ - if (s1_cfg->set) { + if (smmu_domain->stage == ARM_SMMU_DOMAIN_S1) { + struct arm_smmu_s1_cfg *cfg = &smmu_domain->s1_cfg; + /* Prevent SVA from touching the CD while we're freeing it */ mutex_lock(&arm_smmu_asid_lock); - if (s1_cfg->cdcfg.cdtab) + if (cfg->cdcfg.cdtab) arm_smmu_free_cd_tables(smmu_domain); - arm_smmu_free_asid(&s1_cfg->cd); + arm_smmu_free_asid(&cfg->cd); mutex_unlock(&arm_smmu_asid_lock); if (smmu_domain->ssid) ioasid_free(smmu_domain->ssid); - } - if (s2_cfg->set) { - if (s2_cfg->vmid) - arm_smmu_bitmap_free(smmu->vmid_map, s2_cfg->vmid); + } else { + struct arm_smmu_s2_cfg *cfg = &smmu_domain->s2_cfg; + if (cfg->vmid) + arm_smmu_bitmap_free(smmu->vmid_map, cfg->vmid); }
kfree(smmu_domain); @@ -3699,7 +3688,7 @@ static int arm_smmu_set_mpam(struct arm_smmu_device *smmu,
if (WARN_ON(!domain)) return -EINVAL; - if (WARN_ON(!domain->s1_cfg.set)) + if (WARN_ON(domain->stage != ARM_SMMU_DOMAIN_S1)) return -EINVAL; if (WARN_ON(ssid >= (1 << domain->s1_cfg.s1cdmax))) return -E2BIG; @@ -3822,7 +3811,7 @@ static int arm_smmu_get_mpam(struct arm_smmu_device *smmu,
if (WARN_ON(!domain)) return -EINVAL; - if (WARN_ON(!domain->s1_cfg.set)) + if (WARN_ON(domain->stage != ARM_SMMU_DOMAIN_S1)) return -EINVAL; if (WARN_ON(ssid >= (1 << domain->s1_cfg.s1cdmax))) return -E2BIG; diff --git a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h index fdf80cf1184c..f680cd6dd3bd 100644 --- a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h +++ b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h @@ -658,14 +658,12 @@ struct arm_smmu_s1_cfg { struct arm_smmu_ctx_desc cd; u8 s1fmt; u8 s1cdmax; - bool set; };
struct arm_smmu_s2_cfg { u16 vmid; u64 vttbr; u64 vtcr; - bool set; };
struct arm_smmu_strtab_cfg { @@ -800,8 +798,10 @@ struct arm_smmu_domain { atomic_t nr_ats_masters;
enum arm_smmu_domain_stage stage; - struct arm_smmu_s1_cfg s1_cfg; - struct arm_smmu_s2_cfg s2_cfg; + union { + struct arm_smmu_s1_cfg s1_cfg; + struct arm_smmu_s2_cfg s2_cfg; + };
struct iommu_domain domain;
From: Kunkun Jiang jiangkunkun@huawei.com
virt inclusion category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I61SPO CVE: NA
--------------------------------
This reverts commit 9db83ab7c297b4f0d4d31a22fe389ce31c1ee662.
Signed-off-by: Kunkun Jiang jiangkunkun@huawei.com Reviewed-by: Keqian Zhu zhukeqian1@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- drivers/iommu/iommu.c | 37 ------------------------------------- include/linux/iommu.h | 18 ------------------ 2 files changed, 55 deletions(-)
diff --git a/drivers/iommu/iommu.c b/drivers/iommu/iommu.c index d2fbebee719b..d53c88c647ae 100644 --- a/drivers/iommu/iommu.c +++ b/drivers/iommu/iommu.c @@ -2383,43 +2383,6 @@ static void __iommu_detach_device(struct iommu_domain *domain, trace_detach_device_from_domain(dev); }
-/** - * iommu_bind_guest_msi - Passes the stage1 GIOVA/GPA mapping of a - * virtual doorbell - * - * @domain: iommu domain the stage 1 mapping will be attached to - * @iova: iova allocated by the guest - * @gpa: guest physical address of the virtual doorbell - * @size: granule size used for the mapping - * - * The associated IOVA can be reused by the host to create a nested - * stage2 binding mapping translating into the physical doorbell used - * by the devices attached to the domain. - * - * All devices within the domain must share the same physical doorbell. - * A single MSI GIOVA/GPA mapping can be attached to an iommu_domain. - */ - -int iommu_bind_guest_msi(struct iommu_domain *domain, - dma_addr_t giova, phys_addr_t gpa, size_t size) -{ - if (unlikely(!domain->ops->bind_guest_msi)) - return -ENODEV; - - return domain->ops->bind_guest_msi(domain, giova, gpa, size); -} -EXPORT_SYMBOL_GPL(iommu_bind_guest_msi); - -void iommu_unbind_guest_msi(struct iommu_domain *domain, - dma_addr_t giova) -{ - if (unlikely(!domain->ops->unbind_guest_msi)) - return; - - domain->ops->unbind_guest_msi(domain, giova); -} -EXPORT_SYMBOL_GPL(iommu_unbind_guest_msi); - void iommu_detach_device(struct iommu_domain *domain, struct device *dev) { struct iommu_group *group; diff --git a/include/linux/iommu.h b/include/linux/iommu.h index 95320164dcf3..0e696aec98a5 100644 --- a/include/linux/iommu.h +++ b/include/linux/iommu.h @@ -248,8 +248,6 @@ struct iommu_iotlb_gather { * @sva_unbind_gpasid: unbind guest pasid and mm * @attach_pasid_table: attach a pasid table * @detach_pasid_table: detach the pasid table - * @bind_guest_msi: provides a stage1 giova/gpa MSI doorbell mapping - * @unbind_guest_msi: withdraw a stage1 giova/gpa MSI doorbell mapping * @def_domain_type: device default domain type, return value: * - IOMMU_DOMAIN_IDENTITY: must use an identity domain * - IOMMU_DOMAIN_DMA: must use a dma domain @@ -347,10 +345,6 @@ struct iommu_ops {
int (*def_domain_type)(struct device *dev);
- int (*bind_guest_msi)(struct iommu_domain *domain, - dma_addr_t giova, phys_addr_t gpa, size_t size); - void (*unbind_guest_msi)(struct iommu_domain *domain, dma_addr_t giova); - int (*dev_get_config)(struct device *dev, int type, void *data); int (*dev_set_config)(struct device *dev, int type, void *data);
@@ -507,10 +501,6 @@ extern int iommu_attach_pasid_table(struct iommu_domain *domain, extern int iommu_uapi_attach_pasid_table(struct iommu_domain *domain, void __user *udata); extern void iommu_detach_pasid_table(struct iommu_domain *domain); -extern int iommu_bind_guest_msi(struct iommu_domain *domain, - dma_addr_t giova, phys_addr_t gpa, size_t size); -extern void iommu_unbind_guest_msi(struct iommu_domain *domain, - dma_addr_t giova); extern struct iommu_domain *iommu_get_domain_for_dev(struct device *dev); extern struct iommu_domain *iommu_get_dma_domain(struct device *dev); extern size_t iommu_pgsize(struct iommu_domain *domain, @@ -1221,14 +1211,6 @@ iommu_sva_bind_group(struct iommu_group *group, struct mm_struct *mm, return NULL; }
-int iommu_bind_guest_msi(struct iommu_domain *domain, - dma_addr_t giova, phys_addr_t gpa, size_t size) -{ - return -ENODEV; -} -static inline -void iommu_unbind_guest_msi(struct iommu_domain *domain, dma_addr_t giova) {} - static inline int iommu_dev_set_config(struct device *dev, int type, void *data) {
From: Kunkun Jiang jiangkunkun@huawei.com
virt inclusion category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I61SPO CVE: NA
--------------------------------
This reverts commit dbb4844d2af73302bbcf96669a59d031ba69ca85.
Signed-off-by: Kunkun Jiang jiangkunkun@huawei.com Reviewed-by: Keqian Zhu zhukeqian1@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- drivers/iommu/iommu.c | 50 -------------------------------------- include/linux/iommu.h | 13 ++-------- include/uapi/linux/iommu.h | 13 ++++------ 3 files changed, 7 insertions(+), 69 deletions(-)
diff --git a/drivers/iommu/iommu.c b/drivers/iommu/iommu.c index d53c88c647ae..b888efd65e92 100644 --- a/drivers/iommu/iommu.c +++ b/drivers/iommu/iommu.c @@ -2311,56 +2311,6 @@ int iommu_attach_pasid_table(struct iommu_domain *domain, } EXPORT_SYMBOL_GPL(iommu_attach_pasid_table);
-int iommu_uapi_attach_pasid_table(struct iommu_domain *domain, - void __user *uinfo) -{ - struct iommu_pasid_table_config pasid_table_data = { 0 }; - u32 minsz; - - if (unlikely(!domain->ops->attach_pasid_table)) - return -ENODEV; - - /* - * No new spaces can be added before the variable sized union, the - * minimum size is the offset to the union. - */ - minsz = offsetof(struct iommu_pasid_table_config, vendor_data); - - /* Copy minsz from user to get flags and argsz */ - if (copy_from_user(&pasid_table_data, uinfo, minsz)) - return -EFAULT; - - /* Fields before the variable size union are mandatory */ - if (pasid_table_data.argsz < minsz) - return -EINVAL; - - /* PASID and address granu require additional info beyond minsz */ - if (pasid_table_data.version != PASID_TABLE_CFG_VERSION_1) - return -EINVAL; - if (pasid_table_data.format == IOMMU_PASID_FORMAT_SMMUV3 && - pasid_table_data.argsz < - offsetofend(struct iommu_pasid_table_config, vendor_data.smmuv3)) - return -EINVAL; - - /* - * User might be using a newer UAPI header which has a larger data - * size, we shall support the existing flags within the current - * size. Copy the remaining user data _after_ minsz but not more - * than the current kernel supported size. - */ - if (copy_from_user((void *)&pasid_table_data + minsz, uinfo + minsz, - min_t(u32, pasid_table_data.argsz, sizeof(pasid_table_data)) - minsz)) - return -EFAULT; - - /* Now the argsz is validated, check the content */ - if (pasid_table_data.config < IOMMU_PASID_CONFIG_TRANSLATE || - pasid_table_data.config > IOMMU_PASID_CONFIG_ABORT) - return -EINVAL; - - return domain->ops->attach_pasid_table(domain, &pasid_table_data); -} -EXPORT_SYMBOL_GPL(iommu_uapi_attach_pasid_table); - void iommu_detach_pasid_table(struct iommu_domain *domain) { if (unlikely(!domain->ops->detach_pasid_table)) diff --git a/include/linux/iommu.h b/include/linux/iommu.h index 0e696aec98a5..6671e45d3c3b 100644 --- a/include/linux/iommu.h +++ b/include/linux/iommu.h @@ -246,12 +246,12 @@ struct iommu_iotlb_gather { * @cache_invalidate: invalidate translation caches * @sva_bind_gpasid: bind guest pasid and mm * @sva_unbind_gpasid: unbind guest pasid and mm - * @attach_pasid_table: attach a pasid table - * @detach_pasid_table: detach the pasid table * @def_domain_type: device default domain type, return value: * - IOMMU_DOMAIN_IDENTITY: must use an identity domain * - IOMMU_DOMAIN_DMA: must use a dma domain * - 0: use the default setting + * @attach_pasid_table: attach a pasid table + * @detach_pasid_table: detach the pasid table * @pgsize_bitmap: bitmap of all possible supported page sizes * @owner: Driver module providing these ops */ @@ -498,8 +498,6 @@ extern int iommu_sva_unbind_gpasid(struct iommu_domain *domain, struct device *dev, ioasid_t pasid); extern int iommu_attach_pasid_table(struct iommu_domain *domain, struct iommu_pasid_table_config *cfg); -extern int iommu_uapi_attach_pasid_table(struct iommu_domain *domain, - void __user *udata); extern void iommu_detach_pasid_table(struct iommu_domain *domain); extern struct iommu_domain *iommu_get_domain_for_dev(struct device *dev); extern struct iommu_domain *iommu_get_dma_domain(struct device *dev); @@ -1194,13 +1192,6 @@ int iommu_attach_pasid_table(struct iommu_domain *domain, return -ENODEV; }
-static inline -int iommu_uapi_attach_pasid_table(struct iommu_domain *domain, - void __user *uinfo) -{ - return -ENODEV; -} - static inline void iommu_detach_pasid_table(struct iommu_domain *domain) {}
diff --git a/include/uapi/linux/iommu.h b/include/uapi/linux/iommu.h index 40c28bb0e1bf..bed34a8c9430 100644 --- a/include/uapi/linux/iommu.h +++ b/include/uapi/linux/iommu.h @@ -363,33 +363,30 @@ struct iommu_pasid_smmuv3 { /** * struct iommu_pasid_table_config - PASID table data used to bind guest PASID * table to the host IOMMU - * @argsz: User filled size of this data * @version: API version to prepare for future extensions - * @base_ptr: guest physical address of the PASID table * @format: format of the PASID table + * @base_ptr: guest physical address of the PASID table * @pasid_bits: number of PASID bits used in the PASID table * @config: indicates whether the guest translation stage must * be translated, bypassed or aborted. * @padding: reserved for future use (should be zero) - * @vendor_data.smmuv3: table information when @format is - * %IOMMU_PASID_FORMAT_SMMUV3 + * @smmuv3: table information when @format is %IOMMU_PASID_FORMAT_SMMUV3 */ struct iommu_pasid_table_config { - __u32 argsz; #define PASID_TABLE_CFG_VERSION_1 1 __u32 version; - __u64 base_ptr; #define IOMMU_PASID_FORMAT_SMMUV3 1 __u32 format; + __u64 base_ptr; __u8 pasid_bits; #define IOMMU_PASID_CONFIG_TRANSLATE 1 #define IOMMU_PASID_CONFIG_BYPASS 2 #define IOMMU_PASID_CONFIG_ABORT 3 __u8 config; - __u8 padding[2]; + __u8 padding[6]; union { struct iommu_pasid_smmuv3 smmuv3; - } vendor_data; + }; };
#endif /* _UAPI_IOMMU_H */
From: Kunkun Jiang jiangkunkun@huawei.com
virt inclusion category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I61SPO CVE: NA
--------------------------------
In order to be consistent with the vSVA technical route of the open source community, it is necessary to revert related patches and bugfixes. In the meantime, some necessary steps need to be taken to avoid kabi change.
Signed-off-by: Kunkun Jiang jiangkunkun@huawei.com Reviewed-by: Keqian Zhu zhukeqian1@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- include/linux/iommu.h | 5 +++++ include/uapi/linux/iommu.h | 7 ++++--- 2 files changed, 9 insertions(+), 3 deletions(-)
diff --git a/include/linux/iommu.h b/include/linux/iommu.h index 6671e45d3c3b..47294a3a398e 100644 --- a/include/linux/iommu.h +++ b/include/linux/iommu.h @@ -345,6 +345,11 @@ struct iommu_ops {
int (*def_domain_type)(struct device *dev);
+ KABI_DEPRECATE_FN(int, bind_guest_msi, struct iommu_domain *domain, + dma_addr_t giova, phys_addr_t gpa, size_t size) + KABI_DEPRECATE_FN(void, unbind_guest_msi, struct iommu_domain *domain, + dma_addr_t giova) + int (*dev_get_config)(struct device *dev, int type, void *data); int (*dev_set_config)(struct device *dev, int type, void *data);
diff --git a/include/uapi/linux/iommu.h b/include/uapi/linux/iommu.h index bed34a8c9430..9ddaf2d22d9a 100644 --- a/include/uapi/linux/iommu.h +++ b/include/uapi/linux/iommu.h @@ -373,20 +373,21 @@ struct iommu_pasid_smmuv3 { * @smmuv3: table information when @format is %IOMMU_PASID_FORMAT_SMMUV3 */ struct iommu_pasid_table_config { + __u32 argsz; #define PASID_TABLE_CFG_VERSION_1 1 __u32 version; + __u64 base_ptr; #define IOMMU_PASID_FORMAT_SMMUV3 1 __u32 format; - __u64 base_ptr; __u8 pasid_bits; #define IOMMU_PASID_CONFIG_TRANSLATE 1 #define IOMMU_PASID_CONFIG_BYPASS 2 #define IOMMU_PASID_CONFIG_ABORT 3 __u8 config; - __u8 padding[6]; + __u8 padding[2]; union { struct iommu_pasid_smmuv3 smmuv3; - }; + } vendor_data; };
#endif /* _UAPI_IOMMU_H */
From: Dan Carpenter dan.carpenter@oracle.com
stable inclusion from stable-v5.10.142 commit 19e3f69d19801940abc2ac37c169882769ed9770 category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I63OIO CVE: CVE-2022-4095
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=...
--------------------------------
_Read/Write_MACREG callbacks are NULL so the read/write_macreg_hdl() functions don't do anything except free the "pcmd" pointer. It results in a use after free. Delete them.
Fixes: 2865d42c78a9 ("staging: r8712u: Add the new driver to the mainline kernel") Cc: stable stable@kernel.org Reported-by: Zheng Wang hackerzheng666@gmail.com Signed-off-by: Dan Carpenter dan.carpenter@oracle.com Link: https://lore.kernel.org/r/Yw4ASqkYcUhUfoY2@kili Signed-off-by: Greg Kroah-Hartman gregkh@linuxfoundation.org Signed-off-by: Guan Jing guanjing6@huawei.com Reviewed-by: Chen Hui judy.chenhui@huawei.com Reviewed-by: Zhang Qiao zhangqiao22@huawei.com Reviewed-by: Xiu Jianfeng xiujianfeng@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- drivers/staging/rtl8712/rtl8712_cmd.c | 36 --------------------------- 1 file changed, 36 deletions(-)
diff --git a/drivers/staging/rtl8712/rtl8712_cmd.c b/drivers/staging/rtl8712/rtl8712_cmd.c index ff3cb09c57a6..30e965c410ff 100644 --- a/drivers/staging/rtl8712/rtl8712_cmd.c +++ b/drivers/staging/rtl8712/rtl8712_cmd.c @@ -117,34 +117,6 @@ static void r871x_internal_cmd_hdl(struct _adapter *padapter, u8 *pbuf) kfree(pdrvcmd->pbuf); }
-static u8 read_macreg_hdl(struct _adapter *padapter, u8 *pbuf) -{ - void (*pcmd_callback)(struct _adapter *dev, struct cmd_obj *pcmd); - struct cmd_obj *pcmd = (struct cmd_obj *)pbuf; - - /* invoke cmd->callback function */ - pcmd_callback = cmd_callback[pcmd->cmdcode].callback; - if (!pcmd_callback) - r8712_free_cmd_obj(pcmd); - else - pcmd_callback(padapter, pcmd); - return H2C_SUCCESS; -} - -static u8 write_macreg_hdl(struct _adapter *padapter, u8 *pbuf) -{ - void (*pcmd_callback)(struct _adapter *dev, struct cmd_obj *pcmd); - struct cmd_obj *pcmd = (struct cmd_obj *)pbuf; - - /* invoke cmd->callback function */ - pcmd_callback = cmd_callback[pcmd->cmdcode].callback; - if (!pcmd_callback) - r8712_free_cmd_obj(pcmd); - else - pcmd_callback(padapter, pcmd); - return H2C_SUCCESS; -} - static u8 read_bbreg_hdl(struct _adapter *padapter, u8 *pbuf) { struct cmd_obj *pcmd = (struct cmd_obj *)pbuf; @@ -213,14 +185,6 @@ static struct cmd_obj *cmd_hdl_filter(struct _adapter *padapter, pcmd_r = NULL;
switch (pcmd->cmdcode) { - case GEN_CMD_CODE(_Read_MACREG): - read_macreg_hdl(padapter, (u8 *)pcmd); - pcmd_r = pcmd; - break; - case GEN_CMD_CODE(_Write_MACREG): - write_macreg_hdl(padapter, (u8 *)pcmd); - pcmd_r = pcmd; - break; case GEN_CMD_CODE(_Read_BBREG): read_bbreg_hdl(padapter, (u8 *)pcmd); break;