From: Kai Ye yekai13@huawei.com
mainline inclusion from mainline-master commit 66192b2e3fd8ab97ed518d6c0240e26655a20b4b category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I475XT?from=project-issue CVE: NA
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/dr...
----------------------------------------------------------------------
The open interface of the sva prefetching function is distinguish the chip version. But the close interface of the sva prefetching function doesn't distinguish the chip version. As a result, the sva prefetching close operation is also performed on Kunpeng920, those registers are important on Kunpeng920, which eventually leads to abnormal hardware problems. So need to fix it immediately.
Signed-off-by: Kai Ye yekai13@huawei.com Signed-off-by: Herbert Xu herbert@gondor.apana.org.au Reviewed-by: Hao Fang fanghao11@huawei.com Reviewed-by: Mingqiang Ling lingmingqiang@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- drivers/crypto/hisilicon/sec2/sec_main.c | 3 +++ 1 file changed, 3 insertions(+)
diff --git a/drivers/crypto/hisilicon/sec2/sec_main.c b/drivers/crypto/hisilicon/sec2/sec_main.c index d120ce3e34ed..490db7bccf61 100644 --- a/drivers/crypto/hisilicon/sec2/sec_main.c +++ b/drivers/crypto/hisilicon/sec2/sec_main.c @@ -364,6 +364,9 @@ static void sec_close_sva_prefetch(struct hisi_qm *qm) u32 val; int ret;
+ if (qm->ver < QM_HW_V3) + return; + val = readl_relaxed(qm->io_base + SEC_PREFETCH_CFG); val |= SEC_PREFETCH_DISABLE; writel(val, qm->io_base + SEC_PREFETCH_CFG);
From: Weili Qian qianweili@huawei.com
mainline inclusion from mainline-master commit ed5fa39fa8a62fc55c1c4d53b71f3f4f08a90d22 category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I475PF?from=project-issue CVE: NA
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/herbert/cryptodev-2.6.git/co...
----------------------------------------------------------------------
Kunpeng930 zip device supports dynamic clock gating. When executing tasks, the algorithm core is opened, and when idle, the algorithm core is closed. This patch enables zip dynamic clock gating by writing hardware registers.
Signed-off-by: Weili Qian qianweili@huawei.com Signed-off-by: Herbert Xu herbert@gondor.apana.org.au Reviewed-by: Hao Fang fanghao11@huawei.com Reviewed-by: Mingqiang Ling lingmingqiang@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- drivers/crypto/hisilicon/zip/zip_main.c | 26 +++++++++++++++++++++++++ 1 file changed, 26 insertions(+)
diff --git a/drivers/crypto/hisilicon/zip/zip_main.c b/drivers/crypto/hisilicon/zip/zip_main.c index f8482ceebf2a..d1ca474ea8e3 100644 --- a/drivers/crypto/hisilicon/zip/zip_main.c +++ b/drivers/crypto/hisilicon/zip/zip_main.c @@ -107,6 +107,14 @@ #define HZIP_DELAY_1_US 1 #define HZIP_POLL_TIMEOUT_US 1000
+/* clock gating */ +#define HZIP_PEH_CFG_AUTO_GATE 0x3011A8 +#define HZIP_PEH_CFG_AUTO_GATE_EN BIT(0) +#define HZIP_CORE_GATED_EN GENMASK(15, 8) +#define HZIP_CORE_GATED_OOO_EN BIT(29) +#define HZIP_CLOCK_GATED_EN (HZIP_CORE_GATED_EN | \ + HZIP_CORE_GATED_OOO_EN) + static const char hisi_zip_name[] = "hisi_zip"; static struct dentry *hzip_debugfs_root;
@@ -312,6 +320,22 @@ static void hisi_zip_close_sva_prefetch(struct hisi_qm *qm) pci_err(qm->pdev, "failed to close sva prefetch\n"); }
+static void hisi_zip_enable_clock_gate(struct hisi_qm *qm) +{ + u32 val; + + if (qm->ver < QM_HW_V3) + return; + + val = readl(qm->io_base + HZIP_CLOCK_GATE_CTRL); + val |= HZIP_CLOCK_GATED_EN; + writel(val, qm->io_base + HZIP_CLOCK_GATE_CTRL); + + val = readl(qm->io_base + HZIP_PEH_CFG_AUTO_GATE); + val |= HZIP_PEH_CFG_AUTO_GATE_EN; + writel(val, qm->io_base + HZIP_PEH_CFG_AUTO_GATE); +} + static int hisi_zip_set_user_domain_and_cache(struct hisi_qm *qm) { void __iomem *base = qm->io_base; @@ -359,6 +383,8 @@ static int hisi_zip_set_user_domain_and_cache(struct hisi_qm *qm) CQC_CACHE_WB_ENABLE | FIELD_PREP(SQC_CACHE_WB_THRD, 1) | FIELD_PREP(CQC_CACHE_WB_THRD, 1), base + QM_CACHE_CTL);
+ hisi_zip_enable_clock_gate(qm); + return 0; }
From: Weili Qian qianweili@huawei.com
mainline inclusion from mainline-master commit 3d845d497b23547150fe7f9b3261ead9f4295686 category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I475PF?from=project-issue CVE: NA
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/herbert/cryptodev-2.6.git/co...
----------------------------------------------------------------------
Kunpeng930 sec device supports dynamic clock gating. When doing tasks, the algorithm core is opened, and when idle, the algorithm core is closed. This patch enables sec dynamic clock gating by writing hardware registers.
Signed-off-by: Weili Qian qianweili@huawei.com Signed-off-by: Herbert Xu herbert@gondor.apana.org.au Reviewed-by: Hao Fang fanghao11@huawei.com Reviewed-by: Mingqiang Ling lingmingqiang@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- drivers/crypto/hisilicon/sec2/sec_main.c | 46 +++++++++++++++++++++--- 1 file changed, 41 insertions(+), 5 deletions(-)
diff --git a/drivers/crypto/hisilicon/sec2/sec_main.c b/drivers/crypto/hisilicon/sec2/sec_main.c index 490db7bccf61..db4dbcf0492a 100644 --- a/drivers/crypto/hisilicon/sec2/sec_main.c +++ b/drivers/crypto/hisilicon/sec2/sec_main.c @@ -57,10 +57,16 @@ #define SEC_MEM_START_INIT_REG 0x301100 #define SEC_MEM_INIT_DONE_REG 0x301104
+/* clock gating */ #define SEC_CONTROL_REG 0x301200 -#define SEC_TRNG_EN_SHIFT 8 +#define SEC_DYNAMIC_GATE_REG 0x30121c +#define SEC_CORE_AUTO_GATE 0x30212c +#define SEC_DYNAMIC_GATE_EN 0x7bff +#define SEC_CORE_AUTO_GATE_EN GENMASK(3, 0) #define SEC_CLK_GATE_ENABLE BIT(3) #define SEC_CLK_GATE_DISABLE (~BIT(3)) + +#define SEC_TRNG_EN_SHIFT 8 #define SEC_AXI_SHUTDOWN_ENABLE BIT(12) #define SEC_AXI_SHUTDOWN_DISABLE 0xFFFFEFFF
@@ -378,15 +384,43 @@ static void sec_close_sva_prefetch(struct hisi_qm *qm) pci_err(qm->pdev, "failed to close sva prefetch\n"); }
+static void sec_enable_clock_gate(struct hisi_qm *qm) +{ + u32 val; + + if (qm->ver < QM_HW_V3) + return; + + val = readl_relaxed(qm->io_base + SEC_CONTROL_REG); + val |= SEC_CLK_GATE_ENABLE; + writel_relaxed(val, qm->io_base + SEC_CONTROL_REG); + + val = readl(qm->io_base + SEC_DYNAMIC_GATE_REG); + val |= SEC_DYNAMIC_GATE_EN; + writel(val, qm->io_base + SEC_DYNAMIC_GATE_REG); + + val = readl(qm->io_base + SEC_CORE_AUTO_GATE); + val |= SEC_CORE_AUTO_GATE_EN; + writel(val, qm->io_base + SEC_CORE_AUTO_GATE); +} + +static void sec_disable_clock_gate(struct hisi_qm *qm) +{ + u32 val; + + /* Kunpeng920 needs to close clock gating */ + val = readl_relaxed(qm->io_base + SEC_CONTROL_REG); + val &= SEC_CLK_GATE_DISABLE; + writel_relaxed(val, qm->io_base + SEC_CONTROL_REG); +} + static int sec_engine_init(struct hisi_qm *qm) { int ret; u32 reg;
- /* disable clock gate control */ - reg = readl_relaxed(qm->io_base + SEC_CONTROL_REG); - reg &= SEC_CLK_GATE_DISABLE; - writel_relaxed(reg, qm->io_base + SEC_CONTROL_REG); + /* disable clock gate control before mem init */ + sec_disable_clock_gate(qm);
writel_relaxed(0x1, qm->io_base + SEC_MEM_START_INIT_REG);
@@ -433,6 +467,8 @@ static int sec_engine_init(struct hisi_qm *qm) reg |= sec_get_endian(qm); writel_relaxed(reg, qm->io_base + SEC_CONTROL_REG);
+ sec_enable_clock_gate(qm); + return 0; }
From: Weili Qian qianweili@huawei.com
mainline inclusion from mainline-master commit ea5202dff79ce23e1a9fee3e1b2f09e28b77ba3a category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I475PF?from=project-issue CVE: NA
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/herbert/cryptodev-2.6.git/co...
----------------------------------------------------------------------
Kunpeng930 hpre device supports dynamic clock gating. When doing tasks, the algorithm core is opened, and when idle, the algorithm core is closed. This patch enables hpre dynamic clock gating by writing hardware registers.
Signed-off-by: Weili Qian qianweili@huawei.com Signed-off-by: Herbert Xu herbert@gondor.apana.org.au Reviewed-by: Hao Fang fanghao11@huawei.com Reviewed-by: Mingqiang Ling lingmingqiang@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- drivers/crypto/hisilicon/hpre/hpre_main.c | 63 +++++++++++++++++++++++ 1 file changed, 63 insertions(+)
diff --git a/drivers/crypto/hisilicon/hpre/hpre_main.c b/drivers/crypto/hisilicon/hpre/hpre_main.c index 8b0640fb04be..6a5de3073de9 100644 --- a/drivers/crypto/hisilicon/hpre/hpre_main.c +++ b/drivers/crypto/hisilicon/hpre/hpre_main.c @@ -81,6 +81,16 @@ #define HPRE_PREFETCH_DISABLE BIT(30) #define HPRE_SVA_DISABLE_READY (BIT(4) | BIT(8))
+/* clock gate */ +#define HPRE_CLKGATE_CTL 0x301a10 +#define HPRE_PEH_CFG_AUTO_GATE 0x301a2c +#define HPRE_CLUSTER_DYN_CTL 0x302010 +#define HPRE_CORE_SHB_CFG 0x302088 +#define HPRE_CLKGATE_CTL_EN BIT(0) +#define HPRE_PEH_CFG_AUTO_GATE_EN BIT(0) +#define HPRE_CLUSTER_DYN_CTL_EN BIT(0) +#define HPRE_CORE_GATE_EN (BIT(30) | BIT(31)) + #define HPRE_AM_OOO_SHUTDOWN_ENB 0x301044 #define HPRE_AM_OOO_SHUTDOWN_ENABLE BIT(0) #define HPRE_WR_MSI_PORT BIT(2) @@ -417,12 +427,63 @@ static void hpre_close_sva_prefetch(struct hisi_qm *qm) pci_err(qm->pdev, "failed to close sva prefetch\n"); }
+static void hpre_enable_clock_gate(struct hisi_qm *qm) +{ + u32 val; + + if (qm->ver < QM_HW_V3) + return; + + val = readl(qm->io_base + HPRE_CLKGATE_CTL); + val |= HPRE_CLKGATE_CTL_EN; + writel(val, qm->io_base + HPRE_CLKGATE_CTL); + + val = readl(qm->io_base + HPRE_PEH_CFG_AUTO_GATE); + val |= HPRE_PEH_CFG_AUTO_GATE_EN; + writel(val, qm->io_base + HPRE_PEH_CFG_AUTO_GATE); + + val = readl(qm->io_base + HPRE_CLUSTER_DYN_CTL); + val |= HPRE_CLUSTER_DYN_CTL_EN; + writel(val, qm->io_base + HPRE_CLUSTER_DYN_CTL); + + val = readl_relaxed(qm->io_base + HPRE_CORE_SHB_CFG); + val |= HPRE_CORE_GATE_EN; + writel(val, qm->io_base + HPRE_CORE_SHB_CFG); +} + +static void hpre_disable_clock_gate(struct hisi_qm *qm) +{ + u32 val; + + if (qm->ver < QM_HW_V3) + return; + + val = readl(qm->io_base + HPRE_CLKGATE_CTL); + val &= ~HPRE_CLKGATE_CTL_EN; + writel(val, qm->io_base + HPRE_CLKGATE_CTL); + + val = readl(qm->io_base + HPRE_PEH_CFG_AUTO_GATE); + val &= ~HPRE_PEH_CFG_AUTO_GATE_EN; + writel(val, qm->io_base + HPRE_PEH_CFG_AUTO_GATE); + + val = readl(qm->io_base + HPRE_CLUSTER_DYN_CTL); + val &= ~HPRE_CLUSTER_DYN_CTL_EN; + writel(val, qm->io_base + HPRE_CLUSTER_DYN_CTL); + + val = readl_relaxed(qm->io_base + HPRE_CORE_SHB_CFG); + val &= ~HPRE_CORE_GATE_EN; + writel(val, qm->io_base + HPRE_CORE_SHB_CFG); +} + static int hpre_set_user_domain_and_cache(struct hisi_qm *qm) { struct device *dev = &qm->pdev->dev; u32 val; int ret;
+ /* disabel dynamic clock gate before sram init */ + hpre_disable_clock_gate(qm); + writel(HPRE_QM_USR_CFG_MASK, qm->io_base + QM_ARUSER_M_CFG_ENABLE); writel(HPRE_QM_USR_CFG_MASK, qm->io_base + QM_AWUSER_M_CFG_ENABLE); writel_relaxed(HPRE_QM_AXI_CFG_MASK, qm->io_base + QM_AXI_M_CFG); @@ -473,6 +534,8 @@ static int hpre_set_user_domain_and_cache(struct hisi_qm *qm) /* Config data buffer pasid needed by Kunpeng 920 */ hpre_config_pasid(qm);
+ hpre_enable_clock_gate(qm); + return ret; }
From: Kai Ye yekai13@huawei.com
mainline inclusion from mainline-master commit 90367a027a22c3a9ca8b8bac15df34d9e859fc11 category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I475XT?from=project-issue CVE: NA
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/herbert/cryptodev-2.6.git/co...
----------------------------------------------------------------------
Because the algs registration process has added a judgment. So need to add the judgment for the abnormal exiting process.
Signed-off-by: Kai Ye yekai13@huawei.com Signed-off-by: Herbert Xu herbert@gondor.apana.org.au Reviewed-by: Hao Fang fanghao11@huawei.com Reviewed-by: Mingqiang Ling lingmingqiang@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- drivers/crypto/hisilicon/sec2/sec_main.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-)
diff --git a/drivers/crypto/hisilicon/sec2/sec_main.c b/drivers/crypto/hisilicon/sec2/sec_main.c index db4dbcf0492a..45a1ddd31dc3 100644 --- a/drivers/crypto/hisilicon/sec2/sec_main.c +++ b/drivers/crypto/hisilicon/sec2/sec_main.c @@ -1020,7 +1020,8 @@ static int sec_probe(struct pci_dev *pdev, const struct pci_device_id *id) return 0;
err_alg_unregister: - hisi_qm_alg_unregister(qm, &sec_devices); + if (qm->qp_num >= ctx_q_num) + hisi_qm_alg_unregister(qm, &sec_devices); err_qm_stop: sec_debugfs_exit(qm); hisi_qm_stop(qm, QM_NORMAL);
From: Kai Ye yekai13@huawei.com
mainline inclusion from mainline-master commit a52626106d6f7edf3d106c065e13a0313cfeb82f category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I475XT?from=project-issue CVE: NA
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/herbert/cryptodev-2.6.git/co...
----------------------------------------------------------------------
When the endian configuration of the hardware is abnormal, it will cause the SEC engine is faulty that reports empty message. And it will affect the normal function of the hardware. Currently the soft configuration method can't restore the faulty device. The endian needs to be configured according to the system properties. So fix it.
Signed-off-by: Kai Ye yekai13@huawei.com Signed-off-by: Herbert Xu herbert@gondor.apana.org.au Reviewed-by: Hao Fang fanghao11@huawei.com Reviewed-by: Mingqiang Ling lingmingqiang@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- drivers/crypto/hisilicon/sec2/sec.h | 5 ---- drivers/crypto/hisilicon/sec2/sec_main.c | 31 +++++++----------------- 2 files changed, 9 insertions(+), 27 deletions(-)
diff --git a/drivers/crypto/hisilicon/sec2/sec.h b/drivers/crypto/hisilicon/sec2/sec.h index 018415b9840a..d97cf02b1df7 100644 --- a/drivers/crypto/hisilicon/sec2/sec.h +++ b/drivers/crypto/hisilicon/sec2/sec.h @@ -157,11 +157,6 @@ struct sec_ctx { struct device *dev; };
-enum sec_endian { - SEC_LE = 0, - SEC_32BE, - SEC_64BE -};
enum sec_debug_file_index { SEC_CLEAR_ENABLE, diff --git a/drivers/crypto/hisilicon/sec2/sec_main.c b/drivers/crypto/hisilicon/sec2/sec_main.c index 45a1ddd31dc3..2250c81d6158 100644 --- a/drivers/crypto/hisilicon/sec2/sec_main.c +++ b/drivers/crypto/hisilicon/sec2/sec_main.c @@ -318,31 +318,20 @@ static const struct pci_device_id sec_dev_ids[] = { }; MODULE_DEVICE_TABLE(pci, sec_dev_ids);
-static u8 sec_get_endian(struct hisi_qm *qm) +static void sec_set_endian(struct hisi_qm *qm) { u32 reg;
- /* - * As for VF, it is a wrong way to get endian setting by - * reading a register of the engine - */ - if (qm->pdev->is_virtfn) { - dev_err_ratelimited(&qm->pdev->dev, - "cannot access a register in VF!\n"); - return SEC_LE; - } reg = readl_relaxed(qm->io_base + SEC_CONTROL_REG); - /* BD little endian mode */ - if (!(reg & BIT(0))) - return SEC_LE; + reg &= ~(BIT(1) | BIT(0)); + if (!IS_ENABLED(CONFIG_64BIT)) + reg |= BIT(1);
- /* BD 32-bits big endian mode */ - else if (!(reg & BIT(1))) - return SEC_32BE;
- /* BD 64-bits big endian mode */ - else - return SEC_64BE; + if (!IS_ENABLED(CONFIG_CPU_LITTLE_ENDIAN)) + reg |= BIT(0); + + writel_relaxed(reg, qm->io_base + SEC_CONTROL_REG); }
static void sec_open_sva_prefetch(struct hisi_qm *qm) @@ -463,9 +452,7 @@ static int sec_engine_init(struct hisi_qm *qm) qm->io_base + SEC_BD_ERR_CHK_EN_REG3);
/* config endian */ - reg = readl_relaxed(qm->io_base + SEC_CONTROL_REG); - reg |= sec_get_endian(qm); - writel_relaxed(reg, qm->io_base + SEC_CONTROL_REG); + sec_set_endian(qm);
sec_enable_clock_gate(qm);
From: Weili Qian qianweili@huawei.com
mainline inclusion from mainline-master commit 1295292d65b729fc8b234fcdf884d79ff5a63ca1 category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I475PF?from=project-issue CVE: NA
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/herbert/cryptodev-2.6.git/co...
----------------------------------------------------------------------
The accelerator devices support runtime PM, when device is in suspended, an exception will occur if reading registers. Therefore, this patch uses 'debugfs_create_file' instead of 'debugfs_create_regset32' to create debugfs file, and then the driver can get the device status before reading the register.
Signed-off-by: Weili Qian qianweili@huawei.com Signed-off-by: Herbert Xu herbert@gondor.apana.org.au Reviewed-by: Hao Fang fanghao11@huawei.com Reviewed-by: Mingqiang Ling lingmingqiang@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- drivers/crypto/hisilicon/hpre/hpre_main.c | 25 +++++++++- drivers/crypto/hisilicon/qm.c | 58 ++++++++++++++--------- drivers/crypto/hisilicon/qm.h | 2 + drivers/crypto/hisilicon/sec2/sec_main.c | 11 ++++- drivers/crypto/hisilicon/zip/zip_main.c | 12 ++++- 5 files changed, 81 insertions(+), 27 deletions(-)
diff --git a/drivers/crypto/hisilicon/hpre/hpre_main.c b/drivers/crypto/hisilicon/hpre/hpre_main.c index 6a5de3073de9..b216238c1bb3 100644 --- a/drivers/crypto/hisilicon/hpre/hpre_main.c +++ b/drivers/crypto/hisilicon/hpre/hpre_main.c @@ -763,6 +763,24 @@ static int hpre_debugfs_atomic64_set(void *data, u64 val) DEFINE_DEBUGFS_ATTRIBUTE(hpre_atomic64_ops, hpre_debugfs_atomic64_get, hpre_debugfs_atomic64_set, "%llu\n");
+static int hpre_com_regs_show(struct seq_file *s, void *unused) +{ + hisi_qm_regs_dump(s, s->private); + + return 0; +} + +DEFINE_SHOW_ATTRIBUTE(hpre_com_regs); + +static int hpre_cluster_regs_show(struct seq_file *s, void *unused) +{ + hisi_qm_regs_dump(s, s->private); + + return 0; +} + +DEFINE_SHOW_ATTRIBUTE(hpre_cluster_regs); + static int hpre_create_debugfs_file(struct hisi_qm *qm, struct dentry *dir, enum hpre_ctrl_dbgfs_file type, int indx) { @@ -801,7 +819,9 @@ static int hpre_pf_comm_regs_debugfs_init(struct hisi_qm *qm) regset->nregs = ARRAY_SIZE(hpre_com_dfx_regs); regset->base = qm->io_base;
- debugfs_create_regset32("regs", 0444, qm->debug.debug_root, regset); + debugfs_create_file("regs", 0444, qm->debug.debug_root, + regset, &hpre_com_regs_fops); + return 0; }
@@ -828,7 +848,8 @@ static int hpre_cluster_debugfs_init(struct hisi_qm *qm) regset->nregs = ARRAY_SIZE(hpre_cluster_dfx_regs); regset->base = qm->io_base + hpre_cluster_offsets[i];
- debugfs_create_regset32("regs", 0444, tmp_d, regset); + debugfs_create_file("regs", 0444, tmp_d, regset, + &hpre_cluster_regs_fops); ret = hpre_create_debugfs_file(qm, tmp_d, HPRE_CLUSTER_CTRL, i + HPRE_CLUSTER_CTRL); if (ret) diff --git a/drivers/crypto/hisilicon/qm.c b/drivers/crypto/hisilicon/qm.c index 1d67f94a1d56..e417cd05f612 100644 --- a/drivers/crypto/hisilicon/qm.c +++ b/drivers/crypto/hisilicon/qm.c @@ -4,7 +4,6 @@ #include <linux/acpi.h> #include <linux/aer.h> #include <linux/bitmap.h> -#include <linux/debugfs.h> #include <linux/dma-mapping.h> #include <linux/idr.h> #include <linux/io.h> @@ -1337,13 +1336,8 @@ static const struct file_operations qm_debug_fops = { .write = qm_debug_write, };
-struct qm_dfx_registers { - char *reg_name; - u64 reg_offset; -}; - #define CNT_CYC_REGS_NUM 10 -static struct qm_dfx_registers qm_dfx_regs[] = { +static const struct debugfs_reg32 qm_dfx_regs[] = { /* XXX_CNT are reading clear register */ {"QM_ECC_1BIT_CNT ", 0x104000ull}, {"QM_ECC_MBIT_CNT ", 0x104008ull}, @@ -1369,31 +1363,49 @@ static struct qm_dfx_registers qm_dfx_regs[] = { {"QM_DFX_FF_ST5 ", 0x1040dcull}, {"QM_DFX_FF_ST6 ", 0x1040e0ull}, {"QM_IN_IDLE_ST ", 0x1040e4ull}, - { NULL, 0} };
-static struct qm_dfx_registers qm_vf_dfx_regs[] = { +static const struct debugfs_reg32 qm_vf_dfx_regs[] = { {"QM_DFX_FUNS_ACTIVE_ST ", 0x200ull}, - { NULL, 0} };
-static int qm_regs_show(struct seq_file *s, void *unused) +/** + * hisi_qm_regs_dump() - Dump registers's value. + * @s: debugfs file handle. + * @regset: accelerator registers information. + * + * Dump accelerator registers. + */ +void hisi_qm_regs_dump(struct seq_file *s, struct debugfs_regset32 *regset) { - struct hisi_qm *qm = s->private; - struct qm_dfx_registers *regs; + const struct debugfs_reg32 *regs = regset->regs; + int regs_len = regset->nregs; u32 val; + int i;
- if (qm->fun_type == QM_HW_PF) - regs = qm_dfx_regs; - else - regs = qm_vf_dfx_regs; + for (i = 0; i < regs_len; i++) { + val = readl(regset->base + regs[i].offset); + seq_printf(s, "%s= 0x%08x\n", regs[i].name, val); + } +} +EXPORT_SYMBOL_GPL(hisi_qm_regs_dump);
- while (regs->reg_name) { - val = readl(qm->io_base + regs->reg_offset); - seq_printf(s, "%s= 0x%08x\n", regs->reg_name, val); - regs++; +static int qm_regs_show(struct seq_file *s, void *unused) +{ + struct hisi_qm *qm = s->private; + struct debugfs_regset32 regset; + + if (qm->fun_type == QM_HW_PF) { + regset.regs = qm_dfx_regs; + regset.nregs = ARRAY_SIZE(qm_dfx_regs); + } else { + regset.regs = qm_vf_dfx_regs; + regset.nregs = ARRAY_SIZE(qm_vf_dfx_regs); }
+ regset.base = qm->io_base; + hisi_qm_regs_dump(s, ®set); + return 0; }
@@ -4245,7 +4257,7 @@ EXPORT_SYMBOL_GPL(hisi_qm_debug_init); */ void hisi_qm_debug_regs_clear(struct hisi_qm *qm) { - struct qm_dfx_registers *regs; + const struct debugfs_reg32 *regs; int i;
/* clear current_qm */ @@ -4264,7 +4276,7 @@ void hisi_qm_debug_regs_clear(struct hisi_qm *qm)
regs = qm_dfx_regs; for (i = 0; i < CNT_CYC_REGS_NUM; i++) { - readl(qm->io_base + regs->reg_offset); + readl(qm->io_base + regs->offset); regs++; }
diff --git a/drivers/crypto/hisilicon/qm.h b/drivers/crypto/hisilicon/qm.h index 035eaf8c442d..0e5df1c8b3f5 100644 --- a/drivers/crypto/hisilicon/qm.h +++ b/drivers/crypto/hisilicon/qm.h @@ -4,6 +4,7 @@ #define HISI_ACC_QM_H
#include <linux/bitfield.h> +#include <linux/debugfs.h> #include <linux/iopoll.h> #include <linux/module.h> #include <linux/pci.h> @@ -430,4 +431,5 @@ void hisi_qm_dev_shutdown(struct pci_dev *pdev); void hisi_qm_wait_task_finish(struct hisi_qm *qm, struct hisi_qm_list *qm_list); int hisi_qm_alg_register(struct hisi_qm *qm, struct hisi_qm_list *qm_list); void hisi_qm_alg_unregister(struct hisi_qm *qm, struct hisi_qm_list *qm_list); +void hisi_qm_regs_dump(struct seq_file *s, struct debugfs_regset32 *regset); #endif diff --git a/drivers/crypto/hisilicon/sec2/sec_main.c b/drivers/crypto/hisilicon/sec2/sec_main.c index 2250c81d6158..55ef3ae70fdb 100644 --- a/drivers/crypto/hisilicon/sec2/sec_main.c +++ b/drivers/crypto/hisilicon/sec2/sec_main.c @@ -676,6 +676,15 @@ static int sec_debugfs_atomic64_set(void *data, u64 val) DEFINE_DEBUGFS_ATTRIBUTE(sec_atomic64_ops, sec_debugfs_atomic64_get, sec_debugfs_atomic64_set, "%lld\n");
+static int sec_regs_show(struct seq_file *s, void *unused) +{ + hisi_qm_regs_dump(s, s->private); + + return 0; +} + +DEFINE_SHOW_ATTRIBUTE(sec_regs); + static int sec_core_debug_init(struct hisi_qm *qm) { struct sec_dev *sec = container_of(qm, struct sec_dev, qm); @@ -696,7 +705,7 @@ static int sec_core_debug_init(struct hisi_qm *qm) regset->base = qm->io_base;
if (qm->pdev->device == SEC_PF_PCI_DEVICE_ID) - debugfs_create_regset32("regs", 0444, tmp_d, regset); + debugfs_create_file("regs", 0444, tmp_d, regset, &sec_regs_fops);
for (i = 0; i < ARRAY_SIZE(sec_dfx_labels); i++) { atomic64_t *data = (atomic64_t *)((uintptr_t)dfx + diff --git a/drivers/crypto/hisilicon/zip/zip_main.c b/drivers/crypto/hisilicon/zip/zip_main.c index d1ca474ea8e3..4438188adbb7 100644 --- a/drivers/crypto/hisilicon/zip/zip_main.c +++ b/drivers/crypto/hisilicon/zip/zip_main.c @@ -564,6 +564,15 @@ static int zip_debugfs_atomic64_get(void *data, u64 *val) DEFINE_DEBUGFS_ATTRIBUTE(zip_atomic64_ops, zip_debugfs_atomic64_get, zip_debugfs_atomic64_set, "%llu\n");
+static int hisi_zip_regs_show(struct seq_file *s, void *unused) +{ + hisi_qm_regs_dump(s, s->private); + + return 0; +} + +DEFINE_SHOW_ATTRIBUTE(hisi_zip_regs); + static int hisi_zip_core_debug_init(struct hisi_qm *qm) { struct device *dev = &qm->pdev->dev; @@ -588,7 +597,8 @@ static int hisi_zip_core_debug_init(struct hisi_qm *qm) regset->base = qm->io_base + core_offsets[i];
tmp_d = debugfs_create_dir(buf, qm->debug.debug_root); - debugfs_create_regset32("regs", 0444, tmp_d, regset); + debugfs_create_file("regs", 0444, tmp_d, regset, + &hisi_zip_regs_fops); }
return 0;
From: Weili Qian qianweili@huawei.com
mainline inclusion from mainline-master commit d7ea53395b723b1a87b9c0afb3301cc33fbe35e6 category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I475PF?from=project-issue CVE: NA
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/herbert/cryptodev-2.6.git/co...
----------------------------------------------------------------------
Accelerator devices support runtime PM to reduce power consumption. This patch adds the runtime PM suspend/resume callbacks to the accelerator devices.
Signed-off-by: Weili Qian qianweili@huawei.com Signed-off-by: Herbert Xu herbert@gondor.apana.org.au Reviewed-by: Hao Fang fanghao11@huawei.com Reviewed-by: Mingqiang Ling lingmingqiang@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- drivers/crypto/hisilicon/qm.c | 118 ++++++++++++++++++++++++++++++++++ drivers/crypto/hisilicon/qm.h | 2 + 2 files changed, 120 insertions(+)
diff --git a/drivers/crypto/hisilicon/qm.c b/drivers/crypto/hisilicon/qm.c index e417cd05f612..dbe162a0bd02 100644 --- a/drivers/crypto/hisilicon/qm.c +++ b/drivers/crypto/hisilicon/qm.c @@ -5692,6 +5692,124 @@ int hisi_qm_init(struct hisi_qm *qm) } EXPORT_SYMBOL_GPL(hisi_qm_init);
+ +static int qm_prepare_for_suspend(struct hisi_qm *qm) +{ + struct pci_dev *pdev = qm->pdev; + int ret; + u32 val; + + ret = qm->ops->set_msi(qm, false); + if (ret) { + pci_err(pdev, "failed to disable MSI before suspending!\n"); + return ret; + } + + /* shutdown OOO register */ + writel(ACC_MASTER_GLOBAL_CTRL_SHUTDOWN, + qm->io_base + ACC_MASTER_GLOBAL_CTRL); + + ret = readl_relaxed_poll_timeout(qm->io_base + ACC_MASTER_TRANS_RETURN, + val, + (val == ACC_MASTER_TRANS_RETURN_RW), + POLL_PERIOD, POLL_TIMEOUT); + if (ret) { + pci_emerg(pdev, "Bus lock! Please reset system.\n"); + return ret; + } + + ret = qm_set_pf_mse(qm, false); + if (ret) + pci_err(pdev, "failed to disable MSE before suspending!\n"); + + return ret; +} + +static int qm_rebuild_for_resume(struct hisi_qm *qm) +{ + struct pci_dev *pdev = qm->pdev; + int ret; + + ret = qm_set_pf_mse(qm, true); + if (ret) { + pci_err(pdev, "failed to enable MSE after resuming!\n"); + return ret; + } + + ret = qm->ops->set_msi(qm, true); + if (ret) { + pci_err(pdev, "failed to enable MSI after resuming!\n"); + return ret; + } + + ret = qm_dev_hw_init(qm); + if (ret) { + pci_err(pdev, "failed to init device after resuming\n"); + return ret; + } + + qm_cmd_init(qm); + hisi_qm_dev_err_init(qm); + + return 0; +} + +/** + * hisi_qm_suspend() - Runtime suspend of given device. + * @dev: device to suspend. + * + * Function that suspend the device. + */ +int hisi_qm_suspend(struct device *dev) +{ + struct pci_dev *pdev = to_pci_dev(dev); + struct hisi_qm *qm = pci_get_drvdata(pdev); + int ret; + + pci_info(pdev, "entering suspended state\n"); + + ret = hisi_qm_stop(qm, QM_NORMAL); + if (ret) { + pci_err(pdev, "failed to stop qm(%d)\n", ret); + return ret; + } + + ret = qm_prepare_for_suspend(qm); + if (ret) + pci_err(pdev, "failed to prepare suspended(%d)\n", ret); + + return ret; +} +EXPORT_SYMBOL_GPL(hisi_qm_suspend); + +/** + * hisi_qm_resume() - Runtime resume of given device. + * @dev: device to resume. + * + * Function that resume the device. + */ +int hisi_qm_resume(struct device *dev) +{ + struct pci_dev *pdev = to_pci_dev(dev); + struct hisi_qm *qm = pci_get_drvdata(pdev); + int ret; + + pci_info(pdev, "resuming from suspend state\n"); + + ret = qm_rebuild_for_resume(qm); + if (ret) { + pci_err(pdev, "failed to rebuild resume(%d)\n", ret); + return ret; + } + + ret = hisi_qm_start(qm); + if (ret) + pci_err(pdev, "failed to start qm(%d)\n", ret); + + return 0; +} +EXPORT_SYMBOL_GPL(hisi_qm_resume); + MODULE_LICENSE("GPL v2"); MODULE_AUTHOR("Zhou Wang wangzhou1@hisilicon.com"); MODULE_DESCRIPTION("HiSilicon Accelerator queue manager driver"); diff --git a/drivers/crypto/hisilicon/qm.h b/drivers/crypto/hisilicon/qm.h index 0e5df1c8b3f5..59e16464d03d 100644 --- a/drivers/crypto/hisilicon/qm.h +++ b/drivers/crypto/hisilicon/qm.h @@ -431,5 +431,7 @@ void hisi_qm_dev_shutdown(struct pci_dev *pdev); void hisi_qm_wait_task_finish(struct hisi_qm *qm, struct hisi_qm_list *qm_list); int hisi_qm_alg_register(struct hisi_qm *qm, struct hisi_qm_list *qm_list); void hisi_qm_alg_unregister(struct hisi_qm *qm, struct hisi_qm_list *qm_list); +int hisi_qm_resume(struct device *dev); +int hisi_qm_suspend(struct device *dev); void hisi_qm_regs_dump(struct seq_file *s, struct debugfs_regset32 *regset); #endif
From: Weili Qian qianweili@huawei.com
mainline inclusion from mainline-master commit 607c191b371d72952c11dc209e583303a4515f14 category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I475PF?from=project-issue CVE: NA
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/herbert/cryptodev-2.6.git/co...
----------------------------------------------------------------------
Add runtime PM support for Kunpeng930 accelerator device.
Signed-off-by: Weili Qian qianweili@huawei.com Signed-off-by: Herbert Xu herbert@gondor.apana.org.au Reviewed-by: Hao Fang fanghao11@huawei.com Reviewed-by: Mingqiang Ling lingmingqiang@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- drivers/crypto/hisilicon/hpre/hpre_main.c | 35 +++- drivers/crypto/hisilicon/qm.c | 218 ++++++++++++++++++++-- drivers/crypto/hisilicon/qm.h | 4 + drivers/crypto/hisilicon/sec2/sec_main.c | 36 +++- drivers/crypto/hisilicon/zip/zip_main.c | 34 +++- 5 files changed, 297 insertions(+), 30 deletions(-)
diff --git a/drivers/crypto/hisilicon/hpre/hpre_main.c b/drivers/crypto/hisilicon/hpre/hpre_main.c index b216238c1bb3..65a641396c07 100644 --- a/drivers/crypto/hisilicon/hpre/hpre_main.c +++ b/drivers/crypto/hisilicon/hpre/hpre_main.c @@ -9,6 +9,7 @@ #include <linux/kernel.h> #include <linux/module.h> #include <linux/pci.h> +#include <linux/pm_runtime.h> #include <linux/topology.h> #include <linux/uacce.h> #include "hpre.h" @@ -658,10 +659,15 @@ static ssize_t hpre_ctrl_debug_read(struct file *filp, char __user *buf, size_t count, loff_t *pos) { struct hpre_debugfs_file *file = filp->private_data; + struct hisi_qm *qm = hpre_file_to_qm(file); char tbuf[HPRE_DBGFS_VAL_MAX_LEN]; u32 val; int ret;
+ ret = hisi_qm_get_dfx_access(qm); + if (ret) + return ret; + spin_lock_irq(&file->lock); switch (file->type) { case HPRE_CLEAR_ENABLE: @@ -671,18 +677,25 @@ static ssize_t hpre_ctrl_debug_read(struct file *filp, char __user *buf, val = hpre_cluster_inqry_read(file); break; default: - spin_unlock_irq(&file->lock); - return -EINVAL; + goto err_input; } spin_unlock_irq(&file->lock); + + hisi_qm_put_dfx_access(qm); ret = snprintf(tbuf, HPRE_DBGFS_VAL_MAX_LEN, "%u\n", val); return simple_read_from_buffer(buf, count, pos, tbuf, ret); + +err_input: + spin_unlock_irq(&file->lock); + hisi_qm_put_dfx_access(qm); + return -EINVAL; }
static ssize_t hpre_ctrl_debug_write(struct file *filp, const char __user *buf, size_t count, loff_t *pos) { struct hpre_debugfs_file *file = filp->private_data; + struct hisi_qm *qm = hpre_file_to_qm(file); char tbuf[HPRE_DBGFS_VAL_MAX_LEN]; unsigned long val; int len, ret; @@ -702,6 +715,10 @@ static ssize_t hpre_ctrl_debug_write(struct file *filp, const char __user *buf, if (kstrtoul(tbuf, 0, &val)) return -EFAULT;
+ ret = hisi_qm_get_dfx_access(qm); + if (ret) + return ret; + spin_lock_irq(&file->lock); switch (file->type) { case HPRE_CLEAR_ENABLE: @@ -718,12 +735,12 @@ static ssize_t hpre_ctrl_debug_write(struct file *filp, const char __user *buf, ret = -EINVAL; goto err_input; } - spin_unlock_irq(&file->lock);
- return count; + ret = count;
err_input: spin_unlock_irq(&file->lock); + hisi_qm_put_dfx_access(qm); return ret; }
@@ -818,6 +835,7 @@ static int hpre_pf_comm_regs_debugfs_init(struct hisi_qm *qm) regset->regs = hpre_com_dfx_regs; regset->nregs = ARRAY_SIZE(hpre_com_dfx_regs); regset->base = qm->io_base; + regset->dev = dev;
debugfs_create_file("regs", 0444, qm->debug.debug_root, regset, &hpre_com_regs_fops); @@ -847,6 +865,7 @@ static int hpre_cluster_debugfs_init(struct hisi_qm *qm) regset->regs = hpre_cluster_dfx_regs; regset->nregs = ARRAY_SIZE(hpre_cluster_dfx_regs); regset->base = qm->io_base + hpre_cluster_offsets[i]; + regset->dev = dev;
debugfs_create_file("regs", 0444, tmp_d, regset, &hpre_cluster_regs_fops); @@ -1101,6 +1120,8 @@ static int hpre_probe(struct pci_dev *pdev, const struct pci_device_id *id) goto err_with_alg_register; }
+ hisi_qm_pm_init(qm); + return 0;
err_with_alg_register: @@ -1124,6 +1145,7 @@ static void hpre_remove(struct pci_dev *pdev) struct hisi_qm *qm = pci_get_drvdata(pdev); int ret;
+ hisi_qm_pm_uninit(qm); hisi_qm_wait_task_finish(qm, &hpre_devices); hisi_qm_alg_unregister(qm, &hpre_devices); if (qm->fun_type == QM_HW_PF && qm->vfs_num) { @@ -1146,6 +1168,10 @@ static void hpre_remove(struct pci_dev *pdev) hisi_qm_uninit(qm); }
+static const struct dev_pm_ops hpre_pm_ops = { + SET_RUNTIME_PM_OPS(hisi_qm_suspend, hisi_qm_resume, NULL) +}; + static const struct pci_error_handlers hpre_err_handler = { .error_detected = hisi_qm_dev_err_detected, .slot_reset = hisi_qm_dev_slot_reset, @@ -1162,6 +1188,7 @@ static struct pci_driver hpre_pci_driver = { hisi_qm_sriov_configure : NULL, .err_handler = &hpre_err_handler, .shutdown = hisi_qm_dev_shutdown, + .driver.pm = &hpre_pm_ops, };
static void hpre_register_debugfs(void) diff --git a/drivers/crypto/hisilicon/qm.c b/drivers/crypto/hisilicon/qm.c index dbe162a0bd02..74740035380d 100644 --- a/drivers/crypto/hisilicon/qm.c +++ b/drivers/crypto/hisilicon/qm.c @@ -9,6 +9,7 @@ #include <linux/io.h> #include <linux/irqreturn.h> #include <linux/log2.h> +#include <linux/pm_runtime.h> #include <linux/seq_file.h> #include <linux/slab.h> #include <linux/uacce.h> @@ -269,6 +270,8 @@ #define QM_QOS_MAX_CIR_S 11 #define QM_QOS_VAL_MAX_LEN 32
+#define QM_AUTOSUSPEND_DELAY 3000 + #define QM_MK_CQC_DW3_V1(hop_num, pg_sz, buf_sz, cqe_sz) \ (((hop_num) << QM_CQ_HOP_NUM_SHIFT) | \ ((pg_sz) << QM_CQ_PAGE_SIZE_SHIFT) | \ @@ -733,6 +736,34 @@ static u32 qm_get_irq_num_v3(struct hisi_qm *qm) return QM_IRQ_NUM_VF_V3; }
+static int qm_pm_get_sync(struct hisi_qm *qm) +{ + struct device *dev = &qm->pdev->dev; + int ret; + + if (qm->fun_type == QM_HW_VF || qm->ver < QM_HW_V3) + return 0; + + ret = pm_runtime_resume_and_get(dev); + if (ret < 0) { + dev_err(dev, "failed to get_sync(%d).\n", ret); + return ret; + } + + return 0; +} + +static void qm_pm_put_sync(struct hisi_qm *qm) +{ + struct device *dev = &qm->pdev->dev; + + if (qm->fun_type == QM_HW_VF || qm->ver < QM_HW_V3) + return; + + pm_runtime_mark_last_busy(dev); + pm_runtime_put_autosuspend(dev); +} + static struct hisi_qp *qm_to_hisi_qp(struct hisi_qm *qm, struct qm_eqe *eqe) { u16 cqn = le32_to_cpu(eqe->dw0) & QM_EQE_CQN_MASK; @@ -1258,10 +1289,15 @@ static ssize_t qm_debug_read(struct file *filp, char __user *buf, { struct debugfs_file *file = filp->private_data; enum qm_debug_file index = file->index; + struct hisi_qm *qm = file_to_qm(file); char tbuf[QM_DBG_TMP_BUF_LEN]; u32 val; int ret;
+ ret = hisi_qm_get_dfx_access(qm); + if (ret) + return ret; + mutex_lock(&file->lock); switch (index) { case CURRENT_QM: @@ -1274,13 +1310,18 @@ static ssize_t qm_debug_read(struct file *filp, char __user *buf, val = clear_enable_read(file); break; default: - mutex_unlock(&file->lock); - return -EINVAL; + goto err_input; } mutex_unlock(&file->lock);
+ hisi_qm_put_dfx_access(qm); ret = scnprintf(tbuf, QM_DBG_TMP_BUF_LEN, "%u\n", val); return simple_read_from_buffer(buf, count, pos, tbuf, ret); + +err_input: + mutex_unlock(&file->lock); + hisi_qm_put_dfx_access(qm); + return -EINVAL; }
static ssize_t qm_debug_write(struct file *filp, const char __user *buf, @@ -1288,6 +1329,7 @@ static ssize_t qm_debug_write(struct file *filp, const char __user *buf, { struct debugfs_file *file = filp->private_data; enum qm_debug_file index = file->index; + struct hisi_qm *qm = file_to_qm(file); unsigned long val; char tbuf[QM_DBG_TMP_BUF_LEN]; int len, ret; @@ -1307,6 +1349,10 @@ static ssize_t qm_debug_write(struct file *filp, const char __user *buf, if (kstrtoul(tbuf, 0, &val)) return -EFAULT;
+ ret = hisi_qm_get_dfx_access(qm); + if (ret) + return ret; + mutex_lock(&file->lock); switch (index) { case CURRENT_QM: @@ -1323,6 +1369,8 @@ static ssize_t qm_debug_write(struct file *filp, const char __user *buf, } mutex_unlock(&file->lock);
+ hisi_qm_put_dfx_access(qm); + if (ret) return ret;
@@ -1378,15 +1426,23 @@ static const struct debugfs_reg32 qm_vf_dfx_regs[] = { */ void hisi_qm_regs_dump(struct seq_file *s, struct debugfs_regset32 *regset) { + struct pci_dev *pdev = to_pci_dev(regset->dev); + struct hisi_qm *qm = pci_get_drvdata(pdev); const struct debugfs_reg32 *regs = regset->regs; int regs_len = regset->nregs; + int i, ret; u32 val; - int i; + + ret = hisi_qm_get_dfx_access(qm); + if (ret) + return;
for (i = 0; i < regs_len; i++) { val = readl(regset->base + regs[i].offset); seq_printf(s, "%s= 0x%08x\n", regs[i].name, val); } + + hisi_qm_put_dfx_access(qm); } EXPORT_SYMBOL_GPL(hisi_qm_regs_dump);
@@ -1404,6 +1460,8 @@ static int qm_regs_show(struct seq_file *s, void *unused) }
regset.base = qm->io_base; + regset.dev = &qm->pdev->dev; + hisi_qm_regs_dump(s, ®set);
return 0; @@ -1835,16 +1893,24 @@ static ssize_t qm_cmd_write(struct file *filp, const char __user *buffer, if (*pos) return 0;
+ ret = hisi_qm_get_dfx_access(qm); + if (ret) + return ret; + /* Judge if the instance is being reset. */ if (unlikely(atomic_read(&qm->status.flags) == QM_STOP)) return 0;
- if (count > QM_DBG_WRITE_LEN) - return -ENOSPC; + if (count > QM_DBG_WRITE_LEN) { + ret = -ENOSPC; + goto put_dfx_access; + }
cmd_buf = memdup_user_nul(buffer, count); - if (IS_ERR(cmd_buf)) - return PTR_ERR(cmd_buf); + if (IS_ERR(cmd_buf)) { + ret = PTR_ERR(cmd_buf); + goto put_dfx_access; + }
cmd_buf_tmp = strchr(cmd_buf, '\n'); if (cmd_buf_tmp) { @@ -1855,12 +1921,16 @@ static ssize_t qm_cmd_write(struct file *filp, const char __user *buffer, ret = qm_cmd_write_dump(qm, cmd_buf); if (ret) { kfree(cmd_buf); - return ret; + goto put_dfx_access; }
kfree(cmd_buf);
- return count; + ret = count; + +put_dfx_access: + hisi_qm_put_dfx_access(qm); + return ret; }
static const struct file_operations qm_cmd_fops = { @@ -2457,11 +2527,19 @@ static struct hisi_qp *qm_create_qp_nolock(struct hisi_qm *qm, u8 alg_type) struct hisi_qp *hisi_qm_create_qp(struct hisi_qm *qm, u8 alg_type) { struct hisi_qp *qp; + int ret; + + ret = qm_pm_get_sync(qm); + if (ret) + return ERR_PTR(ret);
down_write(&qm->qps_lock); qp = qm_create_qp_nolock(qm, alg_type); up_write(&qm->qps_lock);
+ if (IS_ERR(qp)) + qm_pm_put_sync(qm); + return qp; } EXPORT_SYMBOL_GPL(hisi_qm_create_qp); @@ -2487,6 +2565,8 @@ void hisi_qm_release_qp(struct hisi_qp *qp) idr_remove(&qm->qp_idr, qp->qp_id);
up_write(&qm->qps_lock); + + qm_pm_put_sync(qm); } EXPORT_SYMBOL_GPL(hisi_qm_release_qp);
@@ -4069,10 +4149,15 @@ static ssize_t qm_algqos_read(struct file *filp, char __user *buf, u32 qos_val, ir; int ret;
+ ret = hisi_qm_get_dfx_access(qm); + if (ret) + return ret; + /* Mailbox and reset cannot be operated at the same time */ if (test_and_set_bit(QM_RESETTING, &qm->misc_ctl)) { pci_err(qm->pdev, "dev resetting, read alg qos failed!\n"); - return -EAGAIN; + ret = -EAGAIN; + goto err_put_dfx_access; }
if (qm->fun_type == QM_HW_PF) { @@ -4091,6 +4176,8 @@ static ssize_t qm_algqos_read(struct file *filp, char __user *buf,
err_get_status: clear_bit(QM_RESETTING, &qm->misc_ctl); +err_put_dfx_access: + hisi_qm_put_dfx_access(qm); return ret; }
@@ -4171,15 +4258,23 @@ static ssize_t qm_algqos_write(struct file *filp, const char __user *buf,
fun_index = device * 8 + function;
+ ret = qm_pm_get_sync(qm); + if (ret) { + ret = -EINVAL; + goto err_get_status; + } + ret = qm_func_shaper_enable(qm, fun_index, val); if (ret) { pci_err(qm->pdev, "failed to enable function shaper!\n"); ret = -EINVAL; - goto err_get_status; + goto err_put_sync; }
- ret = count; + ret = count;
+err_put_sync: + qm_pm_put_sync(qm); err_get_status: clear_bit(QM_RESETTING, &qm->misc_ctl); return ret; @@ -4299,19 +4394,23 @@ int hisi_qm_sriov_enable(struct pci_dev *pdev, int max_vfs) struct hisi_qm *qm = pci_get_drvdata(pdev); int pre_existing_vfs, num_vfs, total_vfs, ret;
+ ret = qm_pm_get_sync(qm); + if (ret) + return ret; + total_vfs = pci_sriov_get_totalvfs(pdev); pre_existing_vfs = pci_num_vf(pdev); if (pre_existing_vfs) { pci_err(pdev, "%d VFs already enabled. Please disable pre-enabled VFs!\n", pre_existing_vfs); - return 0; + goto err_put_sync; }
num_vfs = min_t(int, max_vfs, total_vfs); ret = qm_vf_q_assign(qm, num_vfs); if (ret) { pci_err(pdev, "Can't assign queues for VF!\n"); - return ret; + goto err_put_sync; }
qm->vfs_num = num_vfs; @@ -4320,12 +4419,16 @@ int hisi_qm_sriov_enable(struct pci_dev *pdev, int max_vfs) if (ret) { pci_err(pdev, "Can't enable VF!\n"); qm_clear_vft_config(qm); - return ret; + goto err_put_sync; }
pci_info(pdev, "VF enabled, vfs_num(=%d)!\n", num_vfs);
return num_vfs; + +err_put_sync: + qm_pm_put_sync(qm); + return ret; } EXPORT_SYMBOL_GPL(hisi_qm_sriov_enable);
@@ -4340,6 +4443,7 @@ int hisi_qm_sriov_disable(struct pci_dev *pdev, bool is_frozen) { struct hisi_qm *qm = pci_get_drvdata(pdev); int total_vfs = pci_sriov_get_totalvfs(qm->pdev); + int ret;
if (pci_vfs_assigned(pdev)) { pci_err(pdev, "Failed to disable VFs as VFs are assigned!\n"); @@ -4355,8 +4459,13 @@ int hisi_qm_sriov_disable(struct pci_dev *pdev, bool is_frozen) pci_disable_sriov(pdev); /* clear vf function shaper configure array */ memset(qm->factor + 1, 0, sizeof(struct qm_shaper_factor) * total_vfs); + ret = qm_clear_vft_config(qm); + if (ret) + return ret; + + qm_pm_put_sync(qm);
- return qm_clear_vft_config(qm); + return 0; } EXPORT_SYMBOL_GPL(hisi_qm_sriov_disable);
@@ -5176,11 +5285,18 @@ static void hisi_qm_controller_reset(struct work_struct *rst_work) struct hisi_qm *qm = container_of(rst_work, struct hisi_qm, rst_work); int ret;
+ ret = qm_pm_get_sync(qm); + if (ret) { + clear_bit(QM_RST_SCHED, &qm->misc_ctl); + return; + } + /* reset pcie device controller */ ret = qm_controller_reset(qm); if (ret) dev_err(&qm->pdev->dev, "controller reset failed (%d)\n", ret);
+ qm_pm_put_sync(qm); }
static void qm_pf_reset_vf_prepare(struct hisi_qm *qm, @@ -5692,6 +5808,76 @@ int hisi_qm_init(struct hisi_qm *qm) } EXPORT_SYMBOL_GPL(hisi_qm_init);
+/** + * hisi_qm_get_dfx_access() - Try to get dfx access. + * @qm: pointer to accelerator device. + * + * Try to get dfx access, then user can get message. + * + * If device is in suspended, return failure, otherwise + * bump up the runtime PM usage counter. + */ +int hisi_qm_get_dfx_access(struct hisi_qm *qm) +{ + struct device *dev = &qm->pdev->dev; + + if (pm_runtime_suspended(dev)) { + dev_info(dev, "can not read/write - device in suspended.\n"); + return -EAGAIN; + } + + return qm_pm_get_sync(qm); +} +EXPORT_SYMBOL_GPL(hisi_qm_get_dfx_access); + +/** + * hisi_qm_put_dfx_access() - Put dfx access. + * @qm: pointer to accelerator device. + * + * Put dfx access, drop runtime PM usage counter. + */ +void hisi_qm_put_dfx_access(struct hisi_qm *qm) +{ + qm_pm_put_sync(qm); +} +EXPORT_SYMBOL_GPL(hisi_qm_put_dfx_access); + +/** + * hisi_qm_pm_init() - Initialize qm runtime PM. + * @qm: pointer to accelerator device. + * + * Function that initialize qm runtime PM. + */ +void hisi_qm_pm_init(struct hisi_qm *qm) +{ + struct device *dev = &qm->pdev->dev; + + if (qm->fun_type == QM_HW_VF || qm->ver < QM_HW_V3) + return; + + pm_runtime_set_autosuspend_delay(dev, QM_AUTOSUSPEND_DELAY); + pm_runtime_use_autosuspend(dev); + pm_runtime_put_noidle(dev); +} +EXPORT_SYMBOL_GPL(hisi_qm_pm_init); + +/** + * hisi_qm_pm_uninit() - Uninitialize qm runtime PM. + * @qm: pointer to accelerator device. + * + * Function that uninitialize qm runtime PM. + */ +void hisi_qm_pm_uninit(struct hisi_qm *qm) +{ + struct device *dev = &qm->pdev->dev; + + if (qm->fun_type == QM_HW_VF || qm->ver < QM_HW_V3) + return; + + pm_runtime_get_noresume(dev); + pm_runtime_dont_use_autosuspend(dev); +} +EXPORT_SYMBOL_GPL(hisi_qm_pm_uninit);
static int qm_prepare_for_suspend(struct hisi_qm *qm) { diff --git a/drivers/crypto/hisilicon/qm.h b/drivers/crypto/hisilicon/qm.h index 59e16464d03d..3068093229a5 100644 --- a/drivers/crypto/hisilicon/qm.h +++ b/drivers/crypto/hisilicon/qm.h @@ -433,5 +433,9 @@ int hisi_qm_alg_register(struct hisi_qm *qm, struct hisi_qm_list *qm_list); void hisi_qm_alg_unregister(struct hisi_qm *qm, struct hisi_qm_list *qm_list); int hisi_qm_resume(struct device *dev); int hisi_qm_suspend(struct device *dev); +void hisi_qm_pm_uninit(struct hisi_qm *qm); +void hisi_qm_pm_init(struct hisi_qm *qm); +int hisi_qm_get_dfx_access(struct hisi_qm *qm); +void hisi_qm_put_dfx_access(struct hisi_qm *qm); void hisi_qm_regs_dump(struct seq_file *s, struct debugfs_regset32 *regset); #endif diff --git a/drivers/crypto/hisilicon/sec2/sec_main.c b/drivers/crypto/hisilicon/sec2/sec_main.c index 55ef3ae70fdb..0ad1ebe4c7ef 100644 --- a/drivers/crypto/hisilicon/sec2/sec_main.c +++ b/drivers/crypto/hisilicon/sec2/sec_main.c @@ -11,6 +11,7 @@ #include <linux/kernel.h> #include <linux/module.h> #include <linux/pci.h> +#include <linux/pm_runtime.h> #include <linux/seq_file.h> #include <linux/topology.h> #include <linux/uacce.h> @@ -584,9 +585,14 @@ static ssize_t sec_debug_read(struct file *filp, char __user *buf, { struct sec_debug_file *file = filp->private_data; char tbuf[SEC_DBGFS_VAL_MAX_LEN]; + struct hisi_qm *qm = file->qm; u32 val; int ret;
+ ret = hisi_qm_get_dfx_access(qm); + if (ret) + return ret; + spin_lock_irq(&file->lock);
switch (file->index) { @@ -594,14 +600,19 @@ static ssize_t sec_debug_read(struct file *filp, char __user *buf, val = sec_clear_enable_read(file); break; default: - spin_unlock_irq(&file->lock); - return -EINVAL; + goto err_input; }
spin_unlock_irq(&file->lock); - ret = snprintf(tbuf, SEC_DBGFS_VAL_MAX_LEN, "%u\n", val);
+ hisi_qm_put_dfx_access(qm); + ret = snprintf(tbuf, SEC_DBGFS_VAL_MAX_LEN, "%u\n", val); return simple_read_from_buffer(buf, count, pos, tbuf, ret); + +err_input: + spin_unlock_irq(&file->lock); + hisi_qm_put_dfx_access(qm); + return -EINVAL; }
static ssize_t sec_debug_write(struct file *filp, const char __user *buf, @@ -609,6 +620,7 @@ static ssize_t sec_debug_write(struct file *filp, const char __user *buf, { struct sec_debug_file *file = filp->private_data; char tbuf[SEC_DBGFS_VAL_MAX_LEN]; + struct hisi_qm *qm = file->qm; unsigned long val; int len, ret;
@@ -627,6 +639,10 @@ static ssize_t sec_debug_write(struct file *filp, const char __user *buf, if (kstrtoul(tbuf, 0, &val)) return -EFAULT;
+ ret = hisi_qm_get_dfx_access(qm); + if (ret) + return ret; + spin_lock_irq(&file->lock);
switch (file->index) { @@ -640,12 +656,11 @@ static ssize_t sec_debug_write(struct file *filp, const char __user *buf, goto err_input; }
- spin_unlock_irq(&file->lock); - - return count; + ret = count;
err_input: spin_unlock_irq(&file->lock); + hisi_qm_put_dfx_access(qm); return ret; }
@@ -703,6 +718,7 @@ static int sec_core_debug_init(struct hisi_qm *qm) regset->regs = sec_dfx_regs; regset->nregs = ARRAY_SIZE(sec_dfx_regs); regset->base = qm->io_base; + regset->dev = dev;
if (qm->pdev->device == SEC_PF_PCI_DEVICE_ID) debugfs_create_file("regs", 0444, tmp_d, regset, &sec_regs_fops); @@ -1013,6 +1029,8 @@ static int sec_probe(struct pci_dev *pdev, const struct pci_device_id *id) goto err_alg_unregister; }
+ hisi_qm_pm_init(qm); + return 0;
err_alg_unregister: @@ -1032,6 +1050,7 @@ static void sec_remove(struct pci_dev *pdev) { struct hisi_qm *qm = pci_get_drvdata(pdev);
+ hisi_qm_pm_uninit(qm); hisi_qm_wait_task_finish(qm, &sec_devices); if (qm->qp_num >= ctx_q_num) hisi_qm_alg_unregister(qm, &sec_devices); @@ -1051,6 +1070,10 @@ static void sec_remove(struct pci_dev *pdev) sec_qm_uninit(qm); }
+static const struct dev_pm_ops sec_pm_ops = { + SET_RUNTIME_PM_OPS(hisi_qm_suspend, hisi_qm_resume, NULL) +}; + static const struct pci_error_handlers sec_err_handler = { .error_detected = hisi_qm_dev_err_detected, .slot_reset = hisi_qm_dev_slot_reset, @@ -1066,6 +1089,7 @@ static struct pci_driver sec_pci_driver = { .err_handler = &sec_err_handler, .sriov_configure = hisi_qm_sriov_configure, .shutdown = hisi_qm_dev_shutdown, + .driver.pm = &sec_pm_ops, };
static void sec_register_debugfs(void) diff --git a/drivers/crypto/hisilicon/zip/zip_main.c b/drivers/crypto/hisilicon/zip/zip_main.c index 4438188adbb7..b0b424c9fba1 100644 --- a/drivers/crypto/hisilicon/zip/zip_main.c +++ b/drivers/crypto/hisilicon/zip/zip_main.c @@ -9,6 +9,7 @@ #include <linux/kernel.h> #include <linux/module.h> #include <linux/pci.h> +#include <linux/pm_runtime.h> #include <linux/seq_file.h> #include <linux/topology.h> #include <linux/uacce.h> @@ -476,22 +477,33 @@ static ssize_t hisi_zip_ctrl_debug_read(struct file *filp, char __user *buf, size_t count, loff_t *pos) { struct ctrl_debug_file *file = filp->private_data; + struct hisi_qm *qm = file_to_qm(file); char tbuf[HZIP_BUF_SIZE]; u32 val; int ret;
+ ret = hisi_qm_get_dfx_access(qm); + if (ret) + return ret; + spin_lock_irq(&file->lock); switch (file->index) { case HZIP_CLEAR_ENABLE: val = clear_enable_read(file); break; default: - spin_unlock_irq(&file->lock); - return -EINVAL; + goto err_input; } spin_unlock_irq(&file->lock); + + hisi_qm_put_dfx_access(qm); ret = scnprintf(tbuf, sizeof(tbuf), "%u\n", val); return simple_read_from_buffer(buf, count, pos, tbuf, ret); + +err_input: + spin_unlock_irq(&file->lock); + hisi_qm_put_dfx_access(qm); + return -EINVAL; }
static ssize_t hisi_zip_ctrl_debug_write(struct file *filp, @@ -499,6 +511,7 @@ static ssize_t hisi_zip_ctrl_debug_write(struct file *filp, size_t count, loff_t *pos) { struct ctrl_debug_file *file = filp->private_data; + struct hisi_qm *qm = file_to_qm(file); char tbuf[HZIP_BUF_SIZE]; unsigned long val; int len, ret; @@ -517,6 +530,10 @@ static ssize_t hisi_zip_ctrl_debug_write(struct file *filp, if (kstrtoul(tbuf, 0, &val)) return -EFAULT;
+ ret = hisi_qm_get_dfx_access(qm); + if (ret) + return ret; + spin_lock_irq(&file->lock); switch (file->index) { case HZIP_CLEAR_ENABLE: @@ -528,12 +545,12 @@ static ssize_t hisi_zip_ctrl_debug_write(struct file *filp, ret = -EINVAL; goto err_input; } - spin_unlock_irq(&file->lock);
- return count; + ret = count;
err_input: spin_unlock_irq(&file->lock); + hisi_qm_put_dfx_access(qm); return ret; }
@@ -595,6 +612,7 @@ static int hisi_zip_core_debug_init(struct hisi_qm *qm) regset->regs = hzip_dfx_regs; regset->nregs = ARRAY_SIZE(hzip_dfx_regs); regset->base = qm->io_base + core_offsets[i]; + regset->dev = dev;
tmp_d = debugfs_create_dir(buf, qm->debug.debug_root); debugfs_create_file("regs", 0444, tmp_d, regset, @@ -934,6 +952,8 @@ static int hisi_zip_probe(struct pci_dev *pdev, const struct pci_device_id *id) goto err_qm_alg_unregister; }
+ hisi_qm_pm_init(qm); + return 0;
err_qm_alg_unregister: @@ -956,6 +976,7 @@ static void hisi_zip_remove(struct pci_dev *pdev) { struct hisi_qm *qm = pci_get_drvdata(pdev);
+ hisi_qm_pm_uninit(qm); hisi_qm_wait_task_finish(qm, &zip_devices); hisi_qm_alg_unregister(qm, &zip_devices);
@@ -968,6 +989,10 @@ static void hisi_zip_remove(struct pci_dev *pdev) hisi_zip_qm_uninit(qm); }
+static const struct dev_pm_ops hisi_zip_pm_ops = { + SET_RUNTIME_PM_OPS(hisi_qm_suspend, hisi_qm_resume, NULL) +}; + static const struct pci_error_handlers hisi_zip_err_handler = { .error_detected = hisi_qm_dev_err_detected, .slot_reset = hisi_qm_dev_slot_reset, @@ -984,6 +1009,7 @@ static struct pci_driver hisi_zip_pci_driver = { hisi_qm_sriov_configure : NULL, .err_handler = &hisi_zip_err_handler, .shutdown = hisi_qm_dev_shutdown, + .driver.pm = &hisi_zip_pm_ops, };
static void hisi_zip_register_debugfs(void)
From: Weili Qian qianweili@huawei.com
mainline inclusion from mainline-master commit 74f5edbffcd37162084b6883e059bb6bb686151d category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I475PF?from=project-issue CVE: NA
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/herbert/cryptodev-2.6.git/co...
----------------------------------------------------------------------
To avoid repeatedly obtaining 'qm' from 'filp', parameter passing of debugfs function directly use 'qm' instead of 'filp'.
Signed-off-by: Weili Qian qianweili@huawei.com Signed-off-by: Herbert Xu herbert@gondor.apana.org.au Reviewed-by: Hao Fang fanghao11@huawei.com Reviewed-by: Mingqiang Ling lingmingqiang@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- drivers/crypto/hisilicon/qm.c | 34 +++++++++--------------- drivers/crypto/hisilicon/sec2/sec_main.c | 11 +++----- drivers/crypto/hisilicon/zip/zip_main.c | 11 +++----- 3 files changed, 20 insertions(+), 36 deletions(-)
diff --git a/drivers/crypto/hisilicon/qm.c b/drivers/crypto/hisilicon/qm.c index 74740035380d..e29ff971ad79 100644 --- a/drivers/crypto/hisilicon/qm.c +++ b/drivers/crypto/hisilicon/qm.c @@ -1203,16 +1203,13 @@ static struct hisi_qm *file_to_qm(struct debugfs_file *file) return container_of(debug, struct hisi_qm, debug); }
-static u32 current_q_read(struct debugfs_file *file) +static u32 current_q_read(struct hisi_qm *qm) { - struct hisi_qm *qm = file_to_qm(file); - return readl(qm->io_base + QM_DFX_SQE_CNT_VF_SQN) >> QM_DFX_QN_SHIFT; }
-static int current_q_write(struct debugfs_file *file, u32 val) +static int current_q_write(struct hisi_qm *qm, u32 val) { - struct hisi_qm *qm = file_to_qm(file); u32 tmp;
if (val >= qm->debug.curr_qm_qp_num) @@ -1229,18 +1226,14 @@ static int current_q_write(struct debugfs_file *file, u32 val) return 0; }
-static u32 clear_enable_read(struct debugfs_file *file) +static u32 clear_enable_read(struct hisi_qm *qm) { - struct hisi_qm *qm = file_to_qm(file); - return readl(qm->io_base + QM_DFX_CNT_CLR_CE); }
/* rd_clr_ctrl 1 enable read clear, otherwise 0 disable it */ -static int clear_enable_write(struct debugfs_file *file, u32 rd_clr_ctrl) +static int clear_enable_write(struct hisi_qm *qm, u32 rd_clr_ctrl) { - struct hisi_qm *qm = file_to_qm(file); - if (rd_clr_ctrl > 1) return -EINVAL;
@@ -1249,16 +1242,13 @@ static int clear_enable_write(struct debugfs_file *file, u32 rd_clr_ctrl) return 0; }
-static u32 current_qm_read(struct debugfs_file *file) +static u32 current_qm_read(struct hisi_qm *qm) { - struct hisi_qm *qm = file_to_qm(file); - return readl(qm->io_base + QM_DFX_MB_CNT_VF); }
-static int current_qm_write(struct debugfs_file *file, u32 val) +static int current_qm_write(struct hisi_qm *qm, u32 val) { - struct hisi_qm *qm = file_to_qm(file); u32 tmp;
if (val > qm->vfs_num) @@ -1301,13 +1291,13 @@ static ssize_t qm_debug_read(struct file *filp, char __user *buf, mutex_lock(&file->lock); switch (index) { case CURRENT_QM: - val = current_qm_read(file); + val = current_qm_read(qm); break; case CURRENT_Q: - val = current_q_read(file); + val = current_q_read(qm); break; case CLEAR_ENABLE: - val = clear_enable_read(file); + val = clear_enable_read(qm); break; default: goto err_input; @@ -1356,13 +1346,13 @@ static ssize_t qm_debug_write(struct file *filp, const char __user *buf, mutex_lock(&file->lock); switch (index) { case CURRENT_QM: - ret = current_qm_write(file, val); + ret = current_qm_write(qm, val); break; case CURRENT_Q: - ret = current_q_write(file, val); + ret = current_q_write(qm, val); break; case CLEAR_ENABLE: - ret = clear_enable_write(file, val); + ret = clear_enable_write(qm, val); break; default: ret = -EINVAL; diff --git a/drivers/crypto/hisilicon/sec2/sec_main.c b/drivers/crypto/hisilicon/sec2/sec_main.c index 0ad1ebe4c7ef..90551bf38b52 100644 --- a/drivers/crypto/hisilicon/sec2/sec_main.c +++ b/drivers/crypto/hisilicon/sec2/sec_main.c @@ -557,17 +557,14 @@ static void sec_hw_error_disable(struct hisi_qm *qm) writel(SEC_RAS_DISABLE, qm->io_base + SEC_RAS_NFE_REG); }
-static u32 sec_clear_enable_read(struct sec_debug_file *file) +static u32 sec_clear_enable_read(struct hisi_qm *qm) { - struct hisi_qm *qm = file->qm; - return readl(qm->io_base + SEC_CTRL_CNT_CLR_CE) & SEC_CTRL_CNT_CLR_CE_BIT; }
-static int sec_clear_enable_write(struct sec_debug_file *file, u32 val) +static int sec_clear_enable_write(struct hisi_qm *qm, u32 val) { - struct hisi_qm *qm = file->qm; u32 tmp;
if (val != 1 && val) @@ -597,7 +594,7 @@ static ssize_t sec_debug_read(struct file *filp, char __user *buf,
switch (file->index) { case SEC_CLEAR_ENABLE: - val = sec_clear_enable_read(file); + val = sec_clear_enable_read(qm); break; default: goto err_input; @@ -647,7 +644,7 @@ static ssize_t sec_debug_write(struct file *filp, const char __user *buf,
switch (file->index) { case SEC_CLEAR_ENABLE: - ret = sec_clear_enable_write(file, val); + ret = sec_clear_enable_write(qm, val); if (ret) goto err_input; break; diff --git a/drivers/crypto/hisilicon/zip/zip_main.c b/drivers/crypto/hisilicon/zip/zip_main.c index b0b424c9fba1..7148201ce76e 100644 --- a/drivers/crypto/hisilicon/zip/zip_main.c +++ b/drivers/crypto/hisilicon/zip/zip_main.c @@ -450,17 +450,14 @@ static inline struct hisi_qm *file_to_qm(struct ctrl_debug_file *file) return &hisi_zip->qm; }
-static u32 clear_enable_read(struct ctrl_debug_file *file) +static u32 clear_enable_read(struct hisi_qm *qm) { - struct hisi_qm *qm = file_to_qm(file); - return readl(qm->io_base + HZIP_SOFT_CTRL_CNT_CLR_CE) & HZIP_SOFT_CTRL_CNT_CLR_CE_BIT; }
-static int clear_enable_write(struct ctrl_debug_file *file, u32 val) +static int clear_enable_write(struct hisi_qm *qm, u32 val) { - struct hisi_qm *qm = file_to_qm(file); u32 tmp;
if (val != 1 && val != 0) @@ -489,7 +486,7 @@ static ssize_t hisi_zip_ctrl_debug_read(struct file *filp, char __user *buf, spin_lock_irq(&file->lock); switch (file->index) { case HZIP_CLEAR_ENABLE: - val = clear_enable_read(file); + val = clear_enable_read(qm); break; default: goto err_input; @@ -537,7 +534,7 @@ static ssize_t hisi_zip_ctrl_debug_write(struct file *filp, spin_lock_irq(&file->lock); switch (file->index) { case HZIP_CLEAR_ENABLE: - ret = clear_enable_write(file, val); + ret = clear_enable_write(qm, val); if (ret) goto err_input; break;
From: Weili Qian qianweili@huawei.com
mainline inclusion from mainline-master commit 3e1d2c52b2045ba7f90966b02daeb6c438432570 category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I475PF?from=project-issue CVE: NA
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/herbert/cryptodev-2.6.git/co...
----------------------------------------------------------------------
To support runtime PM, use the function 'pci_set_power_state' to change the power state. Therefore, method _PS0 or _PR0 needs to be filled by platform. So check whether the method is supported, if not, print a prompt information.
Signed-off-by: Weili Qian qianweili@huawei.com Signed-off-by: Herbert Xu herbert@gondor.apana.org.au Reviewed-by: Hao Fang fanghao11@huawei.com Reviewed-by: Mingqiang Ling lingmingqiang@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- drivers/crypto/hisilicon/qm.c | 4 ++++ 1 file changed, 4 insertions(+)
diff --git a/drivers/crypto/hisilicon/qm.c b/drivers/crypto/hisilicon/qm.c index e29ff971ad79..369562d34d66 100644 --- a/drivers/crypto/hisilicon/qm.c +++ b/drivers/crypto/hisilicon/qm.c @@ -3282,6 +3282,10 @@ static void hisi_qm_pre_init(struct hisi_qm *qm) init_rwsem(&qm->qps_lock); qm->qp_in_used = 0; qm->misc_ctl = false; + if (qm->fun_type == QM_HW_PF && qm->ver > QM_HW_V2) { + if (!acpi_device_power_manageable(ACPI_COMPANION(&pdev->dev))) + dev_info(&pdev->dev, "_PS0 and _PR0 are not defined"); + } }
static void qm_cmd_uninit(struct hisi_qm *qm)
From: Kai Ye yekai13@huawei.com
driver inclusion category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I4751W?from=project-issue
----------------------------------------------------------------------
Fixup icv(integrity check value) checking enabled wrong on Kunpeng 930
Signed-off-by: Kai Ye yekai13@huawei.com Reviewed-by: Hao Fang fanghao11@huawei.com Reviewed-by: Mingqiang Ling lingmingqiang@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- drivers/crypto/hisilicon/sec2/sec_crypto.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/drivers/crypto/hisilicon/sec2/sec_crypto.c b/drivers/crypto/hisilicon/sec2/sec_crypto.c index 6a45bd23b363..bf93c9847365 100644 --- a/drivers/crypto/hisilicon/sec2/sec_crypto.c +++ b/drivers/crypto/hisilicon/sec2/sec_crypto.c @@ -1614,7 +1614,7 @@ static void sec_auth_bd_fill_ex_v3(struct sec_auth_ctx *ctx, int dir, sqe3->auth_mac_key |= cpu_to_le32((u32)SEC_AUTH_TYPE1); sqe3->huk_iv_seq &= SEC_CIPHER_AUTH_V3; } else { - sqe3->auth_mac_key |= cpu_to_le32((u32)SEC_AUTH_TYPE1); + sqe3->auth_mac_key |= cpu_to_le32((u32)SEC_AUTH_TYPE2); sqe3->huk_iv_seq |= SEC_AUTH_CIPHER_V3; } sqe3->a_len_key = cpu_to_le32(c_req->c_len + aq->assoclen);
From: Kai Ye yekai13@huawei.com
driver inclusion category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I4750I?from=project-issue
----------------------------------------------------------------------
Fix the maximum length of AAD for the CCM mode due to the hardware limited.
Signed-off-by: Kai Ye yekai13@huawei.com Reviewed-by: Hao Fang fanghao11@huawei.com Reviewed-by: Mingqiang Ling lingmingqiang@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- drivers/crypto/hisilicon/sec2/sec_crypto.c | 5 +++++ 1 file changed, 5 insertions(+)
diff --git a/drivers/crypto/hisilicon/sec2/sec_crypto.c b/drivers/crypto/hisilicon/sec2/sec_crypto.c index bf93c9847365..2f65fe92b039 100644 --- a/drivers/crypto/hisilicon/sec2/sec_crypto.c +++ b/drivers/crypto/hisilicon/sec2/sec_crypto.c @@ -63,6 +63,7 @@ #define SEC_AUTH_CIPHER 0x1 #define SEC_MAX_MAC_LEN 64 #define SEC_MAX_AAD_LEN 65535 +#define SEC_MAX_CCM_AAD_LEN 65279 #define SEC_TOTAL_MAC_SZ (SEC_MAX_MAC_LEN * QM_Q_DEPTH)
#define SEC_PBUF_SZ 512 @@ -2219,6 +2220,10 @@ static int sec_aead_spec_check(struct sec_ctx *ctx, struct sec_req *sreq) }
if (c_mode == SEC_CMODE_CCM) { + if (unlikely(req->assoclen > SEC_MAX_CCM_AAD_LEN)) { + dev_err_ratelimited(dev, "CCM input aad parameter is too long!\n"); + return -EINVAL; + } ret = aead_iv_demension_check(req); if (ret) { dev_err(dev, "aead input iv param error!\n");
From: Kai Ye yekai13@huawei.com
driver inclusion category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I4750I?from=project-issue
----------------------------------------------------------------------
The CTR counter is 32bit rollover default on the BD. But the NIST standard is 128bit rollover. it cause the testing failed, so need to fix the BD configuration.
Signed-off-by: Kai Ye yekai13@huawei.com Reviewed-by: Hao Fang fanghao11@huawei.com Reviewed-by: Mingqiang Ling lingmingqiang@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- drivers/crypto/hisilicon/sec2/sec_crypto.c | 6 ++++++ drivers/crypto/hisilicon/sec2/sec_crypto.h | 6 ++++-- 2 files changed, 10 insertions(+), 2 deletions(-)
diff --git a/drivers/crypto/hisilicon/sec2/sec_crypto.c b/drivers/crypto/hisilicon/sec2/sec_crypto.c index 2f65fe92b039..f77be0e6cf65 100644 --- a/drivers/crypto/hisilicon/sec2/sec_crypto.c +++ b/drivers/crypto/hisilicon/sec2/sec_crypto.c @@ -42,6 +42,8 @@ #define SEC_DE_OFFSET_V3 9 #define SEC_SCENE_OFFSET_V3 5 #define SEC_CKEY_OFFSET_V3 13 +#define SEC_CTR_CNT_OFFSET 25 +#define SEC_CTR_CNT_ROLLOVER 2 #define SEC_SRC_SGL_OFFSET_V3 11 #define SEC_DST_SGL_OFFSET_V3 14 #define SEC_CALG_OFFSET_V3 4 @@ -1301,6 +1303,10 @@ static int sec_skcipher_bd_fill_v3(struct sec_ctx *ctx, struct sec_req *req) cipher = SEC_CIPHER_DEC; sec_sqe3->c_icv_key |= cpu_to_le16(cipher);
+ /* Set the CTR counter mode is 128bit rollover */ + sec_sqe3->auth_mac_key = cpu_to_le32((u32)SEC_CTR_CNT_ROLLOVER << + SEC_CTR_CNT_OFFSET); + if (req->use_pbuf) { bd_param |= SEC_PBUF << SEC_SRC_SGL_OFFSET_V3; bd_param |= SEC_PBUF << SEC_DST_SGL_OFFSET_V3; diff --git a/drivers/crypto/hisilicon/sec2/sec_crypto.h b/drivers/crypto/hisilicon/sec2/sec_crypto.h index 9f71c358a6d3..5e039b50e9d4 100644 --- a/drivers/crypto/hisilicon/sec2/sec_crypto.h +++ b/drivers/crypto/hisilicon/sec2/sec_crypto.h @@ -354,8 +354,10 @@ struct sec_sqe3 { * akey_len: 9~14 bits * a_alg: 15~20 bits * key_sel: 21~24 bits - * updata_key: 25 bits - * reserved: 26~31 bits + * ctr_count_mode/sm4_xts: 25~26 bits + * sva_prefetch: 27 bits + * key_wrap_num: 28~30 bits + * update_key: 31 bits */ __le32 auth_mac_key; __le32 salt;
From: Zhangfei Gao zhangfei.gao@linaro.org
driver inclusion category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I472UK?from=project-issue
----------------------------------------------------------------------
A PASID-like feature is implemented on AMBA without using TLP prefixes and these devices have PASID capability though not supporting TLP. Adding a pasid_no_tlp bit for "PASID works without TLP prefixes" and pci_enable_pasid() checks pasid_no_tlp as well as eetlp_prefix_path.
Suggested-by: Bjorn Helgaas helgaas@kernel.org Signed-off-by: Zhangfei Gao zhangfei.gao@linaro.org Reviewed-by: Hao Fang fanghao11@huawei.com Reviewed-by: Mingqiang Ling lingmingqiang@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- drivers/pci/ats.c | 2 +- include/linux/pci.h | 1 + 2 files changed, 2 insertions(+), 1 deletion(-)
diff --git a/drivers/pci/ats.c b/drivers/pci/ats.c index 793d3810eda7..88f981bad52d 100644 --- a/drivers/pci/ats.c +++ b/drivers/pci/ats.c @@ -380,7 +380,7 @@ int pci_enable_pasid(struct pci_dev *pdev, int features) if (WARN_ON(pdev->pasid_enabled)) return -EBUSY;
- if (!pdev->eetlp_prefix_path) + if (!pdev->eetlp_prefix_path && !pdev->pasid_no_tlp) return -EINVAL;
if (!pasid) diff --git a/include/linux/pci.h b/include/linux/pci.h index 936da8925d68..3db9d92a55a2 100644 --- a/include/linux/pci.h +++ b/include/linux/pci.h @@ -382,6 +382,7 @@ struct pci_dev { supported from root to here */ int l1ss; /* L1SS Capability pointer */ #endif + unsigned int pasid_no_tlp:1; /* PASID works without TLP Prefix */ unsigned int eetlp_prefix_path:1; /* End-to-End TLP Prefix */
pci_channel_state_t error_state; /* Current connectivity state */
From: Zhangfei Gao zhangfei.gao@linaro.org
driver inclusion category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I472UK?from=project-issue
----------------------------------------------------------------------
HiSilicon KunPeng920 and KunPeng930 have devices appear as PCI but are actually on the AMBA bus. These fake PCI devices have PASID capability though not supporting TLP.
Add a quirk to set pasid_no_tlp for these devices.
Signed-off-by: Zhangfei Gao zhangfei.gao@linaro.org Signed-off-by: Jean-Philippe Brucker jean-philippe@linaro.org Signed-off-by: Zhou Wang wangzhou1@hisilicon.com Reviewed-by: Hao Fang fanghao11@huawei.com Reviewed-by: Mingqiang Ling lingmingqiang@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- drivers/pci/quirks.c | 14 ++++++++++++++ 1 file changed, 14 insertions(+)
diff --git a/drivers/pci/quirks.c b/drivers/pci/quirks.c index 16fb3d7714d5..b2e59f6657a9 100644 --- a/drivers/pci/quirks.c +++ b/drivers/pci/quirks.c @@ -1825,6 +1825,20 @@ DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_E7525_MCH, quir
DECLARE_PCI_FIXUP_CLASS_FINAL(PCI_VENDOR_ID_HUAWEI, 0x1610, PCI_CLASS_BRIDGE_PCI, 8, quirk_pcie_mch);
+static void quirk_huawei_pcie_sva(struct pci_dev *pdev) +{ + if (pdev->revision != 0x21 && pdev->revision != 0x30) + return; + + pdev->pasid_no_tlp = 1; +} +DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_HUAWEI, 0xa250, quirk_huawei_pcie_sva); +DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_HUAWEI, 0xa251, quirk_huawei_pcie_sva); +DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_HUAWEI, 0xa255, quirk_huawei_pcie_sva); +DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_HUAWEI, 0xa256, quirk_huawei_pcie_sva); +DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_HUAWEI, 0xa258, quirk_huawei_pcie_sva); +DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_HUAWEI, 0xa259, quirk_huawei_pcie_sva); + /* * It's possible for the MSI to get corrupted if SHPC and ACPI are used * together on certain PXH-based systems.
From: Zhangfei Gao zhangfei.gao@linaro.org
driver inclusion category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I472UK?from=project-issue
----------------------------------------------------------------------
HiSilicon KunPeng920 and KunPeng930 have devices appear as PCI but are actually on the AMBA bus. These fake PCI devices can support SVA via SMMU stall feature, by setting dma-can-stall for ACPI platforms.
Signed-off-by: Zhangfei Gao zhangfei.gao@linaro.org Signed-off-by: Jean-Philippe Brucker jean-philippe@linaro.org Signed-off-by: Zhou Wang wangzhou1@hisilicon.com Reviewed-by: Hao Fang fanghao11@huawei.com Reviewed-by: Mingqiang Ling lingmingqiang@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- drivers/pci/quirks.c | 13 +++++++++++++ 1 file changed, 13 insertions(+)
diff --git a/drivers/pci/quirks.c b/drivers/pci/quirks.c index b2e59f6657a9..af7d575c684c 100644 --- a/drivers/pci/quirks.c +++ b/drivers/pci/quirks.c @@ -1827,10 +1827,23 @@ DECLARE_PCI_FIXUP_CLASS_FINAL(PCI_VENDOR_ID_HUAWEI, 0x1610, PCI_CLASS_BRIDGE_PCI
static void quirk_huawei_pcie_sva(struct pci_dev *pdev) { + struct property_entry properties[] = { + PROPERTY_ENTRY_BOOL("dma-can-stall"), + {}, + }; + if (pdev->revision != 0x21 && pdev->revision != 0x30) return;
pdev->pasid_no_tlp = 1; + + /* + * Set the dma-can-stall property on ACPI platforms. Device tree + * can set it directly. + */ + if (!pdev->dev.of_node && + device_add_properties(&pdev->dev, properties)) + pci_warn(pdev, "could not add stall property"); } DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_HUAWEI, 0xa250, quirk_huawei_pcie_sva); DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_HUAWEI, 0xa251, quirk_huawei_pcie_sva);
From: Longfang Liu liulongfang@huawei.com
driver inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I473Q4?from=project-issue
----------------------------------------------------------------------
vfio_pci_vendor_driver_ops includes these parts: (1) .probe() and .remove() interface to be called by vfio_pci_probe() and vfio_pci_remove(). (2) pointer to struct vfio_device_ops. It will be registered as ops of vfio device if .probe() succeeds. (3) vendor modules call macro module_vfio_pci_register_vendor_handler to generate module_init and module_exit. (4) export functions vfio_pci_vendor_data(), vfio_pci_irq_type(), vfio_pci_num_regions(), vfio_pci_pdev(), and functions in vfio_pci_ops, so they are able to be called from outside modules and make them a kind of inherited by vfio_device_ops provided by vendor modules (5) allows a simpler VFIO_DEVICE_GET_INFO ioctl in vendor driver, let vfio_pci know number of vendor regions and vendor irqs (6) allows vendor driver to read/write to bars directly which is useful in security checking condition. (7) allows vendor driver triggers this VFIO_IRQ_TYPE_REMAP_BAR_REGION when it wants to notify userspace to remap PCI BARs.
Signed-off-by: Longfang Liu liulongfang@huawei.com Reviewed-by: Hao Fang fanghao11@huawei.com Reviewed-by: Mingqiang Ling lingmingqiang@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- drivers/vfio/pci/vfio_pci.c | 183 ++++++++++++++++++++++++++-- drivers/vfio/pci/vfio_pci_private.h | 9 ++ drivers/vfio/pci/vfio_pci_rdwr.c | 10 ++ include/linux/vfio.h | 52 ++++++++ 4 files changed, 243 insertions(+), 11 deletions(-)
diff --git a/drivers/vfio/pci/vfio_pci.c b/drivers/vfio/pci/vfio_pci.c index 42612e193a00..df018a4554b6 100644 --- a/drivers/vfio/pci/vfio_pci.c +++ b/drivers/vfio/pci/vfio_pci.c @@ -74,6 +74,61 @@ static inline bool vfio_vga_disabled(void) #endif }
+static struct vfio_pci { + struct mutex vendor_drivers_lock; + struct list_head vendor_drivers_list; +} vfio_pci; + +struct pci_dev *vfio_pci_pdev(void *device_data) +{ + struct vfio_pci_device *vdev = device_data; + + return vdev->pdev; +} +EXPORT_SYMBOL_GPL(vfio_pci_pdev); + +int vfio_pci_num_regions(void *device_data) +{ + struct vfio_pci_device *vdev = device_data; + + return vdev->num_regions; +} +EXPORT_SYMBOL_GPL(vfio_pci_num_regions); + +int vfio_pci_irq_type(void *device_data) +{ + struct vfio_pci_device *vdev = device_data; + + return vdev->irq_type; +} +EXPORT_SYMBOL_GPL(vfio_pci_irq_type); + +void *vfio_pci_vendor_data(void *device_data) +{ + struct vfio_pci_device *vdev = device_data; + + return vdev->vendor_data; +} +EXPORT_SYMBOL_GPL(vfio_pci_vendor_data); + +int vfio_pci_set_vendor_regions(void *device_data, int num_vendor_regions) +{ + struct vfio_pci_device *vdev = device_data; + + vdev->num_vendor_regions = num_vendor_regions; + return 0; +} +EXPORT_SYMBOL_GPL(vfio_pci_set_vendor_regions); + +int vfio_pci_set_vendor_irqs(void *device_data, int num_vendor_irqs) +{ + struct vfio_pci_device *vdev = device_data; + + vdev->num_vendor_irqs = num_vendor_irqs; + return 0; +} +EXPORT_SYMBOL_GPL(vfio_pci_set_vendor_irqs); + static bool vfio_pci_dev_in_denylist(struct pci_dev *pdev) { switch (pdev->vendor) { @@ -914,7 +969,7 @@ static void vfio_pci_vf_token_user_add(struct vfio_pci_device *vdev, int val) vfio_device_put(pf_dev); }
-static void vfio_pci_release(void *device_data) +void vfio_pci_release(void *device_data) { struct vfio_pci_device *vdev = device_data;
@@ -941,8 +996,9 @@ static void vfio_pci_release(void *device_data)
module_put(THIS_MODULE); } +EXPORT_SYMBOL_GPL(vfio_pci_release);
-static int vfio_pci_open(void *device_data) +int vfio_pci_open(void *device_data) { struct vfio_pci_device *vdev = device_data; int ret = 0; @@ -967,6 +1023,7 @@ static int vfio_pci_open(void *device_data) module_put(THIS_MODULE); return ret; } +EXPORT_SYMBOL_GPL(vfio_pci_open);
static int vfio_pci_get_irq_count(struct vfio_pci_device *vdev, int irq_type) { @@ -1161,7 +1218,7 @@ struct vfio_devices { int max_index; };
-static long vfio_pci_ioctl(void *device_data, +long vfio_pci_ioctl(void *device_data, unsigned int cmd, unsigned long arg) { struct vfio_pci_device *vdev = device_data; @@ -1193,8 +1250,10 @@ static long vfio_pci_ioctl(void *device_data, if (vdev->reset_works) info.flags |= VFIO_DEVICE_FLAGS_RESET;
- info.num_regions = VFIO_PCI_NUM_REGIONS + vdev->num_regions; - info.num_irqs = VFIO_PCI_NUM_IRQS + vdev->num_ext_irqs; + info.num_regions = VFIO_PCI_NUM_REGIONS + vdev->num_regions + + vdev->num_vendor_regions; + info.num_irqs = VFIO_PCI_NUM_IRQS + vdev->num_ext_irqs + + vdev->num_vendor_irqs;
if (IS_ENABLED(CONFIG_VFIO_PCI_ZDEV)) { int ret = vfio_pci_info_zdev_add_caps(vdev, &caps); @@ -1819,6 +1878,7 @@ static long vfio_pci_ioctl(void *device_data,
return -ENOTTY; } +EXPORT_SYMBOL_GPL(vfio_pci_ioctl);
static ssize_t vfio_pci_rw(void *device_data, char __user *buf, size_t count, loff_t *ppos, bool iswrite) @@ -1852,7 +1912,7 @@ static ssize_t vfio_pci_rw(void *device_data, char __user *buf, return -EINVAL; }
-static ssize_t vfio_pci_read(void *device_data, char __user *buf, +ssize_t vfio_pci_read(void *device_data, char __user *buf, size_t count, loff_t *ppos) { if (!count) @@ -1860,8 +1920,9 @@ static ssize_t vfio_pci_read(void *device_data, char __user *buf,
return vfio_pci_rw(device_data, buf, count, ppos, false); } +EXPORT_SYMBOL_GPL(vfio_pci_read);
-static ssize_t vfio_pci_write(void *device_data, const char __user *buf, +ssize_t vfio_pci_write(void *device_data, const char __user *buf, size_t count, loff_t *ppos) { if (!count) @@ -1869,6 +1930,7 @@ static ssize_t vfio_pci_write(void *device_data, const char __user *buf,
return vfio_pci_rw(device_data, (char __user *)buf, count, ppos, true); } +EXPORT_SYMBOL_GPL(vfio_pci_write);
/* Return 1 on zap and vma_lock acquired, 0 on contention (only with @try) */ static int vfio_pci_zap_and_vma_lock(struct vfio_pci_device *vdev, bool try) @@ -2064,7 +2126,7 @@ static const struct vm_operations_struct vfio_pci_mmap_ops = { .fault = vfio_pci_mmap_fault, };
-static int vfio_pci_mmap(void *device_data, struct vm_area_struct *vma) +int vfio_pci_mmap(void *device_data, struct vm_area_struct *vma) { struct vfio_pci_device *vdev = device_data; struct pci_dev *pdev = vdev->pdev; @@ -2133,8 +2195,9 @@ static int vfio_pci_mmap(void *device_data, struct vm_area_struct *vma)
return 0; } +EXPORT_SYMBOL_GPL(vfio_pci_mmap);
-static void vfio_pci_request(void *device_data, unsigned int count) +void vfio_pci_request(void *device_data, unsigned int count) { struct vfio_pci_device *vdev = device_data; struct pci_dev *pdev = vdev->pdev; @@ -2154,6 +2217,7 @@ static void vfio_pci_request(void *device_data, unsigned int count)
mutex_unlock(&vdev->igate); } +EXPORT_SYMBOL_GPL(vfio_pci_request);
static int vfio_pci_validate_vf_token(struct vfio_pci_device *vdev, bool vf_token, uuid_t *uuid) @@ -2250,7 +2314,7 @@ static int vfio_pci_validate_vf_token(struct vfio_pci_device *vdev,
#define VF_TOKEN_ARG "vf_token="
-static int vfio_pci_match(void *device_data, char *buf) +int vfio_pci_match(void *device_data, char *buf) { struct vfio_pci_device *vdev = device_data; bool vf_token = false; @@ -2298,6 +2362,7 @@ static int vfio_pci_match(void *device_data, char *buf)
return 1; /* Match */ } +EXPORT_SYMBOL_GPL(vfio_pci_match);
static const struct vfio_device_ops vfio_pci_ops = { .name = "vfio-pci", @@ -2404,6 +2469,35 @@ static void vfio_pci_vga_uninit(struct vfio_pci_device *vdev) VGA_RSRC_LEGACY_MEM); }
+static int probe_vendor_drivers(struct vfio_pci_device *vdev) +{ + struct vfio_pci_vendor_driver *driver; + int ret = -ENODEV; + + request_module("vfio-pci:%x-%x", vdev->pdev->vendor, + vdev->pdev->device); + + mutex_lock(&vfio_pci.vendor_drivers_lock); + list_for_each_entry(driver, &vfio_pci.vendor_drivers_list, next) { + void *data; + + if (!try_module_get(driver->ops->owner)) + continue; + + data = driver->ops->probe(vdev->pdev); + if (IS_ERR(data)) { + module_put(driver->ops->owner); + continue; + } + vdev->vendor_driver = driver; + vdev->vendor_data = data; + ret = 0; + break; + } + mutex_unlock(&vfio_pci.vendor_drivers_lock); + return ret; +} + static int vfio_pci_probe(struct pci_dev *pdev, const struct pci_device_id *id) { struct vfio_pci_device *vdev; @@ -2477,7 +2571,11 @@ static int vfio_pci_probe(struct pci_dev *pdev, const struct pci_device_id *id) vfio_pci_set_power_state(vdev, PCI_D3hot); }
- ret = vfio_add_group_dev(&pdev->dev, &vfio_pci_ops, vdev); + if (probe_vendor_drivers(vdev)) + ret = vfio_add_group_dev(&pdev->dev, &vfio_pci_ops, vdev); + else + ret = vfio_add_group_dev(&pdev->dev, + vdev->vendor_driver->ops->device_ops, vdev); if (ret) goto out_power; return 0; @@ -2517,6 +2615,12 @@ static void vfio_pci_remove(struct pci_dev *pdev) vfio_pci_set_power_state(vdev, PCI_D0);
mutex_destroy(&vdev->ioeventfds_lock); + + if (vdev->vendor_driver) { + vdev->vendor_driver->ops->remove(vdev->vendor_data); + module_put(vdev->vendor_driver->ops->owner); + } + kfree(vdev->region); kfree(vdev->pm_save); kfree(vdev); @@ -2868,6 +2972,9 @@ static int __init vfio_pci_init(void) if (ret) return ret;
+ mutex_init(&vfio_pci.vendor_drivers_lock); + INIT_LIST_HEAD(&vfio_pci.vendor_drivers_list); + /* Register and scan for devices */ ret = pci_register_driver(&vfio_pci_driver); if (ret) @@ -2885,6 +2992,60 @@ static int __init vfio_pci_init(void) return ret; }
+int __vfio_pci_register_vendor_driver(struct vfio_pci_vendor_driver_ops *ops) +{ + struct vfio_pci_vendor_driver *driver, *tmp; + + if (!ops || !ops->device_ops) + return -EINVAL; + + driver = kzalloc(sizeof(*driver), GFP_KERNEL); + if (!driver) + return -ENOMEM; + + driver->ops = ops; + + mutex_lock(&vfio_pci.vendor_drivers_lock); + + /* Check for duplicates */ + list_for_each_entry(tmp, &vfio_pci.vendor_drivers_list, next) { + if (tmp->ops->device_ops == ops->device_ops) { + mutex_unlock(&vfio_pci.vendor_drivers_lock); + kfree(driver); + return -EINVAL; + } + } + + list_add(&driver->next, &vfio_pci.vendor_drivers_list); + + mutex_unlock(&vfio_pci.vendor_drivers_lock); + + if (!try_module_get(THIS_MODULE)) + return -ENODEV; + + return 0; +} +EXPORT_SYMBOL_GPL(__vfio_pci_register_vendor_driver); + +void vfio_pci_unregister_vendor_driver(struct vfio_device_ops *device_ops) +{ + struct vfio_pci_vendor_driver *driver, *tmp; + + mutex_lock(&vfio_pci.vendor_drivers_lock); + list_for_each_entry_safe(driver, tmp, + &vfio_pci.vendor_drivers_list, next) { + if (driver->ops->device_ops == device_ops) { + list_del(&driver->next); + mutex_unlock(&vfio_pci.vendor_drivers_lock); + kfree(driver); + module_put(THIS_MODULE); + return; + } + } + mutex_unlock(&vfio_pci.vendor_drivers_lock); +} +EXPORT_SYMBOL_GPL(vfio_pci_unregister_vendor_driver); + module_init(vfio_pci_init); module_exit(vfio_pci_cleanup);
diff --git a/drivers/vfio/pci/vfio_pci_private.h b/drivers/vfio/pci/vfio_pci_private.h index 5944f96ced0c..318328602874 100644 --- a/drivers/vfio/pci/vfio_pci_private.h +++ b/drivers/vfio/pci/vfio_pci_private.h @@ -112,6 +112,11 @@ struct vfio_pci_mmap_vma { struct list_head vma_next; };
+struct vfio_pci_vendor_driver { + const struct vfio_pci_vendor_driver_ops *ops; + struct list_head next; +}; + struct vfio_pci_device { struct pci_dev *pdev; void __iomem *barmap[PCI_STD_NUM_BARS]; @@ -127,6 +132,8 @@ struct vfio_pci_device { struct vfio_ext_irq *ext_irqs; int num_ext_irqs; int num_regions; + int num_vendor_regions; + int num_vendor_irqs; struct vfio_pci_region *region; u8 msi_qmax; u8 msix_bar; @@ -163,6 +170,8 @@ struct vfio_pci_device { struct mutex vma_lock; struct list_head vma_list; struct rw_semaphore memory_lock; + void *vendor_data; + struct vfio_pci_vendor_driver *vendor_driver; };
#define is_intx(vdev) (vdev->irq_type == VFIO_PCI_INTX_IRQ_INDEX) diff --git a/drivers/vfio/pci/vfio_pci_rdwr.c b/drivers/vfio/pci/vfio_pci_rdwr.c index 78c494fe35cc..43c11b5f5486 100644 --- a/drivers/vfio/pci/vfio_pci_rdwr.c +++ b/drivers/vfio/pci/vfio_pci_rdwr.c @@ -224,6 +224,16 @@ static int vfio_pci_setup_barmap(struct vfio_pci_device *vdev, int bar) return 0; }
+void __iomem *vfio_pci_get_barmap(void *device_data, int bar) +{ + int ret; + struct vfio_pci_device *vdev = device_data; + + ret = vfio_pci_setup_barmap(vdev, bar); + return ret ? ERR_PTR(ret) : vdev->barmap[bar]; +} +EXPORT_SYMBOL_GPL(vfio_pci_get_barmap); + ssize_t vfio_pci_bar_rw(struct vfio_pci_device *vdev, char __user *buf, size_t count, loff_t *ppos, bool iswrite) { diff --git a/include/linux/vfio.h b/include/linux/vfio.h index 38d3c6a8dc7e..c3819fadf7c8 100644 --- a/include/linux/vfio.h +++ b/include/linux/vfio.h @@ -214,4 +214,56 @@ extern int vfio_virqfd_enable(void *opaque, void *data, struct virqfd **pvirqfd, int fd); extern void vfio_virqfd_disable(struct virqfd **pvirqfd);
+extern int vfio_pci_num_regions(void *device_data); +extern struct pci_dev *vfio_pci_pdev(void *device_data); +extern long vfio_pci_ioctl(void *device_data, + unsigned int cmd, unsigned long arg); +extern ssize_t vfio_pci_read(void *device_data, char __user *buf, + size_t count, loff_t *ppos); +extern ssize_t vfio_pci_write(void *device_data, const char __user *buf, + size_t count, loff_t *ppos); +extern int vfio_pci_mmap(void *device_data, struct vm_area_struct *vma); +extern void vfio_pci_request(void *device_data, unsigned int count); +extern int vfio_pci_open(void *device_data); +extern void vfio_pci_release(void *device_data); +extern void *vfio_pci_vendor_data(void *device_data); +extern int vfio_pci_set_vendor_regions(void *device_data, + int num_vendor_regions); + +struct vfio_pci_vendor_driver_ops { + char *name; + struct module *owner; + void *(*probe)(struct pci_dev *pdev); + void (*remove)(void *vendor_data); + struct vfio_device_ops *device_ops; +}; +int __vfio_pci_register_vendor_driver(struct vfio_pci_vendor_driver_ops *ops); +void vfio_pci_unregister_vendor_driver(struct vfio_device_ops *device_ops); + +#define vfio_pci_register_vendor_driver(__name, __probe, __remove, \ + __device_ops) \ +static struct vfio_pci_vendor_driver_ops __ops ## _node = { \ + .owner = THIS_MODULE, \ + .name = __name, \ + .probe = __probe, \ + .remove = __remove, \ + .device_ops = __device_ops, \ +}; \ +__vfio_pci_register_vendor_driver(&__ops ## _node) + +#define module_vfio_pci_register_vendor_handler(name, probe, remove, \ + device_ops) \ +static int __init device_ops ## _module_init(void) \ +{ \ + vfio_pci_register_vendor_driver(name, probe, remove, \ + device_ops); \ + return 0; \ +}; \ +static void __exit device_ops ## _module_exit(void) \ +{ \ + vfio_pci_unregister_vendor_driver(device_ops); \ +}; \ +module_init(device_ops ## _module_init); \ +module_exit(device_ops ## _module_exit) + #endif /* VFIO_H */
From: Longfang Liu liulongfang@huawei.com
driver inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I473Q4?from=project-issue
----------------------------------------------------------------------
Add device status recording function for accelerator live migration driver.
Signed-off-by: Longfang Liu liulongfang@huawei.com Reviewed-by: Hao Fang fanghao11@huawei.com Reviewed-by: Mingqiang Ling lingmingqiang@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- drivers/crypto/hisilicon/qm.c | 15 +++++++++++++++ drivers/crypto/hisilicon/qm.h | 7 +++++++ 2 files changed, 22 insertions(+)
diff --git a/drivers/crypto/hisilicon/qm.c b/drivers/crypto/hisilicon/qm.c index 369562d34d66..2c1b04497e4b 100644 --- a/drivers/crypto/hisilicon/qm.c +++ b/drivers/crypto/hisilicon/qm.c @@ -3266,6 +3266,13 @@ static int hisi_qp_memory_init(struct hisi_qm *qm, size_t dma_size, int id) return 0; }
+static void hisi_qm_set_state(struct hisi_qm *qm, enum vf_state state) +{ + /* set vf driver state */ + if (qm->ver > QM_HW_V2) + writel(state, qm->io_base + QM_VF_STATE); +} + static void hisi_qm_pre_init(struct hisi_qm *qm) { struct pci_dev *pdev = qm->pdev; @@ -3365,6 +3372,8 @@ void hisi_qm_uninit(struct hisi_qm *qm) qm->qdma.va, qm->qdma.dma); }
+ hisi_qm_set_state(qm, VF_NOT_READY); + qm_irq_unregister(qm); hisi_qm_pci_uninit(qm); uacce_remove(qm->uacce); @@ -3578,6 +3587,8 @@ int hisi_qm_start(struct hisi_qm *qm) if (!ret) atomic_set(&qm->status.flags, QM_START);
+ hisi_qm_set_state(qm, VF_READY); + err_unlock: up_write(&qm->qps_lock); return ret; @@ -3672,6 +3683,8 @@ int hisi_qm_stop(struct hisi_qm *qm, enum qm_stop_reason r) struct device *dev = &qm->pdev->dev; int ret = 0;
+ hisi_qm_set_state(qm, VF_PREPARE); + down_write(&qm->qps_lock);
qm->status.stop_reason = r; @@ -5640,6 +5653,8 @@ static int hisi_qm_pci_init(struct hisi_qm *qm) goto err_get_pci_res; pci_set_master(pdev);
+ hisi_qm_set_state(qm, VF_PREPARE); + if (!qm->ops->get_irq_num) { ret = -EOPNOTSUPP; goto err_get_pci_res; diff --git a/drivers/crypto/hisilicon/qm.h b/drivers/crypto/hisilicon/qm.h index 3068093229a5..718687dd7242 100644 --- a/drivers/crypto/hisilicon/qm.h +++ b/drivers/crypto/hisilicon/qm.h @@ -80,6 +80,7 @@ #define QM_SHAPER_CFG 0x100164 #define QM_SHAPER_ENABLE BIT(30) #define QM_SHAPER_TYPE1_OFFSET 10 +#define QM_VF_STATE 0x0060
/* page number for queue file region */ #define QM_DOORBELL_PAGE_NR 1 @@ -109,6 +110,12 @@ enum qp_state { QP_CLOSE, };
+enum vf_state { + VF_READY = 0x0, + VF_NOT_READY, + VF_PREPARE, +}; + enum qm_hw_ver { QM_HW_UNKNOWN = -1, QM_HW_V1 = 0x20,
From: Longfang Liu liulongfang@huawei.com
driver inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I473Q4?from=project-issue
----------------------------------------------------------------------
This driver will provide all device operations as long as it's probed successfully by vfio-pci driver.
Signed-off-by: Longfang Liu liulongfang@huawei.com Reviewed-by: Hao Fang fanghao11@huawei.com Reviewed-by: Mingqiang Ling lingmingqiang@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- arch/arm64/configs/openeuler_defconfig | 1 + drivers/crypto/hisilicon/Kconfig | 7 + drivers/crypto/hisilicon/Makefile | 1 + drivers/crypto/hisilicon/migration/Makefile | 2 + .../hisilicon/migration/acc_vf_migration.c | 1719 +++++++++++++++++ .../hisilicon/migration/acc_vf_migration.h | 242 +++ 6 files changed, 1972 insertions(+) create mode 100644 drivers/crypto/hisilicon/migration/Makefile create mode 100644 drivers/crypto/hisilicon/migration/acc_vf_migration.c create mode 100644 drivers/crypto/hisilicon/migration/acc_vf_migration.h
diff --git a/arch/arm64/configs/openeuler_defconfig b/arch/arm64/configs/openeuler_defconfig index 44a9b95abc74..fbaa20639d81 100644 --- a/arch/arm64/configs/openeuler_defconfig +++ b/arch/arm64/configs/openeuler_defconfig @@ -6681,6 +6681,7 @@ CONFIG_CRYPTO_DEV_HISI_SEC2=m CONFIG_CRYPTO_DEV_HISI_QM=m CONFIG_CRYPTO_DEV_HISI_ZIP=m CONFIG_CRYPTO_DEV_HISI_HPRE=m +CONFIG_CRYPTO_DEV_HISI_MIGRATION=m # CONFIG_CRYPTO_DEV_AMLOGIC_GXL is not set CONFIG_ASYMMETRIC_KEY_TYPE=y CONFIG_ASYMMETRIC_PUBLIC_KEY_SUBTYPE=y diff --git a/drivers/crypto/hisilicon/Kconfig b/drivers/crypto/hisilicon/Kconfig index e572f9982d4e..ae0bd02f5c40 100644 --- a/drivers/crypto/hisilicon/Kconfig +++ b/drivers/crypto/hisilicon/Kconfig @@ -81,3 +81,10 @@ config CRYPTO_DEV_HISI_TRNG select CRYPTO_RNG help Support for HiSilicon TRNG Driver. + +config CRYPTO_DEV_HISI_MIGRATION + tristate "Support for HISI MIGRATION Driver" + depends on ARM64 && ACPI + select PCI_IOV + help + Support for HiSilicon vfio pci to support VF live migration. diff --git a/drivers/crypto/hisilicon/Makefile b/drivers/crypto/hisilicon/Makefile index 1e89269a2e4b..da68eab4c3ae 100644 --- a/drivers/crypto/hisilicon/Makefile +++ b/drivers/crypto/hisilicon/Makefile @@ -6,3 +6,4 @@ obj-$(CONFIG_CRYPTO_DEV_HISI_QM) += hisi_qm.o hisi_qm-objs = qm.o sgl.o obj-$(CONFIG_CRYPTO_DEV_HISI_ZIP) += zip/ obj-$(CONFIG_CRYPTO_DEV_HISI_TRNG) += trng/ +obj-$(CONFIG_CRYPTO_DEV_HISI_MIGRATION) += migration/ diff --git a/drivers/crypto/hisilicon/migration/Makefile b/drivers/crypto/hisilicon/migration/Makefile new file mode 100644 index 000000000000..6493f527dd98 --- /dev/null +++ b/drivers/crypto/hisilicon/migration/Makefile @@ -0,0 +1,2 @@ +obj-$(CONFIG_CRYPTO_DEV_HISI_MIGRATION) += hisi_migration.o +hisi_migration-objs = acc_vf_migration.o diff --git a/drivers/crypto/hisilicon/migration/acc_vf_migration.c b/drivers/crypto/hisilicon/migration/acc_vf_migration.c new file mode 100644 index 000000000000..63c396d55344 --- /dev/null +++ b/drivers/crypto/hisilicon/migration/acc_vf_migration.c @@ -0,0 +1,1719 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2021 HiSilicon Limited. */ + +#include <linux/device.h> +#include <linux/debugfs.h> +#include <linux/eventfd.h> +#include <linux/file.h> +#include <linux/init.h> +#include <linux/kernel.h> +#include <linux/module.h> +#include <linux/pci.h> +#include <linux/sysfs.h> +#include <linux/vfio.h> + +#include "acc_vf_migration.h" + +#define VDM_OFFSET(x) offsetof(struct vfio_device_migration_info, x) +static struct dentry *mig_debugfs_root; +static int mig_root_ref; + +/* return 0 mailbox ready, -ETIMEDOUT hardware timeout */ +static int qm_wait_mb_ready(struct hisi_qm *qm) +{ + u32 val; + + return readl_relaxed_poll_timeout(qm->io_base + QM_MB_CMD_SEND_BASE, + val, !((val >> QM_MB_BUSY_SHIFT) & + 0x1), POLL_PERIOD, POLL_TIMEOUT); +} + +/* return 0 VM acc device ready, -ETIMEDOUT hardware timeout */ +static int qm_wait_dev_ready(struct hisi_qm *qm) +{ + u32 val; + + return readl_relaxed_poll_timeout(qm->io_base + QM_VF_STATE, + val, !(val & 0x1), POLL_PERIOD, POLL_TIMEOUT); +} + + +/* 128 bit should be written to hardware at one time to trigger a mailbox */ +static void qm_mb_write(struct hisi_qm *qm, const void *src) +{ + void __iomem *fun_base = qm->io_base + QM_MB_CMD_SEND_BASE; + unsigned long tmp0 = 0; + unsigned long tmp1 = 0; + + if (!IS_ENABLED(CONFIG_ARM64)) { + memcpy_toio(fun_base, src, 16); + wmb(); + return; + } + + asm volatile("ldp %0, %1, %3\n" + "stp %0, %1, %2\n" + "dsb sy\n" + : "=&r" (tmp0), + "=&r" (tmp1), + "+Q" (*((char __iomem *)fun_base)) + : "Q" (*((char *)src)) + : "memory"); +} + +static void qm_mb_pre_init(struct qm_mailbox *mailbox, u8 cmd, + u16 queue, bool op) +{ + mailbox->w0 = cpu_to_le16(cmd | + (op ? 0x1 << QM_MB_OP_SHIFT : 0) | + (0x1 << QM_MB_BUSY_SHIFT)); + mailbox->queue_num = cpu_to_le16(queue); + mailbox->rsvd = 0; +} + +static int qm_mb_nolock(struct hisi_qm *qm, struct qm_mailbox *mailbox) +{ + int cnt = 0; + + if (unlikely(qm_wait_mb_ready(qm))) { + dev_err(&qm->pdev->dev, "QM mailbox is busy to start!\n"); + return -EBUSY; + } + + qm_mb_write(qm, mailbox); + while (true) { + if (!qm_wait_mb_ready(qm)) + break; + if (++cnt > QM_MB_MAX_WAIT_CNT) { + dev_err(&qm->pdev->dev, "QM mailbox operation timeout!\n"); + return -EBUSY; + } + } + return 0; +} + +static int qm_mb(struct hisi_qm *qm, u8 cmd, dma_addr_t dma_addr, u16 queue, + bool op) +{ + struct qm_mailbox mailbox; + int ret; + + dev_dbg(&qm->pdev->dev, "QM mailbox request to q%u: %u-0x%llx\n", + queue, cmd, (unsigned long long)dma_addr); + + qm_mb_pre_init(&mailbox, cmd, queue, op); + mailbox.base_l = cpu_to_le32(lower_32_bits(dma_addr)); + mailbox.base_h = cpu_to_le32(upper_32_bits(dma_addr)); + + mutex_lock(&qm->mailbox_lock); + ret = qm_mb_nolock(qm, &mailbox); + mutex_unlock(&qm->mailbox_lock); + + return ret; +} + +/* + * Each state Reg is checked 100 times, + * with a delay of 100 microseconds after each check + */ +static u32 acc_check_reg_state(struct hisi_qm *qm, u32 regs) +{ + int check_times = 0; + u32 state; + + state = readl(qm->io_base + regs); + while (state && check_times < ERROR_CHECK_TIMEOUT) { + udelay(CHECK_DELAY_TIME); + state = readl(qm->io_base + regs); + check_times++; + } + + return state; +} + +/* Check the PF's RAS state and Function INT state */ +static int qm_check_int_state(struct acc_vf_migration *acc_vf_dev) +{ + struct hisi_qm *vfqm = acc_vf_dev->vf_qm; + struct hisi_qm *qm = acc_vf_dev->pf_qm; + struct device *dev = &qm->pdev->dev; + u32 state; + + /* Check RAS state */ + state = acc_check_reg_state(qm, QM_ABNORMAL_INT_STATUS); + if (state) { + dev_err(dev, "failed to check QM RAS state!\n"); + return -EBUSY; + } + + /* Check Function Communication state between PF and VF */ + state = acc_check_reg_state(vfqm, QM_IFC_INT_STATUS); + if (state) { + dev_err(dev, "failed to check QM IFC INT state!\n"); + return -EBUSY; + } + state = acc_check_reg_state(vfqm, QM_IFC_INT_SET_V); + if (state) { + dev_err(dev, "failed to check QM IFC INT SET state!\n"); + return -EBUSY; + } + + /* Check submodule task state */ + switch (acc_vf_dev->acc_type) { + case HISI_SEC: + state = acc_check_reg_state(qm, SEC_CORE_INT_STATUS); + if (state) { + dev_err(dev, "failed to check QM SEC Core INT state!\n"); + return -EBUSY; + } + break; + case HISI_HPRE: + state = acc_check_reg_state(qm, HPRE_HAC_INT_STATUS); + if (state) { + dev_err(dev, "failed to check QM HPRE HAC INT state!\n"); + return -EBUSY; + } + break; + case HISI_ZIP: + state = acc_check_reg_state(qm, HZIP_CORE_INT_STATUS); + if (state) { + dev_err(dev, "failed to check QM ZIP Core INT state!\n"); + return -EBUSY; + } + break; + default: + dev_err(dev, "failed to detect acc module type!\n"); + return -EINVAL; + } + + return 0; +} + +static int qm_read_reg(struct hisi_qm *qm, u32 reg_addr, + u32 *data, u8 nums) +{ + int i; + + if (nums < 1 || nums > QM_REGS_MAX_LEN) { + dev_err(&qm->pdev->dev, "QM read input parameter is error!\n"); + return -EINVAL; + } + + for (i = 0; i < nums; i++) { + data[i] = readl(qm->io_base + reg_addr); + reg_addr += QM_REG_ADDR_OFFSET; + } + + return 0; +} + +static int qm_write_reg(struct hisi_qm *qm, u32 reg_addr, + u32 *data, u8 nums) +{ + int i; + + if (nums < 1 || nums > QM_REGS_MAX_LEN) { + dev_err(&qm->pdev->dev, "QM write input parameter is error!\n"); + return -EINVAL; + } + + for (i = 0; i < nums; i++) { + writel(data[i], qm->io_base + reg_addr); + reg_addr += QM_REG_ADDR_OFFSET; + } + + return 0; +} + +static int qm_get_vft(struct hisi_qm *qm, u32 *base, u32 *number) +{ + u64 sqc_vft; + int ret; + + ret = qm_mb(qm, QM_MB_CMD_SQC_VFT_V2, 0, 0, 1); + if (ret) + return ret; + + sqc_vft = readl(qm->io_base + QM_MB_CMD_DATA_ADDR_L) | + ((u64)readl(qm->io_base + QM_MB_CMD_DATA_ADDR_H) << + QM_XQC_ADDR_OFFSET); + *base = QM_SQC_VFT_BASE_MASK_V2 & (sqc_vft >> QM_SQC_VFT_BASE_SHIFT_V2); + *number = (QM_SQC_VFT_NUM_MASK_V2 & + (sqc_vft >> QM_SQC_VFT_NUM_SHIFT_V2)) + 1; + + return 0; +} + +static int qm_get_sqc(struct hisi_qm *qm, u64 *addr) +{ + int ret; + + ret = qm_mb(qm, QM_MB_CMD_SQC_BT, 0, 0, 1); + if (ret) + return ret; + + *addr = readl(qm->io_base + QM_MB_CMD_DATA_ADDR_L) | + ((u64)readl(qm->io_base + QM_MB_CMD_DATA_ADDR_H) << + QM_XQC_ADDR_OFFSET); + + return 0; +} + +static int qm_get_cqc(struct hisi_qm *qm, u64 *addr) +{ + int ret; + + ret = qm_mb(qm, QM_MB_CMD_CQC_BT, 0, 0, 1); + if (ret) + return ret; + + *addr = readl(qm->io_base + QM_MB_CMD_DATA_ADDR_L) | + ((u64)readl(qm->io_base + QM_MB_CMD_DATA_ADDR_H) << + QM_XQC_ADDR_OFFSET); + + return 0; +} + +static int qm_rw_regs_read(struct hisi_qm *qm, struct acc_vf_data *vf_data) +{ + struct device *dev = &qm->pdev->dev; + int ret; + + ret = qm_read_reg(qm, QM_VF_AEQ_INT_MASK, &vf_data->aeq_int_mask, 1); + if (ret) { + dev_err(dev, "failed to read QM_VF_AEQ_INT_MASK!\n"); + return ret; + } + + ret = qm_read_reg(qm, QM_VF_EQ_INT_MASK, &vf_data->eq_int_mask, 1); + if (ret) { + dev_err(dev, "failed to read QM_VF_EQ_INT_MASK!\n"); + return ret; + } + + ret = qm_read_reg(qm, QM_IFC_INT_SOURCE_V, + &vf_data->ifc_int_source, 1); + if (ret) { + dev_err(dev, "failed to read QM_IFC_INT_SOURCE_V!\n"); + return ret; + } + + ret = qm_read_reg(qm, QM_IFC_INT_MASK, &vf_data->ifc_int_mask, 1); + if (ret) { + dev_err(dev, "failed to read QM_IFC_INT_MASK!\n"); + return ret; + } + + ret = qm_read_reg(qm, QM_IFC_INT_SET_V, &vf_data->ifc_int_set, 1); + if (ret) { + dev_err(dev, "failed to read QM_IFC_INT_SET_V!\n"); + return ret; + } + + ret = qm_read_reg(qm, QM_PAGE_SIZE, &vf_data->page_size, 1); + if (ret) { + dev_err(dev, "failed to read QM_PAGE_SIZE!\n"); + return ret; + } + + ret = qm_read_reg(qm, QM_VF_STATE, &vf_data->vf_state, 1); + if (ret) { + dev_err(dev, "failed to read QM_VF_STATE!\n"); + return ret; + } + + /* QM_EQC_DW has 7 regs */ + ret = qm_read_reg(qm, QM_EQC_DW0, vf_data->qm_eqc_dw, 7); + if (ret) { + dev_err(dev, "failed to read QM_EQC_DW!\n"); + return ret; + } + + /* QM_AEQC_DW has 7 regs */ + ret = qm_read_reg(qm, QM_AEQC_DW0, vf_data->qm_aeqc_dw, 7); + if (ret) { + dev_err(dev, "failed to read QM_AEQC_DW!\n"); + return ret; + } + + return 0; +} + +static int qm_rw_regs_write(struct hisi_qm *qm, struct acc_vf_data *vf_data) +{ + struct device *dev = &qm->pdev->dev; + int ret; + + /* check VF state */ + if (unlikely(qm_wait_mb_ready(qm))) { + dev_err(&qm->pdev->dev, "QM device is not ready to write!\n"); + return -EBUSY; + } + + ret = qm_write_reg(qm, QM_VF_AEQ_INT_MASK, &vf_data->aeq_int_mask, 1); + if (ret) { + dev_err(dev, "failed to write QM_VF_AEQ_INT_MASK!\n"); + return ret; + } + + ret = qm_write_reg(qm, QM_VF_EQ_INT_MASK, &vf_data->eq_int_mask, 1); + if (ret) { + dev_err(dev, "failed to write QM_VF_EQ_INT_MASK!\n"); + return ret; + } + + ret = qm_write_reg(qm, QM_IFC_INT_SOURCE_V, + &vf_data->ifc_int_source, 1); + if (ret) { + dev_err(dev, "failed to write QM_IFC_INT_SOURCE_V!\n"); + return ret; + } + + ret = qm_write_reg(qm, QM_IFC_INT_MASK, &vf_data->ifc_int_mask, 1); + if (ret) { + dev_err(dev, "failed to write QM_IFC_INT_MASK!\n"); + return ret; + } + + ret = qm_write_reg(qm, QM_IFC_INT_SET_V, &vf_data->ifc_int_set, 1); + if (ret) { + dev_err(dev, "failed to write QM_IFC_INT_SET_V!\n"); + return ret; + } + + ret = qm_write_reg(qm, QM_QUE_ISO_CFG_V, &vf_data->que_iso_cfg, 1); + if (ret) { + dev_err(dev, "failed to write QM_QUE_ISO_CFG_V!\n"); + return ret; + } + + ret = qm_write_reg(qm, QM_PAGE_SIZE, &vf_data->page_size, 1); + if (ret) { + dev_err(dev, "failed to write QM_PAGE_SIZE!\n"); + return ret; + } + + ret = qm_write_reg(qm, QM_VF_STATE, &vf_data->vf_state, 1); + if (ret) { + dev_err(dev, "failed to write QM_VF_STATE!\n"); + return ret; + } + + /* QM_EQC_DW has 7 regs */ + ret = qm_write_reg(qm, QM_EQC_DW0, vf_data->qm_eqc_dw, 7); + if (ret) { + dev_err(dev, "failed to write QM_EQC_DW!\n"); + return ret; + } + + /* QM_AEQC_DW has 7 regs */ + ret = qm_write_reg(qm, QM_AEQC_DW0, vf_data->qm_aeqc_dw, 7); + if (ret) { + dev_err(dev, "failed to write QM_AEQC_DW!\n"); + return ret; + } + + return 0; +} + +/* + * the vf QM have unbind from host, insmod in the VM + * so, qm just have the addr from pci dev + * others is null. + * so we need read from the SEC hardware REGs. + */ +static int vf_migration_data_store(struct hisi_qm *qm, + struct acc_vf_migration *acc_vf_dev) +{ + struct acc_vf_data *vf_data = acc_vf_dev->vf_data; + struct device *dev = &qm->pdev->dev; + int ret; + + ret = qm_rw_regs_read(qm, vf_data); + if (ret) { + dev_err(dev, "failed to read QM regs!\n"); + return -EINVAL; + } + + /* + * every Reg is 32 bit, the dma address is 64 bit + * so, the dma address is store in the Reg2 and Reg1 + */ + vf_data->eqe_dma = vf_data->qm_eqc_dw[2]; + vf_data->eqe_dma <<= QM_XQC_ADDR_OFFSET; + vf_data->eqe_dma |= vf_data->qm_eqc_dw[1]; + vf_data->aeqe_dma = vf_data->qm_aeqc_dw[2]; + vf_data->aeqe_dma <<= QM_XQC_ADDR_OFFSET; + vf_data->aeqe_dma |= vf_data->qm_aeqc_dw[1]; + + /* Through SQC_BT/CQC_BT to get sqc and cqc address */ + ret = qm_get_sqc(qm, &vf_data->sqc_dma); + if (ret) { + dev_err(dev, "failed to read SQC addr!\n"); + return -EINVAL; + } + + ret = qm_get_cqc(qm, &vf_data->cqc_dma); + if (ret) { + dev_err(dev, "failed to read CQC addr!\n"); + return -EINVAL; + } + + return 0; +} + +static void qm_dev_cmd_init(struct hisi_qm *qm) +{ + /* clear VF communication status registers. */ + writel(0x1, qm->io_base + QM_IFC_INT_SOURCE_V); + + /* enable pf and vf communication. */ + writel(0x0, qm->io_base + QM_IFC_INT_MASK); +} + +static void qm_db(struct hisi_qm *qm, u16 qn, u8 cmd, + u16 index, u8 priority) +{ + void __iomem *io_base = qm->io_base; + u16 randata = 0; + u64 doorbell; + + if (cmd == QM_DOORBELL_CMD_SQ || cmd == QM_DOORBELL_CMD_CQ) + io_base = qm->db_io_base + (u64)qn * qm->db_interval + + QM_DOORBELL_SQ_CQ_BASE_V2; + else + io_base += QM_DOORBELL_EQ_AEQ_BASE_V2; + + doorbell = qn | ((u64)cmd << QM_DB_CMD_SHIFT_V2) | + ((u64)randata << QM_DB_RAND_SHIFT_V2) | + ((u64)index << QM_DB_INDEX_SHIFT_V2) | + ((u64)priority << QM_DB_PRIORITY_SHIFT_V2); + + writeq(doorbell, io_base); +} + +static void vf_qm_fun_restart(struct hisi_qm *qm, + struct acc_vf_migration *acc_vf_dev) +{ + struct acc_vf_data *vf_data = acc_vf_dev->vf_data; + struct device *dev = &qm->pdev->dev; + int i; + + /* + * When the system is rebooted, the SMMU page table is destroyed, + * and the QP queue cannot be returned normally at this time. + * if vf_ready == 0x2, don't need to restart QP. + */ + if (vf_data->vf_state == VF_PREPARE) { + dev_err(dev, "failed to restart VF!\n"); + return; + } + + for (i = 0; i < qm->qp_num; i++) + qm_db(qm, i, QM_DOORBELL_CMD_SQ, 0, 1); +} + +static int vf_match_info_check(struct hisi_qm *qm, + struct acc_vf_migration *acc_vf_dev) +{ + struct acc_vf_data *vf_data = acc_vf_dev->vf_data; + struct device *dev = &qm->pdev->dev; + u32 que_iso_state; + int ret; + + /* vf acc type check */ + if (vf_data->acc_type != acc_vf_dev->acc_type) { + dev_err(dev, "failed to match VF acc type!\n"); + return -EINVAL; + } + + /* vf qp num check */ + ret = qm_get_vft(qm, &qm->qp_base, &qm->qp_num); + if (ret || qm->qp_num <= 1) { + dev_err(dev, "failed to get vft qp nums!\n"); + return ret; + } + + if (vf_data->qp_num != qm->qp_num) { + dev_err(dev, "failed to match VF qp num!\n"); + return -EINVAL; + } + + /* vf isolation state check */ + ret = qm_read_reg(qm, QM_QUE_ISO_CFG_V, &que_iso_state, 1); + if (ret) { + dev_err(dev, "failed to read QM_QUE_ISO_CFG_V!\n"); + return ret; + } + if (vf_data->que_iso_cfg != que_iso_state) { + dev_err(dev, "failed to match isolation state!\n"); + return -EINVAL; + } + + return 0; +} + +static int vf_migration_data_recover(struct hisi_qm *qm, + struct acc_vf_data *vf_data) +{ + struct device *dev = &qm->pdev->dev; + int ret; + + qm->eqe_dma = vf_data->eqe_dma; + qm->aeqe_dma = vf_data->aeqe_dma; + qm->sqc_dma = vf_data->sqc_dma; + qm->cqc_dma = vf_data->cqc_dma; + + qm->qp_base = vf_data->qp_base; + qm->qp_num = vf_data->qp_num; + + ret = qm_rw_regs_write(qm, vf_data); + if (ret) { + dev_err(dev, "Set VF regs failed!\n"); + return ret; + } + + ret = qm_mb(qm, QM_MB_CMD_SQC_BT, qm->sqc_dma, 0, 0); + if (ret) { + dev_err(dev, "Set sqc failed!\n"); + return ret; + } + + ret = qm_mb(qm, QM_MB_CMD_CQC_BT, qm->cqc_dma, 0, 0); + if (ret) { + dev_err(dev, "Set cqc failed!\n"); + return ret; + } + + /* which ACC module need to reinit? */ + qm_dev_cmd_init(qm); + + return 0; +} + +static int vf_qm_cache_wb(struct hisi_qm *qm) +{ + unsigned int val; + + writel(0x1, qm->io_base + QM_CACHE_WB_START); + if (readl_relaxed_poll_timeout(qm->io_base + QM_CACHE_WB_DONE, + val, val & BIT(0), POLL_PERIOD, + POLL_TIMEOUT)) { + dev_err(&qm->pdev->dev, "vf QM writeback sqc cache fail!\n"); + return -EINVAL; + } + + return 0; +} + +static int vf_qm_func_stop(struct hisi_qm *qm) +{ + return qm_mb(qm, QM_MB_CMD_PAUSE_QM, 0, 0, 0); +} + +static int pf_qm_get_qp_num(struct hisi_qm *qm, int vf_id, + u32 *rbase, u32 *rnumber) +{ + unsigned int val; + u64 sqc_vft; + int ret; + + ret = readl_relaxed_poll_timeout(qm->io_base + QM_VFT_CFG_RDY, val, + val & BIT(0), POLL_PERIOD, + POLL_TIMEOUT); + if (ret) + return ret; + + writel(0x1, qm->io_base + QM_VFT_CFG_OP_WR); + /* 0 mean SQC VFT */ + writel(0x0, qm->io_base + QM_VFT_CFG_TYPE); + writel(vf_id, qm->io_base + QM_VFT_CFG); + + writel(0x0, qm->io_base + QM_VFT_CFG_RDY); + writel(0x1, qm->io_base + QM_VFT_CFG_OP_ENABLE); + + ret = readl_relaxed_poll_timeout(qm->io_base + QM_VFT_CFG_RDY, val, + val & BIT(0), POLL_PERIOD, + POLL_TIMEOUT); + if (ret) + return ret; + + sqc_vft = readl(qm->io_base + QM_VFT_CFG_DATA_L) | + ((u64)readl(qm->io_base + QM_VFT_CFG_DATA_H) << + QM_XQC_ADDR_OFFSET); + *rbase = QM_SQC_VFT_BASE_MASK_V2 & + (sqc_vft >> QM_SQC_VFT_BASE_SHIFT_V2); + *rnumber = (QM_SQC_VFT_NUM_MASK_V2 & + (sqc_vft >> QM_SQC_VFT_NUM_SHIFT_V2)) + 1; + + return 0; +} + +static int pf_qm_state_pre_save(struct hisi_qm *qm, + struct acc_vf_migration *acc_vf_dev) +{ + struct acc_vf_data *vf_data = acc_vf_dev->vf_data; + struct device *dev = &qm->pdev->dev; + int vf_id = acc_vf_dev->vf_id; + int ret; + + /* vf acc type save */ + vf_data->acc_type = acc_vf_dev->acc_type; + + /* vf qp num save from PF */ + ret = pf_qm_get_qp_num(qm, vf_id, &qm->qp_base, &qm->qp_num); + if (ret || qm->qp_num <= 1) { + dev_err(dev, "failed to get vft qp nums!\n"); + return -EINVAL; + } + vf_data->qp_base = qm->qp_base; + vf_data->qp_num = qm->qp_num; + + /* vf isolation state save from PF */ + ret = qm_read_reg(qm, QM_QUE_ISO_CFG_V, &vf_data->que_iso_cfg, 1); + if (ret) { + dev_err(dev, "failed to read QM_QUE_ISO_CFG_V!\n"); + return ret; + } + + return 0; +} + +static int vf_qm_state_save(struct hisi_qm *qm, + struct acc_vf_migration *acc_vf_dev) +{ + struct device *dev = &acc_vf_dev->vf_dev->dev; + int ret; + + /* + * check VM task driver state + * if vf_ready == 0x1, skip migrate. + */ + if (unlikely(qm_wait_dev_ready(qm))) { + acc_vf_dev->mig_ignore = true; + dev_err(&qm->pdev->dev, "QM device is not ready to read!\n"); + return 0; + } + + /* First stop the ACC vf function */ + ret = vf_qm_func_stop(qm); + if (ret) { + dev_err(dev, "failed to stop QM VF function!\n"); + return ret; + } + + /* Check the VF's RAS and Interrution state */ + ret = qm_check_int_state(acc_vf_dev); + if (ret) { + dev_err(dev, "failed to check QM INT state!\n"); + goto state_error; + } + + /* hisi_qm_cache_wb store cache data to DDR */ + ret = vf_qm_cache_wb(qm); + if (ret) { + dev_err(dev, "failed to writeback QM Cache!\n"); + goto state_error; + } + + ret = vf_migration_data_store(qm, acc_vf_dev); + if (ret) { + dev_err(dev, "failed to get and store migration data!\n"); + goto state_error; + } + + return 0; + +state_error: + vf_qm_fun_restart(qm, acc_vf_dev); + return ret; +} + +static int vf_qm_state_resume(struct hisi_qm *qm, + struct acc_vf_migration *acc_vf_dev) +{ + struct device *dev = &acc_vf_dev->vf_dev->dev; + int ret; + + /* recover data to VF */ + ret = vf_migration_data_recover(qm, acc_vf_dev->vf_data); + if (ret) { + dev_err(dev, "failed to recover the VF!\n"); + return ret; + } + + /* restart all destination VF's QP */ + vf_qm_fun_restart(qm, acc_vf_dev); + + return 0; +} + +static int acc_vf_set_device_state(struct acc_vf_migration *acc_vf_dev, + u32 state) +{ + struct vfio_device_migration_info *mig_ctl = acc_vf_dev->mig_ctl; + struct device *dev = &acc_vf_dev->vf_dev->dev; + struct hisi_qm *pfqm = acc_vf_dev->pf_qm; + struct hisi_qm *qm = acc_vf_dev->vf_qm; + int ret = 0; + + if (state == mig_ctl->device_state) + return 0; + + switch (state) { + case VFIO_DEVICE_STATE_RUNNING: + if (!mig_ctl->data_size) + break; + + if (mig_ctl->device_state == VFIO_DEVICE_STATE_RESUMING) { + ret = vf_qm_state_resume(qm, acc_vf_dev); + if (ret) { + dev_err(dev, "failed to resume device!\n"); + return -EFAULT; + } + } + + break; + case VFIO_DEVICE_STATE_SAVING | VFIO_DEVICE_STATE_RUNNING: + /* ACC should in the pre cycle to read match information data */ + ret = pf_qm_state_pre_save(pfqm, acc_vf_dev); + if (ret) { + dev_err(dev, "failed to pre save device state!\n"); + return -EFAULT; + } + + /* set the pending_byte and match data size */ + mig_ctl->data_size = QM_MATCH_SIZE; + mig_ctl->pending_bytes = mig_ctl->data_size; + + break; + case VFIO_DEVICE_STATE_SAVING: + /* stop the vf function */ + ret = vf_qm_state_save(qm, acc_vf_dev); + if (ret) { + dev_err(dev, "failed to save device state!\n"); + return -EFAULT; + } + + if (acc_vf_dev->mig_ignore) { + mig_ctl->data_size = 0; + mig_ctl->pending_bytes = 0; + break; + } + + /* set the pending_byte and data_size */ + mig_ctl->data_size = sizeof(struct acc_vf_data); + mig_ctl->pending_bytes = mig_ctl->data_size; + + break; + case VFIO_DEVICE_STATE_STOP: + /* restart all VF's QP */ + vf_qm_fun_restart(qm, acc_vf_dev); + + break; + case VFIO_DEVICE_STATE_RESUMING: + + break; + default: + ret = -EFAULT; + } + + if (!ret) { + dev_info(dev, "migration state: %s ----------> %s!\n", + vf_dev_state[mig_ctl->device_state], + vf_dev_state[state]); + mig_ctl->device_state = state; + } + + return ret; +} + +static int acc_vf_data_transfer(struct acc_vf_migration *acc_vf_dev, + char __user *buf, size_t count, u64 pos, bool iswrite) +{ + struct vfio_device_migration_info *mig_ctl = acc_vf_dev->mig_ctl; + void *data_addr = acc_vf_dev->vf_data; + int ret = 0; + + if (!count) { + dev_err(&acc_vf_dev->vf_dev->dev, + "Qemu operation data size error!\n"); + return -EINVAL; + } + + data_addr += pos - mig_ctl->data_offset; + if (iswrite) { + ret = copy_from_user(data_addr, buf, count) ? + -EFAULT : count; + if (ret == count) + mig_ctl->pending_bytes += count; + } else { + ret = copy_to_user(buf, data_addr, count) ? + -EFAULT : count; + if (ret == count) + mig_ctl->pending_bytes -= count; + } + + return ret; +} + +static int acc_vf_region_migration_rw(struct acc_vf_migration *acc_vf_dev, + char __user *buf, size_t count, loff_t *ppos, bool iswrite) +{ + struct vfio_device_migration_info *mig_ctl = acc_vf_dev->mig_ctl; + struct device *dev = &acc_vf_dev->vf_dev->dev; + struct hisi_qm *qm = acc_vf_dev->vf_qm; + u64 pos = *ppos & VFIO_PCI_OFFSET_MASK; + u32 device_state; + int ret = 0; + + switch (pos) { + case VDM_OFFSET(device_state): + if (count != sizeof(mig_ctl->device_state)) { + ret = -EINVAL; + break; + } + + if (iswrite) { + if (copy_from_user(&device_state, buf, count)) { + ret = -EFAULT; + break; + } + + ret = acc_vf_set_device_state(acc_vf_dev, + device_state) ? ret : count; + } else { + ret = copy_to_user(buf, &mig_ctl->device_state, + count) ? -EFAULT : count; + } + break; + case VDM_OFFSET(reserved): + ret = -EFAULT; + break; + case VDM_OFFSET(pending_bytes): + if (count != sizeof(mig_ctl->pending_bytes)) { + ret = -EINVAL; + break; + } + + if (iswrite) + ret = -EFAULT; + else + ret = copy_to_user(buf, &mig_ctl->pending_bytes, + count) ? -EFAULT : count; + break; + case VDM_OFFSET(data_offset): + if (count != sizeof(mig_ctl->data_offset)) { + ret = -EINVAL; + break; + } + if (iswrite) + ret = copy_from_user(&mig_ctl->data_offset, buf, count) ? + -EFAULT : count; + else + ret = copy_to_user(buf, &mig_ctl->data_offset, count) ? + -EFAULT : count; + break; + case VDM_OFFSET(data_size): + if (count != sizeof(mig_ctl->data_size)) { + ret = -EINVAL; + break; + } + + if (iswrite) + ret = copy_from_user(&mig_ctl->data_size, buf, count) ? + -EFAULT : count; + else + ret = copy_to_user(buf, &mig_ctl->data_size, count) ? + -EFAULT : count; + break; + default: + ret = -EFAULT; + break; + } + + /* Transfer data section */ + if (pos >= mig_ctl->data_offset && + pos < MIGRATION_REGION_SZ) { + ret = acc_vf_data_transfer(acc_vf_dev, buf, + count, pos, iswrite); + if (ret != count) + return ret; + } + + if (mig_ctl->device_state == VFIO_DEVICE_STATE_RESUMING && + mig_ctl->pending_bytes == QM_MATCH_SIZE && + mig_ctl->data_size == QM_MATCH_SIZE) { + /* check the VF match information */ + ret = vf_match_info_check(qm, acc_vf_dev); + if (ret) { + dev_err(dev, "failed to check match information!\n"); + return -EFAULT; + } + ret = count; + + /* clear the VF match data size */ + mig_ctl->pending_bytes = 0; + mig_ctl->data_size = 0; + } + return ret; +} + +static int acc_vf_region_migration_mmap(struct acc_vf_migration *acc_vf_dev, + struct acc_vf_region *region, + struct vm_area_struct *vma) +{ + return -EFAULT; +} + +static void acc_vf_region_migration_release(struct acc_vf_migration *acc_vf_dev, + struct acc_vf_region *region) +{ + kfree(acc_vf_dev->mig_ctl); + acc_vf_dev->mig_ctl = NULL; +} + +static const struct acc_vf_region_ops acc_vf_region_ops_migration = { + .rw = acc_vf_region_migration_rw, + .release = acc_vf_region_migration_release, + .mmap = acc_vf_region_migration_mmap, +}; + +static int acc_vf_register_region(struct acc_vf_migration *acc_vf_dev, + const struct acc_vf_region_ops *ops, + void *data) +{ + struct acc_vf_region *regions; + + regions = krealloc(acc_vf_dev->regions, + (acc_vf_dev->num_regions + 1) * sizeof(*regions), + GFP_KERNEL); + if (!regions) + return -ENOMEM; + + acc_vf_dev->regions = regions; + regions[acc_vf_dev->num_regions].type = + VFIO_REGION_TYPE_MIGRATION; + regions[acc_vf_dev->num_regions].subtype = + VFIO_REGION_SUBTYPE_MIGRATION; + regions[acc_vf_dev->num_regions].ops = ops; + regions[acc_vf_dev->num_regions].size = + MIGRATION_REGION_SZ; + regions[acc_vf_dev->num_regions].flags = + VFIO_REGION_INFO_FLAG_READ | VFIO_REGION_INFO_FLAG_WRITE; + regions[acc_vf_dev->num_regions].data = data; + acc_vf_dev->num_regions++; + + return 0; +} + +static long acc_vf_get_region_info(void *device_data, + unsigned int cmd, unsigned long arg) +{ + int num_vdev_regions = vfio_pci_num_regions(device_data); + struct acc_vf_migration *acc_vf_dev = + vfio_pci_vendor_data(device_data); + struct vfio_region_info_cap_type cap_type; + struct acc_vf_region *regions; + struct vfio_region_info info; + struct vfio_info_cap caps; + unsigned long minsz; + int index, ret; + + minsz = offsetofend(struct vfio_region_info, offset); + + if (cmd != VFIO_DEVICE_GET_REGION_INFO) + return -EINVAL; + + if (copy_from_user(&info, (void __user *)arg, minsz)) + return -EFAULT; + + if (info.argsz < minsz) + return -EINVAL; + + if (info.index < VFIO_PCI_NUM_REGIONS + num_vdev_regions) + goto default_handle; + + index = info.index - VFIO_PCI_NUM_REGIONS - num_vdev_regions; + if (index > acc_vf_dev->num_regions) { + dev_err(&acc_vf_dev->vf_dev->dev, + "failed to check region numbers!\n"); + return -EINVAL; + } + + info.offset = VFIO_PCI_INDEX_TO_OFFSET(info.index); + regions = acc_vf_dev->regions; + info.size = regions[index].size; + info.flags = regions[index].flags; + caps.buf = NULL; + caps.size = 0; + cap_type.header.id = VFIO_REGION_INFO_CAP_TYPE; + cap_type.header.version = 1; + cap_type.type = regions[index].type; + cap_type.subtype = regions[index].subtype; + + ret = vfio_info_add_capability(&caps, &cap_type.header, + sizeof(cap_type)); + if (ret) + return ret; + + if (regions[index].ops->add_cap) { + ret = regions[index].ops->add_cap(acc_vf_dev, + ®ions[index], &caps); + if (ret) { + kfree(caps.buf); + return ret; + } + } + + if (caps.size) { + info.flags |= VFIO_REGION_INFO_FLAG_CAPS; + if (info.argsz < sizeof(info) + caps.size) { + info.argsz = sizeof(info) + caps.size; + info.cap_offset = 0; + } else { + vfio_info_cap_shift(&caps, sizeof(info)); + if (copy_to_user((void __user *)arg + sizeof(info), + caps.buf, caps.size)) { + kfree(caps.buf); + return -EFAULT; + } + info.cap_offset = sizeof(info); + } + kfree(caps.buf); + } + + return copy_to_user((void __user *)arg, &info, minsz) ? + -EFAULT : 0; + +default_handle: + ret = vfio_pci_ioctl(device_data, cmd, arg); + if (ret) + return ret; + + if (info.index == VFIO_PCI_BAR0_REGION_INDEX) { + if (!acc_vf_dev->in_dirty_track) + return ret; + + /* read default handler's data back */ + if (copy_from_user(&info, (void __user *)arg, minsz)) + return -EFAULT; + + info.flags = VFIO_REGION_INFO_FLAG_READ | + VFIO_REGION_INFO_FLAG_WRITE; + /* update customized region info */ + if (copy_to_user((void __user *)arg, &info, minsz)) + return -EFAULT; + } + + if (info.index == VFIO_PCI_BAR2_REGION_INDEX) { + info.offset = VFIO_PCI_INDEX_TO_OFFSET(info.index); + /* + * ACC VF dev BAR2 region(64K) consists of both functional + * register space and migration control register space. + * Report only the first 32K(functional region) to Guest. + */ + info.size = pci_resource_len(acc_vf_dev->vf_dev, info.index) >> 1; + info.flags = VFIO_REGION_INFO_FLAG_READ | + VFIO_REGION_INFO_FLAG_WRITE | + VFIO_REGION_INFO_FLAG_MMAP; + if (copy_to_user((void __user *)arg, &info, minsz)) + return -EFAULT; + } + + return ret; +} + +static int acc_vf_open(void *device_data) +{ + struct acc_vf_migration *acc_vf_dev = + vfio_pci_vendor_data(device_data); + struct vfio_device_migration_info *mig_ctl; + __u64 mig_offset; + void *vf_data; + int ret; + + if (!try_module_get(THIS_MODULE)) + return -ENODEV; + + mutex_lock(&acc_vf_dev->reflock); + if (!acc_vf_dev->refcnt) { + ret = acc_vf_register_region(acc_vf_dev, + &acc_vf_region_ops_migration, + NULL); + if (ret) + goto region_error; + vfio_pci_set_vendor_regions(device_data, + acc_vf_dev->num_regions); + + /* the data region must follow migration info */ + mig_offset = sizeof(struct vfio_device_migration_info); + mig_ctl = kzalloc(MIGRATION_REGION_SZ, GFP_KERNEL); + if (!mig_ctl) { + ret = -ENOMEM; + goto mig_error; + } + acc_vf_dev->mig_ctl = mig_ctl; + + vf_data = (void *)mig_ctl + mig_offset; + acc_vf_dev->vf_data = vf_data; + + mig_ctl->device_state = VFIO_DEVICE_STATE_RUNNING; + mig_ctl->data_offset = mig_offset; + mig_ctl->data_size = 0; + } + + ret = vfio_pci_open(device_data); + if (ret) + goto open_error; + + acc_vf_dev->refcnt++; + mutex_unlock(&acc_vf_dev->reflock); + + return 0; + +open_error: + if (!acc_vf_dev->refcnt) { + kfree(acc_vf_dev->mig_ctl); + acc_vf_dev->mig_ctl = NULL; + } +mig_error: + vfio_pci_set_vendor_regions(device_data, 0); +region_error: + mutex_unlock(&acc_vf_dev->reflock); + module_put(THIS_MODULE); + return ret; +} + +static void acc_vf_release(void *device_data) +{ + struct acc_vf_migration *acc_vf_dev = + vfio_pci_vendor_data(device_data); + int i; + + mutex_lock(&acc_vf_dev->reflock); + if (!--acc_vf_dev->refcnt) { + for (i = 0; i < acc_vf_dev->num_regions; i++) { + if (!acc_vf_dev->regions[i].ops) + continue; + acc_vf_dev->regions[i].ops->release(acc_vf_dev, + &acc_vf_dev->regions[i]); + } + kfree(acc_vf_dev->regions); + acc_vf_dev->regions = NULL; + acc_vf_dev->num_regions = 0; + vfio_pci_set_vendor_regions(device_data, 0); + + kfree(acc_vf_dev->mig_ctl); + acc_vf_dev->mig_ctl = NULL; + } + vfio_pci_release(device_data); + mutex_unlock(&acc_vf_dev->reflock); + module_put(THIS_MODULE); +} + +static long acc_vf_ioctl(void *device_data, + unsigned int cmd, unsigned long arg) +{ + switch (cmd) { + case VFIO_DEVICE_GET_REGION_INFO: + return acc_vf_get_region_info(device_data, cmd, arg); + default: + return vfio_pci_ioctl(device_data, cmd, arg); + } +} + +static ssize_t acc_vf_read(void *device_data, char __user *buf, + size_t count, loff_t *ppos) +{ + struct acc_vf_migration *acc_vf_dev = + vfio_pci_vendor_data(device_data); + int num_vdev_regions = vfio_pci_num_regions(device_data); + unsigned int index = VFIO_PCI_OFFSET_TO_INDEX(*ppos); + int num_vendor_region = acc_vf_dev->num_regions; + struct acc_vf_region *region; + + if (index >= VFIO_PCI_NUM_REGIONS + num_vdev_regions + + num_vendor_region) { + dev_err(&acc_vf_dev->vf_dev->dev, + "failed to check read regions index!\n"); + return -EINVAL; + } + + if (index < VFIO_PCI_NUM_REGIONS + num_vdev_regions) + return vfio_pci_read(device_data, buf, count, ppos); + + index -= VFIO_PCI_NUM_REGIONS + num_vdev_regions; + + region = &acc_vf_dev->regions[index]; + if (!region->ops->rw) { + dev_err(&acc_vf_dev->vf_dev->dev, + "failed to check regions read ops!\n"); + return -EINVAL; + } + + return region->ops->rw(acc_vf_dev, buf, count, ppos, false); +} + +static ssize_t acc_vf_write(void *device_data, const char __user *buf, + size_t count, loff_t *ppos) +{ + struct acc_vf_migration *acc_vf_dev = + vfio_pci_vendor_data(device_data); + int num_vdev_regions = vfio_pci_num_regions(device_data); + unsigned int index = VFIO_PCI_OFFSET_TO_INDEX(*ppos); + int num_vendor_region = acc_vf_dev->num_regions; + struct acc_vf_region *region; + + if (index == VFIO_PCI_BAR0_REGION_INDEX) + pr_debug("vfio bar 0 write\n"); + + if (index >= VFIO_PCI_NUM_REGIONS + num_vdev_regions + + num_vendor_region) { + dev_err(&acc_vf_dev->vf_dev->dev, + "failed to check write regions index!\n"); + return -EINVAL; + } + + if (index < VFIO_PCI_NUM_REGIONS + num_vdev_regions) + return vfio_pci_write(device_data, buf, count, ppos); + + index -= VFIO_PCI_NUM_REGIONS + num_vdev_regions; + + region = &acc_vf_dev->regions[index]; + + if (!region->ops->rw) { + dev_err(&acc_vf_dev->vf_dev->dev, + "failed to check regions write ops!\n"); + return -EINVAL; + } + + return region->ops->rw(acc_vf_dev, (char __user *)buf, + count, ppos, true); +} + +static int acc_vf_mmap(void *device_data, struct vm_area_struct *vma) +{ + return vfio_pci_mmap(device_data, vma); +} + +static void acc_vf_request(void *device_data, unsigned int count) +{ + vfio_pci_request(device_data, count); +} + +static struct vfio_device_ops acc_vf_device_ops_node = { + .name = "acc_vf", + .open = acc_vf_open, + .release = acc_vf_release, + .ioctl = acc_vf_ioctl, + .read = acc_vf_read, + .write = acc_vf_write, + .mmap = acc_vf_mmap, + .request = acc_vf_request, +}; + +static ssize_t acc_vf_debug_read(struct file *filp, char __user *buffer, + size_t count, loff_t *pos) +{ + char buf[VFIO_DEV_DBG_LEN]; + int len; + + len = scnprintf(buf, VFIO_DEV_DBG_LEN, "%s\n", + "echo 0: test vf data store\n" + "echo 1: test vf data writeback\n" + "echo 2: test vf send mailbox\n" + "echo 3: dump vf dev data\n" + "echo 4: dump migration state\n"); + + return simple_read_from_buffer(buffer, count, pos, buf, len); +} + +static ssize_t acc_vf_debug_write(struct file *filp, const char __user *buffer, + size_t count, loff_t *pos) +{ + struct acc_vf_migration *acc_vf_dev = filp->private_data; + struct device *dev = &acc_vf_dev->vf_dev->dev; + struct hisi_qm *qm = acc_vf_dev->vf_qm; + char tbuf[VFIO_DEV_DBG_LEN]; + unsigned long val; + u64 data; + int len, ret; + + if (*pos) + return 0; + + if (count >= VFIO_DEV_DBG_LEN) + return -ENOSPC; + + len = simple_write_to_buffer(tbuf, VFIO_DEV_DBG_LEN - 1, + pos, buffer, count); + if (len < 0) + return len; + tbuf[len] = '\0'; + if (kstrtoul(tbuf, 0, &val)) + return -EFAULT; + + switch (val) { + case STATE_SAVE: + ret = vf_qm_state_save(qm, acc_vf_dev); + if (ret) + return -EINVAL; + break; + case STATE_RESUME: + ret = vf_qm_state_resume(qm, acc_vf_dev); + if (ret) + return -EINVAL; + break; + case MB_TEST: + data = readl(qm->io_base + QM_MB_CMD_SEND_BASE); + dev_info(dev, "debug mailbox addr: 0x%lx, mailbox val: 0x%llx\n", + (uintptr_t)qm->phys_base, data); + break; + case MIG_DATA_DUMP: + dev_info(dev, "dumped vf migration data:\n"); + print_hex_dump(KERN_INFO, "Mig Data:", DUMP_PREFIX_OFFSET, + VFIO_DBG_LOG_LEN, 1, + (unsigned char *)acc_vf_dev->vf_data, + sizeof(struct acc_vf_data), false); + break; + case MIG_DEV_SHOW: + if (!acc_vf_dev->mig_ctl) + dev_info(dev, "migration region have release!\n"); + else + dev_info(dev, + "device state: %u\n" + "data offset: %llu\n" + "data size: %llu\n" + "pending bytes: %llu\n" + "data addr: 0x%lx\n", + acc_vf_dev->mig_ctl->device_state, + acc_vf_dev->mig_ctl->data_offset, + acc_vf_dev->mig_ctl->data_size, + acc_vf_dev->mig_ctl->pending_bytes, + (uintptr_t)acc_vf_dev->vf_data); + break; + default: + return -EINVAL; + } + + return count; +} + +static const struct file_operations acc_vf_debug_fops = { + .owner = THIS_MODULE, + .open = simple_open, + .read = acc_vf_debug_read, + .write = acc_vf_debug_write, +}; + +static ssize_t acc_vf_state_read(struct file *filp, char __user *buffer, + size_t count, loff_t *pos) +{ + struct acc_vf_migration *acc_vf_dev = filp->private_data; + char buf[VFIO_DEV_DBG_LEN]; + u32 state; + int len; + + if (!acc_vf_dev->mig_ctl) { + len = scnprintf(buf, VFIO_DEV_DBG_LEN, "%s\n", "Invalid\n"); + } else { + state = acc_vf_dev->mig_ctl->device_state; + switch (state) { + case VFIO_DEVICE_STATE_RUNNING: + len = scnprintf(buf, VFIO_DEV_DBG_LEN, "%s\n", + "RUNNING\n"); + break; + case VFIO_DEVICE_STATE_SAVING | VFIO_DEVICE_STATE_RUNNING: + len = scnprintf(buf, VFIO_DEV_DBG_LEN, "%s\n", + "SAVING and RUNNING\n"); + break; + case VFIO_DEVICE_STATE_SAVING: + len = scnprintf(buf, VFIO_DEV_DBG_LEN, "%s\n", + "SAVING\n"); + break; + case VFIO_DEVICE_STATE_STOP: + len = scnprintf(buf, VFIO_DEV_DBG_LEN, "%s\n", + "STOP\n"); + break; + case VFIO_DEVICE_STATE_RESUMING: + len = scnprintf(buf, VFIO_DEV_DBG_LEN, "%s\n", + "RESUMING\n"); + break; + default: + len = scnprintf(buf, VFIO_DEV_DBG_LEN, "%s\n", + "Error\n"); + } + } + + return simple_read_from_buffer(buffer, count, pos, buf, len); +} + +static const struct file_operations acc_vf_state_fops = { + .owner = THIS_MODULE, + .open = simple_open, + .read = acc_vf_state_read, +}; + +static void vf_debugfs_init(struct acc_vf_migration *acc_vf_dev) +{ + char name[VFIO_DEV_DBG_LEN]; + int node_id; + + if (!mig_root_ref) + mig_debugfs_root = debugfs_create_dir("vfio_acc", NULL); + mutex_lock(&acc_vf_dev->reflock); + mig_root_ref++; + mutex_unlock(&acc_vf_dev->reflock); + + node_id = dev_to_node(&acc_vf_dev->vf_dev->dev); + if (node_id < 0) + node_id = 0; + + if (acc_vf_dev->acc_type == HISI_SEC) + scnprintf(name, VFIO_DEV_DBG_LEN, "sec_vf%d-%d", + node_id, acc_vf_dev->vf_id); + else if (acc_vf_dev->acc_type == HISI_HPRE) + scnprintf(name, VFIO_DEV_DBG_LEN, "hpre_vf%d-%d", + node_id, acc_vf_dev->vf_id); + else + scnprintf(name, VFIO_DEV_DBG_LEN, "zip_vf%d-%d", + node_id, acc_vf_dev->vf_id); + + acc_vf_dev->debug_root = debugfs_create_dir(name, mig_debugfs_root); + + debugfs_create_file("debug", 0644, acc_vf_dev->debug_root, + acc_vf_dev, &acc_vf_debug_fops); + debugfs_create_file("state", 0444, acc_vf_dev->debug_root, + acc_vf_dev, &acc_vf_state_fops); +} + +static void vf_debugfs_exit(struct acc_vf_migration *acc_vf_dev) +{ + debugfs_remove_recursive(acc_vf_dev->debug_root); + + mutex_lock(&acc_vf_dev->reflock); + mig_root_ref--; + mutex_unlock(&acc_vf_dev->reflock); + + if (!mig_root_ref) + debugfs_remove_recursive(mig_debugfs_root); +} + +static int qm_acc_type_init(struct acc_vf_migration *acc_vf_dev) +{ + struct hisi_qm *qm = acc_vf_dev->vf_qm; + int i; + + acc_vf_dev->acc_type = 0; + for (i = 0; i < ARRAY_SIZE(vf_acc_types); i++) { + if (!strncmp(qm->dev_name, vf_acc_types[i].name, + strlen(vf_acc_types[i].name))) + acc_vf_dev->acc_type = vf_acc_types[i].type; + } + if (!acc_vf_dev->acc_type) { + dev_err(&acc_vf_dev->vf_dev->dev, "failed to check acc type!\n"); + return -EINVAL; + } + + return 0; +} + +static int vf_qm_pci_init(struct pci_dev *pdev, struct hisi_qm *vfqm) +{ + struct device *dev = &pdev->dev; + u32 val; + int ret; + + ret = pci_request_mem_regions(pdev, vfqm->dev_name); + if (ret < 0) { + dev_err(dev, "failed to request mem regions!\n"); + return ret; + } + + vfqm->phys_base = pci_resource_start(pdev, PCI_BAR_2); + vfqm->io_base = devm_ioremap(dev, pci_resource_start(pdev, PCI_BAR_2), + pci_resource_len(pdev, PCI_BAR_2)); + if (!vfqm->io_base) { + ret = -EIO; + goto err_ioremap; + } + + val = readl(vfqm->io_base + QM_QUE_ISO_CFG_V); + val = val & BIT(0); + if (val) { + vfqm->db_phys_base = pci_resource_start(pdev, PCI_BAR_4); + vfqm->db_io_base = devm_ioremap(dev, pci_resource_start(pdev, + PCI_BAR_4), pci_resource_len(pdev, PCI_BAR_4)); + if (!vfqm->db_io_base) { + ret = -EIO; + goto err_db_ioremap; + } + } else { + vfqm->db_phys_base = vfqm->phys_base; + vfqm->db_io_base = vfqm->io_base; + } + + vfqm->pdev = pdev; + mutex_init(&vfqm->mailbox_lock); + + /* + * Allow VF devices to be loaded in VM when + * it loaded in migration driver + */ + pci_release_mem_regions(pdev); + + return 0; + +err_db_ioremap: + devm_iounmap(dev, vfqm->io_base); +err_ioremap: + pci_release_mem_regions(pdev); + return ret; +} + +static int acc_vf_dev_init(struct pci_dev *pdev, struct hisi_qm *pf_qm, + struct acc_vf_migration *acc_vf_dev) +{ + struct hisi_qm *vf_qm; + int ret; + + vf_qm = kzalloc(sizeof(struct hisi_qm), GFP_KERNEL); + if (!vf_qm) + return -ENOMEM; + + /* get vf qm dev name from pf */ + vf_qm->dev_name = pf_qm->dev_name; + vf_qm->fun_type = QM_HW_VF; + acc_vf_dev->vf_qm = vf_qm; + acc_vf_dev->pf_qm = pf_qm; + + ret = vf_qm_pci_init(pdev, vf_qm); + if (ret) + goto init_qm_error; + + ret = qm_acc_type_init(acc_vf_dev); + if (ret) + goto init_qm_error; + + return 0; + +init_qm_error: + kfree(vf_qm); + return -ENOMEM; +} + +static void *acc_vf_probe(struct pci_dev *pdev) +{ + struct acc_vf_migration *acc_vf_dev; + struct pci_dev *pf_dev, *vf_dev; + struct hisi_qm *pf_qm; + int vf_id, ret; + + pf_dev = pdev->physfn; + vf_dev = pdev; + /* + * the VF driver have been remove after unbind + * the PF driver have probe + */ + pf_qm = pci_get_drvdata(pf_dev); + if (!pf_qm) { + dev_err(&pdev->dev, "host qm driver not insmod!\n"); + return ERR_PTR(-EINVAL); + } + if (pf_qm->ver < QM_HW_V3) { + dev_err(&pdev->dev, + "device can't support migration! version: 0x%x\n", + pf_qm->ver); + return ERR_PTR(-EINVAL); + } + + vf_id = PCI_FUNC(vf_dev->devfn); + if (vf_id < 0) { + dev_info(&pdev->dev, "vf device: %s, vf id: %d\n", + pf_qm->dev_name, vf_id); + return ERR_PTR(-EINVAL); + } + + acc_vf_dev = kzalloc(sizeof(*acc_vf_dev), GFP_KERNEL); + if (!acc_vf_dev) + return ERR_PTR(-ENOMEM); + + ret = acc_vf_dev_init(pdev, pf_qm, acc_vf_dev); + if (ret) { + kfree(acc_vf_dev); + return ERR_PTR(-ENOMEM); + } + + acc_vf_dev->vf_id = vf_id; + acc_vf_dev->vf_vendor = pdev->vendor; + acc_vf_dev->vf_device = pdev->device; + acc_vf_dev->pf_dev = pf_dev; + acc_vf_dev->vf_dev = vf_dev; + acc_vf_dev->mig_ignore = false; + mutex_init(&acc_vf_dev->reflock); + + vf_debugfs_init(acc_vf_dev); + + return acc_vf_dev; +} + +static void acc_vf_remove(void *vendor_data) +{ + struct acc_vf_migration *acc_vf_dev = vendor_data; + struct device *dev = &acc_vf_dev->vf_dev->dev; + struct hisi_qm *qm = acc_vf_dev->vf_qm; + + vf_debugfs_exit(acc_vf_dev); + + devm_iounmap(dev, qm->io_base); + + kfree(qm); + kfree(acc_vf_dev); +} + +static struct vfio_pci_vendor_driver_ops sec_vf_mig_ops = { + .owner = THIS_MODULE, + .name = "hisi_sec2", + .probe = acc_vf_probe, + .remove = acc_vf_remove, + .device_ops = &acc_vf_device_ops_node, +}; + +static struct vfio_pci_vendor_driver_ops hpre_vf_mig_ops = { + .owner = THIS_MODULE, + .name = "hisi_hpre", + .probe = acc_vf_probe, + .remove = acc_vf_remove, + .device_ops = &acc_vf_device_ops_node, +}; + +static struct vfio_pci_vendor_driver_ops zip_vf_mig_ops = { + .owner = THIS_MODULE, + .name = "hisi_zip", + .probe = acc_vf_probe, + .remove = acc_vf_remove, + .device_ops = &acc_vf_device_ops_node, +}; + +static int __init acc_vf_module_init(void) +{ + __vfio_pci_register_vendor_driver(&sec_vf_mig_ops); + + __vfio_pci_register_vendor_driver(&hpre_vf_mig_ops); + + __vfio_pci_register_vendor_driver(&zip_vf_mig_ops); + + return 0; +}; + +static void __exit acc_vf_module_exit(void) +{ + vfio_pci_unregister_vendor_driver(&acc_vf_device_ops_node); +}; +module_init(acc_vf_module_init); +module_exit(acc_vf_module_exit); + +MODULE_LICENSE("GPL v2"); +MODULE_AUTHOR("Longfang Liu liulongfang@huawei.com"); +MODULE_DESCRIPTION("HiSilicon Accelerator VF live migration driver"); diff --git a/drivers/crypto/hisilicon/migration/acc_vf_migration.h b/drivers/crypto/hisilicon/migration/acc_vf_migration.h new file mode 100644 index 000000000000..28aa7e318ca9 --- /dev/null +++ b/drivers/crypto/hisilicon/migration/acc_vf_migration.h @@ -0,0 +1,242 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* Copyright (c) 2021 HiSilicon Limited. */ + +#ifndef ACC_MIG_H +#define ACC_MIG_H + +#include <linux/mdev.h> +#include <linux/pci.h> +#include <linux/vfio.h> + +#include "../qm.h" + +#define VFIO_PCI_OFFSET_SHIFT 40 +#define VFIO_PCI_OFFSET_TO_INDEX(off) ((off) >> VFIO_PCI_OFFSET_SHIFT) +#define VFIO_PCI_INDEX_TO_OFFSET(index) ((u64)(index) << VFIO_PCI_OFFSET_SHIFT) +#define VFIO_PCI_OFFSET_MASK (((u64)(1) << VFIO_PCI_OFFSET_SHIFT) - 1) + +#define MIGRATION_REGION_SZ (sizeof(struct acc_vf_data) + \ + sizeof(struct vfio_device_migration_info)) +#define VFIO_DEV_DBG_LEN 256 +#define VFIO_DBG_LOG_LEN 16 +#define VFIO_DEVFN_MASK 0xFF + +#define PCI_BAR_2 2 +#define PCI_BAR_4 4 +#define POLL_PERIOD 10 +#define POLL_TIMEOUT 1000 +#define QM_CACHE_WB_START 0x204 +#define QM_CACHE_WB_DONE 0x208 +#define QM_MB_CMD_PAUSE_QM 0xe +#define QM_ABNORMAL_INT_STATUS 0x100008 +#define QM_IFC_INT_STATUS 0x0028 +#define SEC_CORE_INT_STATUS 0x301008 +#define HPRE_HAC_INT_STATUS 0x301800 +#define HZIP_CORE_INT_STATUS 0x3010AC + +#define QM_VFT_CFG_RDY 0x10006c +#define QM_VFT_CFG_OP_WR 0x100058 +#define QM_VFT_CFG_TYPE 0x10005c +#define QM_VFT_CFG 0x100060 +#define QM_VFT_CFG_OP_ENABLE 0x100054 +#define QM_VFT_CFG_DATA_L 0x100064 +#define QM_VFT_CFG_DATA_H 0x100068 + +#define ERROR_CHECK_TIMEOUT 100 +#define CHECK_DELAY_TIME 100 + +#define QM_SQC_VFT_BASE_SHIFT_V2 28 +#define QM_SQC_VFT_BASE_MASK_V2 GENMASK(15, 0) +#define QM_SQC_VFT_NUM_SHIFT_V2 45 +#define QM_SQC_VFT_NUM_MASK_V2 GENMASK(9, 0) + +/* mailbox */ +#define QM_MB_CMD_SQC_BT 0x4 +#define QM_MB_CMD_CQC_BT 0x5 +#define QM_MB_CMD_SQC_VFT_V2 0x6 + +#define QM_MB_CMD_SEND_BASE 0x300 +#define QM_MB_BUSY_SHIFT 13 +#define QM_MB_OP_SHIFT 14 +#define QM_MB_CMD_DATA_ADDR_L 0x304 +#define QM_MB_CMD_DATA_ADDR_H 0x308 +#define QM_MB_MAX_WAIT_CNT 6000 + +/* doorbell */ +#define QM_DOORBELL_CMD_SQ 0 +#define QM_DOORBELL_CMD_CQ 1 +#define QM_DOORBELL_SQ_CQ_BASE_V2 0x1000 +#define QM_DOORBELL_EQ_AEQ_BASE_V2 0x2000 +#define QM_DB_CMD_SHIFT_V2 12 +#define QM_DB_RAND_SHIFT_V2 16 +#define QM_DB_INDEX_SHIFT_V2 32 +#define QM_DB_PRIORITY_SHIFT_V2 48 + +/* RW regs */ +#define QM_REGS_MAX_LEN 7 +#define QM_REG_ADDR_OFFSET 0x0004 + +#define QM_XQC_ADDR_OFFSET 32U +#define QM_VF_AEQ_INT_MASK 0x0004 +#define QM_VF_EQ_INT_MASK 0x000c +#define QM_IFC_INT_SOURCE_V 0x0020 +#define QM_IFC_INT_MASK 0x0024 +#define QM_IFC_INT_SET_V 0x002c +#define QM_QUE_ISO_CFG_V 0x0030 +#define QM_PAGE_SIZE 0x0034 + +#define QM_EQC_DW0 0X8000 +#define QM_AEQC_DW0 0X8020 + +struct qm_mailbox { + __le16 w0; + __le16 queue_num; + __le32 base_l; + __le32 base_h; + __le32 rsvd; +}; + +enum acc_type { + HISI_SEC = 0x1, + HISI_HPRE = 0x2, + HISI_ZIP = 0x3, +}; + +struct vf_acc_type { + const char *name; + u32 type; +}; + +static struct vf_acc_type vf_acc_types[] = { + {"hisi_sec2", HISI_SEC}, + {"hisi_hpre", HISI_HPRE}, + {"hisi_zip", HISI_ZIP}, +}; + +enum mig_debug_cmd { + STATE_SAVE, + STATE_RESUME, + MB_TEST, + MIG_DATA_DUMP, + MIG_DEV_SHOW, +}; + +static const char * const vf_dev_state[] = { + "Stop", + "Running", + "Saving", + "Running & Saving", + "Resuming", +}; + +#define QM_MATCH_SIZE 32L +struct acc_vf_data { + /* QM match information */ + u32 qp_num; + u32 acc_type; + u32 que_iso_cfg; + u32 qp_base; + /* QM reserved 4 match information */ + u32 qm_rsv_state[4]; + + /* QM RW regs */ + u32 aeq_int_mask; + u32 eq_int_mask; + u32 ifc_int_source; + u32 ifc_int_mask; + u32 ifc_int_set; + u32 page_size; + u32 vf_state; + + /* + * QM_VF_MB has 4 regs don't need to migration + * mailbox regs writeback value will cause + * hardware to perform command operations + */ + + /* QM_EQC_DW has 7 regs */ + u32 qm_eqc_dw[7]; + + /* QM_AEQC_DW has 7 regs */ + u32 qm_aeqc_dw[7]; + + /* QM reserved 5 regs */ + u32 qm_rsv_regs[5]; + + /* qm memory init information */ + dma_addr_t eqe_dma; + dma_addr_t aeqe_dma; + dma_addr_t sqc_dma; + dma_addr_t cqc_dma; +}; + +struct acc_vf_remap_irq_ctx { + struct eventfd_ctx *trigger; + struct virqfd *sync; + atomic_t cnt; + wait_queue_head_t waitq; + bool init; +}; + +struct acc_vf_migration { + __u32 vf_vendor; + __u32 vf_device; + __u32 handle; + struct pci_dev *pf_dev; + struct pci_dev *vf_dev; + struct hisi_qm *pf_qm; + struct hisi_qm *vf_qm; + int vf_id; + int refcnt; + u8 acc_type; + bool mig_ignore; + struct mutex reflock; + + struct vfio_device_migration_info *mig_ctl; + struct acc_vf_data *vf_data; + bool in_dirty_track; + struct acc_vf_remap_irq_ctx remap_irq_ctx; + struct acc_vf_region *regions; + int num_regions; + struct dentry *debug_root; +}; + +struct acc_vf_region_ops { + int (*rw)(struct acc_vf_migration *acc_vf_dev, + char __user *buf, size_t count, + loff_t *ppos, bool iswrite); + void (*release)(struct acc_vf_migration *acc_vf_dev, + struct acc_vf_region *region); + int (*mmap)(struct acc_vf_migration *acc_vf_dev, + struct acc_vf_region *region, + struct vm_area_struct *vma); + int (*add_cap)(struct acc_vf_migration *acc_vf_dev, + struct acc_vf_region *region, + struct vfio_info_cap *caps); +}; + +struct acc_vf_region { + u32 type; + u32 subtype; + size_t size; + u32 flags; + const struct acc_vf_region_ops *ops; + void *data; +}; + +struct acc_vf_irqops { + int (*set_irqs)(struct acc_vf_migration *acc_vf_dev, + u32 flags, unsigned int index, + unsigned int start, unsigned int count, + void *data); +}; + +struct acc_vf_irq { + u32 type; + u32 subtype; + u32 flags; + u32 count; + const struct acc_vf_irqops *ops; +}; + +#endif /* ACC_MIG_H */
From: Kai Ye yekai13@huawei.com
driver inclusion category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I477YJ?from=project-issue
----------------------------------------------------------------------
Add one page memory for device or qp status. Set a qp error flag in this page when device resetting. This error flag can be seen in the userspace. It helps users to stop tasks when device hardware occurs on the device.
Signed-off-by: Kai Ye yekai13@huawei.com Reviewed-by: Hao Fang fanghao11@huawei.com Reviewed-by: Mingqiang Ling lingmingqiang@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- drivers/crypto/hisilicon/qm.c | 47 +++++++++++++++++++++++++++++++++-- 1 file changed, 45 insertions(+), 2 deletions(-)
diff --git a/drivers/crypto/hisilicon/qm.c b/drivers/crypto/hisilicon/qm.c index 2c1b04497e4b..69528da0c0d7 100644 --- a/drivers/crypto/hisilicon/qm.c +++ b/drivers/crypto/hisilicon/qm.c @@ -233,6 +233,8 @@ #define QM_DBG_WRITE_LEN 1024 #define QM_DBG_TMP_BUF_LEN 22 #define QM_PCI_COMMAND_INVALID ~0 +#define QM_RESET_STOP_TX_OFFSET 1 +#define QM_RESET_STOP_RX_OFFSET 2
#define WAIT_PERIOD 20 #define REMOVE_WAIT_DELAY 10 @@ -883,6 +885,20 @@ static irqreturn_t qm_mb_cmd_irq(int irq, void *data) return IRQ_HANDLED; }
+static void qm_set_qp_disable(struct hisi_qp *qp, int offset) +{ + u32 *addr; + + if (qp->is_in_kernel) + return; + + addr = (u32 *)(qp->qdma.va + qp->qdma.size) - offset; + *addr = 1; + + /* make sure setup is completed */ + mb(); +} + static irqreturn_t qm_aeq_irq(int irq, void *data) { struct hisi_qm *qm = data; @@ -2467,6 +2483,15 @@ static void *qm_get_avail_sqe(struct hisi_qp *qp) return qp->sqe + sq_tail * qp->qm->sqe_size; }
+static void hisi_qm_unset_hw_reset(struct hisi_qp *qp) +{ + u64 *addr; + + /* Use last 64 bits of DUS to reset status. */ + addr = (u64 *)(qp->qdma.va + qp->qdma.size) - QM_RESET_STOP_TX_OFFSET; + *addr = 0; +} + static struct hisi_qp *qm_create_qp_nolock(struct hisi_qm *qm, u8 alg_type) { struct device *dev = &qm->pdev->dev; @@ -2492,7 +2517,7 @@ static struct hisi_qp *qm_create_qp_nolock(struct hisi_qm *qm, u8 alg_type) }
qp = &qm->qp_array[qp_id]; - + hisi_qm_unset_hw_reset(qp); memset(qp->cqe, 0, sizeof(struct qm_cqe) * QM_Q_DEPTH);
qp->event_cb = NULL; @@ -2912,6 +2937,14 @@ static int hisi_qm_get_available_instances(struct uacce_device *uacce) return hisi_qm_get_free_qp_num(uacce->priv); }
+static void hisi_qm_set_hw_reset(struct hisi_qm *qm, int offset) +{ + int i; + + for (i = 0; i < qm->qp_num; i++) + qm_set_qp_disable(&qm->qp_array[i], offset); +} + static int hisi_qm_uacce_get_queue(struct uacce_device *uacce, unsigned long arg, struct uacce_queue *q) @@ -3122,8 +3155,10 @@ static int qm_alloc_uacce(struct hisi_qm *qm) else mmio_page_nr = qm->db_interval / PAGE_SIZE;
+ /* Add one more page for device or qp status */ dus_page_nr = (PAGE_SIZE - 1 + qm->sqe_size * QM_Q_DEPTH + - sizeof(struct qm_cqe) * QM_Q_DEPTH) >> PAGE_SHIFT; + sizeof(struct qm_cqe) * QM_Q_DEPTH + PAGE_SIZE) >> + PAGE_SHIFT;
uacce->qf_pg_num[UACCE_QFRT_MMIO] = mmio_page_nr; uacce->qf_pg_num[UACCE_QFRT_DUS] = dus_page_nr; @@ -3695,11 +3730,13 @@ int hisi_qm_stop(struct hisi_qm *qm, enum qm_stop_reason r)
if (qm->status.stop_reason == QM_SOFT_RESET || qm->status.stop_reason == QM_FLR) { + hisi_qm_set_hw_reset(qm, QM_RESET_STOP_TX_OFFSET); ret = qm_stop_started_qp(qm); if (ret < 0) { dev_err(dev, "Failed to stop started qp!\n"); goto err_unlock; } + hisi_qm_set_hw_reset(qm, QM_RESET_STOP_RX_OFFSET); }
/* Mask eq and aeq irq */ @@ -5058,6 +5095,8 @@ static int qm_controller_reset(struct hisi_qm *qm)
ret = qm_controller_reset_prepare(qm); if (ret) { + hisi_qm_set_hw_reset(qm, QM_RESET_STOP_TX_OFFSET); + hisi_qm_set_hw_reset(qm, QM_RESET_STOP_RX_OFFSET); clear_bit(QM_RST_SCHED, &qm->misc_ctl); return ret; } @@ -5144,6 +5183,8 @@ void hisi_qm_reset_prepare(struct pci_dev *pdev) ret = hisi_qm_stop(qm, QM_FLR); if (ret) { pci_err(pdev, "Failed to stop QM, ret = %d.\n", ret); + hisi_qm_set_hw_reset(qm, QM_RESET_STOP_TX_OFFSET); + hisi_qm_set_hw_reset(qm, QM_RESET_STOP_RX_OFFSET); return; }
@@ -5330,6 +5371,8 @@ static void qm_pf_reset_vf_prepare(struct hisi_qm *qm, }
err_prepare: + hisi_qm_set_hw_reset(qm, QM_RESET_STOP_TX_OFFSET); + hisi_qm_set_hw_reset(qm, QM_RESET_STOP_RX_OFFSET); pci_save_state(pdev); ret = qm->ops->ping_pf(qm, cmd); if (ret)
From: chenguangli chenguangli2@huawei.com
driver inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I47Z9P
-----------------------------------------------------------------------
The memory will be leaked when driver set the port_poto_cfg invalid value.
Signed-off-by: chenguangli chenguangli2@huawei.com Reviewed-by: baowenyi baowenyi@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- drivers/scsi/huawei/hifc/hifc_chipitf.c | 2 +- drivers/scsi/huawei/hifc/unf_common.h | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-)
diff --git a/drivers/scsi/huawei/hifc/hifc_chipitf.c b/drivers/scsi/huawei/hifc/hifc_chipitf.c index fe97f6468557..74e3df2fe860 100644 --- a/drivers/scsi/huawei/hifc/hifc_chipitf.c +++ b/drivers/scsi/huawei/hifc/hifc_chipitf.c @@ -369,7 +369,7 @@ unsigned int hifc_config_port_table(struct hifc_hba_s *v_hba) v_hba->port_cfg.port_id, (unsigned char)v_hba->port_topo_cfg);
- return UNF_RETURN_ERROR; + goto exit; }
/* About speed */ diff --git a/drivers/scsi/huawei/hifc/unf_common.h b/drivers/scsi/huawei/hifc/unf_common.h index bc11cc542c56..9253cc3eada5 100644 --- a/drivers/scsi/huawei/hifc/unf_common.h +++ b/drivers/scsi/huawei/hifc/unf_common.h @@ -13,7 +13,7 @@ /* B version, B0XX Corresponding x.x */ #define UNF_B_VERSION "5.0" /* Indicates the minor version number of the driver */ -#define UNF_DRIVER_VERSION "11" +#define UNF_DRIVER_VERSION "12" /* version num */ #define UNF_FC_VERSION UNF_MAJOR_VERSION "." UNF_B_VERSION "." UNF_DRIVER_VERSION extern unsigned int unf_dbg_level;
From: Yang Shi shy828301@gmail.com
mainline inclusion from mainline-v5.13-rc1 commit 8efb4b596df05f004e847d6bfadad3492b766ab3 category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I48N0H CVE: NA
-------------------------------------------------
Patch series "Make shrinker's nr_deferred memcg aware", v10.
Recently huge amount one-off slab drop was seen on some vfs metadata heavy workloads, it turned out there were huge amount accumulated nr_deferred objects seen by the shrinker.
On our production machine, I saw absurd number of nr_deferred shown as the below tracing result:
<...>-48776 [032] .... 27970562.458916: mm_shrink_slab_start: super_cache_scan+0x0/0x1a0 ffff9a83046f3458: nid: 0 objects to shrink 2531805877005 gfp_flags GFP_HIGHUSER_MOVABLE pgs_scanned 32 lru_pgs 9300 cache items 1667 delta 11 total_scan 833
There are 2.5 trillion deferred objects on one node, assuming all of them are dentry (192 bytes per object), so the total size of deferred on one node is ~480TB. It is definitely ridiculous.
I managed to reproduce this problem with kernel build workload plus negative dentry generator.
First step, run the below kernel build test script:
NR_CPUS=`cat /proc/cpuinfo | grep -e processor | wc -l`
cd /root/Buildarea/linux-stable
for i in `seq 1500`; do cgcreate -g memory:kern_build echo 4G > /sys/fs/cgroup/memory/kern_build/memory.limit_in_bytes
echo 3 > /proc/sys/vm/drop_caches cgexec -g memory:kern_build make clean > /dev/null 2>&1 cgexec -g memory:kern_build make -j$NR_CPUS > /dev/null 2>&1
cgdelete -g memory:kern_build done
Then run the below negative dentry generator script:
NR_CPUS=`cat /proc/cpuinfo | grep -e processor | wc -l`
mkdir /sys/fs/cgroup/memory/test echo $$ > /sys/fs/cgroup/memory/test/tasks
for i in `seq $NR_CPUS`; do while true; do FILE=`head /dev/urandom | tr -dc A-Za-z0-9 | head -c 64` cat $FILE 2>/dev/null done & done
Then kswapd will shrink half of dentry cache in just one loop as the below tracing result showed:
kswapd0-475 [028] .... 305968.252561: mm_shrink_slab_start: super_cache_scan+0x0/0x190 0000000024acf00c: nid: 0 objects to shrink 4994376020 gfp_flags GFP_KERNEL cache items 93689873 delta 45746 total_scan 46844936 priority 12 kswapd0-475 [021] .... 306013.099399: mm_shrink_slab_end: super_cache_scan+0x0/0x190 0000000024acf00c: nid: 0 unused scan count 4994376020 new scan count 4947576838 total_scan 8 last shrinker return val 46844928
There were huge number of deferred objects before the shrinker was called, the behavior does match the code but it might be not desirable from the user's stand of point.
The excessive amount of nr_deferred might be accumulated due to various reasons, for example:
* GFP_NOFS allocation
* Significant times of small amount scan (< scan_batch, 1024 for vfs metadata)
However the LRUs of slabs are per memcg (memcg-aware shrinkers) but the deferred objects is per shrinker, this may have some bad effects:
* Poor isolation among memcgs. Some memcgs which happen to have frequent limit reclaim may get nr_deferred accumulated to a huge number, then other innocent memcgs may take the fall. In our case the main workload was hit.
* Unbounded deferred objects. There is no cap for deferred objects, it can outgrow ridiculously as the tracing result showed.
* Easy to get out of control. Although shrinkers take into account deferred objects, but it can go out of control easily. One misconfigured memcg could incur absurd amount of deferred objects in a period of time.
* Sort of reclaim problems, i.e. over reclaim, long reclaim latency, etc. There may be hundred GB slab caches for vfe metadata heavy workload, shrink half of them may take minutes. We observed latency spike due to the prolonged reclaim.
These issues also have been discussed in https://lore.kernel.org/linux-mm/20200916185823.5347-1-shy828301@gmail.com/. The patchset is the outcome of that discussion.
So this patchset makes nr_deferred per-memcg to tackle the problem. It does:
* Have memcg_shrinker_deferred per memcg per node, just like what shrinker_map does. Instead it is an atomic_long_t array, each element represent one shrinker even though the shrinker is not memcg aware, this simplifies the implementation. For memcg aware shrinkers, the deferred objects are just accumulated to its own memcg. The shrinkers just see nr_deferred from its own memcg. Non memcg aware shrinkers still use global nr_deferred from struct shrinker.
* Once the memcg is offlined, its nr_deferred will be reparented to its parent along with LRUs.
* The root memcg has memcg_shrinker_deferred array too. It simplifies the handling of reparenting to root memcg.
* Cap nr_deferred to 2x of the length of lru. The idea is borrowed from Dave Chinner's series (https://lore.kernel.org/linux-xfs/20191031234618.15403-1-david@fromorbit.com...)
The downside is each memcg has to allocate extra memory to store the nr_deferred array. On our production environment, there are typically around 40 shrinkers, so each memcg needs ~320 bytes. 10K memcgs would need ~3.2MB memory. It seems fine.
We have been running the patched kernel on some hosts of our fleet (test and production) for months, it works very well. The monitor data shows the working set is sustained as expected.
This patch (of 13):
The tracepoint's nid should show what node the shrink happens on, the start tracepoint uses nid from shrinkctl, but the nid might be set to 0 before end tracepoint if the shrinker is not NUMA aware, so the tracing log may show the shrink happens on one node but end up on the other node. It seems confusing. And the following patch will remove using nid directly in do_shrink_slab(), this patch also helps cleanup the code.
Link: https://lkml.kernel.org/r/20210311190845.9708-1-shy828301@gmail.com Link: https://lkml.kernel.org/r/20210311190845.9708-2-shy828301@gmail.com Signed-off-by: Yang Shi shy828301@gmail.com Acked-by: Vlastimil Babka vbabka@suse.cz Acked-by: Kirill Tkhai ktkhai@virtuozzo.com Reviewed-by: Shakeel Butt shakeelb@google.com Acked-by: Roman Gushchin guro@fb.com Cc: Dave Chinner david@fromorbit.com Cc: Johannes Weiner hannes@cmpxchg.org Cc: Michal Hocko mhocko@suse.com Signed-off-by: Andrew Morton akpm@linux-foundation.org Signed-off-by: Linus Torvalds torvalds@linux-foundation.org Signed-off-by: Chen Wandun chenwandun@huawei.com Reviewed-by: Tong Tiangen tongtiangen@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- mm/vmscan.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/mm/vmscan.c b/mm/vmscan.c index 244953ea73a8..66a553e90046 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -538,7 +538,7 @@ static unsigned long do_shrink_slab(struct shrink_control *shrinkctl, else new_nr = atomic_long_read(&shrinker->nr_deferred[nid]);
- trace_mm_shrink_slab_end(shrinker, nid, freed, nr, new_nr, total_scan); + trace_mm_shrink_slab_end(shrinker, shrinkctl->nid, freed, nr, new_nr, total_scan); return freed; }
From: Yang Shi shy828301@gmail.com
mainline inclusion from mainline-v5.13-rc1 commit 2bfd36374edd9ed7f2ebf66cacebedf7273901cb category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I48N0H CVE: NA
-------------------------------------------------
The shrinker map management is not purely memcg specific, it is at the intersection between memory cgroup and shrinkers. It's allocation and assignment of a structure, and the only memcg bit is the map is being stored in a memcg structure. So move the shrinker_maps handling code into vmscan.c for tighter integration with shrinker code, and remove the "memcg_" prefix. There is no functional change.
Link: https://lkml.kernel.org/r/20210311190845.9708-3-shy828301@gmail.com Signed-off-by: Yang Shi shy828301@gmail.com Acked-by: Vlastimil Babka vbabka@suse.cz Acked-by: Kirill Tkhai ktkhai@virtuozzo.com Acked-by: Roman Gushchin guro@fb.com Reviewed-by: Shakeel Butt shakeelb@google.com Cc: Dave Chinner david@fromorbit.com Cc: Johannes Weiner hannes@cmpxchg.org Cc: Michal Hocko mhocko@suse.com Signed-off-by: Andrew Morton akpm@linux-foundation.org Signed-off-by: Linus Torvalds torvalds@linux-foundation.org Conflicts: mm/memcontrol.c Signed-off-by: Chen Wandun chenwandun@huawei.com Reviewed-by: Tong Tiangen tongtiangen@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- include/linux/memcontrol.h | 11 ++-- mm/huge_memory.c | 4 +- mm/list_lru.c | 6 +- mm/memcontrol.c | 130 +----------------------------------- mm/vmscan.c | 132 ++++++++++++++++++++++++++++++++++++- 5 files changed, 142 insertions(+), 141 deletions(-)
diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index 3d251cd70727..98e5f519bea9 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -1625,10 +1625,9 @@ static inline bool mem_cgroup_under_socket_pressure(struct mem_cgroup *memcg) return false; }
-extern int memcg_expand_shrinker_maps(int new_id); - -extern void memcg_set_shrinker_bit(struct mem_cgroup *memcg, - int nid, int shrinker_id); +int alloc_shrinker_maps(struct mem_cgroup *memcg); +void free_shrinker_maps(struct mem_cgroup *memcg); +void set_shrinker_bit(struct mem_cgroup *memcg, int nid, int shrinker_id); #else #define mem_cgroup_sockets_enabled 0 static inline void mem_cgroup_sk_alloc(struct sock *sk) { }; @@ -1638,8 +1637,8 @@ static inline bool mem_cgroup_under_socket_pressure(struct mem_cgroup *memcg) return false; }
-static inline void memcg_set_shrinker_bit(struct mem_cgroup *memcg, - int nid, int shrinker_id) +static inline void set_shrinker_bit(struct mem_cgroup *memcg, + int nid, int shrinker_id) { } #endif diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 0e60e321fcb4..378ae22e5640 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -2819,8 +2819,8 @@ void deferred_split_huge_page(struct page *page) ds_queue->split_queue_len++; #ifdef CONFIG_MEMCG if (memcg) - memcg_set_shrinker_bit(memcg, page_to_nid(page), - deferred_split_shrinker.id); + set_shrinker_bit(memcg, page_to_nid(page), + deferred_split_shrinker.id); #endif } spin_unlock_irqrestore(&ds_queue->split_queue_lock, flags); diff --git a/mm/list_lru.c b/mm/list_lru.c index fe230081690b..628030fa5f69 100644 --- a/mm/list_lru.c +++ b/mm/list_lru.c @@ -125,8 +125,8 @@ bool list_lru_add(struct list_lru *lru, struct list_head *item) list_add_tail(item, &l->list); /* Set shrinker bit if the first element was added */ if (!l->nr_items++) - memcg_set_shrinker_bit(memcg, nid, - lru_shrinker_id(lru)); + set_shrinker_bit(memcg, nid, + lru_shrinker_id(lru)); nlru->nr_items++; spin_unlock(&nlru->lock); return true; @@ -548,7 +548,7 @@ static void memcg_drain_list_lru_node(struct list_lru *lru, int nid,
if (src->nr_items) { dst->nr_items += src->nr_items; - memcg_set_shrinker_bit(dst_memcg, nid, lru_shrinker_id(lru)); + set_shrinker_bit(dst_memcg, nid, lru_shrinker_id(lru)); src->nr_items = 0; }
diff --git a/mm/memcontrol.c b/mm/memcontrol.c index e1fb222010db..41c7c102ca96 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -397,130 +397,6 @@ DEFINE_STATIC_KEY_FALSE(memcg_kmem_enabled_key); EXPORT_SYMBOL(memcg_kmem_enabled_key); #endif
-static int memcg_shrinker_map_size; -static DEFINE_MUTEX(memcg_shrinker_map_mutex); - -static void memcg_free_shrinker_map_rcu(struct rcu_head *head) -{ - kvfree(container_of(head, struct memcg_shrinker_map, rcu)); -} - -static int memcg_expand_one_shrinker_map(struct mem_cgroup *memcg, - int size, int old_size) -{ - struct memcg_shrinker_map *new, *old; - int nid; - - lockdep_assert_held(&memcg_shrinker_map_mutex); - - for_each_node(nid) { - old = rcu_dereference_protected( - mem_cgroup_nodeinfo(memcg, nid)->shrinker_map, true); - /* Not yet online memcg */ - if (!old) - return 0; - - new = kvmalloc_node(sizeof(*new) + size, GFP_KERNEL, nid); - if (!new) - return -ENOMEM; - - /* Set all old bits, clear all new bits */ - memset(new->map, (int)0xff, old_size); - memset((void *)new->map + old_size, 0, size - old_size); - - rcu_assign_pointer(memcg->nodeinfo[nid]->shrinker_map, new); - call_rcu(&old->rcu, memcg_free_shrinker_map_rcu); - } - - return 0; -} - -static void memcg_free_shrinker_maps(struct mem_cgroup *memcg) -{ - struct mem_cgroup_per_node *pn; - struct memcg_shrinker_map *map; - int nid; - - if (mem_cgroup_is_root(memcg)) - return; - - for_each_node(nid) { - pn = mem_cgroup_nodeinfo(memcg, nid); - map = rcu_dereference_protected(pn->shrinker_map, true); - if (map) - kvfree(map); - rcu_assign_pointer(pn->shrinker_map, NULL); - } -} - -static int memcg_alloc_shrinker_maps(struct mem_cgroup *memcg) -{ - struct memcg_shrinker_map *map; - int nid, size, ret = 0; - - if (mem_cgroup_is_root(memcg)) - return 0; - - mutex_lock(&memcg_shrinker_map_mutex); - size = memcg_shrinker_map_size; - for_each_node(nid) { - map = kvzalloc_node(sizeof(*map) + size, GFP_KERNEL, nid); - if (!map) { - memcg_free_shrinker_maps(memcg); - ret = -ENOMEM; - break; - } - rcu_assign_pointer(memcg->nodeinfo[nid]->shrinker_map, map); - } - mutex_unlock(&memcg_shrinker_map_mutex); - - return ret; -} - -int memcg_expand_shrinker_maps(int new_id) -{ - int size, old_size, ret = 0; - struct mem_cgroup *memcg; - - size = DIV_ROUND_UP(new_id + 1, BITS_PER_LONG) * sizeof(unsigned long); - old_size = memcg_shrinker_map_size; - if (size <= old_size) - return 0; - - mutex_lock(&memcg_shrinker_map_mutex); - if (!root_mem_cgroup) - goto unlock; - - for_each_mem_cgroup(memcg) { - if (mem_cgroup_is_root(memcg)) - continue; - ret = memcg_expand_one_shrinker_map(memcg, size, old_size); - if (ret) { - mem_cgroup_iter_break(NULL, memcg); - goto unlock; - } - } -unlock: - if (!ret) - memcg_shrinker_map_size = size; - mutex_unlock(&memcg_shrinker_map_mutex); - return ret; -} - -void memcg_set_shrinker_bit(struct mem_cgroup *memcg, int nid, int shrinker_id) -{ - if (shrinker_id >= 0 && memcg && !mem_cgroup_is_root(memcg)) { - struct memcg_shrinker_map *map; - - rcu_read_lock(); - map = rcu_dereference(memcg->nodeinfo[nid]->shrinker_map); - /* Pairs with smp mb in shrink_slab() */ - smp_mb__before_atomic(); - set_bit(shrinker_id, map->map); - rcu_read_unlock(); - } -} - /** * mem_cgroup_css_from_page - css of the memcg associated with a page * @page: page of interest @@ -5665,11 +5541,11 @@ static int mem_cgroup_css_online(struct cgroup_subsys_state *css) struct mem_cgroup *memcg = mem_cgroup_from_css(css);
/* - * A memcg must be visible for memcg_expand_shrinker_maps() + * A memcg must be visible for expand_shrinker_maps() * by the time the maps are allocated. So, we allocate maps * here, when for_each_mem_cgroup() can't skip it. */ - if (memcg_alloc_shrinker_maps(memcg)) { + if (alloc_shrinker_maps(memcg)) { mem_cgroup_id_remove(memcg); return -ENOMEM; } @@ -5737,7 +5613,7 @@ static void mem_cgroup_css_free(struct cgroup_subsys_state *css) vmpressure_cleanup(&memcg->vmpressure); cancel_work_sync(&memcg->high_work); mem_cgroup_remove_from_trees(memcg); - memcg_free_shrinker_maps(memcg); + free_shrinker_maps(memcg); memcg_free_kmem(memcg); mem_cgroup_free(memcg); } diff --git a/mm/vmscan.c b/mm/vmscan.c index 66a553e90046..a34d3cf4bca9 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -188,6 +188,132 @@ static LIST_HEAD(shrinker_list); static DECLARE_RWSEM(shrinker_rwsem);
#ifdef CONFIG_MEMCG + +static int memcg_shrinker_map_size; +static DEFINE_MUTEX(memcg_shrinker_map_mutex); + +static void free_shrinker_map_rcu(struct rcu_head *head) +{ + kvfree(container_of(head, struct memcg_shrinker_map, rcu)); +} + +static int expand_one_shrinker_map(struct mem_cgroup *memcg, + int size, int old_size) +{ + struct memcg_shrinker_map *new, *old; + struct mem_cgroup_per_node *pn; + int nid; + + lockdep_assert_held(&memcg_shrinker_map_mutex); + + for_each_node(nid) { + pn = memcg->nodeinfo[nid]; + old = rcu_dereference_protected(pn->shrinker_map, true); + /* Not yet online memcg */ + if (!old) + return 0; + + new = kvmalloc_node(sizeof(*new) + size, GFP_KERNEL, nid); + if (!new) + return -ENOMEM; + + /* Set all old bits, clear all new bits */ + memset(new->map, (int)0xff, old_size); + memset((void *)new->map + old_size, 0, size - old_size); + + rcu_assign_pointer(pn->shrinker_map, new); + call_rcu(&old->rcu, free_shrinker_map_rcu); + } + + return 0; +} + +void free_shrinker_maps(struct mem_cgroup *memcg) +{ + struct mem_cgroup_per_node *pn; + struct memcg_shrinker_map *map; + int nid; + + if (mem_cgroup_is_root(memcg)) + return; + + for_each_node(nid) { + pn = memcg->nodeinfo[nid]; + map = rcu_dereference_protected(pn->shrinker_map, true); + kvfree(map); + rcu_assign_pointer(pn->shrinker_map, NULL); + } +} + +int alloc_shrinker_maps(struct mem_cgroup *memcg) +{ + struct memcg_shrinker_map *map; + int nid, size, ret = 0; + + if (mem_cgroup_is_root(memcg)) + return 0; + + mutex_lock(&memcg_shrinker_map_mutex); + size = memcg_shrinker_map_size; + for_each_node(nid) { + map = kvzalloc_node(sizeof(*map) + size, GFP_KERNEL, nid); + if (!map) { + free_shrinker_maps(memcg); + ret = -ENOMEM; + break; + } + rcu_assign_pointer(memcg->nodeinfo[nid]->shrinker_map, map); + } + mutex_unlock(&memcg_shrinker_map_mutex); + + return ret; +} + +static int expand_shrinker_maps(int new_id) +{ + int size, old_size, ret = 0; + struct mem_cgroup *memcg; + + size = DIV_ROUND_UP(new_id + 1, BITS_PER_LONG) * sizeof(unsigned long); + old_size = memcg_shrinker_map_size; + if (size <= old_size) + return 0; + + mutex_lock(&memcg_shrinker_map_mutex); + if (!root_mem_cgroup) + goto unlock; + + memcg = mem_cgroup_iter(NULL, NULL, NULL); + do { + if (mem_cgroup_is_root(memcg)) + continue; + ret = expand_one_shrinker_map(memcg, size, old_size); + if (ret) { + mem_cgroup_iter_break(NULL, memcg); + goto unlock; + } + } while ((memcg = mem_cgroup_iter(NULL, memcg, NULL)) != NULL); +unlock: + if (!ret) + memcg_shrinker_map_size = size; + mutex_unlock(&memcg_shrinker_map_mutex); + return ret; +} + +void set_shrinker_bit(struct mem_cgroup *memcg, int nid, int shrinker_id) +{ + if (shrinker_id >= 0 && memcg && !mem_cgroup_is_root(memcg)) { + struct memcg_shrinker_map *map; + + rcu_read_lock(); + map = rcu_dereference(memcg->nodeinfo[nid]->shrinker_map); + /* Pairs with smp mb in shrink_slab() */ + smp_mb__before_atomic(); + set_bit(shrinker_id, map->map); + rcu_read_unlock(); + } +} + /* * We allow subsystems to populate their shrinker-related * LRU lists before register_shrinker_prepared() is called @@ -215,7 +341,7 @@ static int prealloc_memcg_shrinker(struct shrinker *shrinker) goto unlock;
if (id >= shrinker_nr_max) { - if (memcg_expand_shrinker_maps(id)) { + if (expand_shrinker_maps(id)) { idr_remove(&shrinker_idr, id); goto unlock; } @@ -592,7 +718,7 @@ static unsigned long shrink_slab_memcg(gfp_t gfp_mask, int nid, * case, we invoke the shrinker one more time and reset * the bit if it reports that it is not empty anymore. * The memory barrier here pairs with the barrier in - * memcg_set_shrinker_bit(): + * set_shrinker_bit(): * * list_lru_add() shrink_slab_memcg() * list_add_tail() clear_bit() @@ -604,7 +730,7 @@ static unsigned long shrink_slab_memcg(gfp_t gfp_mask, int nid, if (ret == SHRINK_EMPTY) ret = 0; else - memcg_set_shrinker_bit(memcg, nid, i); + set_shrinker_bit(memcg, nid, i); } freed += ret;
From: Yang Shi shy828301@gmail.com
mainline inclusion from mainline-v5.13-rc1 commit d27cf2aa0d26a221982d04757cc32db97833ec29 category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I48N0H CVE: NA
-------------------------------------------------
Since memcg_shrinker_map_size just can be changed under holding shrinker_rwsem exclusively, the read side can be protected by holding read lock, so it sounds superfluous to have a dedicated mutex.
Kirill Tkhai suggested use write lock since:
* We want the assignment to shrinker_maps is visible for shrink_slab_memcg(). * The rcu_dereference_protected() dereferrencing in shrink_slab_memcg(), but in case of we use READ lock in alloc_shrinker_maps(), the dereferrencing is not actually protected. * READ lock makes alloc_shrinker_info() racy against memory allocation fail. alloc_shrinker_info()->free_shrinker_info() may free memory right after shrink_slab_memcg() dereferenced it. You may say shrink_slab_memcg()->mem_cgroup_online() protects us from it? Yes, sure, but this is not the thing we want to remember in the future, since this spreads modularity.
And a test with heavy paging workload didn't show write lock makes things worse.
Link: https://lkml.kernel.org/r/20210311190845.9708-4-shy828301@gmail.com Signed-off-by: Yang Shi shy828301@gmail.com Acked-by: Vlastimil Babka vbabka@suse.cz Acked-by: Kirill Tkhai ktkhai@virtuozzo.com Acked-by: Roman Gushchin guro@fb.com Reviewed-by: Shakeel Butt shakeelb@google.com Cc: Dave Chinner david@fromorbit.com Cc: Johannes Weiner hannes@cmpxchg.org Cc: Michal Hocko mhocko@suse.com Signed-off-by: Andrew Morton akpm@linux-foundation.org Signed-off-by: Linus Torvalds torvalds@linux-foundation.org Signed-off-by: Chen Wandun chenwandun@huawei.com Reviewed-by: Tong Tiangen tongtiangen@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- mm/vmscan.c | 18 ++++++++---------- 1 file changed, 8 insertions(+), 10 deletions(-)
diff --git a/mm/vmscan.c b/mm/vmscan.c index a34d3cf4bca9..2e2043c0c50f 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -190,7 +190,6 @@ static DECLARE_RWSEM(shrinker_rwsem); #ifdef CONFIG_MEMCG
static int memcg_shrinker_map_size; -static DEFINE_MUTEX(memcg_shrinker_map_mutex);
static void free_shrinker_map_rcu(struct rcu_head *head) { @@ -204,8 +203,6 @@ static int expand_one_shrinker_map(struct mem_cgroup *memcg, struct mem_cgroup_per_node *pn; int nid;
- lockdep_assert_held(&memcg_shrinker_map_mutex); - for_each_node(nid) { pn = memcg->nodeinfo[nid]; old = rcu_dereference_protected(pn->shrinker_map, true); @@ -253,7 +250,7 @@ int alloc_shrinker_maps(struct mem_cgroup *memcg) if (mem_cgroup_is_root(memcg)) return 0;
- mutex_lock(&memcg_shrinker_map_mutex); + down_write(&shrinker_rwsem); size = memcg_shrinker_map_size; for_each_node(nid) { map = kvzalloc_node(sizeof(*map) + size, GFP_KERNEL, nid); @@ -264,7 +261,7 @@ int alloc_shrinker_maps(struct mem_cgroup *memcg) } rcu_assign_pointer(memcg->nodeinfo[nid]->shrinker_map, map); } - mutex_unlock(&memcg_shrinker_map_mutex); + up_write(&shrinker_rwsem);
return ret; } @@ -279,9 +276,10 @@ static int expand_shrinker_maps(int new_id) if (size <= old_size) return 0;
- mutex_lock(&memcg_shrinker_map_mutex); if (!root_mem_cgroup) - goto unlock; + goto out; + + lockdep_assert_held(&shrinker_rwsem);
memcg = mem_cgroup_iter(NULL, NULL, NULL); do { @@ -290,13 +288,13 @@ static int expand_shrinker_maps(int new_id) ret = expand_one_shrinker_map(memcg, size, old_size); if (ret) { mem_cgroup_iter_break(NULL, memcg); - goto unlock; + goto out; } } while ((memcg = mem_cgroup_iter(NULL, memcg, NULL)) != NULL); -unlock: +out: if (!ret) memcg_shrinker_map_size = size; - mutex_unlock(&memcg_shrinker_map_mutex); + return ret; }
From: Yang Shi shy828301@gmail.com
mainline inclusion from mainline-v5.13-rc1 commit a2fb12619f202dcec83f22accc09d48347fd9138 category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I48N0H CVE: NA
-------------------------------------------------
Both memcg_shrinker_map_size and shrinker_nr_max is maintained, but actually the map size can be calculated via shrinker_nr_max, so it seems unnecessary to keep both. Remove memcg_shrinker_map_size since shrinker_nr_max is also used by iterating the bit map.
Link: https://lkml.kernel.org/r/20210311190845.9708-5-shy828301@gmail.com Signed-off-by: Yang Shi shy828301@gmail.com Acked-by: Kirill Tkhai ktkhai@virtuozzo.com Acked-by: Roman Gushchin guro@fb.com Acked-by: Vlastimil Babka vbabka@suse.cz Reviewed-by: Shakeel Butt shakeelb@google.com Cc: Dave Chinner david@fromorbit.com Cc: Johannes Weiner hannes@cmpxchg.org Cc: Michal Hocko mhocko@suse.com Signed-off-by: Andrew Morton akpm@linux-foundation.org Signed-off-by: Linus Torvalds torvalds@linux-foundation.org Signed-off-by: Chen Wandun chenwandun@huawei.com Reviewed-by: Tong Tiangen tongtiangen@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- mm/vmscan.c | 20 +++++++++++--------- 1 file changed, 11 insertions(+), 9 deletions(-)
diff --git a/mm/vmscan.c b/mm/vmscan.c index 2e2043c0c50f..d3c00dd4320c 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -188,8 +188,12 @@ static LIST_HEAD(shrinker_list); static DECLARE_RWSEM(shrinker_rwsem);
#ifdef CONFIG_MEMCG +static int shrinker_nr_max;
-static int memcg_shrinker_map_size; +static inline int shrinker_map_size(int nr_items) +{ + return (DIV_ROUND_UP(nr_items, BITS_PER_LONG) * sizeof(unsigned long)); +}
static void free_shrinker_map_rcu(struct rcu_head *head) { @@ -251,7 +255,7 @@ int alloc_shrinker_maps(struct mem_cgroup *memcg) return 0;
down_write(&shrinker_rwsem); - size = memcg_shrinker_map_size; + size = shrinker_map_size(shrinker_nr_max); for_each_node(nid) { map = kvzalloc_node(sizeof(*map) + size, GFP_KERNEL, nid); if (!map) { @@ -269,12 +273,13 @@ int alloc_shrinker_maps(struct mem_cgroup *memcg) static int expand_shrinker_maps(int new_id) { int size, old_size, ret = 0; + int new_nr_max = new_id + 1; struct mem_cgroup *memcg;
- size = DIV_ROUND_UP(new_id + 1, BITS_PER_LONG) * sizeof(unsigned long); - old_size = memcg_shrinker_map_size; + size = shrinker_map_size(new_nr_max); + old_size = shrinker_map_size(shrinker_nr_max); if (size <= old_size) - return 0; + goto out;
if (!root_mem_cgroup) goto out; @@ -293,7 +298,7 @@ static int expand_shrinker_maps(int new_id) } while ((memcg = mem_cgroup_iter(NULL, memcg, NULL)) != NULL); out: if (!ret) - memcg_shrinker_map_size = size; + shrinker_nr_max = new_nr_max;
return ret; } @@ -326,7 +331,6 @@ void set_shrinker_bit(struct mem_cgroup *memcg, int nid, int shrinker_id) #define SHRINKER_REGISTERING ((struct shrinker *)~0UL)
static DEFINE_IDR(shrinker_idr); -static int shrinker_nr_max;
static int prealloc_memcg_shrinker(struct shrinker *shrinker) { @@ -343,8 +347,6 @@ static int prealloc_memcg_shrinker(struct shrinker *shrinker) idr_remove(&shrinker_idr, id); goto unlock; } - - shrinker_nr_max = id + 1; } shrinker->id = id; ret = 0;
From: Yang Shi shy828301@gmail.com
mainline inclusion from mainline-v5.13-rc1 commit 72673e861dd032ccaff533c0d9bb705d508017f7 category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I48N0H CVE: NA
-------------------------------------------------
Using kvfree_rcu() to free the old shrinker_maps instead of call_rcu(). We don't have to define a dedicated callback for call_rcu() anymore.
Link: https://lkml.kernel.org/r/20210311190845.9708-6-shy828301@gmail.com Signed-off-by: Yang Shi shy828301@gmail.com Acked-by: Roman Gushchin guro@fb.com Acked-by: Kirill Tkhai ktkhai@virtuozzo.com Reviewed-by: Shakeel Butt shakeelb@google.com Cc: Dave Chinner david@fromorbit.com Cc: Johannes Weiner hannes@cmpxchg.org Cc: Michal Hocko mhocko@suse.com Cc: Vlastimil Babka vbabka@suse.cz Signed-off-by: Andrew Morton akpm@linux-foundation.org Signed-off-by: Linus Torvalds torvalds@linux-foundation.org Signed-off-by: Chen Wandun chenwandun@huawei.com Reviewed-by: Tong Tiangen tongtiangen@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- mm/vmscan.c | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-)
diff --git a/mm/vmscan.c b/mm/vmscan.c index d3c00dd4320c..90442f955bac 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -195,11 +195,6 @@ static inline int shrinker_map_size(int nr_items) return (DIV_ROUND_UP(nr_items, BITS_PER_LONG) * sizeof(unsigned long)); }
-static void free_shrinker_map_rcu(struct rcu_head *head) -{ - kvfree(container_of(head, struct memcg_shrinker_map, rcu)); -} - static int expand_one_shrinker_map(struct mem_cgroup *memcg, int size, int old_size) { @@ -223,7 +218,7 @@ static int expand_one_shrinker_map(struct mem_cgroup *memcg, memset((void *)new->map + old_size, 0, size - old_size);
rcu_assign_pointer(pn->shrinker_map, new); - call_rcu(&old->rcu, free_shrinker_map_rcu); + kvfree_rcu(old, rcu); }
return 0;
From: Yang Shi shy828301@gmail.com
mainline inclusion from mainline-v5.13-rc1 commit e4262c4f51d6373447c9d89093f49ff6b1e607be category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I48N0H CVE: NA
-------------------------------------------------
The following patch is going to add nr_deferred into shrinker_map, the change will make shrinker_map not only include map anymore, so rename it to "memcg_shrinker_info". And this should make the patch adding nr_deferred cleaner and readable and make review easier. Also remove the "memcg_" prefix.
Link: https://lkml.kernel.org/r/20210311190845.9708-7-shy828301@gmail.com Signed-off-by: Yang Shi shy828301@gmail.com Acked-by: Vlastimil Babka vbabka@suse.cz Acked-by: Kirill Tkhai ktkhai@virtuozzo.com Acked-by: Roman Gushchin guro@fb.com Reviewed-by: Shakeel Butt shakeelb@google.com Cc: Dave Chinner david@fromorbit.com Cc: Johannes Weiner hannes@cmpxchg.org Cc: Michal Hocko mhocko@suse.com Signed-off-by: Andrew Morton akpm@linux-foundation.org Signed-off-by: Linus Torvalds torvalds@linux-foundation.org Signed-off-by: Chen Wandun chenwandun@huawei.com Reviewed-by: Tong Tiangen tongtiangen@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- include/linux/memcontrol.h | 8 +++--- mm/memcontrol.c | 6 ++-- mm/vmscan.c | 58 +++++++++++++++++++------------------- 3 files changed, 36 insertions(+), 36 deletions(-)
diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index 98e5f519bea9..aa274a05569e 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -100,7 +100,7 @@ struct batched_lruvec_stat { * Bitmap of shrinker::id corresponding to memcg-aware shrinkers, * which have elements charged to this memcg. */ -struct memcg_shrinker_map { +struct shrinker_info { struct rcu_head rcu; unsigned long map[]; }; @@ -128,7 +128,7 @@ struct mem_cgroup_per_node {
struct mem_cgroup_reclaim_iter iter;
- struct memcg_shrinker_map __rcu *shrinker_map; + struct shrinker_info __rcu *shrinker_info;
struct rb_node tree_node; /* RB tree node */ unsigned long usage_in_excess;/* Set to the value by which */ @@ -1625,8 +1625,8 @@ static inline bool mem_cgroup_under_socket_pressure(struct mem_cgroup *memcg) return false; }
-int alloc_shrinker_maps(struct mem_cgroup *memcg); -void free_shrinker_maps(struct mem_cgroup *memcg); +int alloc_shrinker_info(struct mem_cgroup *memcg); +void free_shrinker_info(struct mem_cgroup *memcg); void set_shrinker_bit(struct mem_cgroup *memcg, int nid, int shrinker_id); #else #define mem_cgroup_sockets_enabled 0 diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 41c7c102ca96..5d5b710f5a92 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -5541,11 +5541,11 @@ static int mem_cgroup_css_online(struct cgroup_subsys_state *css) struct mem_cgroup *memcg = mem_cgroup_from_css(css);
/* - * A memcg must be visible for expand_shrinker_maps() + * A memcg must be visible for expand_shrinker_info() * by the time the maps are allocated. So, we allocate maps * here, when for_each_mem_cgroup() can't skip it. */ - if (alloc_shrinker_maps(memcg)) { + if (alloc_shrinker_info(memcg)) { mem_cgroup_id_remove(memcg); return -ENOMEM; } @@ -5613,7 +5613,7 @@ static void mem_cgroup_css_free(struct cgroup_subsys_state *css) vmpressure_cleanup(&memcg->vmpressure); cancel_work_sync(&memcg->high_work); mem_cgroup_remove_from_trees(memcg); - free_shrinker_maps(memcg); + free_shrinker_info(memcg); memcg_free_kmem(memcg); mem_cgroup_free(memcg); } diff --git a/mm/vmscan.c b/mm/vmscan.c index 90442f955bac..0e3eae08894c 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -195,16 +195,16 @@ static inline int shrinker_map_size(int nr_items) return (DIV_ROUND_UP(nr_items, BITS_PER_LONG) * sizeof(unsigned long)); }
-static int expand_one_shrinker_map(struct mem_cgroup *memcg, - int size, int old_size) +static int expand_one_shrinker_info(struct mem_cgroup *memcg, + int size, int old_size) { - struct memcg_shrinker_map *new, *old; + struct shrinker_info *new, *old; struct mem_cgroup_per_node *pn; int nid;
for_each_node(nid) { pn = memcg->nodeinfo[nid]; - old = rcu_dereference_protected(pn->shrinker_map, true); + old = rcu_dereference_protected(pn->shrinker_info, true); /* Not yet online memcg */ if (!old) return 0; @@ -217,17 +217,17 @@ static int expand_one_shrinker_map(struct mem_cgroup *memcg, memset(new->map, (int)0xff, old_size); memset((void *)new->map + old_size, 0, size - old_size);
- rcu_assign_pointer(pn->shrinker_map, new); + rcu_assign_pointer(pn->shrinker_info, new); kvfree_rcu(old, rcu); }
return 0; }
-void free_shrinker_maps(struct mem_cgroup *memcg) +void free_shrinker_info(struct mem_cgroup *memcg) { struct mem_cgroup_per_node *pn; - struct memcg_shrinker_map *map; + struct shrinker_info *info; int nid;
if (mem_cgroup_is_root(memcg)) @@ -235,15 +235,15 @@ void free_shrinker_maps(struct mem_cgroup *memcg)
for_each_node(nid) { pn = memcg->nodeinfo[nid]; - map = rcu_dereference_protected(pn->shrinker_map, true); - kvfree(map); - rcu_assign_pointer(pn->shrinker_map, NULL); + info = rcu_dereference_protected(pn->shrinker_info, true); + kvfree(info); + rcu_assign_pointer(pn->shrinker_info, NULL); } }
-int alloc_shrinker_maps(struct mem_cgroup *memcg) +int alloc_shrinker_info(struct mem_cgroup *memcg) { - struct memcg_shrinker_map *map; + struct shrinker_info *info; int nid, size, ret = 0;
if (mem_cgroup_is_root(memcg)) @@ -252,20 +252,20 @@ int alloc_shrinker_maps(struct mem_cgroup *memcg) down_write(&shrinker_rwsem); size = shrinker_map_size(shrinker_nr_max); for_each_node(nid) { - map = kvzalloc_node(sizeof(*map) + size, GFP_KERNEL, nid); - if (!map) { - free_shrinker_maps(memcg); + info = kvzalloc_node(sizeof(*info) + size, GFP_KERNEL, nid); + if (!info) { + free_shrinker_info(memcg); ret = -ENOMEM; break; } - rcu_assign_pointer(memcg->nodeinfo[nid]->shrinker_map, map); + rcu_assign_pointer(memcg->nodeinfo[nid]->shrinker_info, info); } up_write(&shrinker_rwsem);
return ret; }
-static int expand_shrinker_maps(int new_id) +static int expand_shrinker_info(int new_id) { int size, old_size, ret = 0; int new_nr_max = new_id + 1; @@ -285,7 +285,7 @@ static int expand_shrinker_maps(int new_id) do { if (mem_cgroup_is_root(memcg)) continue; - ret = expand_one_shrinker_map(memcg, size, old_size); + ret = expand_one_shrinker_info(memcg, size, old_size); if (ret) { mem_cgroup_iter_break(NULL, memcg); goto out; @@ -301,13 +301,13 @@ static int expand_shrinker_maps(int new_id) void set_shrinker_bit(struct mem_cgroup *memcg, int nid, int shrinker_id) { if (shrinker_id >= 0 && memcg && !mem_cgroup_is_root(memcg)) { - struct memcg_shrinker_map *map; + struct shrinker_info *info;
rcu_read_lock(); - map = rcu_dereference(memcg->nodeinfo[nid]->shrinker_map); + info = rcu_dereference(memcg->nodeinfo[nid]->shrinker_info); /* Pairs with smp mb in shrink_slab() */ smp_mb__before_atomic(); - set_bit(shrinker_id, map->map); + set_bit(shrinker_id, info->map); rcu_read_unlock(); } } @@ -338,7 +338,7 @@ static int prealloc_memcg_shrinker(struct shrinker *shrinker) goto unlock;
if (id >= shrinker_nr_max) { - if (expand_shrinker_maps(id)) { + if (expand_shrinker_info(id)) { idr_remove(&shrinker_idr, id); goto unlock; } @@ -667,7 +667,7 @@ static unsigned long do_shrink_slab(struct shrink_control *shrinkctl, static unsigned long shrink_slab_memcg(gfp_t gfp_mask, int nid, struct mem_cgroup *memcg, int priority) { - struct memcg_shrinker_map *map; + struct shrinker_info *info; unsigned long ret, freed = 0; int i;
@@ -677,12 +677,12 @@ static unsigned long shrink_slab_memcg(gfp_t gfp_mask, int nid, if (!down_read_trylock(&shrinker_rwsem)) return 0;
- map = rcu_dereference_protected(memcg->nodeinfo[nid]->shrinker_map, - true); - if (unlikely(!map)) + info = rcu_dereference_protected(memcg->nodeinfo[nid]->shrinker_info, + true); + if (unlikely(!info)) goto unlock;
- for_each_set_bit(i, map->map, shrinker_nr_max) { + for_each_set_bit(i, info->map, shrinker_nr_max) { struct shrink_control sc = { .gfp_mask = gfp_mask, .nid = nid, @@ -693,7 +693,7 @@ static unsigned long shrink_slab_memcg(gfp_t gfp_mask, int nid, shrinker = idr_find(&shrinker_idr, i); if (unlikely(!shrinker || shrinker == SHRINKER_REGISTERING)) { if (!shrinker) - clear_bit(i, map->map); + clear_bit(i, info->map); continue; }
@@ -704,7 +704,7 @@ static unsigned long shrink_slab_memcg(gfp_t gfp_mask, int nid,
ret = do_shrink_slab(&sc, shrinker, priority); if (ret == SHRINK_EMPTY) { - clear_bit(i, map->map); + clear_bit(i, info->map); /* * After the shrinker reported that it had no objects to * free, but before we cleared the corresponding bit in
From: Yang Shi shy828301@gmail.com
mainline inclusion from mainline-v5.13-rc1 commit 468ab8437a97a953895856c3709e48b3067da13c category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I48N0H CVE: NA
-------------------------------------------------
The shrinker_info is dereferenced in a couple of places via rcu_dereference_protected with different calling conventions, for example, using mem_cgroup_nodeinfo helper or dereferencing memcg->nodeinfo[nid]->shrinker_info. And the later patch will add more dereference places.
So extract the dereference into a helper to make the code more readable. No functional change.
[akpm@linux-foundation.org: retain rcu_dereference_protected() in free_shrinker_info(), per Hugh]
Link: https://lkml.kernel.org/r/20210311190845.9708-8-shy828301@gmail.com Signed-off-by: Yang Shi shy828301@gmail.com Acked-by: Roman Gushchin guro@fb.com Acked-by: Kirill Tkhai ktkhai@virtuozzo.com Acked-by: Vlastimil Babka vbabka@suse.cz Reviewed-by: Shakeel Butt shakeelb@google.com Cc: Dave Chinner david@fromorbit.com Cc: Johannes Weiner hannes@cmpxchg.org Cc: Michal Hocko mhocko@suse.com Signed-off-by: Andrew Morton akpm@linux-foundation.org Signed-off-by: Linus Torvalds torvalds@linux-foundation.org Signed-off-by: Chen Wandun chenwandun@huawei.com Reviewed-by: Tong Tiangen tongtiangen@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- mm/vmscan.c | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-)
diff --git a/mm/vmscan.c b/mm/vmscan.c index 0e3eae08894c..47826e61f9d8 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -195,6 +195,13 @@ static inline int shrinker_map_size(int nr_items) return (DIV_ROUND_UP(nr_items, BITS_PER_LONG) * sizeof(unsigned long)); }
+static struct shrinker_info *shrinker_info_protected(struct mem_cgroup *memcg, + int nid) +{ + return rcu_dereference_protected(memcg->nodeinfo[nid]->shrinker_info, + lockdep_is_held(&shrinker_rwsem)); +} + static int expand_one_shrinker_info(struct mem_cgroup *memcg, int size, int old_size) { @@ -204,7 +211,7 @@ static int expand_one_shrinker_info(struct mem_cgroup *memcg,
for_each_node(nid) { pn = memcg->nodeinfo[nid]; - old = rcu_dereference_protected(pn->shrinker_info, true); + old = shrinker_info_protected(memcg, nid); /* Not yet online memcg */ if (!old) return 0; @@ -677,8 +684,7 @@ static unsigned long shrink_slab_memcg(gfp_t gfp_mask, int nid, if (!down_read_trylock(&shrinker_rwsem)) return 0;
- info = rcu_dereference_protected(memcg->nodeinfo[nid]->shrinker_info, - true); + info = shrinker_info_protected(memcg, nid); if (unlikely(!info)) goto unlock;
From: Yang Shi shy828301@gmail.com
mainline inclusion from mainline-v5.13-rc1 commit 41ca668a71e7b03743369a2c6d8b8edc1e943dc8 category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I48N0H CVE: NA
-------------------------------------------------
Currently registered shrinker is indicated by non-NULL shrinker->nr_deferred. This approach is fine with nr_deferred at the shrinker level, but the following patches will move MEMCG_AWARE shrinkers' nr_deferred to memcg level, so their shrinker->nr_deferred would always be NULL. This would prevent the shrinkers from unregistering correctly.
Remove SHRINKER_REGISTERING since we could check if shrinker is registered successfully by the new flag.
Link: https://lkml.kernel.org/r/20210311190845.9708-9-shy828301@gmail.com Signed-off-by: Yang Shi shy828301@gmail.com Acked-by: Kirill Tkhai ktkhai@virtuozzo.com Acked-by: Vlastimil Babka vbabka@suse.cz Acked-by: Roman Gushchin guro@fb.com Reviewed-by: Shakeel Butt shakeelb@google.com Cc: Dave Chinner david@fromorbit.com Cc: Johannes Weiner hannes@cmpxchg.org Cc: Michal Hocko mhocko@suse.com Signed-off-by: Andrew Morton akpm@linux-foundation.org Signed-off-by: Linus Torvalds torvalds@linux-foundation.org Signed-off-by: Chen Wandun chenwandun@huawei.com Reviewed-by: Tong Tiangen tongtiangen@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- include/linux/shrinker.h | 7 ++++--- mm/vmscan.c | 40 +++++++++++++++------------------------- 2 files changed, 19 insertions(+), 28 deletions(-)
diff --git a/include/linux/shrinker.h b/include/linux/shrinker.h index 0f80123650e2..1eac79ce57d4 100644 --- a/include/linux/shrinker.h +++ b/include/linux/shrinker.h @@ -79,13 +79,14 @@ struct shrinker { #define DEFAULT_SEEKS 2 /* A good number if you don't know better. */
/* Flags */ -#define SHRINKER_NUMA_AWARE (1 << 0) -#define SHRINKER_MEMCG_AWARE (1 << 1) +#define SHRINKER_REGISTERED (1 << 0) +#define SHRINKER_NUMA_AWARE (1 << 1) +#define SHRINKER_MEMCG_AWARE (1 << 2) /* * It just makes sense when the shrinker is also MEMCG_AWARE for now, * non-MEMCG_AWARE shrinker should not have this flag set. */ -#define SHRINKER_NONSLAB (1 << 2) +#define SHRINKER_NONSLAB (1 << 3)
extern int prealloc_shrinker(struct shrinker *shrinker); extern void register_shrinker_prepared(struct shrinker *shrinker); diff --git a/mm/vmscan.c b/mm/vmscan.c index 47826e61f9d8..e2bff96243fa 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -319,19 +319,6 @@ void set_shrinker_bit(struct mem_cgroup *memcg, int nid, int shrinker_id) } }
-/* - * We allow subsystems to populate their shrinker-related - * LRU lists before register_shrinker_prepared() is called - * for the shrinker, since we don't want to impose - * restrictions on their internal registration order. - * In this case shrink_slab_memcg() may find corresponding - * bit is set in the shrinkers map. - * - * This value is used by the function to detect registering - * shrinkers and to skip do_shrink_slab() calls for them. - */ -#define SHRINKER_REGISTERING ((struct shrinker *)~0UL) - static DEFINE_IDR(shrinker_idr);
static int prealloc_memcg_shrinker(struct shrinker *shrinker) @@ -340,7 +327,7 @@ static int prealloc_memcg_shrinker(struct shrinker *shrinker)
down_write(&shrinker_rwsem); /* This may call shrinker, so it must use down_read_trylock() */ - id = idr_alloc(&shrinker_idr, SHRINKER_REGISTERING, 0, 0, GFP_KERNEL); + id = idr_alloc(&shrinker_idr, shrinker, 0, 0, GFP_KERNEL); if (id < 0) goto unlock;
@@ -363,9 +350,9 @@ static void unregister_memcg_shrinker(struct shrinker *shrinker)
BUG_ON(id < 0);
- down_write(&shrinker_rwsem); + lockdep_assert_held(&shrinker_rwsem); + idr_remove(&shrinker_idr, id); - up_write(&shrinker_rwsem); }
static bool cgroup_reclaim(struct scan_control *sc) @@ -492,8 +479,11 @@ void free_prealloced_shrinker(struct shrinker *shrinker) if (!shrinker->nr_deferred) return;
- if (shrinker->flags & SHRINKER_MEMCG_AWARE) + if (shrinker->flags & SHRINKER_MEMCG_AWARE) { + down_write(&shrinker_rwsem); unregister_memcg_shrinker(shrinker); + up_write(&shrinker_rwsem); + }
kfree(shrinker->nr_deferred); shrinker->nr_deferred = NULL; @@ -503,10 +493,7 @@ void register_shrinker_prepared(struct shrinker *shrinker) { down_write(&shrinker_rwsem); list_add_tail(&shrinker->list, &shrinker_list); -#ifdef CONFIG_MEMCG - if (shrinker->flags & SHRINKER_MEMCG_AWARE) - idr_replace(&shrinker_idr, shrinker, shrinker->id); -#endif + shrinker->flags |= SHRINKER_REGISTERED; up_write(&shrinker_rwsem); }
@@ -526,13 +513,16 @@ EXPORT_SYMBOL(register_shrinker); */ void unregister_shrinker(struct shrinker *shrinker) { - if (!shrinker->nr_deferred) + if (!(shrinker->flags & SHRINKER_REGISTERED)) return; - if (shrinker->flags & SHRINKER_MEMCG_AWARE) - unregister_memcg_shrinker(shrinker); + down_write(&shrinker_rwsem); list_del(&shrinker->list); + shrinker->flags &= ~SHRINKER_REGISTERED; + if (shrinker->flags & SHRINKER_MEMCG_AWARE) + unregister_memcg_shrinker(shrinker); up_write(&shrinker_rwsem); + kfree(shrinker->nr_deferred); shrinker->nr_deferred = NULL; } @@ -697,7 +687,7 @@ static unsigned long shrink_slab_memcg(gfp_t gfp_mask, int nid, struct shrinker *shrinker;
shrinker = idr_find(&shrinker_idr, i); - if (unlikely(!shrinker || shrinker == SHRINKER_REGISTERING)) { + if (unlikely(!shrinker || !(shrinker->flags & SHRINKER_REGISTERED))) { if (!shrinker) clear_bit(i, info->map); continue;
From: Yang Shi shy828301@gmail.com
mainline inclusion from mainline-v5.13-rc1 commit 3c6f17e6c5d048c8029578c475dd037dd5db58af category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I48N0H CVE: NA
-------------------------------------------------
Currently the number of deferred objects are per shrinker, but some slabs, for example, vfs inode/dentry cache are per memcg, this would result in poor isolation among memcgs.
The deferred objects typically are generated by __GFP_NOFS allocations, one memcg with excessive __GFP_NOFS allocations may blow up deferred objects, then other innocent memcgs may suffer from over shrink, excessive reclaim latency, etc.
For example, two workloads run in memcgA and memcgB respectively, workload in B is vfs heavy workload. Workload in A generates excessive deferred objects, then B's vfs cache might be hit heavily (drop half of caches) by B's limit reclaim or global reclaim.
We observed this hit in our production environment which was running vfs heavy workload shown as the below tracing log:
<...>-409454 [016] .... 28286961.747146: mm_shrink_slab_start: super_cache_scan+0x0/0x1a0 ffff9a83046f3458: nid: 1 objects to shrink 3641681686040 gfp_flags GFP_HIGHUSER_MOVABLE|__GFP_ZERO pgs_scanned 1 lru_pgs 15721 cache items 246404277 delta 31345 total_scan 123202138 <...>-409454 [022] .... 28287105.928018: mm_shrink_slab_end: super_cache_scan+0x0/0x1a0 ffff9a83046f3458: nid: 1 unused scan count 3641681686040 new scan count 3641798379189 total_scan 602 last shrinker return val 123186855
The vfs cache and page cache ratio was 10:1 on this machine, and half of caches were dropped. This also resulted in significant amount of page caches were dropped due to inodes eviction.
Make nr_deferred per memcg for memcg aware shrinkers would solve the unfairness and bring better isolation.
The following patch will add nr_deferred to parent memcg when memcg offline. To preserve nr_deferred when reparenting memcgs to root, root memcg needs shrinker_info allocated too.
When memcg is not enabled (!CONFIG_MEMCG or memcg disabled), the shrinker's nr_deferred would be used. And non memcg aware shrinkers use shrinker's nr_deferred all the time.
Link: https://lkml.kernel.org/r/20210311190845.9708-10-shy828301@gmail.com Signed-off-by: Yang Shi shy828301@gmail.com Acked-by: Roman Gushchin guro@fb.com Acked-by: Kirill Tkhai ktkhai@virtuozzo.com Reviewed-by: Shakeel Butt shakeelb@google.com Cc: Dave Chinner david@fromorbit.com Cc: Johannes Weiner hannes@cmpxchg.org Cc: Michal Hocko mhocko@suse.com Cc: Vlastimil Babka vbabka@suse.cz Signed-off-by: Andrew Morton akpm@linux-foundation.org Signed-off-by: Linus Torvalds torvalds@linux-foundation.org Signed-off-by: Chen Wandun chenwandun@huawei.com Reviewed-by: Tong Tiangen tongtiangen@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- include/linux/memcontrol.h | 7 +++-- mm/vmscan.c | 60 ++++++++++++++++++++++++++------------ 2 files changed, 46 insertions(+), 21 deletions(-)
diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index aa274a05569e..5d3501a447b0 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -97,12 +97,13 @@ struct batched_lruvec_stat { };
/* - * Bitmap of shrinker::id corresponding to memcg-aware shrinkers, - * which have elements charged to this memcg. + * Bitmap and deferred work of shrinker::id corresponding to memcg-aware + * shrinkers, which have elements charged to this memcg. */ struct shrinker_info { struct rcu_head rcu; - unsigned long map[]; + atomic_long_t *nr_deferred; + unsigned long *map; };
/* diff --git a/mm/vmscan.c b/mm/vmscan.c index e2bff96243fa..894e00f42ade 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -190,11 +190,17 @@ static DECLARE_RWSEM(shrinker_rwsem); #ifdef CONFIG_MEMCG static int shrinker_nr_max;
+/* The shrinker_info is expanded in a batch of BITS_PER_LONG */ static inline int shrinker_map_size(int nr_items) { return (DIV_ROUND_UP(nr_items, BITS_PER_LONG) * sizeof(unsigned long)); }
+static inline int shrinker_defer_size(int nr_items) +{ + return (round_up(nr_items, BITS_PER_LONG) * sizeof(atomic_long_t)); +} + static struct shrinker_info *shrinker_info_protected(struct mem_cgroup *memcg, int nid) { @@ -203,11 +209,13 @@ static struct shrinker_info *shrinker_info_protected(struct mem_cgroup *memcg, }
static int expand_one_shrinker_info(struct mem_cgroup *memcg, - int size, int old_size) + int map_size, int defer_size, + int old_map_size, int old_defer_size) { struct shrinker_info *new, *old; struct mem_cgroup_per_node *pn; int nid; + int size = map_size + defer_size;
for_each_node(nid) { pn = memcg->nodeinfo[nid]; @@ -220,9 +228,16 @@ static int expand_one_shrinker_info(struct mem_cgroup *memcg, if (!new) return -ENOMEM;
- /* Set all old bits, clear all new bits */ - memset(new->map, (int)0xff, old_size); - memset((void *)new->map + old_size, 0, size - old_size); + new->nr_deferred = (atomic_long_t *)(new + 1); + new->map = (void *)new->nr_deferred + defer_size; + + /* map: set all old bits, clear all new bits */ + memset(new->map, (int)0xff, old_map_size); + memset((void *)new->map + old_map_size, 0, map_size - old_map_size); + /* nr_deferred: copy old values, clear all new values */ + memcpy(new->nr_deferred, old->nr_deferred, old_defer_size); + memset((void *)new->nr_deferred + old_defer_size, 0, + defer_size - old_defer_size);
rcu_assign_pointer(pn->shrinker_info, new); kvfree_rcu(old, rcu); @@ -237,9 +252,6 @@ void free_shrinker_info(struct mem_cgroup *memcg) struct shrinker_info *info; int nid;
- if (mem_cgroup_is_root(memcg)) - return; - for_each_node(nid) { pn = memcg->nodeinfo[nid]; info = rcu_dereference_protected(pn->shrinker_info, true); @@ -252,12 +264,12 @@ int alloc_shrinker_info(struct mem_cgroup *memcg) { struct shrinker_info *info; int nid, size, ret = 0; - - if (mem_cgroup_is_root(memcg)) - return 0; + int map_size, defer_size = 0;
down_write(&shrinker_rwsem); - size = shrinker_map_size(shrinker_nr_max); + map_size = shrinker_map_size(shrinker_nr_max); + defer_size = shrinker_defer_size(shrinker_nr_max); + size = map_size + defer_size; for_each_node(nid) { info = kvzalloc_node(sizeof(*info) + size, GFP_KERNEL, nid); if (!info) { @@ -265,6 +277,8 @@ int alloc_shrinker_info(struct mem_cgroup *memcg) ret = -ENOMEM; break; } + info->nr_deferred = (atomic_long_t *)(info + 1); + info->map = (void *)info->nr_deferred + defer_size; rcu_assign_pointer(memcg->nodeinfo[nid]->shrinker_info, info); } up_write(&shrinker_rwsem); @@ -272,15 +286,21 @@ int alloc_shrinker_info(struct mem_cgroup *memcg) return ret; }
+static inline bool need_expand(int nr_max) +{ + return round_up(nr_max, BITS_PER_LONG) > + round_up(shrinker_nr_max, BITS_PER_LONG); +} + static int expand_shrinker_info(int new_id) { - int size, old_size, ret = 0; + int ret = 0; int new_nr_max = new_id + 1; + int map_size, defer_size = 0; + int old_map_size, old_defer_size = 0; struct mem_cgroup *memcg;
- size = shrinker_map_size(new_nr_max); - old_size = shrinker_map_size(shrinker_nr_max); - if (size <= old_size) + if (!need_expand(new_nr_max)) goto out;
if (!root_mem_cgroup) @@ -288,11 +308,15 @@ static int expand_shrinker_info(int new_id)
lockdep_assert_held(&shrinker_rwsem);
+ map_size = shrinker_map_size(new_nr_max); + defer_size = shrinker_defer_size(new_nr_max); + old_map_size = shrinker_map_size(shrinker_nr_max); + old_defer_size = shrinker_defer_size(shrinker_nr_max); + memcg = mem_cgroup_iter(NULL, NULL, NULL); do { - if (mem_cgroup_is_root(memcg)) - continue; - ret = expand_one_shrinker_info(memcg, size, old_size); + ret = expand_one_shrinker_info(memcg, map_size, defer_size, + old_map_size, old_defer_size); if (ret) { mem_cgroup_iter_break(NULL, memcg); goto out;
From: Yang Shi shy828301@gmail.com
mainline inclusion from mainline-v5.13-rc1 commit 86750830468506dc27fa99c644534a7189be7975 category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I48N0H CVE: NA
-------------------------------------------------
Use per memcg's nr_deferred for memcg aware shrinkers. The shrinker's nr_deferred will be used in the following cases:
1. Non memcg aware shrinkers 2. !CONFIG_MEMCG 3. memcg is disabled by boot parameter
Link: https://lkml.kernel.org/r/20210311190845.9708-11-shy828301@gmail.com Signed-off-by: Yang Shi shy828301@gmail.com Acked-by: Roman Gushchin guro@fb.com Acked-by: Kirill Tkhai ktkhai@virtuozzo.com Reviewed-by: Shakeel Butt shakeelb@google.com Cc: Dave Chinner david@fromorbit.com Cc: Johannes Weiner hannes@cmpxchg.org Cc: Michal Hocko mhocko@suse.com Cc: Vlastimil Babka vbabka@suse.cz Signed-off-by: Andrew Morton akpm@linux-foundation.org Signed-off-by: Linus Torvalds torvalds@linux-foundation.org Signed-off-by: Chen Wandun chenwandun@huawei.com Reviewed-by: Tong Tiangen tongtiangen@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- mm/vmscan.c | 78 ++++++++++++++++++++++++++++++++++++++++++++--------- 1 file changed, 66 insertions(+), 12 deletions(-)
diff --git a/mm/vmscan.c b/mm/vmscan.c index 894e00f42ade..02e10ac34331 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -379,6 +379,24 @@ static void unregister_memcg_shrinker(struct shrinker *shrinker) idr_remove(&shrinker_idr, id); }
+static long xchg_nr_deferred_memcg(int nid, struct shrinker *shrinker, + struct mem_cgroup *memcg) +{ + struct shrinker_info *info; + + info = shrinker_info_protected(memcg, nid); + return atomic_long_xchg(&info->nr_deferred[shrinker->id], 0); +} + +static long add_nr_deferred_memcg(long nr, int nid, struct shrinker *shrinker, + struct mem_cgroup *memcg) +{ + struct shrinker_info *info; + + info = shrinker_info_protected(memcg, nid); + return atomic_long_add_return(nr, &info->nr_deferred[shrinker->id]); +} + static bool cgroup_reclaim(struct scan_control *sc) { return sc->target_mem_cgroup; @@ -417,6 +435,18 @@ static void unregister_memcg_shrinker(struct shrinker *shrinker) { }
+static long xchg_nr_deferred_memcg(int nid, struct shrinker *shrinker, + struct mem_cgroup *memcg) +{ + return 0; +} + +static long add_nr_deferred_memcg(long nr, int nid, struct shrinker *shrinker, + struct mem_cgroup *memcg) +{ + return 0; +} + static bool cgroup_reclaim(struct scan_control *sc) { return false; @@ -428,6 +458,39 @@ static bool writeback_throttling_sane(struct scan_control *sc) } #endif
+static long xchg_nr_deferred(struct shrinker *shrinker, + struct shrink_control *sc) +{ + int nid = sc->nid; + + if (!(shrinker->flags & SHRINKER_NUMA_AWARE)) + nid = 0; + + if (sc->memcg && + (shrinker->flags & SHRINKER_MEMCG_AWARE)) + return xchg_nr_deferred_memcg(nid, shrinker, + sc->memcg); + + return atomic_long_xchg(&shrinker->nr_deferred[nid], 0); +} + + +static long add_nr_deferred(long nr, struct shrinker *shrinker, + struct shrink_control *sc) +{ + int nid = sc->nid; + + if (!(shrinker->flags & SHRINKER_NUMA_AWARE)) + nid = 0; + + if (sc->memcg && + (shrinker->flags & SHRINKER_MEMCG_AWARE)) + return add_nr_deferred_memcg(nr, nid, shrinker, + sc->memcg); + + return atomic_long_add_return(nr, &shrinker->nr_deferred[nid]); +} + /* * This misses isolated pages which are not accounted for to save counters. * As the data only determines if reclaim or compaction continues, it is @@ -563,14 +626,10 @@ static unsigned long do_shrink_slab(struct shrink_control *shrinkctl, long freeable; long nr; long new_nr; - int nid = shrinkctl->nid; long batch_size = shrinker->batch ? shrinker->batch : SHRINK_BATCH; long scanned = 0, next_deferred;
- if (!(shrinker->flags & SHRINKER_NUMA_AWARE)) - nid = 0; - freeable = shrinker->count_objects(shrinker, shrinkctl); if (freeable == 0 || freeable == SHRINK_EMPTY) return freeable; @@ -580,7 +639,7 @@ static unsigned long do_shrink_slab(struct shrink_control *shrinkctl, * and zero it so that other concurrent shrinker invocations * don't also do this scanning work. */ - nr = atomic_long_xchg(&shrinker->nr_deferred[nid], 0); + nr = xchg_nr_deferred(shrinker, shrinkctl);
total_scan = nr; if (shrinker->seeks) { @@ -671,14 +730,9 @@ static unsigned long do_shrink_slab(struct shrink_control *shrinkctl, next_deferred = 0; /* * move the unused scan count back into the shrinker in a - * manner that handles concurrent updates. If we exhausted the - * scan, there is no need to do an update. + * manner that handles concurrent updates. */ - if (next_deferred > 0) - new_nr = atomic_long_add_return(next_deferred, - &shrinker->nr_deferred[nid]); - else - new_nr = atomic_long_read(&shrinker->nr_deferred[nid]); + new_nr = add_nr_deferred(next_deferred, shrinker, shrinkctl);
trace_mm_shrink_slab_end(shrinker, shrinkctl->nid, freed, nr, new_nr, total_scan); return freed;
From: Yang Shi shy828301@gmail.com
mainline inclusion from mainline-v5.13-rc1 commit 476b30a0949aec865dcc64d4c14f621b1a8afd12 category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I48N0H CVE: NA
-------------------------------------------------
Now nr_deferred is available on per memcg level for memcg aware shrinkers, so don't need allocate shrinker->nr_deferred for such shrinkers anymore.
The prealloc_memcg_shrinker() would return -ENOSYS if !CONFIG_MEMCG or memcg is disabled by kernel command line, then shrinker's SHRINKER_MEMCG_AWARE flag would be cleared. This makes the implementation of this patch simpler.
Link: https://lkml.kernel.org/r/20210311190845.9708-12-shy828301@gmail.com Signed-off-by: Yang Shi shy828301@gmail.com Acked-by: Vlastimil Babka vbabka@suse.cz Reviewed-by: Kirill Tkhai ktkhai@virtuozzo.com Acked-by: Roman Gushchin guro@fb.com Reviewed-by: Shakeel Butt shakeelb@google.com Cc: Dave Chinner david@fromorbit.com Cc: Johannes Weiner hannes@cmpxchg.org Cc: Michal Hocko mhocko@suse.com Signed-off-by: Andrew Morton akpm@linux-foundation.org Signed-off-by: Linus Torvalds torvalds@linux-foundation.org Signed-off-by: Chen Wandun chenwandun@huawei.com Reviewed-by: Tong Tiangen tongtiangen@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- mm/vmscan.c | 31 ++++++++++++++++--------------- 1 file changed, 16 insertions(+), 15 deletions(-)
diff --git a/mm/vmscan.c b/mm/vmscan.c index 02e10ac34331..6dd9a920c94e 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -349,6 +349,9 @@ static int prealloc_memcg_shrinker(struct shrinker *shrinker) { int id, ret = -ENOMEM;
+ if (mem_cgroup_disabled()) + return -ENOSYS; + down_write(&shrinker_rwsem); /* This may call shrinker, so it must use down_read_trylock() */ id = idr_alloc(&shrinker_idr, shrinker, 0, 0, GFP_KERNEL); @@ -428,7 +431,7 @@ static bool writeback_throttling_sane(struct scan_control *sc) #else static int prealloc_memcg_shrinker(struct shrinker *shrinker) { - return 0; + return -ENOSYS; }
static void unregister_memcg_shrinker(struct shrinker *shrinker) @@ -539,8 +542,18 @@ unsigned long lruvec_lru_size(struct lruvec *lruvec, enum lru_list lru, int zone */ int prealloc_shrinker(struct shrinker *shrinker) { - unsigned int size = sizeof(*shrinker->nr_deferred); + unsigned int size; + int err; + + if (shrinker->flags & SHRINKER_MEMCG_AWARE) { + err = prealloc_memcg_shrinker(shrinker); + if (err != -ENOSYS) + return err;
+ shrinker->flags &= ~SHRINKER_MEMCG_AWARE; + } + + size = sizeof(*shrinker->nr_deferred); if (shrinker->flags & SHRINKER_NUMA_AWARE) size *= nr_node_ids;
@@ -548,28 +561,16 @@ int prealloc_shrinker(struct shrinker *shrinker) if (!shrinker->nr_deferred) return -ENOMEM;
- if (shrinker->flags & SHRINKER_MEMCG_AWARE) { - if (prealloc_memcg_shrinker(shrinker)) - goto free_deferred; - } - return 0; - -free_deferred: - kfree(shrinker->nr_deferred); - shrinker->nr_deferred = NULL; - return -ENOMEM; }
void free_prealloced_shrinker(struct shrinker *shrinker) { - if (!shrinker->nr_deferred) - return; - if (shrinker->flags & SHRINKER_MEMCG_AWARE) { down_write(&shrinker_rwsem); unregister_memcg_shrinker(shrinker); up_write(&shrinker_rwsem); + return; }
kfree(shrinker->nr_deferred);
From: Yang Shi shy828301@gmail.com
mainline inclusion from mainline-v5.13-rc1 commit a178015cde69981cdcd8f109c5abc98703fead62 category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I48N0H CVE: NA
-------------------------------------------------
Now shrinker's nr_deferred is per memcg for memcg aware shrinkers, add to parent's corresponding nr_deferred when memcg offline.
Link: https://lkml.kernel.org/r/20210311190845.9708-13-shy828301@gmail.com Signed-off-by: Yang Shi shy828301@gmail.com Acked-by: Vlastimil Babka vbabka@suse.cz Acked-by: Kirill Tkhai ktkhai@virtuozzo.com Acked-by: Roman Gushchin guro@fb.com Reviewed-by: Shakeel Butt shakeelb@google.com Cc: Dave Chinner david@fromorbit.com Cc: Johannes Weiner hannes@cmpxchg.org Cc: Michal Hocko mhocko@suse.com Signed-off-by: Andrew Morton akpm@linux-foundation.org Signed-off-by: Linus Torvalds torvalds@linux-foundation.org Signed-off-by: Chen Wandun chenwandun@huawei.com Reviewed-by: Tong Tiangen tongtiangen@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- include/linux/memcontrol.h | 1 + mm/memcontrol.c | 1 + mm/vmscan.c | 24 ++++++++++++++++++++++++ 3 files changed, 26 insertions(+)
diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index 5d3501a447b0..c07d8e316d0b 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -1629,6 +1629,7 @@ static inline bool mem_cgroup_under_socket_pressure(struct mem_cgroup *memcg) int alloc_shrinker_info(struct mem_cgroup *memcg); void free_shrinker_info(struct mem_cgroup *memcg); void set_shrinker_bit(struct mem_cgroup *memcg, int nid, int shrinker_id); +void reparent_shrinker_deferred(struct mem_cgroup *memcg); #else #define mem_cgroup_sockets_enabled 0 static inline void mem_cgroup_sk_alloc(struct sock *sk) { }; diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 5d5b710f5a92..782c13630e45 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -5581,6 +5581,7 @@ static void mem_cgroup_css_offline(struct cgroup_subsys_state *css) page_counter_set_low(&memcg->memory, 0);
memcg_offline_kmem(memcg); + reparent_shrinker_deferred(memcg); wb_memcg_offline(memcg);
drain_all_stock(memcg); diff --git a/mm/vmscan.c b/mm/vmscan.c index 6dd9a920c94e..63e82d136712 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -400,6 +400,30 @@ static long add_nr_deferred_memcg(long nr, int nid, struct shrinker *shrinker, return atomic_long_add_return(nr, &info->nr_deferred[shrinker->id]); }
+void reparent_shrinker_deferred(struct mem_cgroup *memcg) +{ + int i, nid; + long nr; + struct mem_cgroup *parent; + struct shrinker_info *child_info, *parent_info; + + parent = parent_mem_cgroup(memcg); + if (!parent) + parent = root_mem_cgroup; + + /* Prevent from concurrent shrinker_info expand */ + down_read(&shrinker_rwsem); + for_each_node(nid) { + child_info = shrinker_info_protected(memcg, nid); + parent_info = shrinker_info_protected(parent, nid); + for (i = 0; i < shrinker_nr_max; i++) { + nr = atomic_long_read(&child_info->nr_deferred[i]); + atomic_long_add(nr, &parent_info->nr_deferred[i]); + } + } + up_read(&shrinker_rwsem); +} + static bool cgroup_reclaim(struct scan_control *sc) { return sc->target_mem_cgroup;
From: Yang Shi shy828301@gmail.com
mainline inclusion from mainline-v5.13-rc1 commit 18bb473e5031213ebfa9a622c0b0f8cdcb8a5371 category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I48N0H CVE: NA
-------------------------------------------------
The number of deferred objects might get windup to an absurd number, and it results in clamp of slab objects. It is undesirable for sustaining workingset.
So shrink deferred objects proportional to priority and cap nr_deferred to twice of cache items.
The idea is borrowed from Dave Chinner's patch: https://lore.kernel.org/linux-xfs/20191031234618.15403-13-david@fromorbit.co...
Tested with kernel build and vfs metadata heavy workload in our production environment, no regression is spotted so far.
Link: https://lkml.kernel.org/r/20210311190845.9708-14-shy828301@gmail.com Signed-off-by: Yang Shi shy828301@gmail.com Cc: Johannes Weiner hannes@cmpxchg.org Cc: Kirill Tkhai ktkhai@virtuozzo.com Cc: Michal Hocko mhocko@suse.com Cc: Roman Gushchin guro@fb.com Cc: Shakeel Butt shakeelb@google.com Cc: Vlastimil Babka vbabka@suse.cz Signed-off-by: Andrew Morton akpm@linux-foundation.org Signed-off-by: Linus Torvalds torvalds@linux-foundation.org Reviewed-by: Tong Tiangen tongtiangen@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- mm/vmscan.c | 46 +++++++++++----------------------------------- 1 file changed, 11 insertions(+), 35 deletions(-)
diff --git a/mm/vmscan.c b/mm/vmscan.c index 63e82d136712..cb830ec05fad 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -666,7 +666,6 @@ static unsigned long do_shrink_slab(struct shrink_control *shrinkctl, */ nr = xchg_nr_deferred(shrinker, shrinkctl);
- total_scan = nr; if (shrinker->seeks) { delta = freeable >> priority; delta *= 4; @@ -680,37 +679,9 @@ static unsigned long do_shrink_slab(struct shrink_control *shrinkctl, delta = freeable / 2; }
+ total_scan = nr >> priority; total_scan += delta; - if (total_scan < 0) { - pr_err("shrink_slab: %pS negative objects to delete nr=%ld\n", - shrinker->scan_objects, total_scan); - total_scan = freeable; - next_deferred = nr; - } else - next_deferred = total_scan; - - /* - * We need to avoid excessive windup on filesystem shrinkers - * due to large numbers of GFP_NOFS allocations causing the - * shrinkers to return -1 all the time. This results in a large - * nr being built up so when a shrink that can do some work - * comes along it empties the entire cache due to nr >>> - * freeable. This is bad for sustaining a working set in - * memory. - * - * Hence only allow the shrinker to scan the entire cache when - * a large delta change is calculated directly. - */ - if (delta < freeable / 4) - total_scan = min(total_scan, freeable / 2); - - /* - * Avoid risking looping forever due to too large nr value: - * never try to free more than twice the estimate number of - * freeable entries. - */ - if (total_scan > freeable * 2) - total_scan = freeable * 2; + total_scan = min(total_scan, (2 * freeable));
trace_mm_shrink_slab_start(shrinker, shrinkctl, nr, freeable, delta, total_scan, priority); @@ -749,10 +720,15 @@ static unsigned long do_shrink_slab(struct shrink_control *shrinkctl, cond_resched(); }
- if (next_deferred >= scanned) - next_deferred -= scanned; - else - next_deferred = 0; + /* + * The deferred work is increased by any new work (delta) that wasn't + * done, decreased by old deferred work that was done now. + * + * And it is capped to two times of the freeable items. + */ + next_deferred = max_t(long, (nr + delta - scanned), 0); + next_deferred = min(next_deferred, (2 * freeable)); + /* * move the unused scan count back into the shrinker in a * manner that handles concurrent updates.
From: Mel Gorman mgorman@techsingularity.net
mainline inclusion from mainline-5.14-rc2 commit 187ad460b8413e863c951998cb321a117a717868 category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I3ZVL2 CVE: NA
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?i...
-------------------------------------------------
Syzbot is reporting potential deadlocks due to pagesets.lock when PAGE_OWNER is enabled. One example from Desmond Cheong Zhi Xi is as follows
__alloc_pages_bulk() local_lock_irqsave(&pagesets.lock, flags) <---- outer lock here prep_new_page(): post_alloc_hook(): set_page_owner(): __set_page_owner(): save_stack(): stack_depot_save(): alloc_pages(): alloc_page_interleave(): __alloc_pages(): get_page_from_freelist(): rm_queue(): rm_queue_pcplist(): local_lock_irqsave(&pagesets.lock, flags); *** DEADLOCK ***
Zhang, Qiang also reported
BUG: sleeping function called from invalid context at mm/page_alloc.c:5179 in_atomic(): 0, irqs_disabled(): 1, non_block: 0, pid: 1, name: swapper/0 ..... __dump_stack lib/dump_stack.c:79 [inline] dump_stack_lvl+0xcd/0x134 lib/dump_stack.c:96 ___might_sleep.cold+0x1f1/0x237 kernel/sched/core.c:9153 prepare_alloc_pages+0x3da/0x580 mm/page_alloc.c:5179 __alloc_pages+0x12f/0x500 mm/page_alloc.c:5375 alloc_page_interleave+0x1e/0x200 mm/mempolicy.c:2147 alloc_pages+0x238/0x2a0 mm/mempolicy.c:2270 stack_depot_save+0x39d/0x4e0 lib/stackdepot.c:303 save_stack+0x15e/0x1e0 mm/page_owner.c:120 __set_page_owner+0x50/0x290 mm/page_owner.c:181 prep_new_page mm/page_alloc.c:2445 [inline] __alloc_pages_bulk+0x8b9/0x1870 mm/page_alloc.c:5313 alloc_pages_bulk_array_node include/linux/gfp.h:557 [inline] vm_area_alloc_pages mm/vmalloc.c:2775 [inline] __vmalloc_area_node mm/vmalloc.c:2845 [inline] __vmalloc_node_range+0x39d/0x960 mm/vmalloc.c:2947 __vmalloc_node mm/vmalloc.c:2996 [inline] vzalloc+0x67/0x80 mm/vmalloc.c:3066
There are a number of ways it could be fixed. The page owner code could be audited to strip GFP flags that allow sleeping but it'll impair the functionality of PAGE_OWNER if allocations fail. The bulk allocator could add a special case to release/reacquire the lock for prep_new_page and lookup PCP after the lock is reacquired at the cost of performance. The pages requiring prep could be tracked using the least significant bit and looping through the array although it is more complicated for the list interface. The options are relatively complex and the second one still incurs a performance penalty when PAGE_OWNER is active so this patch takes the simple approach -- disable bulk allocation of PAGE_OWNER is active. The caller will be forced to allocate one page at a time incurring a performance penalty but PAGE_OWNER is already a performance penalty.
Link: https://lkml.kernel.org/r/20210708081434.GV3840@techsingularity.net Fixes: dbbee9d5cd83 ("mm/page_alloc: convert per-cpu list protection to local_lock") Signed-off-by: Mel Gorman mgorman@techsingularity.net Reported-by: Desmond Cheong Zhi Xi desmondcheongzx@gmail.com Reported-by: "Zhang, Qiang" Qiang.Zhang@windriver.com Reported-by: syzbot+127fd7828d6eeb611703@syzkaller.appspotmail.com Tested-by: syzbot+127fd7828d6eeb611703@syzkaller.appspotmail.com Acked-by: Rafael Aquini aquini@redhat.com Cc: Shuah Khan skhan@linuxfoundation.org Signed-off-by: Andrew Morton akpm@linux-foundation.org Signed-off-by: Linus Torvalds torvalds@linux-foundation.org Signed-off-by: Yongqiang Liu liuyongqiang13@huawei.com Reviewed-by: tong tiangen tongtiangen@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- mm/page_alloc.c | 12 ++++++++++++ 1 file changed, 12 insertions(+)
diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 19394f8f9daf..12a7a81996d3 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -4971,6 +4971,18 @@ unsigned long __alloc_pages_bulk(gfp_t gfp, int preferred_nid, if (nr_pages - nr_populated == 1) goto failed;
+#ifdef CONFIG_PAGE_OWNER + /* + * PAGE_OWNER may recurse into the allocator to allocate space to + * save the stack with pagesets.lock held. Releasing/reacquiring + * removes much of the performance benefit of bulk allocation so + * force the caller to allocate one page at a time as it'll have + * similar performance to added complexity to the bulk allocator. + */ + if (static_branch_unlikely(&page_owner_inited)) + goto failed; +#endif + /* May set ALLOC_NOFRAGMENT, fragmentation will return 1 page. */ gfp &= gfp_allowed_mask; alloc_gfp = gfp;
From: Yanfei Xu yanfei.xu@windriver.com
mainline inclusion from mainline-5.14-rc2 commit e5c15cea339115edf99dc92282865f173cf84510 category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I3ZVL2 CVE: NA
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?i...
-------------------------------------------------
If the array passed in is already partially populated, we should return "nr_populated" even failing at preparing arguments stage.
Link: https://lkml.kernel.org/r/20210713152100.10381-3-mgorman@techsingularity.net Signed-off-by: Yanfei Xu yanfei.xu@windriver.com Signed-off-by: Mel Gorman mgorman@techsingularity.net Link: https://lore.kernel.org/r/20210709102855.55058-1-yanfei.xu@windriver.com Signed-off-by: Andrew Morton akpm@linux-foundation.org Signed-off-by: Linus Torvalds torvalds@linux-foundation.org (cherry picked from commit e5c15cea339115edf99dc92282865f173cf84510) Signed-off-by: Yongqiang Liu liuyongqiang13@huawei.com Reviewed-by: tong tiangen tongtiangen@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- mm/page_alloc.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 12a7a81996d3..090c23060be8 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -4987,7 +4987,7 @@ unsigned long __alloc_pages_bulk(gfp_t gfp, int preferred_nid, gfp &= gfp_allowed_mask; alloc_gfp = gfp; if (!prepare_alloc_pages(gfp, 0, preferred_nid, nodemask, &ac, &alloc_gfp, &alloc_flags)) - return 0; + return nr_populated; gfp = alloc_gfp;
/* Find an allowed local zone that meets the low watermark. */
From: Chuck Lever chuck.lever@oracle.com
mainline inclusion from mainline-5.14-rc2 commit 061478438d04779181c2ce4d7ffeeca343a70a98 category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I3ZVL2 CVE: NA
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?i...
-------------------------------------------------
The author of commit b3b64ebd3822 ("mm/page_alloc: do bulk array bounds check after checking populated elements") was possibly confused by the mixture of return values throughout the function.
The API contract is clear that the function "Returns the number of pages on the list or array." It does not list zero as a unique return value with a special meaning. Therefore zero is a plausible return value only if @nr_pages is zero or less.
Clean up the return logic to make it clear that the returned value is always the total number of pages in the array/list, not the number of pages that were allocated during this call.
The only change in behavior with this patch is the value returned if prepare_alloc_pages() fails. To match the API contract, the number of pages currently in the array/list is returned in this case.
The call site in __page_pool_alloc_pages_slow() also seems to be confused on this matter. It should be attended to by someone who is familiar with that code.
[mel@techsingularity.net: Return nr_populated if 0 pages are requested]
Link: https://lkml.kernel.org/r/20210713152100.10381-4-mgorman@techsingularity.net Signed-off-by: Chuck Lever chuck.lever@oracle.com Signed-off-by: Mel Gorman mgorman@techsingularity.net Acked-by: Jesper Dangaard Brouer brouer@redhat.com Cc: Desmond Cheong Zhi Xi desmondcheongzx@gmail.com Cc: Zhang Qiang Qiang.Zhang@windriver.com Cc: Yanfei Xu yanfei.xu@windriver.com Cc: Matteo Croce mcroce@microsoft.com Signed-off-by: Andrew Morton akpm@linux-foundation.org Signed-off-by: Linus Torvalds torvalds@linux-foundation.org (cherry picked from commit 061478438d04779181c2ce4d7ffeeca343a70a98) Signed-off-by: Yongqiang Liu liuyongqiang13@huawei.com Reviewed-by: tong tiangen tongtiangen@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- mm/page_alloc.c | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-)
diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 090c23060be8..a5adfda63ff5 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -4953,9 +4953,6 @@ unsigned long __alloc_pages_bulk(gfp_t gfp, int preferred_nid, unsigned int alloc_flags = ALLOC_WMARK_LOW; int nr_populated = 0;
- if (unlikely(nr_pages <= 0)) - return 0; - /* * Skip populated array elements to determine if any pages need * to be allocated before disabling IRQs. @@ -4963,9 +4960,13 @@ unsigned long __alloc_pages_bulk(gfp_t gfp, int preferred_nid, while (page_array && nr_populated < nr_pages && page_array[nr_populated]) nr_populated++;
+ /* No pages requested? */ + if (unlikely(nr_pages <= 0)) + goto out; + /* Already populated array? */ if (unlikely(page_array && nr_pages - nr_populated == 0)) - return nr_populated; + goto out;
/* Use the single page allocator for one page. */ if (nr_pages - nr_populated == 1) @@ -4987,7 +4988,7 @@ unsigned long __alloc_pages_bulk(gfp_t gfp, int preferred_nid, gfp &= gfp_allowed_mask; alloc_gfp = gfp; if (!prepare_alloc_pages(gfp, 0, preferred_nid, nodemask, &ac, &alloc_gfp, &alloc_flags)) - return nr_populated; + goto out; gfp = alloc_gfp;
/* Find an allowed local zone that meets the low watermark. */ @@ -5060,6 +5061,7 @@ unsigned long __alloc_pages_bulk(gfp_t gfp, int preferred_nid,
local_irq_restore(flags);
+out: return nr_populated;
failed_irq: @@ -5075,7 +5077,7 @@ unsigned long __alloc_pages_bulk(gfp_t gfp, int preferred_nid, nr_populated++; }
- return nr_populated; + goto out; } EXPORT_SYMBOL_GPL(__alloc_pages_bulk);