From: Wei Li liwei391@huawei.com
hulk inclusion category: feature bugzilla: 31400 CVE: NA ---------------------------
Currently, clear_page() clear the page through 'dc zva', while the page may not be used immediately mostly, so the cache flush is in vain.
Add an optimized implementation of clear_page() by 'stnp' for performance promotion. It can be switched by the boot cmdline 'mm.use_clearpage_stnp'.
In the hugetlb clear test, we gained about 53.7% performance improvement:
Set mm.use_clearpage_stnp = 0 | Set mm.use_clearpage_stnp = 1 [root@localhost liwei]# ./a.out 50 20 | [root@localhost liwei]# ./a.out 50 20 size is 50 Gib, test times is 20 | size is 50 Gib, test times is 20 test_time[0] : use 8.438046 sec | test_time[0] : use 3.722682 sec test_time[1] : use 8.028493 sec | test_time[1] : use 3.640274 sec test_time[2] : use 8.646547 sec | test_time[2] : use 4.095052 sec test_time[3] : use 8.122490 sec | test_time[3] : use 3.998446 sec test_time[4] : use 8.053038 sec | test_time[4] : use 4.084259 sec test_time[5] : use 8.843512 sec | test_time[5] : use 3.933871 sec test_time[6] : use 8.308906 sec | test_time[6] : use 3.934334 sec test_time[7] : use 8.093817 sec | test_time[7] : use 3.869142 sec test_time[8] : use 8.303504 sec | test_time[8] : use 3.902916 sec test_time[9] : use 8.178336 sec | test_time[9] : use 3.541885 sec test_time[10] : use 8.003625 sec | test_time[10] : use 3.595554 sec test_time[11] : use 8.163807 sec | test_time[11] : use 3.583813 sec test_time[12] : use 8.267464 sec | test_time[12] : use 3.863033 sec test_time[13] : use 8.055326 sec | test_time[13] : use 3.770953 sec test_time[14] : use 8.246986 sec | test_time[14] : use 3.808006 sec test_time[15] : use 8.546992 sec | test_time[15] : use 3.653194 sec test_time[16] : use 8.727256 sec | test_time[16] : use 3.722395 sec test_time[17] : use 8.288951 sec | test_time[17] : use 3.683508 sec test_time[18] : use 8.019322 sec | test_time[18] : use 4.253087 sec test_time[19] : use 8.250685 sec | test_time[19] : use 4.082845 sec hugetlb test end! | hugetlb test end!
Signed-off-by: Wei Li liwei391@huawei.com Reviewed-by: Hanjun Guo guohanjun@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- arch/arm64/include/asm/cpucaps.h | 3 ++- arch/arm64/kernel/cpufeature.c | 34 ++++++++++++++++++++++++++++++++++ arch/arm64/lib/clear_page.S | 21 +++++++++++++++++++++ 3 files changed, 57 insertions(+), 1 deletion(-)
diff --git a/arch/arm64/include/asm/cpucaps.h b/arch/arm64/include/asm/cpucaps.h index a9090f2..3cd169f 100644 --- a/arch/arm64/include/asm/cpucaps.h +++ b/arch/arm64/include/asm/cpucaps.h @@ -56,7 +56,8 @@ #define ARM64_WORKAROUND_1463225 35 #define ARM64_HAS_CRC32 36 #define ARM64_SSBS 37 +#define ARM64_CLEARPAGE_STNP 38
-#define ARM64_NCAPS 38 +#define ARM64_NCAPS 39
#endif /* __ASM_CPUCAPS_H */ diff --git a/arch/arm64/kernel/cpufeature.c b/arch/arm64/kernel/cpufeature.c index b1f621c..8b84a47 100644 --- a/arch/arm64/kernel/cpufeature.c +++ b/arch/arm64/kernel/cpufeature.c @@ -1232,6 +1232,34 @@ static bool can_use_gic_priorities(const struct arm64_cpu_capabilities *entry, } #endif
+static bool use_clearpage_stnp; + +static int __init early_use_clearpage_stnp(char *p) +{ + return strtobool(p, &use_clearpage_stnp); +} +early_param("mm.use_clearpage_stnp", early_use_clearpage_stnp); + +static bool has_mor_nontemporal(const struct arm64_cpu_capabilities *entry) +{ + /* + * List of CPUs which have memory ordering ruled non-temporal + * load and store. + */ + static const struct midr_range cpus[] = { + MIDR_ALL_VERSIONS(MIDR_HISI_TSV110), + {}, + }; + + return is_midr_in_range_list(read_cpuid_id(), cpus); +} + +static bool can_clearpage_use_stnp(const struct arm64_cpu_capabilities *entry, + int scope) +{ + return use_clearpage_stnp && has_mor_nontemporal(entry); +} + static const struct arm64_cpu_capabilities arm64_features[] = { { .desc = "GIC system register CPU interface", @@ -1467,6 +1495,12 @@ static bool can_use_gic_priorities(const struct arm64_cpu_capabilities *entry, .cpu_enable = cpu_enable_ssbs, }, #endif + { + .desc = "Clear Page by STNP", + .capability = ARM64_CLEARPAGE_STNP, + .type = ARM64_CPUCAP_SYSTEM_FEATURE, + .matches = can_clearpage_use_stnp, + }, {}, };
diff --git a/arch/arm64/lib/clear_page.S b/arch/arm64/lib/clear_page.S index ef08e90..9aa1de1 100644 --- a/arch/arm64/lib/clear_page.S +++ b/arch/arm64/lib/clear_page.S @@ -18,6 +18,25 @@ #include <linux/const.h> #include <asm/assembler.h> #include <asm/page.h> +#include <asm/alternative.h> + +/* + * Clear page @dest + * + * Parameters: + * x0 - dest + */ +ENTRY(clear_page_stnp) + .align 6 +1: stnp xzr, xzr, [x0] + stnp xzr, xzr, [x0, #0x10] + stnp xzr, xzr, [x0, #0x20] + stnp xzr, xzr, [x0, #0x30] + add x0, x0, #0x40 + tst x0, #(PAGE_SIZE - 1) + b.ne 1b + ret +ENDPROC(clear_page_stnp)
/* * Clear page @dest @@ -26,6 +45,8 @@ * x0 - dest */ ENTRY(clear_page) + ALTERNATIVE("nop", "b clear_page_stnp", ARM64_CLEARPAGE_STNP) + mrs x1, dczid_el0 and w1, w1, #0xf mov x2, #4