From: Takao Indoh indou.takao@fujitsu.com
hulk inclusion category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I4BLL0 CVE: NA
---------------------------
mm_cpumask was deleted by the commit 38d96287504a ("arm64: mm: kill mm_cpumask usage") because it was not used at that time. Now this is needed to find appropriate CPUs for TLB flush, so this patch reverts this commit.
Signed-off-by: QI Fuli qi.fuli@fujitsu.com Signed-off-by: Takao Indoh indou.takao@fujitsu.com Signed-off-by: Cheng Jian cj.chengjian@huawei.com Reviewed-by: Xie XiuQi xiexiuqi@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- arch/arm64/kernel/smp.c | 6 ++++++ arch/arm64/mm/context.c | 2 ++ 2 files changed, 8 insertions(+)
diff --git a/arch/arm64/kernel/smp.c b/arch/arm64/kernel/smp.c index fe562778de352..e86940d353a3e 100644 --- a/arch/arm64/kernel/smp.c +++ b/arch/arm64/kernel/smp.c @@ -387,6 +387,7 @@ asmlinkage notrace void secondary_start_kernel(void) */ mmgrab(mm); current->active_mm = mm; + cpumask_set_cpu(cpu, mm_cpumask(mm));
/* * TTBR0 is only used for the identity mapping at this stage. Make it @@ -489,6 +490,11 @@ int __cpu_disable(void) */ irq_migrate_all_off_this_cpu();
+ /* + * Remove this CPU from the vm mask set of all processes. + */ + clear_tasks_mm_cpumask(cpu); + return 0; }
diff --git a/arch/arm64/mm/context.c b/arch/arm64/mm/context.c index 2b80ceff5d6c2..27d1f3fec1cc9 100644 --- a/arch/arm64/mm/context.c +++ b/arch/arm64/mm/context.c @@ -207,6 +207,7 @@ static u64 new_context(struct mm_struct *mm, unsigned int cpu) set_asid: __set_bit(asid, asid_map); cur_idx = asid; + cpumask_clear(mm_cpumask(mm)); return idx2asid(asid) | generation; }
@@ -254,6 +255,7 @@ void check_and_switch_context(struct mm_struct *mm, unsigned int cpu) switch_mm_fastpath:
arm64_apply_bp_hardening(); + cpumask_set_cpu(cpu, mm_cpumask(mm));
/* * Defer TTBR0_EL1 setting for user threads to uaccess_enable() when
From: Takao Indoh indou.takao@fujitsu.com
hulk inclusion category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I4BLL0 CVE: NA
---------------------------
This patch adds new boot parameter 'disable_tlbflush_is' to disable TLB flush within the same inner shareable domain for performance tuning.
In the case of flush_tlb_mm() *without* this parameter, TLB entry is invalidated by __tlbi(aside1is, asid). By this instruction, all CPUs within the same inner shareable domain check if there are TLB entries which have this ASID, this causes performance noise, especially at large-scale HPC environment, which has more than thousand nodes with low latency interconnect.
When this new parameter is specified, TLB entry is invalidated by __tlbi(aside1, asid) only on the CPUs specified by mm_cpumask(mm). Therefore TLB flush is done on minimal CPUs and performance problem does not occur.
Signed-off-by: QI Fuli qi.fuli@fujitsu.com Signed-off-by: Takao Indoh indou.takao@fujitsu.com Signed-off-by: Cheng Jian cj.chengjian@huawei.com Reviewed-by: Xie XiuQi xiexiuqi@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- .../admin-guide/kernel-parameters.txt | 4 + arch/arm64/include/asm/mmu_context.h | 7 +- arch/arm64/include/asm/tlbflush.h | 61 ++----- arch/arm64/kernel/Makefile | 2 +- arch/arm64/kernel/tlbflush.c | 155 ++++++++++++++++++ 5 files changed, 178 insertions(+), 51 deletions(-) create mode 100644 arch/arm64/kernel/tlbflush.c
diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt index 54127ef751553..297586ef297ee 100644 --- a/Documentation/admin-guide/kernel-parameters.txt +++ b/Documentation/admin-guide/kernel-parameters.txt @@ -844,6 +844,10 @@ disable= [IPV6] See Documentation/networking/ipv6.txt.
+ disable_tlbflush_is + [ARM64] Disable using TLB instruction to flush + all PE within the same inner shareable domain. + hardened_usercopy= [KNL] Under CONFIG_HARDENED_USERCOPY, whether hardening is enabled for this boot. Hardened diff --git a/arch/arm64/include/asm/mmu_context.h b/arch/arm64/include/asm/mmu_context.h index 93d6a3c093dbc..005345d77349c 100644 --- a/arch/arm64/include/asm/mmu_context.h +++ b/arch/arm64/include/asm/mmu_context.h @@ -229,8 +229,13 @@ static inline void switch_mm(struct mm_struct *prev, struct mm_struct *next, struct task_struct *tsk) { - if (prev != next) + unsigned int cpu = smp_processor_id(); + + if (prev != next) { __switch_mm(next); + cpumask_clear_cpu(cpu, mm_cpumask(prev)); + local_flush_tlb_mm(prev); + }
/* * Update the saved TTBR0_EL1 of the scheduled-in task as the previous diff --git a/arch/arm64/include/asm/tlbflush.h b/arch/arm64/include/asm/tlbflush.h index 178719950e74e..c9307b602fadd 100644 --- a/arch/arm64/include/asm/tlbflush.h +++ b/arch/arm64/include/asm/tlbflush.h @@ -139,6 +139,13 @@ * on top of these routines, since that is our interface to the mmu_gather * API as used by munmap() and friends. */ + +void flush_tlb_mm(struct mm_struct *mm); +void flush_tlb_page_nosync(struct vm_area_struct *vma, + unsigned long uaddr); +void __flush_tlb_range(struct vm_area_struct *vma, unsigned long start, + unsigned long end, unsigned long stride, bool last_level); + static inline void local_flush_tlb_all(void) { dsb(nshst); @@ -155,24 +162,14 @@ static inline void flush_tlb_all(void) isb(); }
-static inline void flush_tlb_mm(struct mm_struct *mm) +static inline void local_flush_tlb_mm(struct mm_struct *mm) { unsigned long asid = __TLBI_VADDR(0, ASID(mm));
- dsb(ishst); - __tlbi(aside1is, asid); - __tlbi_user(aside1is, asid); - dsb(ish); -} - -static inline void flush_tlb_page_nosync(struct vm_area_struct *vma, - unsigned long uaddr) -{ - unsigned long addr = __TLBI_VADDR(uaddr, ASID(vma->vm_mm)); - - dsb(ishst); - __tlbi(vale1is, addr); - __tlbi_user(vale1is, addr); + dsb(nshst); + __tlbi(aside1, asid); + __tlbi_user(aside1, asid); + dsb(nsh); }
static inline void flush_tlb_page(struct vm_area_struct *vma, @@ -188,40 +185,6 @@ static inline void flush_tlb_page(struct vm_area_struct *vma, */ #define MAX_TLBI_OPS PTRS_PER_PTE
-static inline void __flush_tlb_range(struct vm_area_struct *vma, - unsigned long start, unsigned long end, - unsigned long stride, bool last_level) -{ - unsigned long asid = ASID(vma->vm_mm); - unsigned long addr; - - start = round_down(start, stride); - end = round_up(end, stride); - - if ((end - start) >= (MAX_TLBI_OPS * stride)) { - flush_tlb_mm(vma->vm_mm); - return; - } - - /* Convert the stride into units of 4k */ - stride >>= 12; - - start = __TLBI_VADDR(start, asid); - end = __TLBI_VADDR(end, asid); - - dsb(ishst); - for (addr = start; addr < end; addr += stride) { - if (last_level) { - __tlbi(vale1is, addr); - __tlbi_user(vale1is, addr); - } else { - __tlbi(vae1is, addr); - __tlbi_user(vae1is, addr); - } - } - dsb(ish); -} - static inline void flush_tlb_range(struct vm_area_struct *vma, unsigned long start, unsigned long end) { diff --git a/arch/arm64/kernel/Makefile b/arch/arm64/kernel/Makefile index fe759f364609d..fcb9d64d651cc 100644 --- a/arch/arm64/kernel/Makefile +++ b/arch/arm64/kernel/Makefile @@ -19,7 +19,7 @@ arm64-obj-y := debug-monitors.o entry.o irq.o fpsimd.o \ return_address.o cpuinfo.o cpu_errata.o \ cpufeature.o alternative.o cacheinfo.o \ smp.o smp_spin_table.o topology.o smccc-call.o \ - syscall.o + syscall.o tlbflush.o
extra-$(CONFIG_EFI) := efi-entry.o
diff --git a/arch/arm64/kernel/tlbflush.c b/arch/arm64/kernel/tlbflush.c new file mode 100644 index 0000000000000..52c9a237759a6 --- /dev/null +++ b/arch/arm64/kernel/tlbflush.c @@ -0,0 +1,155 @@ +// SPDX-License-Identifier: GPL-2.0 +// Copyright (C) 2019 FUJITSU LIMITED + +#include <linux/smp.h> +#include <asm/tlbflush.h> + +struct tlb_args { + struct vm_area_struct *ta_vma; + unsigned long ta_start; + unsigned long ta_end; + unsigned long ta_stride; + bool ta_last_level; +}; + +int disable_tlbflush_is; + +static int __init disable_tlbflush_is_setup(char *str) +{ + disable_tlbflush_is = 1; + + return 0; +} +__setup("disable_tlbflush_is", disable_tlbflush_is_setup); + +static inline void __flush_tlb_mm(struct mm_struct *mm) +{ + unsigned long asid = __TLBI_VADDR(0, ASID(mm)); + + dsb(ishst); + __tlbi(aside1is, asid); + __tlbi_user(aside1is, asid); + dsb(ish); +} + +static inline void ipi_flush_tlb_mm(void *arg) +{ + struct mm_struct *mm = arg; + + local_flush_tlb_mm(mm); +} + +void flush_tlb_mm(struct mm_struct *mm) +{ + if (disable_tlbflush_is) + on_each_cpu_mask(mm_cpumask(mm), ipi_flush_tlb_mm, + (void *)mm, true); + else + __flush_tlb_mm(mm); +} + +static inline void __flush_tlb_page_nosync(unsigned long addr) +{ + dsb(ishst); + __tlbi(vale1is, addr); + __tlbi_user(vale1is, addr); +} + +static inline void __local_flush_tlb_page_nosync(unsigned long addr) +{ + dsb(nshst); + __tlbi(vale1, addr); + __tlbi_user(vale1, addr); + dsb(nsh); +} + +static inline void ipi_flush_tlb_page_nosync(void *arg) +{ + unsigned long addr = *(unsigned long *)arg; + + __local_flush_tlb_page_nosync(addr); +} + +void flush_tlb_page_nosync(struct vm_area_struct *vma, unsigned long uaddr) +{ + unsigned long addr = __TLBI_VADDR(uaddr, ASID(vma->vm_mm)); + + if (disable_tlbflush_is) + on_each_cpu_mask(mm_cpumask(vma->vm_mm), + ipi_flush_tlb_page_nosync, &addr, true); + else + __flush_tlb_page_nosync(addr); +} + +static inline void ___flush_tlb_range(unsigned long start, unsigned long end, + unsigned long stride, bool last_level) +{ + unsigned long addr; + + dsb(ishst); + for (addr = start; addr < end; addr += stride) { + if (last_level) { + __tlbi(vale1is, addr); + __tlbi_user(vale1is, addr); + } else { + __tlbi(vae1is, addr); + __tlbi_user(vae1is, addr); + } + } + dsb(ish); +} + +static inline void __local_flush_tlb_range(unsigned long addr, bool last_level) +{ + dsb(nshst); + if (last_level) { + __tlbi(vale1, addr); + __tlbi_user(vale1, addr); + } else { + __tlbi(vae1, addr); + __tlbi_user(vae1, addr); + } + dsb(nsh); +} + +static inline void ipi_flush_tlb_range(void *arg) +{ + struct tlb_args *ta = (struct tlb_args *)arg; + unsigned long addr; + + for (addr = ta->ta_start; addr < ta->ta_end; addr += ta->ta_stride) + __local_flush_tlb_range(addr, ta->ta_last_level); +} + +void __flush_tlb_range(struct vm_area_struct *vma, unsigned long start, + unsigned long end, unsigned long stride, bool last_level) +{ + unsigned long asid = ASID(vma->vm_mm); + + start = round_down(start, stride); + end = round_up(end, stride); + + if ((end - start) >= (MAX_TLBI_OPS * stride)) { + flush_tlb_mm(vma->vm_mm); + return; + } + + /* Convert the stride into units of 4k */ + stride >>= 12; + + start = __TLBI_VADDR(start, asid); + end = __TLBI_VADDR(end, asid); + + if (disable_tlbflush_is) { + struct tlb_args ta = { + .ta_start = start, + .ta_end = end, + .ta_stride = stride, + .ta_last_level = last_level, + }; + + on_each_cpu_mask(mm_cpumask(vma->vm_mm), ipi_flush_tlb_range, + &ta, true); + } else + ___flush_tlb_range(start, end, stride, last_level); +}
From: Cheng Jian cj.chengjian@huawei.com
hulk inclusion category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I4BLL0 CVE: NA
---------------------------
add CONFIG_ARM64_TLBI_IPI to isolate code.
Signed-off-by: Cheng Jian cj.chengjian@huawei.com Reviewed-by: Xie XiuQi xiexiuqi@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- arch/arm64/Kconfig | 21 +++++++++++++++++++++ arch/arm64/include/asm/mmu_context.h | 4 +++- arch/arm64/kernel/smp.c | 4 ++++ arch/arm64/kernel/tlbflush.c | 25 ++++++++++++++++++++++--- arch/arm64/mm/context.c | 4 ++++ 5 files changed, 54 insertions(+), 4 deletions(-)
diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig index 2f34aef79179e..f984280b0d577 100644 --- a/arch/arm64/Kconfig +++ b/arch/arm64/Kconfig @@ -1607,6 +1607,27 @@ config UCE_KERNEL_RECOVERY
endmenu
+menu "TLB options" + +config ARM64_TLBI_IPI + bool "IPI based ARM64 TLB invalidation" + depends on ARM64 + default n + help + adds new boot parameter 'disable_tlbflush_is' to disable TLB flush + within the same inner shareable domain for performance tuning. + + When this new parameter is specified, TLB entry is invalidated by + __tlbi(aside1, asid) only on the CPUs specified by mm_cpumask(mm). + + By using TLB.IS, all CPUs within the same inner shareable domain + check if there are TLB entries which have this ASID, this causes + performance noise, especially at large-scale HPC environment, which + has more than thousand nodes with low latency interconnect. + + If unsure, say N. +endmenu + menu "CPU Power Management"
source "drivers/cpuidle/Kconfig" diff --git a/arch/arm64/include/asm/mmu_context.h b/arch/arm64/include/asm/mmu_context.h index 005345d77349c..e319ce86fa708 100644 --- a/arch/arm64/include/asm/mmu_context.h +++ b/arch/arm64/include/asm/mmu_context.h @@ -229,12 +229,14 @@ static inline void switch_mm(struct mm_struct *prev, struct mm_struct *next, struct task_struct *tsk) { - unsigned int cpu = smp_processor_id(); + unsigned int __maybe_unused cpu = smp_processor_id();
if (prev != next) { __switch_mm(next); +#ifdef CONFIG_ARM64_TLBI_IPI cpumask_clear_cpu(cpu, mm_cpumask(prev)); local_flush_tlb_mm(prev); +#endif }
/* diff --git a/arch/arm64/kernel/smp.c b/arch/arm64/kernel/smp.c index e86940d353a3e..f09c10863867b 100644 --- a/arch/arm64/kernel/smp.c +++ b/arch/arm64/kernel/smp.c @@ -387,7 +387,9 @@ asmlinkage notrace void secondary_start_kernel(void) */ mmgrab(mm); current->active_mm = mm; +#ifdef CONFIG_ARM64_TLBI_IPI cpumask_set_cpu(cpu, mm_cpumask(mm)); +#endif
/* * TTBR0 is only used for the identity mapping at this stage. Make it @@ -490,10 +492,12 @@ int __cpu_disable(void) */ irq_migrate_all_off_this_cpu();
+#ifdef CONFIG_ARM64_TLBI_IPI /* * Remove this CPU from the vm mask set of all processes. */ clear_tasks_mm_cpumask(cpu); +#endif
return 0; } diff --git a/arch/arm64/kernel/tlbflush.c b/arch/arm64/kernel/tlbflush.c index 52c9a237759a6..e20fbd38fd262 100644 --- a/arch/arm64/kernel/tlbflush.c +++ b/arch/arm64/kernel/tlbflush.c @@ -4,6 +4,7 @@ #include <linux/smp.h> #include <asm/tlbflush.h>
+#ifdef CONFIG_ARM64_TLBI_IPI struct tlb_args { struct vm_area_struct *ta_vma; unsigned long ta_start; @@ -21,6 +22,7 @@ static int __init disable_tlbflush_is_setup(char *str) return 0; } __setup("disable_tlbflush_is", disable_tlbflush_is_setup); +#endif
static inline void __flush_tlb_mm(struct mm_struct *mm) { @@ -32,20 +34,26 @@ static inline void __flush_tlb_mm(struct mm_struct *mm) dsb(ish); }
+#ifdef CONFIG_ARM64_TLBI_IPI static inline void ipi_flush_tlb_mm(void *arg) { struct mm_struct *mm = arg;
local_flush_tlb_mm(mm); } +#endif
void flush_tlb_mm(struct mm_struct *mm) { - if (disable_tlbflush_is) +#ifdef CONFIG_ARM64_TLBI_IPI + if (unlikely(disable_tlbflush_is)) on_each_cpu_mask(mm_cpumask(mm), ipi_flush_tlb_mm, (void *)mm, true); else __flush_tlb_mm(mm); +#else + __flush_tlb_mm(mm); +#endif }
static inline void __flush_tlb_page_nosync(unsigned long addr) @@ -74,11 +82,15 @@ void flush_tlb_page_nosync(struct vm_area_struct *vma, unsigned long uaddr) { unsigned long addr = __TLBI_VADDR(uaddr, ASID(vma->vm_mm));
- if (disable_tlbflush_is) +#ifdef CONFIG_ARM64_TLBI_IPI + if (unlikely(disable_tlbflush_is)) on_each_cpu_mask(mm_cpumask(vma->vm_mm), ipi_flush_tlb_page_nosync, &addr, true); else __flush_tlb_page_nosync(addr); +#else + __flush_tlb_page_nosync(addr); +#endif }
static inline void ___flush_tlb_range(unsigned long start, unsigned long end, @@ -112,6 +124,7 @@ static inline void __local_flush_tlb_range(unsigned long addr, bool last_level) dsb(nsh); }
+#ifdef CONFIG_ARM64_TLBI_IPI static inline void ipi_flush_tlb_range(void *arg) { struct tlb_args *ta = (struct tlb_args *)arg; @@ -120,6 +133,7 @@ static inline void ipi_flush_tlb_range(void *arg) for (addr = ta->ta_start; addr < ta->ta_end; addr += ta->ta_stride) __local_flush_tlb_range(addr, ta->ta_last_level); } +#endif
void __flush_tlb_range(struct vm_area_struct *vma, unsigned long start, unsigned long end, unsigned long stride, bool last_level) @@ -140,7 +154,9 @@ void __flush_tlb_range(struct vm_area_struct *vma, unsigned long start, start = __TLBI_VADDR(start, asid); end = __TLBI_VADDR(end, asid);
- if (disable_tlbflush_is) { + +#ifdef CONFIG_ARM64_TLBI_IPI + if (unlikely(disable_tlbflush_is)) { struct tlb_args ta = { .ta_start = start, .ta_end = end, @@ -152,4 +168,7 @@ void __flush_tlb_range(struct vm_area_struct *vma, unsigned long start, &ta, true); } else ___flush_tlb_range(start, end, stride, last_level); +#else + ___flush_tlb_range(start, end, stride, last_level); +#endif } diff --git a/arch/arm64/mm/context.c b/arch/arm64/mm/context.c index 27d1f3fec1cc9..35a103c7c22bf 100644 --- a/arch/arm64/mm/context.c +++ b/arch/arm64/mm/context.c @@ -207,7 +207,9 @@ static u64 new_context(struct mm_struct *mm, unsigned int cpu) set_asid: __set_bit(asid, asid_map); cur_idx = asid; +#ifdef CONFIG_ARM64_TLBI_IPI cpumask_clear(mm_cpumask(mm)); +#endif return idx2asid(asid) | generation; }
@@ -255,7 +257,9 @@ void check_and_switch_context(struct mm_struct *mm, unsigned int cpu) switch_mm_fastpath:
arm64_apply_bp_hardening(); +#ifdef CONFIG_ARM64_TLBI_IPI cpumask_set_cpu(cpu, mm_cpumask(mm)); +#endif
/* * Defer TTBR0_EL1 setting for user threads to uaccess_enable() when
From: Cheng Jian cj.chengjian@huawei.com
hulk inclusion category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I4BLL0 CVE: NA
---------------------------
Several control bits are added to control different flush path.
mm use tlb invalidation ipi for flush_tlb_mm range use tlb invalidation ipi for flush_tlb_range page use tlb invalidation ipi for flush_tlb_page switch don't local_flush_tlb_mm when switch_mm
Signed-off-by: Cheng Jian cj.chengjian@huawei.com Reviewed-by: Xie XiuQi xiexiuqi@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- .../admin-guide/kernel-parameters.txt | 7 +- arch/arm64/include/asm/mmu_context.h | 6 +- arch/arm64/include/asm/tlbflush.h | 1 + arch/arm64/kernel/tlbflush.c | 77 +++++++++++++++++-- 4 files changed, 82 insertions(+), 9 deletions(-)
diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt index 297586ef297ee..6b1fc6dedc034 100644 --- a/Documentation/admin-guide/kernel-parameters.txt +++ b/Documentation/admin-guide/kernel-parameters.txt @@ -844,10 +844,15 @@ disable= [IPV6] See Documentation/networking/ipv6.txt.
- disable_tlbflush_is + disable_tlbflush_is= [page,range,switch,]mm [ARM64] Disable using TLB instruction to flush all PE within the same inner shareable domain.
+ range use tlb invalidation ipi for flush_tlb_range + page use tlb invalidation ipi for flush_tlb_page + switch don't local_flush_tlb_mm when switch_mm + mm use tlb invalidation ipi for flush_tlb_mm + hardened_usercopy= [KNL] Under CONFIG_HARDENED_USERCOPY, whether hardening is enabled for this boot. Hardened diff --git a/arch/arm64/include/asm/mmu_context.h b/arch/arm64/include/asm/mmu_context.h index e319ce86fa708..04a7700109a88 100644 --- a/arch/arm64/include/asm/mmu_context.h +++ b/arch/arm64/include/asm/mmu_context.h @@ -234,8 +234,10 @@ switch_mm(struct mm_struct *prev, struct mm_struct *next, if (prev != next) { __switch_mm(next); #ifdef CONFIG_ARM64_TLBI_IPI - cpumask_clear_cpu(cpu, mm_cpumask(prev)); - local_flush_tlb_mm(prev); + if (unlikely(test_tlbi_ipi_switch())) { + cpumask_clear_cpu(cpu, mm_cpumask(prev)); + local_flush_tlb_mm(prev); + } #endif }
diff --git a/arch/arm64/include/asm/tlbflush.h b/arch/arm64/include/asm/tlbflush.h index c9307b602fadd..3f1d863ad3d90 100644 --- a/arch/arm64/include/asm/tlbflush.h +++ b/arch/arm64/include/asm/tlbflush.h @@ -145,6 +145,7 @@ void flush_tlb_page_nosync(struct vm_area_struct *vma, unsigned long uaddr); void __flush_tlb_range(struct vm_area_struct *vma, unsigned long start, unsigned long end, unsigned long stride, bool last_level); +bool test_tlbi_ipi_switch(void);
static inline void local_flush_tlb_all(void) { diff --git a/arch/arm64/kernel/tlbflush.c b/arch/arm64/kernel/tlbflush.c index e20fbd38fd262..733cf1c612294 100644 --- a/arch/arm64/kernel/tlbflush.c +++ b/arch/arm64/kernel/tlbflush.c @@ -2,6 +2,7 @@ // Copyright (C) 2019 FUJITSU LIMITED
#include <linux/smp.h> +#include <linux/ctype.h> #include <asm/tlbflush.h>
#ifdef CONFIG_ARM64_TLBI_IPI @@ -13,15 +14,79 @@ struct tlb_args { bool ta_last_level; };
-int disable_tlbflush_is;
+unsigned int disable_tlbflush_is; + +#define FLAG_TLBFLUSH_RANGE 0x0001 +#define FLAG_TLBFLUSH_PAGE 0x0002 +#define FLAG_TLBFLUSH_SWITCH 0x0004 +#define FLAG_TLBFLUSH_MM 0x0008 + +#define TEST_TLBFLUSH_FLAG_EXTERN(flag, FLAG) \ +bool test_tlbi_ipi_##flag(void) \ +{ \ + return !!(disable_tlbflush_is & FLAG_TLBFLUSH_##FLAG); \ +} +#else +#define TEST_TLBFLUSH_FLAG_EXTERN(flag, FLAG) \ +bool test_tlbi_ipi_##flag(void) \ +{ \ + return false; \ +} +#endif + +#define TEST_TLBFLUSH_FLAG(flag, FLAG) \ +static __always_inline TEST_TLBFLUSH_FLAG_EXTERN(flag, FLAG) + +TEST_TLBFLUSH_FLAG(mm, MM) +TEST_TLBFLUSH_FLAG(page, PAGE) +TEST_TLBFLUSH_FLAG(range, RANGE) +TEST_TLBFLUSH_FLAG_EXTERN(switch, SWITCH) + +#ifdef CONFIG_ARM64_TLBI_IPI static int __init disable_tlbflush_is_setup(char *str) { - disable_tlbflush_is = 1; + unsigned int flags = 0; + + while (isalpha(*str)) { + if (!strncmp(str, "range,", 6)) { + str += 6; + flags |= FLAG_TLBFLUSH_RANGE; + continue; + } + + if (!strncmp(str, "page,", 5)) { + str += 5; + flags |= FLAG_TLBFLUSH_PAGE; + continue; + } + + if (!strncmp(str, "switch,", 7)) { + str += 7; + flags |= FLAG_TLBFLUSH_SWITCH; + continue; + } + + if (!strcmp(str, "mm")) { + str += 2; + flags |= FLAG_TLBFLUSH_MM; + break; + } + + pr_warn("disable_tlbflush_is: Error, unknown flag\n"); + return 0; + } + + disable_tlbflush_is = flags; + pr_info("DISABLE_TLBFLUSH_IS : [%s] [%s] [%s] [%s]\n", + test_tlbi_ipi_page() ? "PAGE" : "NA", + test_tlbi_ipi_range() ? "RANGE" : "NA", + test_tlbi_ipi_switch() ? "SWITCH" : "NA", + test_tlbi_ipi_mm() ? "MM" : "NA");
return 0; } -__setup("disable_tlbflush_is", disable_tlbflush_is_setup); +early_param("disable_tlbflush_is", disable_tlbflush_is_setup); #endif
static inline void __flush_tlb_mm(struct mm_struct *mm) @@ -46,7 +111,7 @@ static inline void ipi_flush_tlb_mm(void *arg) void flush_tlb_mm(struct mm_struct *mm) { #ifdef CONFIG_ARM64_TLBI_IPI - if (unlikely(disable_tlbflush_is)) + if (unlikely(test_tlbi_ipi_mm())) on_each_cpu_mask(mm_cpumask(mm), ipi_flush_tlb_mm, (void *)mm, true); else @@ -83,7 +148,7 @@ void flush_tlb_page_nosync(struct vm_area_struct *vma, unsigned long uaddr) unsigned long addr = __TLBI_VADDR(uaddr, ASID(vma->vm_mm));
#ifdef CONFIG_ARM64_TLBI_IPI - if (unlikely(disable_tlbflush_is)) + if (unlikely(test_tlbi_ipi_page())) on_each_cpu_mask(mm_cpumask(vma->vm_mm), ipi_flush_tlb_page_nosync, &addr, true); else @@ -156,7 +221,7 @@ void __flush_tlb_range(struct vm_area_struct *vma, unsigned long start,
#ifdef CONFIG_ARM64_TLBI_IPI - if (unlikely(disable_tlbflush_is)) { + if (unlikely(test_tlbi_ipi_range())) { struct tlb_args ta = { .ta_start = start, .ta_end = end,
From: Cheng Jian cj.chengjian@huawei.com
hulk inclusion category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I4BLL0 CVE: NA
---------------------------
enable ARM64_TLBI_IPI in hulk_defconfig and openeuler_defconfig.
Signed-off-by: Cheng Jian cj.chengjian@huawei.com Reviewed-by: Xie XiuQi xiexiuqi@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- arch/arm64/configs/hulk_defconfig | 5 +++++ arch/arm64/configs/openeuler_defconfig | 5 +++++ 2 files changed, 10 insertions(+)
diff --git a/arch/arm64/configs/hulk_defconfig b/arch/arm64/configs/hulk_defconfig index adaf13d8f59ee..bc350c41b333a 100644 --- a/arch/arm64/configs/hulk_defconfig +++ b/arch/arm64/configs/hulk_defconfig @@ -528,6 +528,11 @@ CONFIG_ARCH_HIBERNATION_HEADER=y CONFIG_ARCH_SUSPEND_POSSIBLE=y CONFIG_UCE_KERNEL_RECOVERY=y
+# +# TLB options +# +CONFIG_ARM64_TLBI_IPI=y + # # CPU Power Management # diff --git a/arch/arm64/configs/openeuler_defconfig b/arch/arm64/configs/openeuler_defconfig index 548e88c579350..d2167c80757fc 100644 --- a/arch/arm64/configs/openeuler_defconfig +++ b/arch/arm64/configs/openeuler_defconfig @@ -492,6 +492,11 @@ CONFIG_AARCH32_EL0=y # CONFIG_ARM64_ILP32 is not set CONFIG_SYSVIPC_COMPAT=y
+# +# TLB options +# +CONFIG_ARM64_TLBI_IPI=y + # # Power management options #
From: Cheng Jian cj.chengjian@huawei.com
hulk inclusion category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I4BLL0 CVE: NA
---------------------------
The patchset cann't be merged into the mainline community, but the modules it modify is very important, and we are always careful here.
This patch restores all the original code, so that if we disable CONFIG_ARM64_TLBI_IPI, There is no logic change for the original logic.
Signed-off-by: Cheng Jian cj.chengjian@huawei.com Reviewed-by: Xie XiuQi xiexiuqi@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- arch/arm64/include/asm/tlbflush.h | 86 ++++++++++++++++++++++++++----- arch/arm64/kernel/Makefile | 4 +- arch/arm64/kernel/tlbflush.c | 24 --------- 3 files changed, 75 insertions(+), 39 deletions(-)
diff --git a/arch/arm64/include/asm/tlbflush.h b/arch/arm64/include/asm/tlbflush.h index 3f1d863ad3d90..2051277ebc33d 100644 --- a/arch/arm64/include/asm/tlbflush.h +++ b/arch/arm64/include/asm/tlbflush.h @@ -139,14 +139,6 @@ * on top of these routines, since that is our interface to the mmu_gather * API as used by munmap() and friends. */ - -void flush_tlb_mm(struct mm_struct *mm); -void flush_tlb_page_nosync(struct vm_area_struct *vma, - unsigned long uaddr); -void __flush_tlb_range(struct vm_area_struct *vma, unsigned long start, - unsigned long end, unsigned long stride, bool last_level); -bool test_tlbi_ipi_switch(void); - static inline void local_flush_tlb_all(void) { dsb(nshst); @@ -163,6 +155,21 @@ static inline void flush_tlb_all(void) isb(); }
+/* + * This is meant to avoid soft lock-ups on large TLB flushing ranges and not + * necessarily a performance improvement. + */ +#define MAX_TLBI_OPS PTRS_PER_PTE + +#ifdef CONFIG_ARM64_TLBI_IPI + +void flush_tlb_mm(struct mm_struct *mm); +void flush_tlb_page_nosync(struct vm_area_struct *vma, + unsigned long uaddr); +void __flush_tlb_range(struct vm_area_struct *vma, unsigned long start, + unsigned long end, unsigned long stride, bool last_level); +bool test_tlbi_ipi_switch(void); + static inline void local_flush_tlb_mm(struct mm_struct *mm) { unsigned long asid = __TLBI_VADDR(0, ASID(mm)); @@ -173,6 +180,63 @@ static inline void local_flush_tlb_mm(struct mm_struct *mm) dsb(nsh); }
+#else /* CONFIG_ARM64_TLBI_IPI */ + +static inline void flush_tlb_mm(struct mm_struct *mm) +{ + unsigned long asid = __TLBI_VADDR(0, ASID(mm)); + + dsb(ishst); + __tlbi(aside1is, asid); + __tlbi_user(aside1is, asid); + dsb(ish); +} + +static inline void flush_tlb_page_nosync(struct vm_area_struct *vma, + unsigned long uaddr) +{ + unsigned long addr = __TLBI_VADDR(uaddr, ASID(vma->vm_mm)); + + dsb(ishst); + __tlbi(vale1is, addr); + __tlbi_user(vale1is, addr); +} + +static inline void __flush_tlb_range(struct vm_area_struct *vma, + unsigned long start, unsigned long end, + unsigned long stride, bool last_level) +{ + unsigned long asid = ASID(vma->vm_mm); + unsigned long addr; + + start = round_down(start, stride); + end = round_up(end, stride); + + if ((end - start) >= (MAX_TLBI_OPS * stride)) { + flush_tlb_mm(vma->vm_mm); + return; + } + + /* Convert the stride into units of 4k */ + stride >>= 12; + + start = __TLBI_VADDR(start, asid); + end = __TLBI_VADDR(end, asid); + + dsb(ishst); + for (addr = start; addr < end; addr += stride) { + if (last_level) { + __tlbi(vale1is, addr); + __tlbi_user(vale1is, addr); + } else { + __tlbi(vae1is, addr); + __tlbi_user(vae1is, addr); + } + } + dsb(ish); +} +#endif /* CONFIG_ARM64_TLBI_IPI */ + static inline void flush_tlb_page(struct vm_area_struct *vma, unsigned long uaddr) { @@ -180,12 +244,6 @@ static inline void flush_tlb_page(struct vm_area_struct *vma, dsb(ish); }
-/* - * This is meant to avoid soft lock-ups on large TLB flushing ranges and not - * necessarily a performance improvement. - */ -#define MAX_TLBI_OPS PTRS_PER_PTE - static inline void flush_tlb_range(struct vm_area_struct *vma, unsigned long start, unsigned long end) { diff --git a/arch/arm64/kernel/Makefile b/arch/arm64/kernel/Makefile index fcb9d64d651cc..e34b9b5969bfb 100644 --- a/arch/arm64/kernel/Makefile +++ b/arch/arm64/kernel/Makefile @@ -19,7 +19,9 @@ arm64-obj-y := debug-monitors.o entry.o irq.o fpsimd.o \ return_address.o cpuinfo.o cpu_errata.o \ cpufeature.o alternative.o cacheinfo.o \ smp.o smp_spin_table.o topology.o smccc-call.o \ - syscall.o tlbflush.o + syscall.o + +arm64-obj-$(CONFIG_ARM64_TLBI_IPI) += tlbflush.o
extra-$(CONFIG_EFI) := efi-entry.o
diff --git a/arch/arm64/kernel/tlbflush.c b/arch/arm64/kernel/tlbflush.c index 733cf1c612294..9a51941b18b3b 100644 --- a/arch/arm64/kernel/tlbflush.c +++ b/arch/arm64/kernel/tlbflush.c @@ -5,7 +5,6 @@ #include <linux/ctype.h> #include <asm/tlbflush.h>
-#ifdef CONFIG_ARM64_TLBI_IPI struct tlb_args { struct vm_area_struct *ta_vma; unsigned long ta_start; @@ -27,13 +26,6 @@ bool test_tlbi_ipi_##flag(void) \ { \ return !!(disable_tlbflush_is & FLAG_TLBFLUSH_##FLAG); \ } -#else -#define TEST_TLBFLUSH_FLAG_EXTERN(flag, FLAG) \ -bool test_tlbi_ipi_##flag(void) \ -{ \ - return false; \ -} -#endif
#define TEST_TLBFLUSH_FLAG(flag, FLAG) \ static __always_inline TEST_TLBFLUSH_FLAG_EXTERN(flag, FLAG) @@ -99,26 +91,20 @@ static inline void __flush_tlb_mm(struct mm_struct *mm) dsb(ish); }
-#ifdef CONFIG_ARM64_TLBI_IPI static inline void ipi_flush_tlb_mm(void *arg) { struct mm_struct *mm = arg;
local_flush_tlb_mm(mm); } -#endif
void flush_tlb_mm(struct mm_struct *mm) { -#ifdef CONFIG_ARM64_TLBI_IPI if (unlikely(test_tlbi_ipi_mm())) on_each_cpu_mask(mm_cpumask(mm), ipi_flush_tlb_mm, (void *)mm, true); else __flush_tlb_mm(mm); -#else - __flush_tlb_mm(mm); -#endif }
static inline void __flush_tlb_page_nosync(unsigned long addr) @@ -147,15 +133,11 @@ void flush_tlb_page_nosync(struct vm_area_struct *vma, unsigned long uaddr) { unsigned long addr = __TLBI_VADDR(uaddr, ASID(vma->vm_mm));
-#ifdef CONFIG_ARM64_TLBI_IPI if (unlikely(test_tlbi_ipi_page())) on_each_cpu_mask(mm_cpumask(vma->vm_mm), ipi_flush_tlb_page_nosync, &addr, true); else __flush_tlb_page_nosync(addr); -#else - __flush_tlb_page_nosync(addr); -#endif }
static inline void ___flush_tlb_range(unsigned long start, unsigned long end, @@ -189,7 +171,6 @@ static inline void __local_flush_tlb_range(unsigned long addr, bool last_level) dsb(nsh); }
-#ifdef CONFIG_ARM64_TLBI_IPI static inline void ipi_flush_tlb_range(void *arg) { struct tlb_args *ta = (struct tlb_args *)arg; @@ -198,7 +179,6 @@ static inline void ipi_flush_tlb_range(void *arg) for (addr = ta->ta_start; addr < ta->ta_end; addr += ta->ta_stride) __local_flush_tlb_range(addr, ta->ta_last_level); } -#endif
void __flush_tlb_range(struct vm_area_struct *vma, unsigned long start, unsigned long end, unsigned long stride, bool last_level) @@ -220,7 +200,6 @@ void __flush_tlb_range(struct vm_area_struct *vma, unsigned long start, end = __TLBI_VADDR(end, asid);
-#ifdef CONFIG_ARM64_TLBI_IPI if (unlikely(test_tlbi_ipi_range())) { struct tlb_args ta = { .ta_start = start, @@ -233,7 +212,4 @@ void __flush_tlb_range(struct vm_area_struct *vma, unsigned long start, &ta, true); } else ___flush_tlb_range(start, end, stride, last_level); -#else - ___flush_tlb_range(start, end, stride, last_level); -#endif }
From: Cheng Jian cj.chengjian@huawei.com
hulk inclusion category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I4BLL0 CVE: NA
---------------------------
This is an experimental feature to let everyone know this.
Signed-off-by: Cheng Jian cj.chengjian@huawei.com Reviewed-by: Xie XiuQi xiexiuqi@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- Documentation/admin-guide/kernel-parameters.txt | 6 ++++++ arch/arm64/Kconfig | 9 ++++++++- 2 files changed, 14 insertions(+), 1 deletion(-)
diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt index 6b1fc6dedc034..425015359b5ce 100644 --- a/Documentation/admin-guide/kernel-parameters.txt +++ b/Documentation/admin-guide/kernel-parameters.txt @@ -848,6 +848,12 @@ [ARM64] Disable using TLB instruction to flush all PE within the same inner shareable domain.
+ NOTE(Important) + This feature is used for learning and debugging + only. Please don't enable it on commercial products. + If you know exactly what the impact of the feature is, + you can configure it as you do. + range use tlb invalidation ipi for flush_tlb_range page use tlb invalidation ipi for flush_tlb_page switch don't local_flush_tlb_mm when switch_mm diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig index f984280b0d577..02c2f528cb608 100644 --- a/arch/arm64/Kconfig +++ b/arch/arm64/Kconfig @@ -1610,7 +1610,7 @@ endmenu menu "TLB options"
config ARM64_TLBI_IPI - bool "IPI based ARM64 TLB invalidation" + bool "IPI based ARM64 TLB invalidation(EXPERIMENTAL)" depends on ARM64 default n help @@ -1625,7 +1625,14 @@ config ARM64_TLBI_IPI performance noise, especially at large-scale HPC environment, which has more than thousand nodes with low latency interconnect.
+ NOTE(Important) + This feature is used for learning and debugging only. Please don't + enable it on commercial products. + If you know exactly what the impact of the feature is, you can + configure it as you do. + If unsure, say N. + endmenu
menu "CPU Power Management"
From: Marc Zyngier maz@kernel.org
mainline inclusion from mainline-v5.12-rc3 commit 01dc9262ff5797b675c32c0c6bc682777d23de05 category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I4BLL0 CVE: NA
--------------------------------
It recently became apparent that the ARMv8 architecture has interesting rules regarding attributes being used when fetching instructions if the MMU is off at Stage-1.
In this situation, the CPU is allowed to fetch from the PoC and allocate into the I-cache (unless the memory is mapped with the XN attribute at Stage-2).
If we transpose this to vcpus sharing a single physical CPU, it is possible for a vcpu running with its MMU off to influence another vcpu running with its MMU on, as the latter is expected to fetch from the PoU (and self-patching code doesn't flush below that level).
In order to solve this, reuse the vcpu-private TLB invalidation code to apply the same policy to the I-cache, nuking it every time the vcpu runs on a physical CPU that ran another vcpu of the same VM in the past.
This involve renaming __kvm_tlb_flush_local_vmid() to __kvm_flush_cpu_context(), and inserting a local i-cache invalidation there.
Cc: stable@vger.kernel.org Signed-off-by: Marc Zyngier maz@kernel.org Acked-by: Will Deacon will@kernel.org Acked-by: Catalin Marinas catalin.marinas@arm.com Link: https://lore.kernel.org/r/20210303164505.68492-1-maz@kernel.org Signed-off-by: Zenghui Yu yuzenghui@huawei.com Reviewed-by: Cheng Jian cj.chengjian@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- arch/arm/include/asm/kvm_asm.h | 2 +- arch/arm/kvm/hyp/tlb.c | 2 +- arch/arm64/include/asm/kvm_asm.h | 2 +- arch/arm64/kvm/hyp/tlb.c | 3 ++- virt/kvm/arm/arm.c | 7 ++++++- 5 files changed, 11 insertions(+), 5 deletions(-)
diff --git a/arch/arm/include/asm/kvm_asm.h b/arch/arm/include/asm/kvm_asm.h index 35491af879852..51c9f9836befa 100644 --- a/arch/arm/include/asm/kvm_asm.h +++ b/arch/arm/include/asm/kvm_asm.h @@ -66,9 +66,9 @@ extern char __kvm_hyp_init[]; extern char __kvm_hyp_init_end[];
extern void __kvm_flush_vm_context(void); +extern void __kvm_flush_cpu_context(struct kvm_vcpu *vcpu); extern void __kvm_tlb_flush_vmid_ipa(struct kvm *kvm, phys_addr_t ipa); extern void __kvm_tlb_flush_vmid(struct kvm *kvm); -extern void __kvm_tlb_flush_local_vmid(struct kvm_vcpu *vcpu);
extern void __kvm_timer_set_cntvoff(u32 cntvoff_low, u32 cntvoff_high);
diff --git a/arch/arm/kvm/hyp/tlb.c b/arch/arm/kvm/hyp/tlb.c index c0edd450e1045..e8c45d088d0b1 100644 --- a/arch/arm/kvm/hyp/tlb.c +++ b/arch/arm/kvm/hyp/tlb.c @@ -56,7 +56,7 @@ void __hyp_text __kvm_tlb_flush_vmid_ipa(struct kvm *kvm, phys_addr_t ipa) __kvm_tlb_flush_vmid(kvm); }
-void __hyp_text __kvm_tlb_flush_local_vmid(struct kvm_vcpu *vcpu) +void __hyp_text __kvm_flush_cpu_context(struct kvm_vcpu *vcpu) { struct kvm *kvm = kern_hyp_va(kern_hyp_va(vcpu)->kvm);
diff --git a/arch/arm64/include/asm/kvm_asm.h b/arch/arm64/include/asm/kvm_asm.h index ff73f5462aca5..ab8dec4eb3aa8 100644 --- a/arch/arm64/include/asm/kvm_asm.h +++ b/arch/arm64/include/asm/kvm_asm.h @@ -63,9 +63,9 @@ extern char __kvm_hyp_init_end[]; extern char __kvm_hyp_vector[];
extern void __kvm_flush_vm_context(void); +extern void __kvm_flush_cpu_context(struct kvm_vcpu *vcpu); extern void __kvm_tlb_flush_vmid_ipa(struct kvm *kvm, phys_addr_t ipa); extern void __kvm_tlb_flush_vmid(struct kvm *kvm); -extern void __kvm_tlb_flush_local_vmid(struct kvm_vcpu *vcpu);
extern void __kvm_timer_set_cntvoff(u32 cntvoff_low, u32 cntvoff_high);
diff --git a/arch/arm64/kvm/hyp/tlb.c b/arch/arm64/kvm/hyp/tlb.c index 7fcc9c1a5f45c..c35e9b99b0c50 100644 --- a/arch/arm64/kvm/hyp/tlb.c +++ b/arch/arm64/kvm/hyp/tlb.c @@ -149,7 +149,7 @@ void __hyp_text __kvm_tlb_flush_vmid(struct kvm *kvm) __tlb_switch_to_host()(kvm, flags); }
-void __hyp_text __kvm_tlb_flush_local_vmid(struct kvm_vcpu *vcpu) +void __hyp_text __kvm_flush_cpu_context(struct kvm_vcpu *vcpu) { struct kvm *kvm = kern_hyp_va(kern_hyp_va(vcpu)->kvm); unsigned long flags; @@ -158,6 +158,7 @@ void __hyp_text __kvm_tlb_flush_local_vmid(struct kvm_vcpu *vcpu) __tlb_switch_to_guest()(kvm, &flags);
__tlbi(vmalle1); + asm volatile("ic iallu"); dsb(nsh); isb();
diff --git a/virt/kvm/arm/arm.c b/virt/kvm/arm/arm.c index cdfe28311f414..c1ae17f68a05c 100644 --- a/virt/kvm/arm/arm.c +++ b/virt/kvm/arm/arm.c @@ -424,11 +424,16 @@ void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu) cpu_data = this_cpu_ptr(&kvm_host_data);
/* + * We guarantee that both TLBs and I-cache are private to each + * vcpu. If detecting that a vcpu from the same VM has + * previously run on the same physical CPU, call into the + * hypervisor code to nuke the relevant contexts. + * * We might get preempted before the vCPU actually runs, but * over-invalidation doesn't affect correctness. */ if (*last_ran != vcpu->vcpu_id) { - kvm_call_hyp(__kvm_tlb_flush_local_vmid, vcpu); + kvm_call_hyp(__kvm_flush_cpu_context, vcpu); *last_ran = vcpu->vcpu_id; }
From: Zenghui Yu yuzenghui@huawei.com
virt inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I4BLL0 CVE: NA
--------------------------------
kvm_vcpu::pre_pcpu was introduced in commit bf9f6ac8d749 ("KVM: Update Posted-Interrupts Descriptor when vCPU is blocked") but has so far only been used by x86.
Set it properly on the vcpu_put() path for arm64.
Signed-off-by: Zenghui Yu yuzenghui@huawei.com Reviewed-by: Cheng Jian cj.chengjian@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- virt/kvm/arm/arm.c | 1 + 1 file changed, 1 insertion(+)
diff --git a/virt/kvm/arm/arm.c b/virt/kvm/arm/arm.c index c1ae17f68a05c..7cf274e6966e6 100644 --- a/virt/kvm/arm/arm.c +++ b/virt/kvm/arm/arm.c @@ -466,6 +466,7 @@ void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu) kvm_vgic_put(vcpu); kvm_vcpu_pmu_restore_host(vcpu);
+ vcpu->pre_pcpu = vcpu->cpu; vcpu->cpu = -1;
kvm_arm_set_running_vcpu(NULL);
From: Zenghui Yu yuzenghui@huawei.com
virt inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I4BLL0 CVE: NA
--------------------------------
As per ARM DDI 0487G.a, seting the HCR_EL2.FB (Force broadcast) bit causes a given set of TLBI and IC instructions to be broadcast within the Inner Shareable domain when executed from EL1 (if HCR_EL2.TGE is 0).
And people complain that this leads to bad performance when running guests on Kunpeng920 which has 128 physical CPUs in the IS domain, especially in the case where vcpus are pinned to physical CPUs, where we indeed don't need broadcast invalidations.
Introduce a new cmdline parameter "kvm-arm.hcr_nofb" for users and setting it at boot time allows all vcpus running without HCR_EL2.FB. Note that we now have to nuke the whole vcpu context in the general case (when vcpu is loaded on to the new physical CPU).
Co-developed-by: Nianyao Tang tangnianyao@huawei.com Signed-off-by: Nianyao Tang tangnianyao@huawei.com Signed-off-by: Zenghui Yu yuzenghui@huawei.com Reviewed-by: Cheng Jian cj.chengjian@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- arch/arm64/include/asm/kvm_emulate.h | 5 +++++ virt/kvm/arm/arm.c | 18 ++++++++++++++++++ 2 files changed, 23 insertions(+)
diff --git a/arch/arm64/include/asm/kvm_emulate.h b/arch/arm64/include/asm/kvm_emulate.h index 6106a85ae0be7..9ee37c0e763b2 100644 --- a/arch/arm64/include/asm/kvm_emulate.h +++ b/arch/arm64/include/asm/kvm_emulate.h @@ -52,6 +52,8 @@ static inline bool vcpu_el1_is_32bit(struct kvm_vcpu *vcpu) return !(vcpu->arch.hcr_el2 & HCR_RW); }
+extern bool kvm_hcr_nofb; + static inline void vcpu_reset_hcr(struct kvm_vcpu *vcpu) { vcpu->arch.hcr_el2 = HCR_GUEST_FLAGS; @@ -76,6 +78,9 @@ static inline void vcpu_reset_hcr(struct kvm_vcpu *vcpu) */ if (!vcpu_el1_is_32bit(vcpu)) vcpu->arch.hcr_el2 |= HCR_TID3; + + if (unlikely(kvm_hcr_nofb)) + vcpu->arch.hcr_el2 &= ~HCR_FB; }
static inline unsigned long *vcpu_hcr(struct kvm_vcpu *vcpu) diff --git a/virt/kvm/arm/arm.c b/virt/kvm/arm/arm.c index 7cf274e6966e6..1271779873d36 100644 --- a/virt/kvm/arm/arm.c +++ b/virt/kvm/arm/arm.c @@ -74,6 +74,14 @@ static bool vgic_present; enum hisi_cpu_type hi_cpu_type = UNKNOWN_HI_TYPE; bool kvm_ncsnp_support;
+bool kvm_hcr_nofb; + +static int __init early_hcr_nofb_cfg(char *buf) +{ + return strtobool(buf, &kvm_hcr_nofb); +} +early_param("kvm-arm.hcr_nofb", early_hcr_nofb_cfg); + static DEFINE_PER_CPU(unsigned char, kvm_arm_hardware_enabled);
static void kvm_arm_set_running_vcpu(struct kvm_vcpu *vcpu) @@ -419,6 +427,7 @@ void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu) { int *last_ran; kvm_host_data_t *cpu_data; + bool flushed = false;
last_ran = this_cpu_ptr(vcpu->kvm->arch.last_vcpu_ran); cpu_data = this_cpu_ptr(&kvm_host_data); @@ -435,8 +444,17 @@ void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu) if (*last_ran != vcpu->vcpu_id) { kvm_call_hyp(__kvm_flush_cpu_context, vcpu); *last_ran = vcpu->vcpu_id; + flushed = true; }
+ /* + * If FB (Force broadcast) is cleared, we have to nuke the + * vcpu context as well in case it is loaded on to the new + * physical CPU. + */ + if (unlikely(kvm_hcr_nofb) && vcpu->pre_pcpu != cpu && !flushed) + kvm_call_hyp(__kvm_flush_cpu_context, vcpu); + vcpu->cpu = cpu; vcpu->arch.host_cpu_context = &cpu_data->host_ctxt;