From: Sang Yan sangyan@huawei.com
hulk inclusion category: feature bugzilla: 48159 CVE: N/A
Introducing a feature of CPU PARK in order to save time of cpus down and up during kexec, which may cost 250ms of per cpu's down and 30ms of up.
As a result, for 128 cores, it costs more than 30 seconds to down and up cpus during kexec. Think about 256 cores and more.
CPU PARK is a state that cpu power-on and staying in spin loop, polling for exit chances, such as writing exit address.
Reserving a block of memory, to fill with cpu park text section, exit address and park-magic-flag of each cpu. In implementation, reserved one page for one cpu core.
Cpus going to park state instead of down in machine_shutdown(). Cpus going out of park state in smp_init instead of brought up.
One of cpu park sections in pre-reserved memory blocks,: +--------------+ + exit address + +--------------+ + park magic + +--------------+ + park codes + + . + + . + + . + +--------------+
Signed-off-by: Sang Yan sangyan@huawei.com Reviewed-by: Jing Xiangfeng jingxiangfeng@huawei.com --- arch/arm64/Kconfig | 12 ++ arch/arm64/include/asm/kexec.h | 6 + arch/arm64/include/asm/smp.h | 15 +++ arch/arm64/kernel/Makefile | 1 + arch/arm64/kernel/cpu-park.S | 59 ++++++++++ arch/arm64/kernel/machine_kexec.c | 2 +- arch/arm64/kernel/process.c | 4 + arch/arm64/kernel/smp.c | 230 ++++++++++++++++++++++++++++++++++++++ arch/arm64/mm/init.c | 55 +++++++++ 9 files changed, 383 insertions(+), 1 deletion(-) create mode 100644 arch/arm64/kernel/cpu-park.S
diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig index b9c5654..0885668 100644 --- a/arch/arm64/Kconfig +++ b/arch/arm64/Kconfig @@ -345,6 +345,18 @@ config KASAN_SHADOW_OFFSET default 0xeffffff900000000 if ARM64_VA_BITS_36 && KASAN_SW_TAGS default 0xffffffffffffffff
+config ARM64_CPU_PARK + bool "Support CPU PARK on kexec" + depends on SMP + depends on KEXEC_CORE + help + This enables support for CPU PARK feature in + order to save time of cpu down to up. + CPU park is a state through kexec, spin loop + instead of cpu die before jumping to new kernel, + jumping out from loop to new kernel entry in + smp_init. + source "arch/arm64/Kconfig.platforms"
menu "Kernel Features" diff --git a/arch/arm64/include/asm/kexec.h b/arch/arm64/include/asm/kexec.h index 79909ae..a133889 100644 --- a/arch/arm64/include/asm/kexec.h +++ b/arch/arm64/include/asm/kexec.h @@ -36,6 +36,11 @@
#define CRASH_ADDR_HIGH_MAX MEMBLOCK_ALLOC_ACCESSIBLE
+#ifdef CONFIG_ARM64_CPU_PARK +/* CPU park state flag: "park" */ +#define PARK_MAGIC 0x7061726b +#endif + #ifndef __ASSEMBLY__
/** @@ -104,6 +109,7 @@ static inline void crash_post_resume(void) {} #ifdef CONFIG_KEXEC_CORE extern void __init reserve_crashkernel(void); #endif +void machine_kexec_mask_interrupts(void);
#ifdef CONFIG_KEXEC_FILE #define ARCH_HAS_KIMAGE_ARCH diff --git a/arch/arm64/include/asm/smp.h b/arch/arm64/include/asm/smp.h index 2e7f529..8c5d2d6 100644 --- a/arch/arm64/include/asm/smp.h +++ b/arch/arm64/include/asm/smp.h @@ -145,6 +145,21 @@ bool cpus_are_stuck_in_kernel(void);
extern void crash_smp_send_stop(void); extern bool smp_crash_stop_failed(void); +#ifdef CONFIG_ARM64_CPU_PARK +#define PARK_SECTION_SIZE 1024 +struct cpu_park_info { + /* Physical address of reserved park memory. */ + unsigned long start; + /* park reserve mem len should be PARK_SECTION_SIZE * NR_CPUS */ + unsigned long len; + /* Virtual address of reserved park memory. */ + unsigned long start_v; +}; +extern struct cpu_park_info park_info; +extern void enter_cpu_park(unsigned long text, unsigned long exit); +extern void do_cpu_park(unsigned long exit); +extern int kexec_smp_send_park(void); +#endif
#endif /* ifndef __ASSEMBLY__ */
diff --git a/arch/arm64/kernel/Makefile b/arch/arm64/kernel/Makefile index 2621d5c..60478d2 100644 --- a/arch/arm64/kernel/Makefile +++ b/arch/arm64/kernel/Makefile @@ -54,6 +54,7 @@ obj-$(CONFIG_RANDOMIZE_BASE) += kaslr.o obj-$(CONFIG_HIBERNATION) += hibernate.o hibernate-asm.o obj-$(CONFIG_KEXEC_CORE) += machine_kexec.o relocate_kernel.o \ cpu-reset.o +obj-$(CONFIG_ARM64_CPU_PARK) += cpu-park.o obj-$(CONFIG_KEXEC_FILE) += machine_kexec_file.o kexec_image.o obj-$(CONFIG_ARM64_RELOC_TEST) += arm64-reloc-test.o arm64-reloc-test-y := reloc_test_core.o reloc_test_syms.o diff --git a/arch/arm64/kernel/cpu-park.S b/arch/arm64/kernel/cpu-park.S new file mode 100644 index 0000000..10c685c --- /dev/null +++ b/arch/arm64/kernel/cpu-park.S @@ -0,0 +1,59 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * CPU park routines + * + * Copyright (C) 2020 Huawei Technologies., Ltd. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#include <linux/linkage.h> +#include <asm/assembler.h> +#include <asm/kexec.h> +#include <asm/sysreg.h> +#include <asm/virt.h> + +.text +.pushsection .idmap.text, "awx" + +/* cpu park helper in idmap section */ +SYM_CODE_START(enter_cpu_park) + /* Clear sctlr_el1 flags. */ + mrs x12, sctlr_el1 + mov_q x13, SCTLR_ELx_FLAGS + bic x12, x12, x13 + pre_disable_mmu_workaround + msr sctlr_el1, x12 /* disable mmu */ + isb + + mov x18, x0 + mov x0, x1 /* secondary_entry addr */ + br x18 /* call do_cpu_park of each cpu */ +SYM_CODE_END(enter_cpu_park) + +.popsection + +SYM_CODE_START(do_cpu_park) + ldr x18, =PARK_MAGIC /* magic number "park" */ + add x1, x0, #8 + str x18, [x1] /* set on-park flag */ + dc civac, x1 /* flush cache of "park" */ + dsb nsh + isb + +.Lloop: + wfe + isb + ldr x19, [x0] + cmp x19, #0 /* test secondary_entry */ + b.eq .Lloop + + ic iallu /* invalidate the local I-cache */ + dsb nsh + isb + + br x19 /* jump to secondary_entry */ +SYM_CODE_END(do_cpu_park) + diff --git a/arch/arm64/kernel/machine_kexec.c b/arch/arm64/kernel/machine_kexec.c index a0b144c..f47ce96 100644 --- a/arch/arm64/kernel/machine_kexec.c +++ b/arch/arm64/kernel/machine_kexec.c @@ -213,7 +213,7 @@ void machine_kexec(struct kimage *kimage) BUG(); /* Should never get here. */ }
-static void machine_kexec_mask_interrupts(void) +void machine_kexec_mask_interrupts(void) { unsigned int i; struct irq_desc *desc; diff --git a/arch/arm64/kernel/process.c b/arch/arm64/kernel/process.c index 73e3b32..10cffee 100644 --- a/arch/arm64/kernel/process.c +++ b/arch/arm64/kernel/process.c @@ -146,6 +146,10 @@ void arch_cpu_idle_dead(void) */ void machine_shutdown(void) { +#ifdef CONFIG_ARM64_CPU_PARK + if (kexec_smp_send_park() == 0) + return; +#endif smp_shutdown_nonboot_cpus(reboot_cpu); }
diff --git a/arch/arm64/kernel/smp.c b/arch/arm64/kernel/smp.c index 18e9727..dea67d0 100644 --- a/arch/arm64/kernel/smp.c +++ b/arch/arm64/kernel/smp.c @@ -32,6 +32,7 @@ #include <linux/irq_work.h> #include <linux/kernel_stat.h> #include <linux/kexec.h> + #include <linux/kvm_host.h>
#include <asm/alternative.h> @@ -93,6 +94,167 @@ static inline int op_cpu_kill(unsigned int cpu) } #endif
+#ifdef CONFIG_ARM64_CPU_PARK +struct cpu_park_section { + unsigned long exit; /* exit address of park look */ + unsigned long magic; /* maigc represent park state */ + char text[0]; /* text section of park */ +}; + +static int mmap_cpu_park_mem(void) +{ + if (!park_info.start) + return -ENOMEM; + + if (park_info.start_v) + return 0; + + park_info.start_v = (unsigned long)__ioremap(park_info.start, + park_info.len, + PAGE_KERNEL_EXEC); + if (!park_info.start_v) { + pr_warn("map park memory failed."); + return -ENOMEM; + } + + return 0; +} + +static inline unsigned long cpu_park_section_v(unsigned int cpu) +{ + return park_info.start_v + PARK_SECTION_SIZE * (cpu - 1); +} + +static inline unsigned long cpu_park_section_p(unsigned int cpu) +{ + return park_info.start + PARK_SECTION_SIZE * (cpu - 1); +} + +/* + * Write the secondary_entry to exit section of park state. + * Then the secondary cpu will jump straight into the kernel + * by the secondary_entry. + */ +static int write_park_exit(unsigned int cpu) +{ + struct cpu_park_section *park_section; + unsigned long *park_exit; + unsigned long *park_text; + + if (mmap_cpu_park_mem() != 0) + return -EPERM; + + park_section = (struct cpu_park_section *)cpu_park_section_v(cpu); + park_exit = &park_section->exit; + park_text = (unsigned long *)park_section->text; + pr_debug("park_text 0x%lx : 0x%lx, do_cpu_park text 0x%lx : 0x%lx", + (unsigned long)park_text, *park_text, + (unsigned long)do_cpu_park, + *(unsigned long *)do_cpu_park); + + /* + * Test first 8 bytes to determine + * whether needs to write cpu park exit. + */ + if (*park_text == *(unsigned long *)do_cpu_park) { + writeq_relaxed(__pa_symbol(secondary_entry), park_exit); + __flush_dcache_area((__force void *)park_exit, + sizeof(unsigned long)); + flush_icache_range((unsigned long)park_exit, + (unsigned long)(park_exit + 1)); + sev(); + dsb(sy); + isb(); + + pr_debug("Write cpu %u secondary entry 0x%lx to 0x%lx.", + cpu, *park_exit, (unsigned long)park_exit); + pr_info("Boot cpu %u from PARK state.", cpu); + return 0; + } + + return -EPERM; +} + +/* Install cpu park sections for the specific cpu. */ +static int install_cpu_park(unsigned int cpu) +{ + struct cpu_park_section *park_section; + unsigned long *park_exit; + unsigned long *park_magic; + unsigned long park_text_len; + + park_section = (struct cpu_park_section *)cpu_park_section_v(cpu); + pr_debug("Install cpu park on cpu %u park exit 0x%lx park text 0x%lx", + cpu, (unsigned long)park_section, + (unsigned long)(park_section->text)); + + park_exit = &park_section->exit; + park_magic = &park_section->magic; + park_text_len = PARK_SECTION_SIZE - sizeof(struct cpu_park_section); + + *park_exit = 0UL; + *park_magic = 0UL; + memcpy((void *)park_section->text, do_cpu_park, park_text_len); + __flush_dcache_area((void *)park_section, PARK_SECTION_SIZE); + + return 0; +} + +static int uninstall_cpu_park(unsigned int cpu) +{ + unsigned long park_section; + + if (mmap_cpu_park_mem() != 0) + return -EPERM; + + park_section = cpu_park_section_v(cpu); + memset((void *)park_section, 0, PARK_SECTION_SIZE); + __flush_dcache_area((void *)park_section, PARK_SECTION_SIZE); + + return 0; +} + +static int cpu_wait_park(unsigned int cpu) +{ + long timeout; + struct cpu_park_section *park_section; + + volatile unsigned long *park_magic; + + park_section = (struct cpu_park_section *)cpu_park_section_v(cpu); + park_magic = &park_section->magic; + + timeout = USEC_PER_SEC; + while (*park_magic != PARK_MAGIC && timeout--) + udelay(1); + + if (timeout > 0) + pr_debug("cpu %u park done.", cpu); + else + pr_err("cpu %u park failed.", cpu); + + return *park_magic == PARK_MAGIC; +} + +static void cpu_park(unsigned int cpu) +{ + unsigned long park_section_p; + unsigned long park_exit_phy; + unsigned long do_park; + typeof(enter_cpu_park) *park; + + park_section_p = cpu_park_section_p(cpu); + park_exit_phy = park_section_p; + pr_debug("Go to park cpu %u exit address 0x%lx", cpu, park_exit_phy); + + do_park = park_section_p + sizeof(struct cpu_park_section); + park = (void *)__pa_symbol(enter_cpu_park); + + cpu_install_idmap(); + park(do_park, park_exit_phy); + unreachable(); +} +#endif
/* * Boot a secondary CPU, and assign it the specified idle task. @@ -102,6 +264,10 @@ static int boot_secondary(unsigned int cpu, struct task_struct *idle) { const struct cpu_operations *ops = get_cpu_ops(cpu);
+#ifdef CONFIG_ARM64_CPU_PARK + if (write_park_exit(cpu) == 0) + return 0; +#endif if (ops->cpu_boot) return ops->cpu_boot(cpu);
@@ -131,6 +297,9 @@ int __cpu_up(unsigned int cpu, struct task_struct *idle) return ret; }
+#ifdef CONFIG_ARM64_CPU_PARK + uninstall_cpu_park(cpu); +#endif /* * CPU was successfully started, wait for it to come online or * time out. @@ -844,10 +1013,32 @@ void arch_irq_work_raise(void)
static void local_cpu_stop(void) { +#ifdef CONFIG_ARM64_CPU_PARK + int cpu; + const struct cpu_operations *ops = NULL; +#endif + set_cpu_online(smp_processor_id(), false);
local_daif_mask(); sdei_mask_local_cpu(); + +#ifdef CONFIG_ARM64_CPU_PARK + /* + * Go to cpu park state. + * Otherwise go to cpu die. + */ + cpu = smp_processor_id(); + if (kexec_in_progress && park_info.start_v) { + machine_kexec_mask_interrupts(); + cpu_park(cpu); + + ops = get_cpu_ops(cpu); + if (ops && ops->cpu_die) + ops->cpu_die(cpu); + } +#endif + cpu_park_loop(); }
@@ -1053,6 +1244,45 @@ void smp_send_stop(void) sdei_mask_local_cpu(); }
+#ifdef CONFIG_ARM64_CPU_PARK +int kexec_smp_send_park(void) +{ + unsigned long cpu; + + if (WARN_ON(!kexec_in_progress)) { + pr_crit("%s called not in kexec progress.", __func__); + return -EPERM; + } + + if (mmap_cpu_park_mem() != 0) { + pr_info("no cpuparkmem, goto normal way."); + return -EPERM; + } + + local_irq_disable(); + + if (num_online_cpus() > 1) { + cpumask_t mask; + + cpumask_copy(&mask, cpu_online_mask); + cpumask_clear_cpu(smp_processor_id(), &mask); + + for_each_cpu(cpu, &mask) + install_cpu_park(cpu); + smp_cross_call(&mask, IPI_CPU_STOP); + + /* Wait for other CPUs to park */ + for_each_cpu(cpu, &mask) + cpu_wait_park(cpu); + pr_info("smp park other cpus done\n"); + } + + sdei_mask_local_cpu(); + + return 0; +} +#endif + #ifdef CONFIG_KEXEC_CORE void crash_smp_send_stop(void) { diff --git a/arch/arm64/mm/init.c b/arch/arm64/mm/init.c index 794f992..d01259c 100644 --- a/arch/arm64/mm/init.c +++ b/arch/arm64/mm/init.c @@ -236,6 +236,57 @@ static void __init fdt_enforce_memory_region(void) memblock_add(usable_rgns[1].base, usable_rgns[1].size); }
+#ifdef CONFIG_ARM64_CPU_PARK +struct cpu_park_info park_info = { + .start = 0, + .len = PARK_SECTION_SIZE * NR_CPUS, + .start_v = 0, +}; + +static int __init parse_park_mem(char *p) +{ + if (!p) + return 0; + + park_info.start = PAGE_ALIGN(memparse(p, NULL)); + if (park_info.start == 0) + pr_info("cpu park mem params[%s]", p); + + return 0; +} +early_param("cpuparkmem", parse_park_mem); + +static int __init reserve_park_mem(void) +{ + if (park_info.start == 0 || park_info.len == 0) + return 0; + + park_info.start = PAGE_ALIGN(park_info.start); + park_info.len = PAGE_ALIGN(park_info.len); + + if (!memblock_is_region_memory(park_info.start, park_info.len)) { + pr_warn("cannot reserve park mem: region is not memory!"); + goto out; + } + + if (memblock_is_region_reserved(park_info.start, park_info.len)) { + pr_warn("cannot reserve park mem: region overlaps reserved memory!"); + goto out; + } + + memblock_remove(park_info.start, park_info.len); + pr_info("cpu park mem reserved: 0x%016lx - 0x%016lx (%ld MB)", + park_info.start, park_info.start + park_info.len, + park_info.len >> 20); + + return 0; +out: + park_info.start = 0; + park_info.len = 0; + return -EINVAL; +} +#endif + void __init arm64_memblock_init(void) { const s64 linear_region_size = BIT(vabits_actual - 1); @@ -357,6 +408,10 @@ void __init arm64_memblock_init(void)
reserve_crashkernel();
+#ifdef CONFIG_ARM64_CPU_PARK + reserve_park_mem(); +#endif + reserve_elfcorehdr();
high_memory = __va(memblock_end_of_DRAM() - 1) + 1;