From: Sang Yan sangyan@huawei.com
hulk inclusion from openEuler-21.03 commit 6d3c56fa93e3 ("arm64: smp: Add support for cpu park") category: feature bugzilla: 48159 CVE: N/A
------------------------------
Introducing a feature of CPU PARK in order to save time of cpus down and up during kexec, which may cost 250ms of per cpu's down and 30ms of up.
As a result, for 128 cores, it costs more than 30 seconds to down and up cpus during kexec. Think about 256 cores and more.
CPU PARK is a state that cpu power-on and staying in spin loop, polling for exit chances, such as writing exit address.
Reserving a block of memory, to fill with cpu park text section, exit address and park-magic-flag of each cpu. In implementation, reserved one page for one cpu core.
Cpus going to park state instead of down in machine_shutdown(). Cpus going out of park state in smp_init instead of brought up.
One of cpu park sections in pre-reserved memory blocks,: +--------------+ + exit address + +--------------+ + park magic + +--------------+ + park codes + + . + + . + + . + +--------------+
Signed-off-by: Sang Yan sangyan@huawei.com --- arch/arm64/Kconfig | 12 +++ arch/arm64/include/asm/kexec.h | 7 ++ arch/arm64/include/asm/smp.h | 16 +++ arch/arm64/kernel/Makefile | 1 + arch/arm64/kernel/cpu-park.S | 59 ++++++++++ arch/arm64/kernel/machine_kexec.c | 2 +- arch/arm64/kernel/process.c | 4 + arch/arm64/kernel/smp.c | 222 ++++++++++++++++++++++++++++++++++++++ arch/arm64/mm/init.c | 64 +++++++++++ 9 files changed, 386 insertions(+), 1 deletion(-) create mode 100644 arch/arm64/kernel/cpu-park.S
diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig index 37c5064..dec367a 100644 --- a/arch/arm64/Kconfig +++ b/arch/arm64/Kconfig @@ -964,6 +964,18 @@ config CRASH_DUMP
For more details see Documentation/kdump/kdump.txt
+config ARM64_CPU_PARK + bool "Support CPU PARK on kexec" + depends on SMP + depends on KEXEC_CORE + help + This enables support for CPU PARK feature in + order to save time of cpu down to up. + CPU park is a state through kexec, spin loop + instead of cpu die before jumping to new kernel, + jumping out from loop to new kernel entry in + smp_init. + config XEN_DOM0 def_bool y depends on XEN diff --git a/arch/arm64/include/asm/kexec.h b/arch/arm64/include/asm/kexec.h index 87e75d0..30ab1fa 100644 --- a/arch/arm64/include/asm/kexec.h +++ b/arch/arm64/include/asm/kexec.h @@ -31,6 +31,11 @@ /* 2M alignment for crash kernel regions */ #define CRASH_ALIGN SZ_2M
+#ifdef CONFIG_ARM64_CPU_PARK +/* CPU park state flag: "park" */ +#define PARK_MAGIC 0x7061726b +#endif + #ifndef __ASSEMBLY__
/** @@ -96,6 +101,8 @@ static inline void crash_prepare_suspend(void) {} static inline void crash_post_resume(void) {} #endif
+void machine_kexec_mask_interrupts(void); + #endif /* __ASSEMBLY__ */
#endif diff --git a/arch/arm64/include/asm/smp.h b/arch/arm64/include/asm/smp.h index 403c22f..f0898dc 100644 --- a/arch/arm64/include/asm/smp.h +++ b/arch/arm64/include/asm/smp.h @@ -153,6 +153,22 @@ extern bool smp_crash_stop_failed(void);
void ipi_set_nmi_prio(void __iomem *base, u8 prio);
+#ifdef CONFIG_ARM64_CPU_PARK +#define PARK_SECTION_SIZE 1024 +struct cpu_park_info { + /* Physical address of reserved park memory. */ + unsigned long start; + /* park reserve mem len should be PARK_SECTION_SIZE * NR_CPUS */ + unsigned long len; + /* Virtual address of reserved park memory. */ + unsigned long start_v; +}; +extern struct cpu_park_info park_info; +extern void enter_cpu_park(unsigned long text, unsigned long exit); +extern void do_cpu_park(unsigned long exit); +extern int kexec_smp_send_park(void); +#endif + #endif /* ifndef __ASSEMBLY__ */
#endif /* ifndef __ASM_SMP_H */ diff --git a/arch/arm64/kernel/Makefile b/arch/arm64/kernel/Makefile index 6dd565a..d2861d7 100644 --- a/arch/arm64/kernel/Makefile +++ b/arch/arm64/kernel/Makefile @@ -56,6 +56,7 @@ arm64-obj-$(CONFIG_RANDOMIZE_BASE) += kaslr.o arm64-obj-$(CONFIG_HIBERNATION) += hibernate.o hibernate-asm.o arm64-obj-$(CONFIG_KEXEC) += machine_kexec.o relocate_kernel.o \ cpu-reset.o +arm64-obj-$(CONFIG_ARM64_CPU_PARK) += cpu-park.o arm64-obj-$(CONFIG_ARM64_RELOC_TEST) += arm64-reloc-test.o arm64-reloc-test-y := reloc_test_core.o reloc_test_syms.o arm64-obj-$(CONFIG_CRASH_DUMP) += crash_dump.o diff --git a/arch/arm64/kernel/cpu-park.S b/arch/arm64/kernel/cpu-park.S new file mode 100644 index 0000000..fc8f61e --- /dev/null +++ b/arch/arm64/kernel/cpu-park.S @@ -0,0 +1,59 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * CPU park routines + * + * Copyright (C) 2020 Huawei Technologies., Ltd. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#include <linux/linkage.h> +#include <asm/assembler.h> +#include <asm/kexec.h> +#include <asm/sysreg.h> +#include <asm/virt.h> + +.text +.pushsection .idmap.text, "awx" + +/* cpu park helper in idmap section */ +ENTRY(enter_cpu_park) + /* Clear sctlr_el1 flags. */ + mrs x12, sctlr_el1 + mov_q x13, SCTLR_ELx_FLAGS + bic x12, x12, x13 + pre_disable_mmu_workaround + msr sctlr_el1, x12 /* disable mmu */ + isb + + mov x18, x0 + mov x0, x1 /* secondary_entry addr */ + br x18 /* call do_cpu_park of each cpu */ +ENDPROC(enter_cpu_park) + +.popsection + +ENTRY(do_cpu_park) + ldr x18, =PARK_MAGIC /* magic number "park" */ + add x1, x0, #8 + str x18, [x1] /* set on-park flag */ + dc civac, x1 /* flush cache of "park" */ + dsb nsh + isb + +.Lloop: + wfe + isb + ldr x19, [x0] + cmp x19, #0 /* test secondary_entry */ + b.eq .Lloop + + ic iallu /* invalidate the local I-cache */ + dsb nsh + isb + + br x19 /* jump to secondary_entry */ +ENDPROC(do_cpu_park) + diff --git a/arch/arm64/kernel/machine_kexec.c b/arch/arm64/kernel/machine_kexec.c index 7aef7ac..b41d6d4 100644 --- a/arch/arm64/kernel/machine_kexec.c +++ b/arch/arm64/kernel/machine_kexec.c @@ -221,7 +221,7 @@ void machine_kexec(struct kimage *kimage) BUG(); /* Should never get here. */ }
-static void machine_kexec_mask_interrupts(void) +void machine_kexec_mask_interrupts(void) { unsigned int i; struct irq_desc *desc; diff --git a/arch/arm64/kernel/process.c b/arch/arm64/kernel/process.c index 8342459..ddb1911 100644 --- a/arch/arm64/kernel/process.c +++ b/arch/arm64/kernel/process.c @@ -151,6 +151,10 @@ void arch_cpu_idle_dead(void) */ void machine_shutdown(void) { +#ifdef CONFIG_ARM64_CPU_PARK + if (kexec_smp_send_park() == 0) + return; +#endif disable_nonboot_cpus(); }
diff --git a/arch/arm64/kernel/smp.c b/arch/arm64/kernel/smp.c index 5235b9a..090e760 100644 --- a/arch/arm64/kernel/smp.c +++ b/arch/arm64/kernel/smp.c @@ -98,6 +98,167 @@ static inline int op_cpu_kill(unsigned int cpu) } #endif
+#ifdef CONFIG_ARM64_CPU_PARK +struct cpu_park_section { + unsigned long exit; /* exit address of park look */ + unsigned long magic; /* maigc represent park state */ + char text[0]; /* text section of park */ +}; + +static int mmap_cpu_park_mem(void) +{ + if (!park_info.start) + return -ENOMEM; + + if (park_info.start_v) + return 0; + + park_info.start_v = (unsigned long)__ioremap(park_info.start, + park_info.len, + PAGE_KERNEL_EXEC); + if (!park_info.start_v) { + pr_warn("map park memory failed."); + return -ENOMEM; + } + + return 0; +} + +static inline unsigned long cpu_park_section_v(unsigned int cpu) +{ + return park_info.start_v + PARK_SECTION_SIZE * (cpu - 1); +} + +static inline unsigned long cpu_park_section_p(unsigned int cpu) +{ + return park_info.start + PARK_SECTION_SIZE * (cpu - 1); +} + +/* + * Write the secondary_entry to exit section of park state. + * Then the secondary cpu will jump straight into the kernel + * by the secondary_entry. + */ +static int write_park_exit(unsigned int cpu) +{ + struct cpu_park_section *park_section; + unsigned long *park_exit; + unsigned long *park_text; + + if (mmap_cpu_park_mem() != 0) + return -EPERM; + + park_section = (struct cpu_park_section *)cpu_park_section_v(cpu); + park_exit = &park_section->exit; + park_text = (unsigned long *)park_section->text; + pr_debug("park_text 0x%lx : 0x%lx, do_cpu_park text 0x%lx : 0x%lx", + (unsigned long)park_text, *park_text, + (unsigned long)do_cpu_park, + *(unsigned long *)do_cpu_park); + + /* + * Test first 8 bytes to determine + * whether needs to write cpu park exit. + */ + if (*park_text == *(unsigned long *)do_cpu_park) { + writeq_relaxed(__pa_symbol(secondary_entry), park_exit); + __flush_dcache_area((__force void *)park_exit, + sizeof(unsigned long)); + flush_icache_range((unsigned long)park_exit, + (unsigned long)(park_exit + 1)); + sev(); + dsb(sy); + isb(); + + pr_debug("Write cpu %u secondary entry 0x%lx to 0x%lx.", + cpu, *park_exit, (unsigned long)park_exit); + pr_info("Boot cpu %u from PARK state.", cpu); + return 0; + } + + return -EPERM; +} + +/* Install cpu park sections for the specific cpu. */ +static int install_cpu_park(unsigned int cpu) +{ + struct cpu_park_section *park_section; + unsigned long *park_exit; + unsigned long *park_magic; + unsigned long park_text_len; + + park_section = (struct cpu_park_section *)cpu_park_section_v(cpu); + pr_debug("Install cpu park on cpu %u park exit 0x%lx park text 0x%lx", + cpu, (unsigned long)park_section, + (unsigned long)(park_section->text)); + + park_exit = &park_section->exit; + park_magic = &park_section->magic; + park_text_len = PARK_SECTION_SIZE - sizeof(struct cpu_park_section); + + *park_exit = 0UL; + *park_magic = 0UL; + memcpy((void *)park_section->text, do_cpu_park, park_text_len); + __flush_dcache_area((void *)park_section, PARK_SECTION_SIZE); + + return 0; +} + +static int uninstall_cpu_park(unsigned int cpu) +{ + unsigned long park_section; + + if (mmap_cpu_park_mem() != 0) + return -EPERM; + + park_section = cpu_park_section_v(cpu); + memset((void *)park_section, 0, PARK_SECTION_SIZE); + __flush_dcache_area((void *)park_section, PARK_SECTION_SIZE); + + return 0; +} + +static int cpu_wait_park(unsigned int cpu) +{ + long timeout; + struct cpu_park_section *park_section; + + volatile unsigned long *park_magic; + + park_section = (struct cpu_park_section *)cpu_park_section_v(cpu); + park_magic = &park_section->magic; + + timeout = USEC_PER_SEC; + while (*park_magic != PARK_MAGIC && timeout--) + udelay(1); + + if (timeout > 0) + pr_debug("cpu %u park done.", cpu); + else + pr_err("cpu %u park failed.", cpu); + + return *park_magic == PARK_MAGIC; +} + +static void cpu_park(unsigned int cpu) +{ + unsigned long park_section_p; + unsigned long park_exit_phy; + unsigned long do_park; + typeof(enter_cpu_park) *park; + + park_section_p = cpu_park_section_p(cpu); + park_exit_phy = park_section_p; + pr_debug("Go to park cpu %u exit address 0x%lx", cpu, park_exit_phy); + + do_park = park_section_p + sizeof(struct cpu_park_section); + park = (void *)__pa_symbol(enter_cpu_park); + + cpu_install_idmap(); + park(do_park, park_exit_phy); + unreachable(); +} +#endif
/* * Boot a secondary CPU, and assign it the specified idle task. @@ -105,6 +266,10 @@ static inline int op_cpu_kill(unsigned int cpu) */ static int boot_secondary(unsigned int cpu, struct task_struct *idle) { +#ifdef CONFIG_ARM64_CPU_PARK + if (write_park_exit(cpu) == 0) + return 0; +#endif if (cpu_ops[cpu]->cpu_boot) return cpu_ops[cpu]->cpu_boot(cpu);
@@ -153,6 +318,10 @@ int __cpu_up(unsigned int cpu, struct task_struct *idle) return ret; }
+#ifdef CONFIG_ARM64_CPU_PARK + uninstall_cpu_park(cpu); +#endif + secondary_data.task = NULL; secondary_data.stack = NULL; status = READ_ONCE(secondary_data.status); @@ -863,6 +1032,20 @@ static void ipi_cpu_stop(unsigned int cpu) local_daif_mask(); sdei_mask_local_cpu();
+#ifdef CONFIG_ARM64_CPU_PARK + /* + * Go to cpu park state. + * Otherwise go to cpu die. + */ + if (kexec_in_progress && park_info.start_v) { + machine_kexec_mask_interrupts(); + cpu_park(cpu); + + if (cpu_ops[cpu]->cpu_die) + cpu_ops[cpu]->cpu_die(cpu); + } +#endif + while (1) cpu_relax(); } @@ -1037,6 +1220,45 @@ void smp_send_stop(void) sdei_mask_local_cpu(); }
+#ifdef CONFIG_ARM64_CPU_PARK +int kexec_smp_send_park(void) +{ + unsigned long cpu; + + if (WARN_ON(!kexec_in_progress)) { + pr_crit("%s called not in kexec progress.", __func__); + return -EPERM; + } + + if (mmap_cpu_park_mem() != 0) { + pr_info("no cpuparkmem, goto normal way."); + return -EPERM; + } + + local_irq_disable(); + + if (num_online_cpus() > 1) { + cpumask_t mask; + + cpumask_copy(&mask, cpu_online_mask); + cpumask_clear_cpu(smp_processor_id(), &mask); + + for_each_cpu(cpu, &mask) + install_cpu_park(cpu); + smp_cross_call(&mask, IPI_CPU_STOP); + + /* Wait for other CPUs to park */ + for_each_cpu(cpu, &mask) + cpu_wait_park(cpu); + pr_info("smp park other cpus done\n"); + } + + sdei_mask_local_cpu(); + + return 0; +} +#endif + #ifdef CONFIG_KEXEC_CORE void crash_smp_send_stop(void) { diff --git a/arch/arm64/mm/init.c b/arch/arm64/mm/init.c index 0773205..3a914ff 100644 --- a/arch/arm64/mm/init.c +++ b/arch/arm64/mm/init.c @@ -466,6 +466,57 @@ static int __init parse_memmap_opt(char *str) } early_param("memmap", parse_memmap_opt);
+#ifdef CONFIG_ARM64_CPU_PARK +struct cpu_park_info park_info = { + .start = 0, + .len = PARK_SECTION_SIZE * NR_CPUS, + .start_v = 0, +}; + +static int __init parse_park_mem(char *p) +{ + if (!p) + return 0; + + park_info.start = PAGE_ALIGN(memparse(p, NULL)); + if (park_info.start == 0) + pr_info("cpu park mem params[%s]", p); + + return 0; +} +early_param("cpuparkmem", parse_park_mem); + +static int __init reserve_park_mem(void) +{ + if (park_info.start == 0 || park_info.len == 0) + return 0; + + park_info.start = PAGE_ALIGN(park_info.start); + park_info.len = PAGE_ALIGN(park_info.len); + + if (!memblock_is_region_memory(park_info.start, park_info.len)) { + pr_warn("cannot reserve park mem: region is not memory!"); + goto out; + } + + if (memblock_is_region_reserved(park_info.start, park_info.len)) { + pr_warn("cannot reserve park mem: region overlaps reserved memory!"); + goto out; + } + + memblock_remove(park_info.start, park_info.len); + pr_info("cpu park mem reserved: 0x%016lx - 0x%016lx (%ld MB)", + park_info.start, park_info.start + park_info.len, + park_info.len >> 20); + + return 0; +out: + park_info.start = 0; + park_info.len = 0; + return -EINVAL; +} +#endif + void __init arm64_memblock_init(void) { const s64 linear_region_size = -(s64)PAGE_OFFSET; @@ -582,6 +633,19 @@ void __init arm64_memblock_init(void) else arm64_dma_phys_limit = PHYS_MASK + 1;
+ /* + * Reserve park memory before crashkernel and quick kexec. + * Because park memory must be specified by address, but + * crashkernel and quickkexec may be specified by memory length, + * then find one sutiable memory region to reserve. + * + * So reserve park memory firstly is better, but it may cause + * crashkernel or quickkexec reserving failed. + */ +#ifdef CONFIG_ARM64_CPU_PARK + reserve_park_mem(); +#endif + reserve_crashkernel();
reserve_elfcorehdr();