From: Sang Yan <sangyan(a)huawei.com>
hulk inclusion
category: feature
bugzilla: 48159
CVE: N/A
Introducing a feature of CPU PARK in order to save time
of cpus down and up during kexec, which may cost 250ms of
per cpu's down and 30ms of up.
As a result, for 128 cores, it costs more than 30 seconds
to down and up cpus during kexec. Think about 256 cores and more.
CPU PARK is a state that cpu power-on and staying in spin loop, polling
for exit chances, such as writing exit address.
Reserving a block of memory, to fill with cpu park text section,
exit address and park-magic-flag of each cpu. In implementation,
reserved one page for one cpu core.
Cpus going to park state instead of down in machine_shutdown().
Cpus going out of park state in smp_init instead of brought up.
One of cpu park sections in pre-reserved memory blocks,:
+--------------+
+ exit address +
+--------------+
+ park magic +
+--------------+
+ park codes +
+ . +
+ . +
+ . +
+--------------+
Signed-off-by: Sang Yan <sangyan(a)huawei.com>
---
arch/arm64/Kconfig | 12 ++
arch/arm64/include/asm/kexec.h | 6 +
arch/arm64/include/asm/smp.h | 15 +++
arch/arm64/kernel/Makefile | 1 +
arch/arm64/kernel/cpu-park.S | 59 ++++++++++
arch/arm64/kernel/machine_kexec.c | 2 +-
arch/arm64/kernel/process.c | 4 +
arch/arm64/kernel/smp.c | 229 ++++++++++++++++++++++++++++++++++++++
arch/arm64/mm/init.c | 55 +++++++++
9 files changed, 382 insertions(+), 1 deletion(-)
create mode 100644 arch/arm64/kernel/cpu-park.S
diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig
index b9c5654..0885668 100644
--- a/arch/arm64/Kconfig
+++ b/arch/arm64/Kconfig
@@ -345,6 +345,18 @@ config KASAN_SHADOW_OFFSET
default 0xeffffff900000000 if ARM64_VA_BITS_36 && KASAN_SW_TAGS
default 0xffffffffffffffff
+config ARM64_CPU_PARK
+ bool "Support CPU PARK on kexec"
+ depends on SMP
+ depends on KEXEC_CORE
+ help
+ This enables support for CPU PARK feature in
+ order to save time of cpu down to up.
+ CPU park is a state through kexec, spin loop
+ instead of cpu die before jumping to new kernel,
+ jumping out from loop to new kernel entry in
+ smp_init.
+
source "arch/arm64/Kconfig.platforms"
menu "Kernel Features"
diff --git a/arch/arm64/include/asm/kexec.h b/arch/arm64/include/asm/kexec.h
index 79909ae..a133889 100644
--- a/arch/arm64/include/asm/kexec.h
+++ b/arch/arm64/include/asm/kexec.h
@@ -36,6 +36,11 @@
#define CRASH_ADDR_HIGH_MAX MEMBLOCK_ALLOC_ACCESSIBLE
+#ifdef CONFIG_ARM64_CPU_PARK
+/* CPU park state flag: "park" */
+#define PARK_MAGIC 0x7061726b
+#endif
+
#ifndef __ASSEMBLY__
/**
@@ -104,6 +109,7 @@ static inline void crash_post_resume(void) {}
#ifdef CONFIG_KEXEC_CORE
extern void __init reserve_crashkernel(void);
#endif
+void machine_kexec_mask_interrupts(void);
#ifdef CONFIG_KEXEC_FILE
#define ARCH_HAS_KIMAGE_ARCH
diff --git a/arch/arm64/include/asm/smp.h b/arch/arm64/include/asm/smp.h
index 2e7f529..8c5d2d6 100644
--- a/arch/arm64/include/asm/smp.h
+++ b/arch/arm64/include/asm/smp.h
@@ -145,6 +145,21 @@ bool cpus_are_stuck_in_kernel(void);
extern void crash_smp_send_stop(void);
extern bool smp_crash_stop_failed(void);
+#ifdef CONFIG_ARM64_CPU_PARK
+#define PARK_SECTION_SIZE 1024
+struct cpu_park_info {
+ /* Physical address of reserved park memory. */
+ unsigned long start;
+ /* park reserve mem len should be PARK_SECTION_SIZE * NR_CPUS */
+ unsigned long len;
+ /* Virtual address of reserved park memory. */
+ unsigned long start_v;
+};
+extern struct cpu_park_info park_info;
+extern void enter_cpu_park(unsigned long text, unsigned long exit);
+extern void do_cpu_park(unsigned long exit);
+extern int kexec_smp_send_park(void);
+#endif
#endif /* ifndef __ASSEMBLY__ */
diff --git a/arch/arm64/kernel/Makefile b/arch/arm64/kernel/Makefile
index 2621d5c..60478d2 100644
--- a/arch/arm64/kernel/Makefile
+++ b/arch/arm64/kernel/Makefile
@@ -54,6 +54,7 @@ obj-$(CONFIG_RANDOMIZE_BASE) += kaslr.o
obj-$(CONFIG_HIBERNATION) += hibernate.o hibernate-asm.o
obj-$(CONFIG_KEXEC_CORE) += machine_kexec.o relocate_kernel.o \
cpu-reset.o
+obj-$(CONFIG_ARM64_CPU_PARK) += cpu-park.o
obj-$(CONFIG_KEXEC_FILE) += machine_kexec_file.o kexec_image.o
obj-$(CONFIG_ARM64_RELOC_TEST) += arm64-reloc-test.o
arm64-reloc-test-y := reloc_test_core.o reloc_test_syms.o
diff --git a/arch/arm64/kernel/cpu-park.S b/arch/arm64/kernel/cpu-park.S
new file mode 100644
index 0000000..10c685c
--- /dev/null
+++ b/arch/arm64/kernel/cpu-park.S
@@ -0,0 +1,59 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * CPU park routines
+ *
+ * Copyright (C) 2020 Huawei Technologies., Ltd.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/linkage.h>
+#include <asm/assembler.h>
+#include <asm/kexec.h>
+#include <asm/sysreg.h>
+#include <asm/virt.h>
+
+.text
+.pushsection .idmap.text, "awx"
+
+/* cpu park helper in idmap section */
+SYM_CODE_START(enter_cpu_park)
+ /* Clear sctlr_el1 flags. */
+ mrs x12, sctlr_el1
+ mov_q x13, SCTLR_ELx_FLAGS
+ bic x12, x12, x13
+ pre_disable_mmu_workaround
+ msr sctlr_el1, x12 /* disable mmu */
+ isb
+
+ mov x18, x0
+ mov x0, x1 /* secondary_entry addr */
+ br x18 /* call do_cpu_park of each cpu */
+SYM_CODE_END(enter_cpu_park)
+
+.popsection
+
+SYM_CODE_START(do_cpu_park)
+ ldr x18, =PARK_MAGIC /* magic number "park" */
+ add x1, x0, #8
+ str x18, [x1] /* set on-park flag */
+ dc civac, x1 /* flush cache of "park" */
+ dsb nsh
+ isb
+
+.Lloop:
+ wfe
+ isb
+ ldr x19, [x0]
+ cmp x19, #0 /* test secondary_entry */
+ b.eq .Lloop
+
+ ic iallu /* invalidate the local I-cache */
+ dsb nsh
+ isb
+
+ br x19 /* jump to secondary_entry */
+SYM_CODE_END(do_cpu_park)
+
diff --git a/arch/arm64/kernel/machine_kexec.c b/arch/arm64/kernel/machine_kexec.c
index a0b144c..f47ce96 100644
--- a/arch/arm64/kernel/machine_kexec.c
+++ b/arch/arm64/kernel/machine_kexec.c
@@ -213,7 +213,7 @@ void machine_kexec(struct kimage *kimage)
BUG(); /* Should never get here. */
}
-static void machine_kexec_mask_interrupts(void)
+void machine_kexec_mask_interrupts(void)
{
unsigned int i;
struct irq_desc *desc;
diff --git a/arch/arm64/kernel/process.c b/arch/arm64/kernel/process.c
index 73e3b32..10cffee 100644
--- a/arch/arm64/kernel/process.c
+++ b/arch/arm64/kernel/process.c
@@ -146,6 +146,10 @@ void arch_cpu_idle_dead(void)
*/
void machine_shutdown(void)
{
+#ifdef CONFIG_ARM64_CPU_PARK
+ if (kexec_smp_send_park() == 0)
+ return;
+#endif
smp_shutdown_nonboot_cpus(reboot_cpu);
}
diff --git a/arch/arm64/kernel/smp.c b/arch/arm64/kernel/smp.c
index 18e9727..bc475d5 100644
--- a/arch/arm64/kernel/smp.c
+++ b/arch/arm64/kernel/smp.c
@@ -32,6 +32,8 @@
#include <linux/irq_work.h>
#include <linux/kernel_stat.h>
#include <linux/kexec.h>
+#include <linux/console.h>
+
#include <linux/kvm_host.h>
#include <asm/alternative.h>
@@ -93,6 +95,167 @@ static inline int op_cpu_kill(unsigned int cpu)
}
#endif
+#ifdef CONFIG_ARM64_CPU_PARK
+struct cpu_park_section {
+ unsigned long exit; /* exit address of park look */
+ unsigned long magic; /* maigc represent park state */
+ char text[0]; /* text section of park */
+};
+
+static int mmap_cpu_park_mem(void)
+{
+ if (!park_info.start)
+ return -ENOMEM;
+
+ if (park_info.start_v)
+ return 0;
+
+ park_info.start_v = (unsigned long)__ioremap(park_info.start,
+ park_info.len,
+ PAGE_KERNEL_EXEC);
+ if (!park_info.start_v) {
+ pr_warn("map park memory failed.");
+ return -ENOMEM;
+ }
+
+ return 0;
+}
+
+static inline unsigned long cpu_park_section_v(unsigned int cpu)
+{
+ return park_info.start_v + PARK_SECTION_SIZE * (cpu - 1);
+}
+
+static inline unsigned long cpu_park_section_p(unsigned int cpu)
+{
+ return park_info.start + PARK_SECTION_SIZE * (cpu - 1);
+}
+
+/*
+ * Write the secondary_entry to exit section of park state.
+ * Then the secondary cpu will jump straight into the kernel
+ * by the secondary_entry.
+ */
+static int write_park_exit(unsigned int cpu)
+{
+ struct cpu_park_section *park_section;
+ unsigned long *park_exit;
+ unsigned long *park_text;
+
+ if (mmap_cpu_park_mem() != 0)
+ return -EPERM;
+
+ park_section = (struct cpu_park_section *)cpu_park_section_v(cpu);
+ park_exit = &park_section->exit;
+ park_text = (unsigned long *)park_section->text;
+ pr_debug("park_text 0x%lx : 0x%lx, do_cpu_park text 0x%lx : 0x%lx",
+ (unsigned long)park_text, *park_text,
+ (unsigned long)do_cpu_park,
+ *(unsigned long *)do_cpu_park);
+
+ /*
+ * Test first 8 bytes to determine
+ * whether needs to write cpu park exit.
+ */
+ if (*park_text == *(unsigned long *)do_cpu_park) {
+ writeq_relaxed(__pa_symbol(secondary_entry), park_exit);
+ __flush_dcache_area((__force void *)park_exit,
+ sizeof(unsigned long));
+ flush_icache_range((unsigned long)park_exit,
+ (unsigned long)(park_exit + 1));
+ sev();
+ dsb(sy);
+ isb();
+
+ pr_debug("Write cpu %u secondary entry 0x%lx to 0x%lx.",
+ cpu, *park_exit, (unsigned long)park_exit);
+ pr_info("Boot cpu %u from PARK state.", cpu);
+ return 0;
+ }
+
+ return -EPERM;
+}
+
+/* Install cpu park sections for the specific cpu. */
+static int install_cpu_park(unsigned int cpu)
+{
+ struct cpu_park_section *park_section;
+ unsigned long *park_exit;
+ unsigned long *park_magic;
+ unsigned long park_text_len;
+
+ park_section = (struct cpu_park_section *)cpu_park_section_v(cpu);
+ pr_debug("Install cpu park on cpu %u park exit 0x%lx park text 0x%lx",
+ cpu, (unsigned long)park_section,
+ (unsigned long)(park_section->text));
+
+ park_exit = &park_section->exit;
+ park_magic = &park_section->magic;
+ park_text_len = PARK_SECTION_SIZE - sizeof(struct cpu_park_section);
+
+ *park_exit = 0UL;
+ *park_magic = 0UL;
+ memcpy((void *)park_section->text, do_cpu_park, park_text_len);
+ __flush_dcache_area((void *)park_section, PARK_SECTION_SIZE);
+
+ return 0;
+}
+
+static int uninstall_cpu_park(unsigned int cpu)
+{
+ unsigned long park_section;
+
+ if (mmap_cpu_park_mem() != 0)
+ return -EPERM;
+
+ park_section = cpu_park_section_v(cpu);
+ memset((void *)park_section, 0, PARK_SECTION_SIZE);
+ __flush_dcache_area((void *)park_section, PARK_SECTION_SIZE);
+
+ return 0;
+}
+
+static int cpu_wait_park(unsigned int cpu)
+{
+ long timeout;
+ struct cpu_park_section *park_section;
+
+ volatile unsigned long *park_magic;
+
+ park_section = (struct cpu_park_section *)cpu_park_section_v(cpu);
+ park_magic = &park_section->magic;
+
+ timeout = USEC_PER_SEC;
+ while (*park_magic != PARK_MAGIC && timeout--)
+ udelay(1);
+
+ if (timeout > 0)
+ pr_debug("cpu %u park done.", cpu);
+ else
+ pr_err("cpu %u park failed.", cpu);
+
+ return *park_magic == PARK_MAGIC;
+}
+
+static void cpu_park(unsigned int cpu)
+{
+ unsigned long park_section_p;
+ unsigned long park_exit_phy;
+ unsigned long do_park;
+ typeof(enter_cpu_park) *park;
+
+ park_section_p = cpu_park_section_p(cpu);
+ park_exit_phy = park_section_p;
+ pr_debug("Go to park cpu %u exit address 0x%lx", cpu, park_exit_phy);
+
+ do_park = park_section_p + sizeof(struct cpu_park_section);
+ park = (void *)__pa_symbol(enter_cpu_park);
+
+ cpu_install_idmap();
+ park(do_park, park_exit_phy);
+ unreachable();
+}
+#endif
/*
* Boot a secondary CPU, and assign it the specified idle task.
@@ -102,6 +265,10 @@ static int boot_secondary(unsigned int cpu, struct task_struct *idle)
{
const struct cpu_operations *ops = get_cpu_ops(cpu);
+#ifdef CONFIG_ARM64_CPU_PARK
+ if (write_park_exit(cpu) == 0)
+ return 0;
+#endif
if (ops->cpu_boot)
return ops->cpu_boot(cpu);
@@ -131,6 +298,9 @@ int __cpu_up(unsigned int cpu, struct task_struct *idle)
return ret;
}
+#ifdef CONFIG_ARM64_CPU_PARK
+ uninstall_cpu_park(cpu);
+#endif
/*
* CPU was successfully started, wait for it to come online or
* time out.
@@ -844,10 +1014,30 @@ void arch_irq_work_raise(void)
static void local_cpu_stop(void)
{
+ int cpu;
+ const struct cpu_operations *ops = NULL;
+
set_cpu_online(smp_processor_id(), false);
local_daif_mask();
sdei_mask_local_cpu();
+
+#ifdef CONFIG_ARM64_CPU_PARK
+ /*
+ * Go to cpu park state.
+ * Otherwise go to cpu die.
+ */
+ cpu = smp_processor_id();
+ if (kexec_in_progress && park_info.start_v) {
+ machine_kexec_mask_interrupts();
+ cpu_park(cpu);
+
+ ops = get_cpu_ops(cpu);
+ if (ops && ops->cpu_die)
+ ops->cpu_die(cpu);
+ }
+#endif
+
cpu_park_loop();
}
@@ -1053,6 +1243,45 @@ void smp_send_stop(void)
sdei_mask_local_cpu();
}
+#ifdef CONFIG_ARM64_CPU_PARK
+int kexec_smp_send_park(void)
+{
+ unsigned long cpu;
+
+ if (WARN_ON(!kexec_in_progress)) {
+ pr_crit("%s called not in kexec progress.", __func__);
+ return -EPERM;
+ }
+
+ if (mmap_cpu_park_mem() != 0) {
+ pr_info("no cpuparkmem, goto normal way.");
+ return -EPERM;
+ }
+
+ local_irq_disable();
+
+ if (num_online_cpus() > 1) {
+ cpumask_t mask;
+
+ cpumask_copy(&mask, cpu_online_mask);
+ cpumask_clear_cpu(smp_processor_id(), &mask);
+
+ for_each_cpu(cpu, &mask)
+ install_cpu_park(cpu);
+ smp_cross_call(&mask, IPI_CPU_STOP);
+
+ /* Wait for other CPUs to park */
+ for_each_cpu(cpu, &mask)
+ cpu_wait_park(cpu);
+ pr_info("smp park other cpus done\n");
+ }
+
+ sdei_mask_local_cpu();
+
+ return 0;
+}
+#endif
+
#ifdef CONFIG_KEXEC_CORE
void crash_smp_send_stop(void)
{
diff --git a/arch/arm64/mm/init.c b/arch/arm64/mm/init.c
index 794f992..d01259c 100644
--- a/arch/arm64/mm/init.c
+++ b/arch/arm64/mm/init.c
@@ -236,6 +236,57 @@ static void __init fdt_enforce_memory_region(void)
memblock_add(usable_rgns[1].base, usable_rgns[1].size);
}
+#ifdef CONFIG_ARM64_CPU_PARK
+struct cpu_park_info park_info = {
+ .start = 0,
+ .len = PARK_SECTION_SIZE * NR_CPUS,
+ .start_v = 0,
+};
+
+static int __init parse_park_mem(char *p)
+{
+ if (!p)
+ return 0;
+
+ park_info.start = PAGE_ALIGN(memparse(p, NULL));
+ if (park_info.start == 0)
+ pr_info("cpu park mem params[%s]", p);
+
+ return 0;
+}
+early_param("cpuparkmem", parse_park_mem);
+
+static int __init reserve_park_mem(void)
+{
+ if (park_info.start == 0 || park_info.len == 0)
+ return 0;
+
+ park_info.start = PAGE_ALIGN(park_info.start);
+ park_info.len = PAGE_ALIGN(park_info.len);
+
+ if (!memblock_is_region_memory(park_info.start, park_info.len)) {
+ pr_warn("cannot reserve park mem: region is not memory!");
+ goto out;
+ }
+
+ if (memblock_is_region_reserved(park_info.start, park_info.len)) {
+ pr_warn("cannot reserve park mem: region overlaps reserved memory!");
+ goto out;
+ }
+
+ memblock_remove(park_info.start, park_info.len);
+ pr_info("cpu park mem reserved: 0x%016lx - 0x%016lx (%ld MB)",
+ park_info.start, park_info.start + park_info.len,
+ park_info.len >> 20);
+
+ return 0;
+out:
+ park_info.start = 0;
+ park_info.len = 0;
+ return -EINVAL;
+}
+#endif
+
void __init arm64_memblock_init(void)
{
const s64 linear_region_size = BIT(vabits_actual - 1);
@@ -357,6 +408,10 @@ void __init arm64_memblock_init(void)
reserve_crashkernel();
+#ifdef CONFIG_ARM64_CPU_PARK
+ reserve_park_mem();
+#endif
+
reserve_elfcorehdr();
high_memory = __va(memblock_end_of_DRAM() - 1) + 1;
--
2.9.5