From: Sang Yan <sangyan(a)huawei.com>
hulk inclusion
category: feature
bugzilla: 48159
CVE: N/A
Introducing a feature of CPU PARK in order to save time
of cpus down and up during kexec, which may cost 250ms of
per cpu's down and 30ms of up.
As a result, for 128 cores, it costs more than 30 seconds
to down and up cpus during kexec. Think about 256 cores and more.
CPU PARK is a state that cpu power-on and staying in spin loop, polling
for exit chances, such as writing exit address.
Reserving a block of memory, to fill with cpu park text section,
exit address and park-magic-flag of each cpu. In implementation,
reserved one page for one cpu core.
Cpus going to park state instead of down in machine_shutdown().
Cpus going out of park state in smp_init instead of brought up.
One of cpu park sections in pre-reserved memory blocks,:
+--------------+
+ exit address +
+--------------+
+ park magic   +
+--------------+
+ park codes   +
+      .       +
+      .       +
+      .       +
+--------------+
Signed-off-by: Sang Yan <sangyan(a)huawei.com>
---
 arch/arm64/Kconfig                |  12 ++
 arch/arm64/include/asm/kexec.h    |   6 +
 arch/arm64/include/asm/smp.h      |  15 +++
 arch/arm64/kernel/Makefile        |   1 +
 arch/arm64/kernel/cpu-park.S      |  59 ++++++++++
 arch/arm64/kernel/machine_kexec.c |   2 +-
 arch/arm64/kernel/process.c       |   4 +
 arch/arm64/kernel/smp.c           | 229 ++++++++++++++++++++++++++++++++++++++
 arch/arm64/mm/init.c              |  55 +++++++++
 9 files changed, 382 insertions(+), 1 deletion(-)
 create mode 100644 arch/arm64/kernel/cpu-park.S
diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig
index b9c5654..0885668 100644
--- a/arch/arm64/Kconfig
+++ b/arch/arm64/Kconfig
@@ -345,6 +345,18 @@ config KASAN_SHADOW_OFFSET
 	default 0xeffffff900000000 if ARM64_VA_BITS_36 && KASAN_SW_TAGS
 	default 0xffffffffffffffff
 
+config ARM64_CPU_PARK
+	bool "Support CPU PARK on kexec"
+	depends on SMP
+	depends on KEXEC_CORE
+	help
+	 This enables support for CPU PARK feature in
+	 order to save time of cpu down to up.
+	 CPU park is a state through kexec, spin loop
+	 instead of cpu die before jumping to new kernel,
+	 jumping out from loop to new kernel entry in
+	 smp_init.
+
 source "arch/arm64/Kconfig.platforms"
 
 menu "Kernel Features"
diff --git a/arch/arm64/include/asm/kexec.h b/arch/arm64/include/asm/kexec.h
index 79909ae..a133889 100644
--- a/arch/arm64/include/asm/kexec.h
+++ b/arch/arm64/include/asm/kexec.h
@@ -36,6 +36,11 @@
 
 #define CRASH_ADDR_HIGH_MAX	MEMBLOCK_ALLOC_ACCESSIBLE
 
+#ifdef CONFIG_ARM64_CPU_PARK
+/* CPU park state flag: "park" */
+#define PARK_MAGIC 0x7061726b
+#endif
+
 #ifndef __ASSEMBLY__
 
 /**
@@ -104,6 +109,7 @@ static inline void crash_post_resume(void) {}
 #ifdef CONFIG_KEXEC_CORE
 extern void __init reserve_crashkernel(void);
 #endif
+void machine_kexec_mask_interrupts(void);
 
 #ifdef CONFIG_KEXEC_FILE
 #define ARCH_HAS_KIMAGE_ARCH
diff --git a/arch/arm64/include/asm/smp.h b/arch/arm64/include/asm/smp.h
index 2e7f529..8c5d2d6 100644
--- a/arch/arm64/include/asm/smp.h
+++ b/arch/arm64/include/asm/smp.h
@@ -145,6 +145,21 @@ bool cpus_are_stuck_in_kernel(void);
 
 extern void crash_smp_send_stop(void);
 extern bool smp_crash_stop_failed(void);
+#ifdef CONFIG_ARM64_CPU_PARK
+#define PARK_SECTION_SIZE 1024
+struct cpu_park_info {
+	/* Physical address of reserved park memory. */
+	unsigned long start;
+	/* park reserve mem len should be PARK_SECTION_SIZE * NR_CPUS */
+	unsigned long len;
+	/* Virtual address of reserved park memory. */
+	unsigned long start_v;
+};
+extern struct cpu_park_info park_info;
+extern void enter_cpu_park(unsigned long text, unsigned long exit);
+extern void do_cpu_park(unsigned long exit);
+extern int kexec_smp_send_park(void);
+#endif
 
 #endif /* ifndef __ASSEMBLY__ */
 
diff --git a/arch/arm64/kernel/Makefile b/arch/arm64/kernel/Makefile
index 2621d5c..60478d2 100644
--- a/arch/arm64/kernel/Makefile
+++ b/arch/arm64/kernel/Makefile
@@ -54,6 +54,7 @@ obj-$(CONFIG_RANDOMIZE_BASE)		+= kaslr.o
 obj-$(CONFIG_HIBERNATION)		+= hibernate.o hibernate-asm.o
 obj-$(CONFIG_KEXEC_CORE)		+= machine_kexec.o relocate_kernel.o	\
 					   cpu-reset.o
+obj-$(CONFIG_ARM64_CPU_PARK)		+= cpu-park.o
 obj-$(CONFIG_KEXEC_FILE)		+= machine_kexec_file.o kexec_image.o
 obj-$(CONFIG_ARM64_RELOC_TEST)		+= arm64-reloc-test.o
 arm64-reloc-test-y := reloc_test_core.o reloc_test_syms.o
diff --git a/arch/arm64/kernel/cpu-park.S b/arch/arm64/kernel/cpu-park.S
new file mode 100644
index 0000000..10c685c
--- /dev/null
+++ b/arch/arm64/kernel/cpu-park.S
@@ -0,0 +1,59 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * CPU park routines
+ *
+ * Copyright (C) 2020 Huawei Technologies., Ltd.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/linkage.h>
+#include <asm/assembler.h>
+#include <asm/kexec.h>
+#include <asm/sysreg.h>
+#include <asm/virt.h>
+
+.text
+.pushsection    .idmap.text, "awx"
+
+/* cpu park helper in idmap section */
+SYM_CODE_START(enter_cpu_park)
+	/* Clear sctlr_el1 flags. */
+	mrs	x12, sctlr_el1
+	mov_q	x13, SCTLR_ELx_FLAGS
+	bic	x12, x12, x13
+	pre_disable_mmu_workaround
+	msr	sctlr_el1, x12		/* disable mmu */
+	isb
+
+	mov	x18, x0
+	mov	x0, x1			/* secondary_entry addr */
+	br	x18			/* call do_cpu_park of each cpu */
+SYM_CODE_END(enter_cpu_park)
+
+.popsection
+
+SYM_CODE_START(do_cpu_park)
+	ldr	x18, =PARK_MAGIC	/* magic number "park" */
+	add	x1, x0, #8
+	str	x18, [x1]		/* set on-park flag */
+	dc	civac, x1		/* flush cache of "park" */
+	dsb     nsh
+	isb
+
+.Lloop:
+	wfe
+	isb
+	ldr	x19, [x0]
+	cmp	x19, #0			/* test secondary_entry */
+	b.eq	.Lloop
+
+	ic	iallu			/* invalidate the local I-cache */
+	dsb	nsh
+	isb
+
+	br	x19			/* jump to secondary_entry */
+SYM_CODE_END(do_cpu_park)
+
diff --git a/arch/arm64/kernel/machine_kexec.c b/arch/arm64/kernel/machine_kexec.c
index a0b144c..f47ce96 100644
--- a/arch/arm64/kernel/machine_kexec.c
+++ b/arch/arm64/kernel/machine_kexec.c
@@ -213,7 +213,7 @@ void machine_kexec(struct kimage *kimage)
 	BUG(); /* Should never get here. */
 }
 
-static void machine_kexec_mask_interrupts(void)
+void machine_kexec_mask_interrupts(void)
 {
 	unsigned int i;
 	struct irq_desc *desc;
diff --git a/arch/arm64/kernel/process.c b/arch/arm64/kernel/process.c
index 73e3b32..10cffee 100644
--- a/arch/arm64/kernel/process.c
+++ b/arch/arm64/kernel/process.c
@@ -146,6 +146,10 @@ void arch_cpu_idle_dead(void)
  */
 void machine_shutdown(void)
 {
+#ifdef CONFIG_ARM64_CPU_PARK
+	if (kexec_smp_send_park() == 0)
+		return;
+#endif
 	smp_shutdown_nonboot_cpus(reboot_cpu);
 }
 
diff --git a/arch/arm64/kernel/smp.c b/arch/arm64/kernel/smp.c
index 18e9727..bc475d5 100644
--- a/arch/arm64/kernel/smp.c
+++ b/arch/arm64/kernel/smp.c
@@ -32,6 +32,8 @@
 #include <linux/irq_work.h>
 #include <linux/kernel_stat.h>
 #include <linux/kexec.h>
+#include <linux/console.h>
+
 #include <linux/kvm_host.h>
 
 #include <asm/alternative.h>
@@ -93,6 +95,167 @@ static inline int op_cpu_kill(unsigned int cpu)
 }
 #endif
 
+#ifdef CONFIG_ARM64_CPU_PARK
+struct cpu_park_section {
+	unsigned long exit;	/* exit address of park look */
+	unsigned long magic;	/* maigc represent park state */
+	char text[0];		/* text section of park */
+};
+
+static int mmap_cpu_park_mem(void)
+{
+	if (!park_info.start)
+		return -ENOMEM;
+
+	if (park_info.start_v)
+		return 0;
+
+	park_info.start_v = (unsigned long)__ioremap(park_info.start,
+						     park_info.len,
+						     PAGE_KERNEL_EXEC);
+	if (!park_info.start_v) {
+		pr_warn("map park memory failed.");
+		return -ENOMEM;
+	}
+
+	return 0;
+}
+
+static inline unsigned long cpu_park_section_v(unsigned int cpu)
+{
+	return park_info.start_v + PARK_SECTION_SIZE * (cpu - 1);
+}
+
+static inline unsigned long cpu_park_section_p(unsigned int cpu)
+{
+	return park_info.start + PARK_SECTION_SIZE * (cpu - 1);
+}
+
+/*
+ * Write the secondary_entry to exit section of park state.
+ * Then the secondary cpu will jump straight into the kernel
+ * by the secondary_entry.
+ */
+static int write_park_exit(unsigned int cpu)
+{
+	struct cpu_park_section *park_section;
+	unsigned long *park_exit;
+	unsigned long *park_text;
+
+	if (mmap_cpu_park_mem() != 0)
+		return -EPERM;
+
+	park_section = (struct cpu_park_section *)cpu_park_section_v(cpu);
+	park_exit = &park_section->exit;
+	park_text = (unsigned long *)park_section->text;
+	pr_debug("park_text 0x%lx : 0x%lx, do_cpu_park text 0x%lx : 0x%lx",
+		 (unsigned long)park_text, *park_text,
+		 (unsigned long)do_cpu_park,
+		 *(unsigned long *)do_cpu_park);
+
+	/*
+	 * Test first 8 bytes to determine
+	 * whether needs to write cpu park exit.
+	 */
+	if (*park_text == *(unsigned long *)do_cpu_park) {
+		writeq_relaxed(__pa_symbol(secondary_entry), park_exit);
+		__flush_dcache_area((__force void *)park_exit,
+				    sizeof(unsigned long));
+		flush_icache_range((unsigned long)park_exit,
+				   (unsigned long)(park_exit + 1));
+		sev();
+		dsb(sy);
+		isb();
+
+		pr_debug("Write cpu %u secondary entry 0x%lx to 0x%lx.",
+			cpu, *park_exit, (unsigned long)park_exit);
+		pr_info("Boot cpu %u from PARK state.", cpu);
+		return 0;
+	}
+
+	return -EPERM;
+}
+
+/* Install cpu park sections for the specific cpu. */
+static int install_cpu_park(unsigned int cpu)
+{
+	struct cpu_park_section *park_section;
+	unsigned long *park_exit;
+	unsigned long *park_magic;
+	unsigned long park_text_len;
+
+	park_section = (struct cpu_park_section *)cpu_park_section_v(cpu);
+	pr_debug("Install cpu park on cpu %u park exit 0x%lx park text 0x%lx",
+		 cpu, (unsigned long)park_section,
+		 (unsigned long)(park_section->text));
+
+	park_exit = &park_section->exit;
+	park_magic = &park_section->magic;
+	park_text_len = PARK_SECTION_SIZE - sizeof(struct cpu_park_section);
+
+	*park_exit = 0UL;
+	*park_magic = 0UL;
+	memcpy((void *)park_section->text, do_cpu_park, park_text_len);
+	__flush_dcache_area((void *)park_section, PARK_SECTION_SIZE);
+
+	return 0;
+}
+
+static int uninstall_cpu_park(unsigned int cpu)
+{
+	unsigned long park_section;
+
+	if (mmap_cpu_park_mem() != 0)
+		return -EPERM;
+
+	park_section = cpu_park_section_v(cpu);
+	memset((void *)park_section, 0, PARK_SECTION_SIZE);
+	__flush_dcache_area((void *)park_section, PARK_SECTION_SIZE);
+
+	return 0;
+}
+
+static int cpu_wait_park(unsigned int cpu)
+{
+	long timeout;
+	struct cpu_park_section *park_section;
+
+	volatile unsigned long *park_magic;
+
+	park_section = (struct cpu_park_section *)cpu_park_section_v(cpu);
+	park_magic = &park_section->magic;
+
+	timeout = USEC_PER_SEC;
+	while (*park_magic != PARK_MAGIC && timeout--)
+		udelay(1);
+
+	if (timeout > 0)
+		pr_debug("cpu %u park done.", cpu);
+	else
+		pr_err("cpu %u park failed.", cpu);
+
+	return *park_magic == PARK_MAGIC;
+}
+
+static void cpu_park(unsigned int cpu)
+{
+	unsigned long park_section_p;
+	unsigned long park_exit_phy;
+	unsigned long do_park;
+	typeof(enter_cpu_park) *park;
+
+	park_section_p = cpu_park_section_p(cpu);
+	park_exit_phy = park_section_p;
+	pr_debug("Go to park cpu %u exit address 0x%lx", cpu, park_exit_phy);
+
+	do_park = park_section_p + sizeof(struct cpu_park_section);
+	park = (void *)__pa_symbol(enter_cpu_park);
+
+	cpu_install_idmap();
+	park(do_park, park_exit_phy);
+	unreachable();
+}
+#endif
 
 /*
  * Boot a secondary CPU, and assign it the specified idle task.
@@ -102,6 +265,10 @@ static int boot_secondary(unsigned int cpu, struct task_struct *idle)
 {
 	const struct cpu_operations *ops = get_cpu_ops(cpu);
 
+#ifdef CONFIG_ARM64_CPU_PARK
+	if (write_park_exit(cpu) == 0)
+		return 0;
+#endif
 	if (ops->cpu_boot)
 		return ops->cpu_boot(cpu);
 
@@ -131,6 +298,9 @@ int __cpu_up(unsigned int cpu, struct task_struct *idle)
 		return ret;
 	}
 
+#ifdef CONFIG_ARM64_CPU_PARK
+	uninstall_cpu_park(cpu);
+#endif
 	/*
 	 * CPU was successfully started, wait for it to come online or
 	 * time out.
@@ -844,10 +1014,30 @@ void arch_irq_work_raise(void)
 
 static void local_cpu_stop(void)
 {
+	int cpu;
+	const struct cpu_operations *ops = NULL;
+
 	set_cpu_online(smp_processor_id(), false);
 
 	local_daif_mask();
 	sdei_mask_local_cpu();
+
+#ifdef CONFIG_ARM64_CPU_PARK
+	/*
+	 * Go to cpu park state.
+	 * Otherwise go to cpu die.
+	 */
+	cpu = smp_processor_id();
+	if (kexec_in_progress && park_info.start_v) {
+		machine_kexec_mask_interrupts();
+		cpu_park(cpu);
+
+		ops = get_cpu_ops(cpu);
+		if (ops && ops->cpu_die)
+			ops->cpu_die(cpu);
+	}
+#endif
+
 	cpu_park_loop();
 }
 
@@ -1053,6 +1243,45 @@ void smp_send_stop(void)
 	sdei_mask_local_cpu();
 }
 
+#ifdef CONFIG_ARM64_CPU_PARK
+int kexec_smp_send_park(void)
+{
+	unsigned long cpu;
+
+	if (WARN_ON(!kexec_in_progress)) {
+		pr_crit("%s called not in kexec progress.", __func__);
+		return -EPERM;
+	}
+
+	if (mmap_cpu_park_mem() != 0) {
+		pr_info("no cpuparkmem, goto normal way.");
+		return -EPERM;
+	}
+
+	local_irq_disable();
+
+	if (num_online_cpus() > 1) {
+		cpumask_t mask;
+
+		cpumask_copy(&mask, cpu_online_mask);
+		cpumask_clear_cpu(smp_processor_id(), &mask);
+
+		for_each_cpu(cpu, &mask)
+			install_cpu_park(cpu);
+		smp_cross_call(&mask, IPI_CPU_STOP);
+
+		/* Wait for other CPUs to park */
+		for_each_cpu(cpu, &mask)
+			cpu_wait_park(cpu);
+		pr_info("smp park other cpus done\n");
+	}
+
+	sdei_mask_local_cpu();
+
+	return 0;
+}
+#endif
+
 #ifdef CONFIG_KEXEC_CORE
 void crash_smp_send_stop(void)
 {
diff --git a/arch/arm64/mm/init.c b/arch/arm64/mm/init.c
index 794f992..d01259c 100644
--- a/arch/arm64/mm/init.c
+++ b/arch/arm64/mm/init.c
@@ -236,6 +236,57 @@ static void __init fdt_enforce_memory_region(void)
 		memblock_add(usable_rgns[1].base, usable_rgns[1].size);
 }
 
+#ifdef CONFIG_ARM64_CPU_PARK
+struct cpu_park_info park_info = {
+	.start = 0,
+	.len = PARK_SECTION_SIZE * NR_CPUS,
+	.start_v = 0,
+};
+
+static int __init parse_park_mem(char *p)
+{
+	if (!p)
+		return 0;
+
+	park_info.start = PAGE_ALIGN(memparse(p, NULL));
+	if (park_info.start == 0)
+		pr_info("cpu park mem params[%s]", p);
+
+	return 0;
+}
+early_param("cpuparkmem", parse_park_mem);
+
+static int __init reserve_park_mem(void)
+{
+	if (park_info.start == 0 || park_info.len == 0)
+		return 0;
+
+	park_info.start = PAGE_ALIGN(park_info.start);
+	park_info.len = PAGE_ALIGN(park_info.len);
+
+	if (!memblock_is_region_memory(park_info.start, park_info.len)) {
+		pr_warn("cannot reserve park mem: region is not memory!");
+		goto out;
+	}
+
+	if (memblock_is_region_reserved(park_info.start, park_info.len)) {
+		pr_warn("cannot reserve park mem: region overlaps reserved memory!");
+		goto out;
+	}
+
+	memblock_remove(park_info.start, park_info.len);
+	pr_info("cpu park mem reserved: 0x%016lx - 0x%016lx (%ld MB)",
+		park_info.start, park_info.start + park_info.len,
+		park_info.len >> 20);
+
+	return 0;
+out:
+	park_info.start = 0;
+	park_info.len = 0;
+	return -EINVAL;
+}
+#endif
+
 void __init arm64_memblock_init(void)
 {
 	const s64 linear_region_size = BIT(vabits_actual - 1);
@@ -357,6 +408,10 @@ void __init arm64_memblock_init(void)
 
 	reserve_crashkernel();
 
+#ifdef CONFIG_ARM64_CPU_PARK
+	reserve_park_mem();
+#endif
+
 	reserve_elfcorehdr();
 
 	high_memory = __va(memblock_end_of_DRAM() - 1) + 1;
-- 
2.9.5