We introduced several features to support kernel seamless upgrade:
1 pin mem function for checkpoint and restore: 2 pid reserve function for checkpoint and restore 3 cpu park to accelerate cpus' down and up 4 quick kexec to accelerate relocating kernel images 5 legacy pmem support for arm64
-----------------------------------
Jingxian He (2): mm: add pin memory method for checkpoint add restore pid: add pid reserve method for checkpoint and recover
Sang Yan (4): kexec: Add quick kexec support for kernel arm64: Reserve memory for quick kexec arm64: smp: Add support for cpu park config: enable kernel hotupgrade features by default
ZhuLing (1): arm64: Add memmap parameter and register pmem
arch/Kconfig | 10 + arch/arm64/Kconfig | 33 + arch/arm64/configs/openeuler_defconfig | 8 + arch/arm64/include/asm/kexec.h | 6 + arch/arm64/include/asm/smp.h | 15 + arch/arm64/kernel/Makefile | 2 + arch/arm64/kernel/cpu-park.S | 58 ++ arch/arm64/kernel/machine_kexec.c | 2 +- arch/arm64/kernel/pmem.c | 35 + arch/arm64/kernel/process.c | 4 + arch/arm64/kernel/setup.c | 26 + arch/arm64/kernel/smp.c | 231 +++++ arch/arm64/mm/init.c | 258 ++++++ drivers/char/Kconfig | 7 + drivers/char/Makefile | 1 + drivers/char/pin_memory.c | 209 +++++ drivers/nvdimm/Kconfig | 5 + drivers/nvdimm/Makefile | 2 +- fs/proc/task_mmu.c | 138 +++ include/linux/crash_core.h | 5 + include/linux/ioport.h | 1 + include/linux/kexec.h | 24 +- include/linux/pin_mem.h | 99 ++ include/uapi/linux/kexec.h | 1 + kernel/crash_core.c | 11 + kernel/kexec.c | 10 + kernel/kexec_core.c | 42 +- kernel/pid.c | 10 + mm/Kconfig | 18 + mm/Makefile | 1 + mm/huge_memory.c | 63 ++ mm/memory.c | 65 ++ mm/pin_mem.c | 1142 ++++++++++++++++++++++++ 33 files changed, 2527 insertions(+), 15 deletions(-) create mode 100644 arch/arm64/kernel/cpu-park.S create mode 100644 arch/arm64/kernel/pmem.c create mode 100644 drivers/char/pin_memory.c create mode 100644 include/linux/pin_mem.h create mode 100644 mm/pin_mem.c
From: Sang Yan sangyan@huawei.com
hulk inclusion category: feature bugzilla: 48159 CVE: N/A
------------------------------
In normal kexec, relocating kernel may cost 5 ~ 10 seconds, to copy all segments from vmalloced memory to kernel boot memory, because of disabled mmu.
We introduce quick kexec to save time of copying memory as above, just like kdump(kexec on crash), by using reserved memory "Quick Kexec".
To enable it, we should reserve memory and setup quick_kexec_res.
Constructing quick kimage as the same as crash kernel, then simply copy all segments of kimage to reserved memroy.
We also add this support in syscall kexec_load using flags of KEXEC_QUICK.
Signed-off-by: Sang Yan sangyan@huawei.com Reviewed-by: Kuohai Xu xukuohai@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- arch/Kconfig | 10 +++++++++ include/linux/ioport.h | 1 + include/linux/kexec.h | 24 ++++++++++++++++++---- include/uapi/linux/kexec.h | 1 + kernel/kexec.c | 10 +++++++++ kernel/kexec_core.c | 42 ++++++++++++++++++++++++++++++-------- 6 files changed, 75 insertions(+), 13 deletions(-)
diff --git a/arch/Kconfig b/arch/Kconfig index f2d65d38f119..2bc2dea38ea4 100644 --- a/arch/Kconfig +++ b/arch/Kconfig @@ -18,6 +18,16 @@ config KEXEC_CORE select CRASH_CORE bool
+config QUICK_KEXEC + bool "Support for quick kexec" + depends on KEXEC_CORE + help + It uses pre-reserved memory to accelerate kexec, just like + crash kexec, loads new kernel and initrd to reserved memory, + and boots new kernel on that memory. It will save the time + of relocating kernel. + + config KEXEC_ELF bool
diff --git a/include/linux/ioport.h b/include/linux/ioport.h index 5135d4b86cd6..84a716fd6029 100644 --- a/include/linux/ioport.h +++ b/include/linux/ioport.h @@ -139,6 +139,7 @@ enum { IORES_DESC_DEVICE_PRIVATE_MEMORY = 6, IORES_DESC_RESERVED = 7, IORES_DESC_SOFT_RESERVED = 8, + IORES_DESC_QUICK_KEXEC = 9, };
/* diff --git a/include/linux/kexec.h b/include/linux/kexec.h index 6aca2e51fd8e..bbd8b54753bf 100644 --- a/include/linux/kexec.h +++ b/include/linux/kexec.h @@ -269,9 +269,10 @@ struct kimage { unsigned long control_page;
/* Flags to indicate special processing */ - unsigned int type : 1; + unsigned int type : 2; #define KEXEC_TYPE_DEFAULT 0 #define KEXEC_TYPE_CRASH 1 +#define KEXEC_TYPE_QUICK 2 unsigned int preserve_context : 1; /* If set, we are using file mode kexec syscall */ unsigned int file_mode:1; @@ -338,12 +339,24 @@ extern int kexec_load_disabled; #endif
/* List of defined/legal kexec flags */ -#ifndef CONFIG_KEXEC_JUMP -#define KEXEC_FLAGS KEXEC_ON_CRASH +#define __KEXEC_FLAGS_CRASH KEXEC_ON_CRASH + +#ifdef CONFIG_KEXEC_JUMP +#define __KEXEC_FLAGS_JUMP KEXEC_PRESERVE_CONTEXT +#else +#define __KEXEC_FLAGS_JUMP 0 +#endif + +#ifdef CONFIG_QUICK_KEXEC +#define __KEXEC_FLAGS_QUICK KEXEC_QUICK #else -#define KEXEC_FLAGS (KEXEC_ON_CRASH | KEXEC_PRESERVE_CONTEXT) +#define __KEXEC_FLAGS_QUICK 0 #endif
+#define KEXEC_FLAGS \ + (__KEXEC_FLAGS_CRASH | __KEXEC_FLAGS_JUMP | __KEXEC_FLAGS_QUICK) + + /* List of defined/legal kexec file flags */ #define KEXEC_FILE_FLAGS (KEXEC_FILE_UNLOAD | KEXEC_FILE_ON_CRASH | \ KEXEC_FILE_NO_INITRAMFS) @@ -351,6 +364,9 @@ extern int kexec_load_disabled; /* Location of a reserved region to hold the crash kernel. */ extern note_buf_t __percpu *crash_notes; +#ifdef CONFIG_QUICK_KEXEC +extern struct resource quick_kexec_res; +#endif
/* flag to track if kexec reboot is in progress */ extern bool kexec_in_progress; diff --git a/include/uapi/linux/kexec.h b/include/uapi/linux/kexec.h index 05669c87a0af..d891d8009a17 100644 --- a/include/uapi/linux/kexec.h +++ b/include/uapi/linux/kexec.h @@ -12,6 +12,7 @@ /* kexec flags for different usage scenarios */ #define KEXEC_ON_CRASH 0x00000001 #define KEXEC_PRESERVE_CONTEXT 0x00000002 +#define KEXEC_QUICK 0x00000004 #define KEXEC_ARCH_MASK 0xffff0000
/* diff --git a/kernel/kexec.c b/kernel/kexec.c index c82c6c06f051..4acc909940f7 100644 --- a/kernel/kexec.c +++ b/kernel/kexec.c @@ -44,6 +44,9 @@ static int kimage_alloc_init(struct kimage **rimage, unsigned long entry, int ret; struct kimage *image; bool kexec_on_panic = flags & KEXEC_ON_CRASH; +#ifdef CONFIG_QUICK_KEXEC + bool kexec_on_quick = flags & KEXEC_QUICK; +#endif
if (kexec_on_panic) { /* Verify we have a valid entry point */ @@ -69,6 +72,13 @@ static int kimage_alloc_init(struct kimage **rimage, unsigned long entry, image->type = KEXEC_TYPE_CRASH; }
+#ifdef CONFIG_QUICK_KEXEC + if (kexec_on_quick) { + image->control_page = quick_kexec_res.start; + image->type = KEXEC_TYPE_QUICK; + } +#endif + ret = sanity_check_segment_list(image); if (ret) goto out_free_image; diff --git a/kernel/kexec_core.c b/kernel/kexec_core.c index 255ec90c6806..b9a6f4658f89 100644 --- a/kernel/kexec_core.c +++ b/kernel/kexec_core.c @@ -52,6 +52,17 @@ note_buf_t __percpu *crash_notes; /* Flag to indicate we are going to kexec a new kernel */ bool kexec_in_progress = false;
+/* Resource for quick kexec */ +#ifdef CONFIG_QUICK_KEXEC +struct resource quick_kexec_res = { + .name = "Quick kexec", + .start = 0, + .end = 0, + .flags = IORESOURCE_BUSY | IORESOURCE_SYSTEM_RAM, + .desc = IORES_DESC_QUICK_KEXEC +}; +#endif + int kexec_should_crash(struct task_struct *p) { /* @@ -395,8 +406,9 @@ static struct page *kimage_alloc_normal_control_pages(struct kimage *image, return pages; }
-static struct page *kimage_alloc_crash_control_pages(struct kimage *image, - unsigned int order) +static struct page *kimage_alloc_special_control_pages(struct kimage *image, + unsigned int order, + unsigned long end) { /* Control pages are special, they are the intermediaries * that are needed while we copy the rest of the pages @@ -426,7 +438,7 @@ static struct page *kimage_alloc_crash_control_pages(struct kimage *image, size = (1 << order) << PAGE_SHIFT; hole_start = (image->control_page + (size - 1)) & ~(size - 1); hole_end = hole_start + size - 1; - while (hole_end <= crashk_res.end) { + while (hole_end <= end) { unsigned long i;
cond_resched(); @@ -461,7 +473,6 @@ static struct page *kimage_alloc_crash_control_pages(struct kimage *image, return pages; }
- struct page *kimage_alloc_control_pages(struct kimage *image, unsigned int order) { @@ -472,8 +483,15 @@ struct page *kimage_alloc_control_pages(struct kimage *image, pages = kimage_alloc_normal_control_pages(image, order); break; case KEXEC_TYPE_CRASH: - pages = kimage_alloc_crash_control_pages(image, order); + pages = kimage_alloc_special_control_pages(image, order, + crashk_res.end); + break; +#ifdef CONFIG_QUICK_KEXEC + case KEXEC_TYPE_QUICK: + pages = kimage_alloc_special_control_pages(image, order, + quick_kexec_res.end); break; +#endif }
return pages; @@ -829,11 +847,12 @@ static int kimage_load_normal_segment(struct kimage *image, return result; }
-static int kimage_load_crash_segment(struct kimage *image, +static int kimage_load_special_segment(struct kimage *image, struct kexec_segment *segment) { - /* For crash dumps kernels we simply copy the data from - * user space to it's destination. + /* + * For crash dumps kernels and quick kexec kernels + * we simply copy the data from user space to it's destination. * We do things a page at a time for the sake of kmap. */ unsigned long maddr; @@ -907,8 +926,13 @@ int kimage_load_segment(struct kimage *image, result = kimage_load_normal_segment(image, segment); break; case KEXEC_TYPE_CRASH: - result = kimage_load_crash_segment(image, segment); + result = kimage_load_special_segment(image, segment); break; +#ifdef CONFIG_QUICK_KEXEC + case KEXEC_TYPE_QUICK: + result = kimage_load_special_segment(image, segment); + break; +#endif }
return result;
From: Sang Yan sangyan@huawei.com
hulk inclusion category: feature bugzilla: 48159 CVE: N/A
------------------------------
Reserve memory for quick kexec on arm64 with cmdline "quickkexec=".
Signed-off-by: Sang Yan sangyan@huawei.com Reviewed-by: Chen Wandun chenwandun@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- arch/arm64/kernel/setup.c | 6 ++++++ arch/arm64/mm/init.c | 42 +++++++++++++++++++++++++++++++++++++++ 2 files changed, 48 insertions(+)
diff --git a/arch/arm64/kernel/setup.c b/arch/arm64/kernel/setup.c index f89dbb3f9eea..b0fe635e943e 100644 --- a/arch/arm64/kernel/setup.c +++ b/arch/arm64/kernel/setup.c @@ -266,6 +266,12 @@ static void __init request_standard_resources(void) if (crashk_res.end && crashk_res.start >= res->start && crashk_res.end <= res->end) request_resource(res, &crashk_res); +#endif +#ifdef CONFIG_QUICK_KEXEC + if (quick_kexec_res.end && + quick_kexec_res.start >= res->start && + quick_kexec_res.end <= res->end) + request_resource(res, &quick_kexec_res); #endif } } diff --git a/arch/arm64/mm/init.c b/arch/arm64/mm/init.c index d5bcc6dffdee..a3edb18a22f1 100644 --- a/arch/arm64/mm/init.c +++ b/arch/arm64/mm/init.c @@ -128,6 +128,45 @@ static void __init reserve_elfcorehdr(void) } #endif /* CONFIG_CRASH_DUMP */
+#ifdef CONFIG_QUICK_KEXEC +static int __init parse_quick_kexec(char *p) +{ + if (!p) + return 0; + + quick_kexec_res.end = PAGE_ALIGN(memparse(p, NULL)); + + return 0; +} +early_param("quickkexec", parse_quick_kexec); + +static void __init reserve_quick_kexec(void) +{ + unsigned long long mem_start, mem_len; + + mem_len = quick_kexec_res.end; + if (mem_len == 0) + return; + + /* Current arm64 boot protocol requires 2MB alignment */ + mem_start = memblock_find_in_range(0, arm64_dma_phys_limit, + mem_len, SZ_2M); + if (mem_start == 0) { + pr_warn("cannot allocate quick kexec mem (size:0x%llx)\n", + mem_len); + quick_kexec_res.end = 0; + return; + } + + memblock_reserve(mem_start, mem_len); + pr_info("quick kexec mem reserved: 0x%016llx - 0x%016llx (%lld MB)\n", + mem_start, mem_start + mem_len, mem_len >> 20); + + quick_kexec_res.start = mem_start; + quick_kexec_res.end = mem_start + mem_len - 1; +} +#endif + /* * Return the maximum physical address for a zone with a given address size * limit. It currently assumes that for memory starting above 4G, 32-bit @@ -415,6 +454,9 @@ void __init bootmem_init(void) */ reserve_crashkernel();
+#ifdef CONFIG_QUICK_KEXEC + reserve_quick_kexec(); +#endif memblock_dump_all(); }
From: Sang Yan sangyan@huawei.com
hulk inclusion category: feature bugzilla: 48159 CVE: N/A
------------------------------
Introducing a feature of CPU PARK in order to save time of cpus down and up during kexec, which may cost 250ms of per cpu's down and 30ms of up.
As a result, for 128 cores, it costs more than 30 seconds to down and up cpus during kexec. Think about 256 cores and more.
CPU PARK is a state that cpu power-on and staying in spin loop, polling for exit chances, such as writing exit address.
Reserving a block of memory, to fill with cpu park text section, exit address and park-magic-flag of each cpu. In implementation, reserved one page for one cpu core.
Cpus going to park state instead of down in machine_shutdown(). Cpus going out of park state in smp_init instead of brought up.
One of cpu park sections in pre-reserved memory blocks,: +--------------+ + exit address + +--------------+ + park magic + +--------------+ + park codes + + . + + . + + . + +--------------+
Signed-off-by: Sang Yan sangyan@huawei.com Reviewed-by: Xiongfeng Wang wangxiongfeng2@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- arch/arm64/Kconfig | 12 ++ arch/arm64/include/asm/kexec.h | 6 + arch/arm64/include/asm/smp.h | 15 ++ arch/arm64/kernel/Makefile | 1 + arch/arm64/kernel/cpu-park.S | 58 ++++++++ arch/arm64/kernel/machine_kexec.c | 2 +- arch/arm64/kernel/process.c | 4 + arch/arm64/kernel/smp.c | 231 ++++++++++++++++++++++++++++++ arch/arm64/mm/init.c | 64 +++++++++ 9 files changed, 392 insertions(+), 1 deletion(-) create mode 100644 arch/arm64/kernel/cpu-park.S
diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig index 109d8a18f268..a86cdfab8b39 100644 --- a/arch/arm64/Kconfig +++ b/arch/arm64/Kconfig @@ -1229,6 +1229,18 @@ config CRASH_DUMP
For more details see Documentation/admin-guide/kdump/kdump.rst
+config ARM64_CPU_PARK + bool "Support CPU PARK on kexec" + depends on SMP + depends on KEXEC_CORE + help + This enables support for CPU PARK feature in + order to save time of cpu down to up. + CPU park is a state through kexec, spin loop + instead of cpu die before jumping to new kernel, + jumping out from loop to new kernel entry in + smp_init. + config XEN_DOM0 def_bool y depends on XEN diff --git a/arch/arm64/include/asm/kexec.h b/arch/arm64/include/asm/kexec.h index ea67a4d6dd6e..20bee23b6503 100644 --- a/arch/arm64/include/asm/kexec.h +++ b/arch/arm64/include/asm/kexec.h @@ -32,6 +32,11 @@
#define CRASH_ADDR_HIGH_MAX MEMBLOCK_ALLOC_ACCESSIBLE
+#ifdef CONFIG_ARM64_CPU_PARK +/* CPU park state flag: "park" */ +#define PARK_MAGIC 0x7061726b +#endif + #ifndef __ASSEMBLY__
/** @@ -100,6 +105,7 @@ static inline void crash_post_resume(void) {} #ifdef CONFIG_KEXEC_CORE extern void __init reserve_crashkernel(void); #endif +void machine_kexec_mask_interrupts(void);
#ifdef CONFIG_KEXEC_FILE #define ARCH_HAS_KIMAGE_ARCH diff --git a/arch/arm64/include/asm/smp.h b/arch/arm64/include/asm/smp.h index 2e7f529ec5a6..8c5d2d650b8a 100644 --- a/arch/arm64/include/asm/smp.h +++ b/arch/arm64/include/asm/smp.h @@ -145,6 +145,21 @@ bool cpus_are_stuck_in_kernel(void);
extern void crash_smp_send_stop(void); extern bool smp_crash_stop_failed(void); +#ifdef CONFIG_ARM64_CPU_PARK +#define PARK_SECTION_SIZE 1024 +struct cpu_park_info { + /* Physical address of reserved park memory. */ + unsigned long start; + /* park reserve mem len should be PARK_SECTION_SIZE * NR_CPUS */ + unsigned long len; + /* Virtual address of reserved park memory. */ + unsigned long start_v; +}; +extern struct cpu_park_info park_info; +extern void enter_cpu_park(unsigned long text, unsigned long exit); +extern void do_cpu_park(unsigned long exit); +extern int kexec_smp_send_park(void); +#endif
#endif /* ifndef __ASSEMBLY__ */
diff --git a/arch/arm64/kernel/Makefile b/arch/arm64/kernel/Makefile index e1a50865def8..169d90f11cf5 100644 --- a/arch/arm64/kernel/Makefile +++ b/arch/arm64/kernel/Makefile @@ -56,6 +56,7 @@ obj-$(CONFIG_RANDOMIZE_BASE) += kaslr.o obj-$(CONFIG_HIBERNATION) += hibernate.o hibernate-asm.o obj-$(CONFIG_KEXEC_CORE) += machine_kexec.o relocate_kernel.o \ cpu-reset.o +obj-$(CONFIG_ARM64_CPU_PARK) += cpu-park.o obj-$(CONFIG_KEXEC_FILE) += machine_kexec_file.o kexec_image.o obj-$(CONFIG_ARM64_RELOC_TEST) += arm64-reloc-test.o arm64-reloc-test-y := reloc_test_core.o reloc_test_syms.o diff --git a/arch/arm64/kernel/cpu-park.S b/arch/arm64/kernel/cpu-park.S new file mode 100644 index 000000000000..07290dabe10c --- /dev/null +++ b/arch/arm64/kernel/cpu-park.S @@ -0,0 +1,58 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * CPU park routines + * + * Copyright (C) 2020 Huawei Technologies., Ltd. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#include <linux/linkage.h> +#include <asm/assembler.h> +#include <asm/kexec.h> +#include <asm/sysreg.h> +#include <asm/virt.h> + +.text +.pushsection .idmap.text, "awx" + +/* cpu park helper in idmap section */ +SYM_CODE_START(enter_cpu_park) + /* Clear sctlr_el1 flags. */ + mrs x12, sctlr_el1 + mov_q x13, SCTLR_ELx_FLAGS + bic x12, x12, x13 + pre_disable_mmu_workaround + msr sctlr_el1, x12 /* disable mmu */ + isb + + mov x18, x0 + mov x0, x1 /* secondary_entry addr */ + br x18 /* call do_cpu_park of each cpu */ +SYM_CODE_END(enter_cpu_park) + +.popsection + +SYM_CODE_START(do_cpu_park) + ldr x18, =PARK_MAGIC /* magic number "park" */ + add x1, x0, #8 + str x18, [x1] /* set on-park flag */ + dc civac, x1 /* flush cache of "park" */ + dsb nsh + isb + +.Lloop: + wfe + isb + ldr x19, [x0] + cmp x19, #0 /* test secondary_entry */ + b.eq .Lloop + + ic iallu /* invalidate the local I-cache */ + dsb nsh + isb + + br x19 /* jump to secondary_entry */ +SYM_CODE_END(do_cpu_park) diff --git a/arch/arm64/kernel/machine_kexec.c b/arch/arm64/kernel/machine_kexec.c index f176def34fd9..53def49c2ea3 100644 --- a/arch/arm64/kernel/machine_kexec.c +++ b/arch/arm64/kernel/machine_kexec.c @@ -214,7 +214,7 @@ void machine_kexec(struct kimage *kimage) BUG(); /* Should never get here. */ }
-static void machine_kexec_mask_interrupts(void) +void machine_kexec_mask_interrupts(void) { unsigned int i; struct irq_desc *desc; diff --git a/arch/arm64/kernel/process.c b/arch/arm64/kernel/process.c index b74eede19483..02ece450a54a 100644 --- a/arch/arm64/kernel/process.c +++ b/arch/arm64/kernel/process.c @@ -151,6 +151,10 @@ void arch_cpu_idle_dead(void) */ void machine_shutdown(void) { +#ifdef CONFIG_ARM64_CPU_PARK + if (kexec_smp_send_park() == 0) + return; +#endif smp_shutdown_nonboot_cpus(reboot_cpu); }
diff --git a/arch/arm64/kernel/smp.c b/arch/arm64/kernel/smp.c index 70c7634433e1..fb6007dab18c 100644 --- a/arch/arm64/kernel/smp.c +++ b/arch/arm64/kernel/smp.c @@ -32,6 +32,8 @@ #include <linux/irq_work.h> #include <linux/kernel_stat.h> #include <linux/kexec.h> +#include <linux/console.h> + #include <linux/kvm_host.h> #include <linux/perf/arm_pmu.h>
@@ -95,6 +97,167 @@ static inline int op_cpu_kill(unsigned int cpu) } #endif
+#ifdef CONFIG_ARM64_CPU_PARK +struct cpu_park_section { + unsigned long exit; /* exit address of park look */ + unsigned long magic; /* maigc represent park state */ + char text[0]; /* text section of park */ +}; + +static int mmap_cpu_park_mem(void) +{ + if (!park_info.start) + return -ENOMEM; + + if (park_info.start_v) + return 0; + + park_info.start_v = (unsigned long)__ioremap(park_info.start, + park_info.len, + PAGE_KERNEL_EXEC); + if (!park_info.start_v) { + pr_warn("map park memory failed."); + return -ENOMEM; + } + + return 0; +} + +static inline unsigned long cpu_park_section_v(unsigned int cpu) +{ + return park_info.start_v + PARK_SECTION_SIZE * (cpu - 1); +} + +static inline unsigned long cpu_park_section_p(unsigned int cpu) +{ + return park_info.start + PARK_SECTION_SIZE * (cpu - 1); +} + +/* + * Write the secondary_entry to exit section of park state. + * Then the secondary cpu will jump straight into the kernel + * by the secondary_entry. + */ +static int write_park_exit(unsigned int cpu) +{ + struct cpu_park_section *park_section; + unsigned long *park_exit; + unsigned long *park_text; + + if (mmap_cpu_park_mem() != 0) + return -EPERM; + + park_section = (struct cpu_park_section *)cpu_park_section_v(cpu); + park_exit = &park_section->exit; + park_text = (unsigned long *)park_section->text; + pr_debug("park_text 0x%lx : 0x%lx, do_cpu_park text 0x%lx : 0x%lx", + (unsigned long)park_text, *park_text, + (unsigned long)do_cpu_park, + *(unsigned long *)do_cpu_park); + + /* + * Test first 8 bytes to determine + * whether needs to write cpu park exit. + */ + if (*park_text == *(unsigned long *)do_cpu_park) { + writeq_relaxed(__pa_symbol(secondary_entry), park_exit); + __flush_dcache_area((__force void *)park_exit, + sizeof(unsigned long)); + flush_icache_range((unsigned long)park_exit, + (unsigned long)(park_exit + 1)); + sev(); + dsb(sy); + isb(); + + pr_debug("Write cpu %u secondary entry 0x%lx to 0x%lx.", + cpu, *park_exit, (unsigned long)park_exit); + pr_info("Boot cpu %u from PARK state.", cpu); + return 0; + } + + return -EPERM; +} + +/* Install cpu park sections for the specific cpu. */ +static int install_cpu_park(unsigned int cpu) +{ + struct cpu_park_section *park_section; + unsigned long *park_exit; + unsigned long *park_magic; + unsigned long park_text_len; + + park_section = (struct cpu_park_section *)cpu_park_section_v(cpu); + pr_debug("Install cpu park on cpu %u park exit 0x%lx park text 0x%lx", + cpu, (unsigned long)park_section, + (unsigned long)(park_section->text)); + + park_exit = &park_section->exit; + park_magic = &park_section->magic; + park_text_len = PARK_SECTION_SIZE - sizeof(struct cpu_park_section); + + *park_exit = 0UL; + *park_magic = 0UL; + memcpy((void *)park_section->text, do_cpu_park, park_text_len); + __flush_dcache_area((void *)park_section, PARK_SECTION_SIZE); + + return 0; +} + +static int uninstall_cpu_park(unsigned int cpu) +{ + unsigned long park_section; + + if (mmap_cpu_park_mem() != 0) + return -EPERM; + + park_section = cpu_park_section_v(cpu); + memset((void *)park_section, 0, PARK_SECTION_SIZE); + __flush_dcache_area((void *)park_section, PARK_SECTION_SIZE); + + return 0; +} + +static int cpu_wait_park(unsigned int cpu) +{ + long timeout; + struct cpu_park_section *park_section; + + volatile unsigned long *park_magic; + + park_section = (struct cpu_park_section *)cpu_park_section_v(cpu); + park_magic = &park_section->magic; + + timeout = USEC_PER_SEC; + while (*park_magic != PARK_MAGIC && timeout--) + udelay(1); + + if (timeout > 0) + pr_debug("cpu %u park done.", cpu); + else + pr_err("cpu %u park failed.", cpu); + + return *park_magic == PARK_MAGIC; +} + +static void cpu_park(unsigned int cpu) +{ + unsigned long park_section_p; + unsigned long park_exit_phy; + unsigned long do_park; + typeof(enter_cpu_park) *park; + + park_section_p = cpu_park_section_p(cpu); + park_exit_phy = park_section_p; + pr_debug("Go to park cpu %u exit address 0x%lx", cpu, park_exit_phy); + + do_park = park_section_p + sizeof(struct cpu_park_section); + park = (void *)__pa_symbol(enter_cpu_park); + + cpu_install_idmap(); + park(do_park, park_exit_phy); + unreachable(); +} +#endif
/* * Boot a secondary CPU, and assign it the specified idle task. @@ -104,6 +267,10 @@ static int boot_secondary(unsigned int cpu, struct task_struct *idle) { const struct cpu_operations *ops = get_cpu_ops(cpu);
+#ifdef CONFIG_ARM64_CPU_PARK + if (write_park_exit(cpu) == 0) + return 0; +#endif if (ops->cpu_boot) return ops->cpu_boot(cpu);
@@ -139,6 +306,9 @@ int __cpu_up(unsigned int cpu, struct task_struct *idle) */ wait_for_completion_timeout(&cpu_running, msecs_to_jiffies(5000)); +#ifdef CONFIG_ARM64_CPU_PARK + uninstall_cpu_park(cpu); +#endif if (cpu_online(cpu)) return 0;
@@ -854,10 +1024,32 @@ void arch_irq_work_raise(void)
static void local_cpu_stop(void) { +#ifdef CONFIG_ARM64_CPU_PARK + int cpu; + const struct cpu_operations *ops = NULL; +#endif + set_cpu_online(smp_processor_id(), false);
local_daif_mask(); sdei_mask_local_cpu(); + +#ifdef CONFIG_ARM64_CPU_PARK + /* + * Go to cpu park state. + * Otherwise go to cpu die. + */ + cpu = smp_processor_id(); + if (kexec_in_progress && park_info.start_v) { + machine_kexec_mask_interrupts(); + cpu_park(cpu); + + ops = get_cpu_ops(cpu); + if (ops && ops->cpu_die) + ops->cpu_die(cpu); + } +#endif + cpu_park_loop(); }
@@ -1070,6 +1262,45 @@ void smp_send_stop(void) sdei_mask_local_cpu(); }
+#ifdef CONFIG_ARM64_CPU_PARK +int kexec_smp_send_park(void) +{ + unsigned long cpu; + + if (WARN_ON(!kexec_in_progress)) { + pr_crit("%s called not in kexec progress.", __func__); + return -EPERM; + } + + if (mmap_cpu_park_mem() != 0) { + pr_info("no cpuparkmem, goto normal way."); + return -EPERM; + } + + local_irq_disable(); + + if (num_online_cpus() > 1) { + cpumask_t mask; + + cpumask_copy(&mask, cpu_online_mask); + cpumask_clear_cpu(smp_processor_id(), &mask); + + for_each_cpu(cpu, &mask) + install_cpu_park(cpu); + smp_cross_call(&mask, IPI_CPU_STOP); + + /* Wait for other CPUs to park */ + for_each_cpu(cpu, &mask) + cpu_wait_park(cpu); + pr_info("smp park other cpus done\n"); + } + + sdei_mask_local_cpu(); + + return 0; +} +#endif + #ifdef CONFIG_KEXEC_CORE void crash_smp_send_stop(void) { diff --git a/arch/arm64/mm/init.c b/arch/arm64/mm/init.c index a3edb18a22f1..5b0c785e2889 100644 --- a/arch/arm64/mm/init.c +++ b/arch/arm64/mm/init.c @@ -297,6 +297,57 @@ static void __init fdt_enforce_memory_region(void) memblock_add(usable_rgns[1].base, usable_rgns[1].size); }
+#ifdef CONFIG_ARM64_CPU_PARK +struct cpu_park_info park_info = { + .start = 0, + .len = PARK_SECTION_SIZE * NR_CPUS, + .start_v = 0, +}; + +static int __init parse_park_mem(char *p) +{ + if (!p) + return 0; + + park_info.start = PAGE_ALIGN(memparse(p, NULL)); + if (park_info.start == 0) + pr_info("cpu park mem params[%s]", p); + + return 0; +} +early_param("cpuparkmem", parse_park_mem); + +static int __init reserve_park_mem(void) +{ + if (park_info.start == 0 || park_info.len == 0) + return 0; + + park_info.start = PAGE_ALIGN(park_info.start); + park_info.len = PAGE_ALIGN(park_info.len); + + if (!memblock_is_region_memory(park_info.start, park_info.len)) { + pr_warn("cannot reserve park mem: region is not memory!"); + goto out; + } + + if (memblock_is_region_reserved(park_info.start, park_info.len)) { + pr_warn("cannot reserve park mem: region overlaps reserved memory!"); + goto out; + } + + memblock_remove(park_info.start, park_info.len); + pr_info("cpu park mem reserved: 0x%016lx - 0x%016lx (%ld MB)", + park_info.start, park_info.start + park_info.len, + park_info.len >> 20); + + return 0; +out: + park_info.start = 0; + park_info.len = 0; + return -EINVAL; +} +#endif + void __init arm64_memblock_init(void) { const s64 linear_region_size = BIT(vabits_actual - 1); @@ -448,6 +499,19 @@ void __init bootmem_init(void) */ dma_contiguous_reserve(arm64_dma_phys_limit);
+ /* + * Reserve park memory before crashkernel and quick kexec. + * Because park memory must be specified by address, but + * crashkernel and quickkexec may be specified by memory length, + * then find one sutiable memory region to reserve. + * + * So reserve park memory firstly is better, but it may cause + * crashkernel or quickkexec reserving failed. + */ +#ifdef CONFIG_ARM64_CPU_PARK + reserve_park_mem(); +#endif + /* * request_standard_resources() depends on crashkernel's memory being * reserved, so do it here.
From: ZhuLing zhuling8@huawei.com
hulk inclusion category: feature bugzilla: 48159 CVE: NA
------------------------------
Register pmem in arm64: Use memmap(memmap=nn[KMG]!ss[KMG]) reserve memory and e820(driver/nvdimm/e820.c) function to register persistent memory in arm64. when the kernel restart or update, the data in PMEM will not be lost and can be loaded faster. this is a general features.
driver/nvdimm/e820.c: The function of this file is scan "iomem_resource" and take advantage of nvdimm resource discovery mechanism by registering a resource named "Persistent Memory (legacy)", this function doesn't depend on architecture.
We will push the feature to linux kernel community and discuss to modify the file name. because people have a mistaken notion that the e820.c is depend on x86.
If you want use this features, you need do as follows: 1.Reserve memory: add memmap to reserve memory in grub.cfg memmap=nn[KMG]!ss[KMG] exp:memmap=100K!0x1a0000000. 2.Insmod nd_e820.ko: modprobe nd_e820. 3.Check pmem device in /dev exp: /dev/pmem0
Signed-off-by: ZhuLing zhuling8@huawei.com Signed-off-by: Sang Yan sangyan@huawei.com Reviewed-by: Chen Wandun chenwandun@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- arch/arm64/Kconfig | 21 +++++++++ arch/arm64/kernel/Makefile | 1 + arch/arm64/kernel/pmem.c | 35 ++++++++++++++ arch/arm64/kernel/setup.c | 10 ++++ arch/arm64/mm/init.c | 95 ++++++++++++++++++++++++++++++++++++++ drivers/nvdimm/Kconfig | 5 ++ drivers/nvdimm/Makefile | 2 +- 7 files changed, 168 insertions(+), 1 deletion(-) create mode 100644 arch/arm64/kernel/pmem.c
diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig index a86cdfab8b39..a2380374ef59 100644 --- a/arch/arm64/Kconfig +++ b/arch/arm64/Kconfig @@ -1302,6 +1302,27 @@ config RODATA_FULL_DEFAULT_ENABLED This requires the linear region to be mapped down to pages, which may adversely affect performance in some cases.
+config ARM64_PMEM_RESERVE + bool "Reserve memory for persistent storage" + default n + help + Use memmap=nn[KMG]!ss[KMG](memmap=100K!0x1a0000000) reserve + memory for persistent storage. + + Say y here to enable this feature. + +config ARM64_PMEM_LEGACY_DEVICE + bool "Create persistent storage" + depends on BLK_DEV + depends on LIBNVDIMM + select ARM64_PMEM_RESERVE + help + Use reserved memory for persistent storage when the kernel + restart or update. the data in PMEM will not be lost and + can be loaded faster. + + Say y if unsure. + config ARM64_SW_TTBR0_PAN bool "Emulate Privileged Access Never using TTBR0_EL1 switching" help diff --git a/arch/arm64/kernel/Makefile b/arch/arm64/kernel/Makefile index 169d90f11cf5..f6153250b631 100644 --- a/arch/arm64/kernel/Makefile +++ b/arch/arm64/kernel/Makefile @@ -68,6 +68,7 @@ obj-$(CONFIG_ARM64_PTR_AUTH) += pointer_auth.o obj-$(CONFIG_SHADOW_CALL_STACK) += scs.o obj-$(CONFIG_ARM64_MTE) += mte.o obj-$(CONFIG_MPAM) += mpam/ +obj-$(CONFIG_ARM64_PMEM_LEGACY_DEVICE) += pmem.o
obj-y += vdso/ probes/ obj-$(CONFIG_COMPAT_VDSO) += vdso32/ diff --git a/arch/arm64/kernel/pmem.c b/arch/arm64/kernel/pmem.c new file mode 100644 index 000000000000..16eaf706f671 --- /dev/null +++ b/arch/arm64/kernel/pmem.c @@ -0,0 +1,35 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright(c) 2021 Huawei Technologies Co., Ltd + * + * Derived from x86 and arm64 implement PMEM. + */ +#include <linux/platform_device.h> +#include <linux/init.h> +#include <linux/ioport.h> +#include <linux/module.h> + +static int found(struct resource *res, void *data) +{ + return 1; +} + +static int __init register_e820_pmem(void) +{ + struct platform_device *pdev; + int rc; + + rc = walk_iomem_res_desc(IORES_DESC_PERSISTENT_MEMORY_LEGACY, + IORESOURCE_MEM, 0, -1, NULL, found); + if (rc <= 0) + return 0; + + /* + * See drivers/nvdimm/e820.c for the implementation, this is + * simply here to trigger the module to load on demand. + */ + pdev = platform_device_alloc("e820_pmem", -1); + + return platform_device_add(pdev); +} +device_initcall(register_e820_pmem); diff --git a/arch/arm64/kernel/setup.c b/arch/arm64/kernel/setup.c index b0fe635e943e..1c3da53c5d57 100644 --- a/arch/arm64/kernel/setup.c +++ b/arch/arm64/kernel/setup.c @@ -67,6 +67,10 @@ static int __init arm64_enable_cpu0_hotplug(char *str) __setup("arm64_cpu0_hotplug", arm64_enable_cpu0_hotplug); #endif
+#ifdef CONFIG_ARM64_PMEM_RESERVE +extern struct resource pmem_res; +#endif + phys_addr_t __fdt_pointer __initdata;
/* @@ -274,6 +278,12 @@ static void __init request_standard_resources(void) request_resource(res, &quick_kexec_res); #endif } + +#ifdef CONFIG_ARM64_PMEM_RESERVE + if (pmem_res.end && pmem_res.start) + request_resource(&iomem_resource, &pmem_res); +#endif + }
static int __init reserve_memblock_reserved_regions(void) diff --git a/arch/arm64/mm/init.c b/arch/arm64/mm/init.c index 5b0c785e2889..dd33a2a9404c 100644 --- a/arch/arm64/mm/init.c +++ b/arch/arm64/mm/init.c @@ -52,6 +52,7 @@ */ s64 memstart_addr __ro_after_init = -1; EXPORT_SYMBOL(memstart_addr); +phys_addr_t start_at, mem_size;
/* * If the corresponding config options are enabled, we create both ZONE_DMA @@ -62,6 +63,18 @@ EXPORT_SYMBOL(memstart_addr); */ phys_addr_t arm64_dma_phys_limit __ro_after_init;
+static unsigned long long pmem_size, pmem_start; + +#ifdef CONFIG_ARM64_PMEM_RESERVE +struct resource pmem_res = { + .name = "Persistent Memory (legacy)", + .start = 0, + .end = 0, + .flags = IORESOURCE_MEM, + .desc = IORES_DESC_PERSISTENT_MEMORY_LEGACY +}; +#endif + #ifndef CONFIG_KEXEC_CORE static void __init reserve_crashkernel(void) { @@ -348,6 +361,83 @@ static int __init reserve_park_mem(void) } #endif
+static bool __init is_mem_valid(unsigned long long mem_size, unsigned long long mem_start) +{ + if (!memblock_is_region_memory(mem_start, mem_size)) { + pr_warn("cannot reserve mem: region is not memory!\n"); + return false; + } + + if (memblock_is_region_reserved(mem_start, mem_size)) { + pr_warn("cannot reserve mem: region overlaps reserved memory!\n"); + return false; + } + + if (!IS_ALIGNED(mem_start, SZ_2M)) { + pr_warn("cannot reserve mem: base address is not 2MB aligned!\n"); + return false; + } + + return true; +} + +static int __init parse_memmap_one(char *p) +{ + char *oldp; + + if (!p) + return -EINVAL; + + oldp = p; + mem_size = memparse(p, &p); + if (p == oldp) + return -EINVAL; + + if (!mem_size) + return -EINVAL; + + mem_size = PAGE_ALIGN(mem_size); + + if (*p == '!') { + start_at = memparse(p+1, &p); + + pmem_start = start_at; + pmem_size = mem_size; + } else + pr_info("Unrecognized memmap option, please check the parameter.\n"); + + return *p == '\0' ? 0 : -EINVAL; +} + +static int __init parse_memmap_opt(char *str) +{ + while (str) { + char *k = strchr(str, ','); + + if (k) + *k++ = 0; + parse_memmap_one(str); + str = k; + } + + return 0; +} +early_param("memmap", parse_memmap_opt); + +#ifdef CONFIG_ARM64_PMEM_RESERVE +static void __init reserve_pmem(void) +{ + if (!is_mem_valid(mem_size, start_at)) + return; + + memblock_remove(pmem_start, pmem_size); + pr_info("pmem reserved: 0x%016llx - 0x%016llx (%lld MB)\n", + pmem_start, pmem_start + pmem_size, pmem_size >> 20); + pmem_res.start = pmem_start; + pmem_res.end = pmem_start + pmem_size - 1; +} +#endif + void __init arm64_memblock_init(void) { const s64 linear_region_size = BIT(vabits_actual - 1); @@ -521,6 +611,11 @@ void __init bootmem_init(void) #ifdef CONFIG_QUICK_KEXEC reserve_quick_kexec(); #endif + +#ifdef CONFIG_ARM64_PMEM_RESERVE + reserve_pmem(); +#endif + memblock_dump_all(); }
diff --git a/drivers/nvdimm/Kconfig b/drivers/nvdimm/Kconfig index b7d1eb38b27d..ce4de75262b9 100644 --- a/drivers/nvdimm/Kconfig +++ b/drivers/nvdimm/Kconfig @@ -132,3 +132,8 @@ config NVDIMM_TEST_BUILD infrastructure.
endif + +config PMEM_LEGACY + tristate "Pmem_legacy" + select X86_PMEM_LEGACY if X86 + select ARM64_PMEM_LEGACY_DEVICE if ARM64 diff --git a/drivers/nvdimm/Makefile b/drivers/nvdimm/Makefile index 29203f3d3069..6f8dc9242a81 100644 --- a/drivers/nvdimm/Makefile +++ b/drivers/nvdimm/Makefile @@ -3,7 +3,7 @@ obj-$(CONFIG_LIBNVDIMM) += libnvdimm.o obj-$(CONFIG_BLK_DEV_PMEM) += nd_pmem.o obj-$(CONFIG_ND_BTT) += nd_btt.o obj-$(CONFIG_ND_BLK) += nd_blk.o -obj-$(CONFIG_X86_PMEM_LEGACY) += nd_e820.o +obj-$(CONFIG_PMEM_LEGACY) += nd_e820.o obj-$(CONFIG_OF_PMEM) += of_pmem.o obj-$(CONFIG_VIRTIO_PMEM) += virtio_pmem.o nd_virtio.o
From: Jingxian He hejingxian@huawei.com
hulk inclusion category: feature bugzilla: 48159 CVE: N/A
------------------------------
We can use the checkpoint and restore in userspace(criu) method to dump and restore tasks when updating the kernel. Currently, criu needs dump all memory data of tasks to files. When the memory size is very large(larger than 1G), the cost time of the dumping data will be very long(more than 1 min).
By pin the memory data of tasks and collect the corresponding physical pages mapping info in checkpoint process, we can remap the physical pages to restore tasks after upgrading the kernel. This pin memory method can restore the task data within one second.
The pin memory area info is saved in the reserved memblock, which can keep usable in the kernel update process.
The pin memory driver provides the following ioctl command for criu: 1) SET_PIN_MEM_AREA: Set pin memory area, which can be remap to the restore task. 2) CLEAR_PIN_MEM_AREA: Clear the pin memory area info, which enable user reset the pin data. 3) REMAP_PIN_MEM_AREA: Remap the pages of the pin memory to the restore task.
Signed-off-by: Jingxian He hejingxian@huawei.com Reviewed-by: Chen Wandun chenwandun@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- arch/arm64/kernel/setup.c | 10 + arch/arm64/mm/init.c | 57 ++ drivers/char/Kconfig | 7 + drivers/char/Makefile | 1 + drivers/char/pin_memory.c | 209 +++++++ fs/proc/task_mmu.c | 138 +++++ include/linux/crash_core.h | 5 + include/linux/pin_mem.h | 93 +++ kernel/crash_core.c | 11 + mm/Kconfig | 8 + mm/Makefile | 1 + mm/huge_memory.c | 63 +++ mm/memory.c | 65 +++ mm/pin_mem.c | 1091 ++++++++++++++++++++++++++++++++++++ 14 files changed, 1759 insertions(+) create mode 100644 drivers/char/pin_memory.c create mode 100644 include/linux/pin_mem.h create mode 100644 mm/pin_mem.c
diff --git a/arch/arm64/kernel/setup.c b/arch/arm64/kernel/setup.c index 1c3da53c5d57..8c29faf00521 100644 --- a/arch/arm64/kernel/setup.c +++ b/arch/arm64/kernel/setup.c @@ -30,6 +30,9 @@ #include <linux/psci.h> #include <linux/sched/task.h> #include <linux/mm.h> +#ifdef CONFIG_PIN_MEMORY +#include <linux/pin_mem.h> +#endif
#include <asm/acpi.h> #include <asm/fixmap.h> @@ -271,12 +274,19 @@ static void __init request_standard_resources(void) crashk_res.end <= res->end) request_resource(res, &crashk_res); #endif + #ifdef CONFIG_QUICK_KEXEC if (quick_kexec_res.end && quick_kexec_res.start >= res->start && quick_kexec_res.end <= res->end) request_resource(res, &quick_kexec_res); #endif + +#ifdef CONFIG_PIN_MEMORY + if (pin_memory_resource.end && pin_memory_resource.start >= res->start && + pin_memory_resource.end <= res->end) + request_resource(res, &pin_memory_resource); +#endif }
#ifdef CONFIG_ARM64_PMEM_RESERVE diff --git a/arch/arm64/mm/init.c b/arch/arm64/mm/init.c index dd33a2a9404c..3800fb73db9c 100644 --- a/arch/arm64/mm/init.c +++ b/arch/arm64/mm/init.c @@ -30,6 +30,9 @@ #include <linux/crash_dump.h> #include <linux/hugetlb.h> #include <linux/acpi_iort.h> +#ifdef CONFIG_PIN_MEMORY +#include <linux/pin_mem.h> +#endif
#include <asm/boot.h> #include <asm/fixmap.h> @@ -54,6 +57,52 @@ s64 memstart_addr __ro_after_init = -1; EXPORT_SYMBOL(memstart_addr); phys_addr_t start_at, mem_size;
+#ifdef CONFIG_PIN_MEMORY +struct resource pin_memory_resource = { + .name = "Pin memory", + .start = 0, + .end = 0, + .flags = IORESOURCE_MEM, + .desc = IORES_DESC_RESERVED +}; + +static void __init reserve_pin_memory_res(void) +{ + unsigned long long mem_start, mem_len; + int ret; + + ret = parse_pin_memory(boot_command_line, memblock_phys_mem_size(), + &mem_len, &mem_start); + if (ret || !mem_len) + return; + + mem_len = PAGE_ALIGN(mem_len); + + if (!memblock_is_region_memory(mem_start, mem_len)) { + pr_warn("cannot reserve for pin memory: region is not memory!\n"); + return; + } + + if (memblock_is_region_reserved(mem_start, mem_len)) { + pr_warn("cannot reserve for pin memory: region overlaps reserved memory!\n"); + return; + } + + if (!IS_ALIGNED(mem_start, SZ_2M)) { + pr_warn("cannot reserve for pin memory: base address is not 2MB aligned\n"); + return; + } + + memblock_reserve(mem_start, mem_len); + pin_memory_resource.start = mem_start; + pin_memory_resource.end = mem_start + mem_len - 1; +} +#else +static void __init reserve_pin_memory_res(void) +{ +} +#endif /* CONFIG_PIN_MEMORY */ + /* * If the corresponding config options are enabled, we create both ZONE_DMA * and ZONE_DMA32. By default ZONE_DMA covers the 32-bit addressable memory @@ -616,6 +665,8 @@ void __init bootmem_init(void) reserve_pmem(); #endif
+ reserve_pin_memory_res(); + memblock_dump_all(); }
@@ -705,6 +756,12 @@ void __init mem_init(void) /* this will put all unused low memory onto the freelists */ memblock_free_all();
+#ifdef CONFIG_PIN_MEMORY + /* pre alloc the pages for pin memory */ + init_reserve_page_map((unsigned long)pin_memory_resource.start, + (unsigned long)(pin_memory_resource.end - pin_memory_resource.start)); +#endif + mem_init_print_info(NULL);
/* diff --git a/drivers/char/Kconfig b/drivers/char/Kconfig index d229a2d0c017..4f451477281b 100644 --- a/drivers/char/Kconfig +++ b/drivers/char/Kconfig @@ -471,6 +471,13 @@ config ADI and SSM (Silicon Secured Memory). Intended consumers of this driver include crash and makedumpfile.
+config PIN_MEMORY_DEV + tristate "/dev/pinmem character device" + depends on PIN_MEMORY + default m + help + pin memory driver + endmenu
config RANDOM_TRUST_CPU diff --git a/drivers/char/Makefile b/drivers/char/Makefile index ffce287ef415..71d76fd62692 100644 --- a/drivers/char/Makefile +++ b/drivers/char/Makefile @@ -47,3 +47,4 @@ obj-$(CONFIG_PS3_FLASH) += ps3flash.o obj-$(CONFIG_XILLYBUS) += xillybus/ obj-$(CONFIG_POWERNV_OP_PANEL) += powernv-op-panel.o obj-$(CONFIG_ADI) += adi.o +obj-$(CONFIG_PIN_MEMORY_DEV) += pin_memory.o diff --git a/drivers/char/pin_memory.c b/drivers/char/pin_memory.c new file mode 100644 index 000000000000..9ddbaa64b392 --- /dev/null +++ b/drivers/char/pin_memory.c @@ -0,0 +1,209 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright @ Huawei Technologies Co., Ltd. 2020-2020. ALL rights reserved. + * Description: Euler pin memory driver + */ +#include <linux/kernel.h> +#include <linux/module.h> +#include <linux/kprobes.h> +#include <linux/spinlock.h> +#include <linux/workqueue.h> +#include <linux/sched.h> +#include <linux/mm.h> +#include <linux/init.h> +#include <linux/miscdevice.h> +#include <linux/fs.h> +#include <linux/mm_types.h> +#include <linux/processor.h> +#include <uapi/asm-generic/ioctl.h> +#include <uapi/asm-generic/mman-common.h> +#include <uapi/asm/setup.h> +#include <linux/pin_mem.h> +#include <linux/sched/mm.h> + +#define MAX_PIN_MEM_AREA_NUM 16 +struct _pin_mem_area { + unsigned long virt_start; + unsigned long virt_end; +}; + +struct pin_mem_area_set { + unsigned int pid; + unsigned int area_num; + struct _pin_mem_area mem_area[MAX_PIN_MEM_AREA_NUM]; +}; + +#define PIN_MEM_MAGIC 0x59 +#define _SET_PIN_MEM_AREA 1 +#define _CLEAR_PIN_MEM_AREA 2 +#define _REMAP_PIN_MEM_AREA 3 +#define _FINISH_PIN_MEM_DUMP 4 +#define _INIT_PAGEMAP_READ 5 +#define _PIN_MEM_IOC_MAX_NR 5 +#define SET_PIN_MEM_AREA _IOW(PIN_MEM_MAGIC, _SET_PIN_MEM_AREA, struct pin_mem_area_set) +#define CLEAR_PIN_MEM_AREA _IOW(PIN_MEM_MAGIC, _CLEAR_PIN_MEM_AREA, int) +#define REMAP_PIN_MEM_AREA _IOW(PIN_MEM_MAGIC, _REMAP_PIN_MEM_AREA, int) +#define FINISH_PIN_MEM_DUMP _IOW(PIN_MEM_MAGIC, _FINISH_PIN_MEM_DUMP, int) +#define INIT_PAGEMAP_READ _IOW(PIN_MEM_MAGIC, _INIT_PAGEMAP_READ, int) +static int set_pin_mem(struct pin_mem_area_set *pmas) +{ + int i; + int ret = 0; + struct _pin_mem_area *pma; + struct mm_struct *mm; + struct task_struct *task; + struct pid *pid_s; + + pid_s = find_get_pid(pmas->pid); + if (!pid_s) { + pr_warn("Get pid struct fail:%d.\n", pmas->pid); + return -EFAULT; + } + rcu_read_lock(); + task = pid_task(pid_s, PIDTYPE_PID); + if (!task) { + pr_warn("Get task struct fail:%d.\n", pmas->pid); + goto fail; + } + mm = get_task_mm(task); + for (i = 0; i < pmas->area_num; i++) { + pma = &(pmas->mem_area[i]); + ret = pin_mem_area(task, mm, pma->virt_start, pma->virt_end); + if (ret) { + mmput(mm); + goto fail; + } + } + mmput(mm); + rcu_read_unlock(); + put_pid(pid_s); + return ret; + +fail: + rcu_read_unlock(); + put_pid(pid_s); + return -EFAULT; +} + +static int set_pin_mem_area(unsigned long arg) +{ + struct pin_mem_area_set pmas; + void __user *buf = (void __user *)arg; + + if (copy_from_user(&pmas, buf, sizeof(pmas))) + return -EINVAL; + if (pmas.area_num > MAX_PIN_MEM_AREA_NUM) { + pr_warn("Input area_num is too large.\n"); + return -EINVAL; + } + + return set_pin_mem(&pmas); +} + +static int pin_mem_remap(unsigned long arg) +{ + int pid; + struct task_struct *task; + struct mm_struct *mm; + vm_fault_t ret; + void __user *buf = (void __user *)arg; + struct pid *pid_s; + + if (copy_from_user(&pid, buf, sizeof(int))) + return -EINVAL; + + pid_s = find_get_pid(pid); + if (!pid_s) { + pr_warn("Get pid struct fail:%d.\n", pid); + return -EINVAL; + } + rcu_read_lock(); + task = pid_task(pid_s, PIDTYPE_PID); + if (!task) { + pr_warn("Get task struct fail:%d.\n", pid); + goto fault; + } + mm = get_task_mm(task); + ret = do_mem_remap(pid, mm); + if (ret) { + pr_warn("Handle pin memory remap fail.\n"); + mmput(mm); + goto fault; + } + mmput(mm); + rcu_read_unlock(); + put_pid(pid_s); + return 0; + +fault: + rcu_read_unlock(); + put_pid(pid_s); + return -EFAULT; +} + +static long pin_memory_ioctl(struct file *file, unsigned int cmd, unsigned long arg) +{ + long ret = 0; + + if (_IOC_TYPE(cmd) != PIN_MEM_MAGIC) + return -EINVAL; + if (_IOC_NR(cmd) > _PIN_MEM_IOC_MAX_NR) + return -EINVAL; + + switch (cmd) { + case SET_PIN_MEM_AREA: + ret = set_pin_mem_area(arg); + break; + case CLEAR_PIN_MEM_AREA: + clear_pin_memory_record(); + break; + case REMAP_PIN_MEM_AREA: + ret = pin_mem_remap(arg); + break; + case FINISH_PIN_MEM_DUMP: + ret = finish_pin_mem_dump(); + break; + case INIT_PAGEMAP_READ: + ret = init_pagemap_read(); + break; + default: + return -EINVAL; + } + return ret; +} + +static const struct file_operations pin_memory_fops = { + .owner = THIS_MODULE, + .unlocked_ioctl = pin_memory_ioctl, + .compat_ioctl = pin_memory_ioctl, +}; + +static struct miscdevice pin_memory_miscdev = { + .minor = MISC_DYNAMIC_MINOR, + .name = "pinmem", + .fops = &pin_memory_fops, +}; + +static int pin_memory_init(void) +{ + int err = misc_register(&pin_memory_miscdev); + + if (!err) + pr_info("pin_memory init\n"); + else + pr_warn("pin_memory init failed!\n"); + return err; +} + +static void pin_memory_exit(void) +{ + misc_deregister(&pin_memory_miscdev); + pr_info("pin_memory ko exists!\n"); +} + +module_init(pin_memory_init); +module_exit(pin_memory_exit); + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Euler"); +MODULE_DESCRIPTION("pin memory"); diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index 4dc080939bdd..f46c7efd6147 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c @@ -1665,6 +1665,144 @@ static ssize_t pagemap_read(struct file *file, char __user *buf, return ret; }
+#ifdef CONFIG_PIN_MEMORY +static int get_pagemap_pmd_range(pmd_t *pmdp, unsigned long addr, unsigned long end, + struct mm_walk *walk) +{ + struct vm_area_struct *vma = walk->vma; + struct pagemapread *pm = walk->private; + spinlock_t *ptl; + pte_t *pte, *orig_pte; + int err = 0; + pagemap_entry_t pme; + +#ifdef CONFIG_TRANSPARENT_HUGEPAGE + ptl = pmd_trans_huge_lock(pmdp, vma); + if (ptl) { + u64 flags = 0, frame = 0; + pmd_t pmd = *pmdp; + struct page *page = NULL; + + if (pmd_present(pmd)) { + page = pmd_page(pmd); + flags |= PM_PRESENT; + frame = pmd_pfn(pmd) + + ((addr & ~PMD_MASK) >> PAGE_SHIFT); + } +#ifdef CONFIG_ARCH_ENABLE_THP_MIGRATION + else if (is_swap_pmd(pmd)) { + swp_entry_t entry = pmd_to_swp_entry(pmd); + unsigned long offset; + + offset = swp_offset(entry) + + ((addr & ~PMD_MASK) >> PAGE_SHIFT); + frame = swp_type(entry) | + (offset << MAX_SWAPFILES_SHIFT); + + flags |= PM_SWAP; + if (pmd_swp_soft_dirty(pmd)) + flags |= PM_SOFT_DIRTY; + VM_BUG_ON(!is_pmd_migration_entry(pmd)); + page = migration_entry_to_page(entry); + } +#endif + pme = make_pme(frame, flags); + err = add_to_pagemap(addr, &pme, pm); + spin_unlock(ptl); + return err; + } + + if (pmd_trans_unstable(pmdp)) + return 0; +#endif /* CONFIG_TRANSPARENT_HUGEPAGE */ + + orig_pte = pte = pte_offset_map_lock(walk->mm, pmdp, addr, &ptl); + for (; addr < end; pte++, addr += PAGE_SIZE) { + pme = pte_to_pagemap_entry(pm, vma, addr, *pte); + err = add_to_pagemap(addr, &pme, pm); + if (err) + break; + } + pte_unmap_unlock(orig_pte, ptl); + return err; +} + +static const struct mm_walk_ops pin_pagemap_ops = { + .pmd_entry = get_pagemap_pmd_range, + .pte_hole = pagemap_pte_hole, + .hugetlb_entry = pagemap_hugetlb_range, +}; + +void *create_pagemap_walk(void) +{ + struct pagemapread *pm; + struct mm_walk *pagemap_walk; + + pagemap_walk = kzalloc(sizeof(struct mm_walk), GFP_KERNEL); + if (!pagemap_walk) + return NULL; + pm = kmalloc(sizeof(struct pagemapread), GFP_KERNEL); + if (!pm) + goto out_free_walk; + + pm->show_pfn = true; + pm->len = (PAGEMAP_WALK_SIZE >> PAGE_SHIFT) + 1; + pm->buffer = kmalloc_array(pm->len, PM_ENTRY_BYTES, GFP_KERNEL); + if (!pm->buffer) + goto out_free; + + pagemap_walk->ops = &pin_pagemap_ops; + pagemap_walk->private = pm; + return (void *)pagemap_walk; +out_free: + kfree(pm); +out_free_walk: + kfree(pagemap_walk); + return NULL; +} + +void free_pagemap_walk(void *mem_walk) +{ + struct pagemapread *pm; + struct mm_walk *pagemap_walk = (struct mm_walk *)mem_walk; + + if (!pagemap_walk) + return; + if (pagemap_walk->private) { + pm = (struct pagemapread *)pagemap_walk->private; + kfree(pm->buffer); + kfree(pm); + pagemap_walk->private = NULL; + } + kfree(pagemap_walk); +} + +int pagemap_get(struct mm_struct *mm, void *mem_walk, + unsigned long start_vaddr, unsigned long end_vaddr, + unsigned long *pte_entry, unsigned int *count) +{ + int i, ret; + struct pagemapread *pm; + unsigned long end; + struct mm_walk *pagemap_walk = (struct mm_walk *)mem_walk; + + if (!pte_entry || !mm || !pagemap_walk) + return -EFAULT; + + pm = (struct pagemapread *)pagemap_walk->private; + pagemap_walk->mm = mm; + pm->pos = 0; + end = (start_vaddr + PAGEMAP_WALK_SIZE) & PAGEMAP_WALK_MASK; + if (end > end_vaddr) + end = end_vaddr; + ret = walk_page_range(mm, start_vaddr, end, pagemap_walk->ops, pm); + *count = pm->pos; + for (i = 0; i < pm->pos; i++) + pte_entry[i] = pm->buffer[i].pme; + return ret; +} +#endif + static int pagemap_open(struct inode *inode, struct file *file) { struct mm_struct *mm; diff --git a/include/linux/crash_core.h b/include/linux/crash_core.h index fc0ef33a76f7..30f0df3cfbfb 100644 --- a/include/linux/crash_core.h +++ b/include/linux/crash_core.h @@ -87,4 +87,9 @@ int parse_crashkernel_high(char *cmdline, unsigned long long system_ram, int parse_crashkernel_low(char *cmdline, unsigned long long system_ram, unsigned long long *crash_size, unsigned long long *crash_base);
+#ifdef CONFIG_PIN_MEMORY +int __init parse_pin_memory(char *cmdline, unsigned long long system_ram, + unsigned long long *pin_size, unsigned long long *pin_base); +#endif + #endif /* LINUX_CRASH_CORE_H */ diff --git a/include/linux/pin_mem.h b/include/linux/pin_mem.h new file mode 100644 index 000000000000..b01cd05ace06 --- /dev/null +++ b/include/linux/pin_mem.h @@ -0,0 +1,93 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Copyright (C) 2020. Huawei Technologies Co., Ltd. All rights reserved. + * Provide the pin memory method for check point and restore task. + */ +#ifndef _LINUX_PIN_MEMORY_H +#define _LINUX_PIN_MEMORY_H + +#ifdef CONFIG_PIN_MEMORY +#include <linux/errno.h> +#include <linux/mm_types.h> +#include <linux/err.h> +#ifdef CONFIG_ARM64 +#include <linux/ioport.h> +#endif + +#define PAGE_BUDDY_MAPCOUNT_VALUE (~PG_buddy) + +#define COLLECT_PAGES_FINISH 0 +#define COLLECT_PAGES_NEED_CONTINUE 1 +#define COLLECT_PAGES_FAIL -1 + +#define COMPOUND_PAD_MASK 0xffffffff +#define COMPOUND_PAD_START 0x88 +#define COMPOUND_PAD_DELTA 0x40 +#define LIST_POISON4 0xdead000000000400 +#define PAGE_FLAGS_CHECK_RESERVED (1UL << PG_reserved) +#define SHA256_DIGEST_SIZE 32 +#define next_pme(pme) ((unsigned long *)((pme) + 1) + (pme)->nr_pages) +#define PIN_MEM_DUMP_MAGIC 0xfeab000000001acd +#define PM_PFRAME_BITS 55 +#define PM_PFRAME_MASK GENMASK_ULL(PM_PFRAME_BITS - 1, 0) +#define PM_PRESENT BIT_ULL(63) +#define PM_SWAP BIT_ULL(62) +#define IS_PTE_PRESENT(entry) (((entry) & PM_PFRAME_MASK) && ((entry) & PM_PRESENT)) +#define NEXT_PIN_ADDR(next, end_addr) (((next) + HPAGE_PMD_SIZE) > (end_addr) ? \ + (end_addr) : ((next) + HPAGE_PMD_SIZE)) + +struct page_map_entry { + unsigned long virt_addr; + unsigned int nr_pages; + unsigned int is_huge_page; + unsigned long redirect_start; + unsigned long phy_addr_array[0]; +}; + +struct page_map_info { + int pid; + int pid_reserved; + unsigned int entry_num; + int disable_free_page; + struct page_map_entry *pme; +}; + +struct pin_mem_dump_info { + char sha_digest[SHA256_DIGEST_SIZE]; + unsigned long magic; + unsigned int pin_pid_num; + struct page_map_info pmi_array[0]; +}; + +struct redirect_info { + unsigned int redirect_pages; + unsigned int redirect_index[0]; +}; + +extern struct page_map_info *get_page_map_info_by_pid(int pid); +extern struct page_map_info *create_page_map_info_by_pid(int pid); +extern vm_fault_t do_mem_remap(int pid, struct mm_struct *mm); +extern vm_fault_t do_anon_page_remap(struct vm_area_struct *vma, unsigned long address, + pmd_t *pmd, struct page *page); +extern void clear_pin_memory_record(void); +extern int pin_mem_area(struct task_struct *task, struct mm_struct *mm, + unsigned long start_addr, unsigned long end_addr); +extern vm_fault_t do_anon_huge_page_remap(struct vm_area_struct *vma, unsigned long address, + pmd_t *pmd, struct page *page); +extern int finish_pin_mem_dump(void); + +extern void *create_pagemap_walk(void); +extern void free_pagemap_walk(void *mem_walk); +extern int pagemap_get(struct mm_struct *mm, void *mem_walk, + unsigned long start_vaddr, unsigned long end_vaddr, + unsigned long *pte_entry, unsigned int *count); + +extern int init_pagemap_read(void); +/* reserve space for pin memory*/ +#ifdef CONFIG_ARM64 +extern struct resource pin_memory_resource; +#endif +extern void init_reserve_page_map(unsigned long map_addr, unsigned long map_size); + +#endif /* CONFIG_PIN_MEMORY */ +#endif /* _LINUX_PIN_MEMORY_H */ diff --git a/kernel/crash_core.c b/kernel/crash_core.c index b8e76831100e..88d93da963e8 100644 --- a/kernel/crash_core.c +++ b/kernel/crash_core.c @@ -462,6 +462,17 @@ void __init reserve_crashkernel(void) } #endif /* CONFIG_ARCH_WANT_RESERVE_CRASH_KERNEL */
+#ifdef CONFIG_PIN_MEMORY +int __init parse_pin_memory(char *cmdline, + unsigned long long system_ram, + unsigned long long *pin_size, + unsigned long long *pin_base) +{ + return __parse_crashkernel(cmdline, system_ram, pin_size, pin_base, + "pinmemory=", NULL); +} +#endif + Elf_Word *append_elf_note(Elf_Word *buf, char *name, unsigned int type, void *data, size_t data_len) { diff --git a/mm/Kconfig b/mm/Kconfig index f08be27b9cf0..15eebc24d703 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -898,4 +898,12 @@ config ARCH_HAS_HUGEPD config MAPPING_DIRTY_HELPERS bool
+config PIN_MEMORY + bool "Support for pin memory" + depends on MMU && ARM64 + help + Say y here to enable the pin memory feature for checkpoint + and restore. We can pin the memory data of tasks and collect + the corresponding physical pages mapping info in checkpoint, + and remap the physical pages to restore tasks in restore. endmenu diff --git a/mm/Makefile b/mm/Makefile index 2b1991759835..b341ef0d3406 100644 --- a/mm/Makefile +++ b/mm/Makefile @@ -122,3 +122,4 @@ obj-$(CONFIG_MAPPING_DIRTY_HELPERS) += mapping_dirty_helpers.o obj-$(CONFIG_PTDUMP_CORE) += ptdump.o obj-$(CONFIG_PAGE_REPORTING) += page_reporting.o obj-$(CONFIG_HAVE_BOOTMEM_INFO_NODE) += bootmem_info.o +obj-$(CONFIG_PIN_MEMORY) += pin_mem.o diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 44c59572a95c..6407b9324cf1 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -3015,3 +3015,66 @@ void remove_migration_pmd(struct page_vma_mapped_walk *pvmw, struct page *new) update_mmu_cache_pmd(vma, address, pvmw->pmd); } #endif + +#ifdef CONFIG_PIN_MEMORY +vm_fault_t do_anon_huge_page_remap(struct vm_area_struct *vma, unsigned long address, + pmd_t *pmd, struct page *page) +{ + gfp_t gfp; + pgtable_t pgtable; + spinlock_t *ptl; + pmd_t entry; + vm_fault_t ret = 0; + + if (unlikely(anon_vma_prepare(vma))) + return VM_FAULT_OOM; + if (unlikely(khugepaged_enter(vma, vma->vm_flags))) + return VM_FAULT_OOM; + gfp = alloc_hugepage_direct_gfpmask(vma); + + prep_transhuge_page(page); + if (mem_cgroup_charge(page, vma->vm_mm, gfp)) { + put_page(page); + count_vm_event(THP_FAULT_FALLBACK); + count_vm_event(THP_FAULT_FALLBACK_CHARGE); + return VM_FAULT_FALLBACK; + } + cgroup_throttle_swaprate(page, gfp); + + pgtable = pte_alloc_one(vma->vm_mm); + if (unlikely(!pgtable)) { + ret = VM_FAULT_OOM; + goto release; + } + __SetPageUptodate(page); + ptl = pmd_lock(vma->vm_mm, pmd); + if (unlikely(!pmd_none(*pmd))) { + goto unlock_release; + } else { + ret = check_stable_address_space(vma->vm_mm); + if (ret) + goto unlock_release; + entry = mk_huge_pmd(page, vma->vm_page_prot); + entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma); + page_add_new_anon_rmap(page, vma, address, true); + lru_cache_add_inactive_or_unevictable(page, vma); + pgtable_trans_huge_deposit(vma->vm_mm, pmd, pgtable); + set_pmd_at(vma->vm_mm, address, pmd, entry); + add_mm_counter(vma->vm_mm, MM_ANONPAGES, HPAGE_PMD_NR); + mm_inc_nr_ptes(vma->vm_mm); + spin_unlock(ptl); + count_vm_event(THP_FAULT_ALLOC); + count_memcg_event_mm(vma->vm_mm, THP_FAULT_ALLOC); + } + + return 0; + +unlock_release: + spin_unlock(ptl); +release: + if (pgtable) + pte_free(vma->vm_mm, pgtable); + put_page(page); + return ret; +} +#endif diff --git a/mm/memory.c b/mm/memory.c index 4e8dc5c25cf1..0be6fd3198a8 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -5318,3 +5318,68 @@ void ptlock_free(struct page *page) kmem_cache_free(page_ptl_cachep, page->ptl); } #endif + +#ifdef CONFIG_PIN_MEMORY +vm_fault_t do_anon_page_remap(struct vm_area_struct *vma, unsigned long address, + pmd_t *pmd, struct page *page) +{ + pte_t entry; + spinlock_t *ptl; + pte_t *pte; + vm_fault_t ret = 0; + + if (pte_alloc(vma->vm_mm, pmd)) + return VM_FAULT_OOM; + + /* See the comment in pte_alloc_one_map() */ + if (unlikely(pmd_trans_unstable(pmd))) + return 0; + + /* Allocate our own private page. */ + if (unlikely(anon_vma_prepare(vma))) + goto oom; + + if (mem_cgroup_charge(page, vma->vm_mm, GFP_KERNEL)) + goto oom_free_page; + + /* + * The memory barrier inside __SetPageUptodate makes sure that + * preceding stores to the page contents become visible before + * the set_pte_at() write. + */ + __SetPageUptodate(page); + + entry = mk_pte(page, vma->vm_page_prot); + if (vma->vm_flags & VM_WRITE) + entry = pte_mkwrite(pte_mkdirty(entry)); + pte = pte_offset_map_lock(vma->vm_mm, pmd, address, + &ptl); + if (!pte_none(*pte)) { + ret = VM_FAULT_FALLBACK; + goto release; + } + + ret = check_stable_address_space(vma->vm_mm); + if (ret) + goto release; + inc_mm_counter_fast(vma->vm_mm, MM_ANONPAGES); + page_add_new_anon_rmap(page, vma, address, false); + lru_cache_add_inactive_or_unevictable(page, vma); + + set_pte_at(vma->vm_mm, address, pte, entry); + /* No need to invalidate - it was non-present before */ + update_mmu_cache(vma, address, pte); + +unlock: + pte_unmap_unlock(pte, ptl); + return ret; + +release: + put_page(page); + goto unlock; +oom_free_page: + put_page(page); +oom: + return VM_FAULT_OOM; +} +#endif diff --git a/mm/pin_mem.c b/mm/pin_mem.c new file mode 100644 index 000000000000..df618ba1f5bb --- /dev/null +++ b/mm/pin_mem.c @@ -0,0 +1,1091 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (C) 2020. Huawei Technologies Co., Ltd. All rights reserved. + * Provide the pin memory method for check point and restore task. + */ +#ifdef CONFIG_PIN_MEMORY +#include <linux/init.h> +#include <linux/slab.h> +#include <linux/time.h> +#include <linux/sched/cputime.h> +#include <linux/tick.h> +#include <linux/mm.h> +#include <linux/pin_mem.h> +#include <linux/idr.h> +#include <linux/page-isolation.h> +#include <linux/sched/mm.h> +#include <linux/ctype.h> +#include <linux/highmem.h> +#include <crypto/sha2.h> + +#define MAX_PIN_PID_NUM 128 +#define DEFAULT_REDIRECT_SPACE_SIZE 0x100000 + +static DEFINE_SPINLOCK(page_map_entry_lock); +static DEFINE_MUTEX(pin_mem_mutex); +static struct pin_mem_dump_info *pin_mem_dump_start; +static unsigned int pin_pid_num; +static unsigned int *pin_pid_num_addr; +static struct page_map_entry *__page_map_entry_start; +static unsigned long page_map_entry_end; +static struct page_map_info *user_space_reserve_start; +static struct page_map_entry *page_map_entry_start; + +unsigned int max_pin_pid_num __read_mostly; +unsigned long redirect_space_size __read_mostly; +static unsigned long redirect_space_start; +static void *pin_mem_pagewalk; +static unsigned long *pagemap_buffer; +static int reserve_user_map_pages_fail; + +static int __init setup_max_pin_pid_num(char *str) +{ + int ret; + + if (!str) + return 0; + + ret = kstrtouint(str, 10, &max_pin_pid_num); + if (ret) { + pr_warn("Unable to parse max pin pid num.\n"); + } else { + if (max_pin_pid_num > MAX_PIN_PID_NUM) { + max_pin_pid_num = 0; + pr_warn("Input max_pin_pid_num is too large.\n"); + } + } + return ret; +} +early_param("max_pin_pid_num", setup_max_pin_pid_num); + +static int __init setup_redirect_space_size(char *str) +{ + if (!str) + return 0; + + redirect_space_size = memparse(str, NULL); + if (!redirect_space_size) { + pr_warn("Unable to parse redirect space size, use the default value.\n"); + redirect_space_size = DEFAULT_REDIRECT_SPACE_SIZE; + } + return 0; +} +early_param("redirect_space_size", setup_redirect_space_size); + +static struct page_map_info *create_page_map_info(int pid) +{ + struct page_map_info *new; + + if (!user_space_reserve_start) + return NULL; + + if (pin_pid_num >= max_pin_pid_num) { + pr_warn("Pin pid num too large than max_pin_pid_num, fail create: %d!", pid); + return NULL; + } + new = (struct page_map_info *)(user_space_reserve_start + pin_pid_num); + new->pid = pid; + new->pme = NULL; + new->entry_num = 0; + new->pid_reserved = false; + new->disable_free_page = false; + (*pin_pid_num_addr)++; + pin_pid_num++; + return new; +} + +struct page_map_info *create_page_map_info_by_pid(int pid) +{ + unsigned long flags; + struct page_map_info *ret; + + spin_lock_irqsave(&page_map_entry_lock, flags); + ret = create_page_map_info(pid); + spin_unlock_irqrestore(&page_map_entry_lock, flags); + return ret; +} +EXPORT_SYMBOL_GPL(create_page_map_info_by_pid); + +static struct page_map_info *get_page_map_info(int pid) +{ + int i; + + if (!user_space_reserve_start) + return NULL; + + for (i = 0; i < pin_pid_num; i++) { + if (user_space_reserve_start[i].pid == pid) + return &(user_space_reserve_start[i]); + } + return NULL; +} + +struct page_map_info *get_page_map_info_by_pid(int pid) +{ + unsigned long flags; + struct page_map_info *ret; + + spin_lock_irqsave(&page_map_entry_lock, flags); + ret = get_page_map_info(pid); + spin_unlock_irqrestore(&page_map_entry_lock, flags); + return ret; +} +EXPORT_SYMBOL_GPL(get_page_map_info_by_pid); + +static struct page *find_head_page(struct page *page) +{ + struct page *p = page; + + while (!PageBuddy(p)) { + if (PageLRU(p)) + return NULL; + p--; + } + return p; +} + +static void spilt_page_area_left(struct zone *zone, struct free_area *area, struct page *page, + unsigned long size, int order) +{ + unsigned long cur_size = 1 << order; + unsigned long total_size = 0; + + while (size && cur_size > size) { + cur_size >>= 1; + order--; + area--; + if (cur_size <= size) { + list_add(&page[total_size].lru, &area->free_list[MIGRATE_MOVABLE]); + atomic_set(&(page[total_size]._mapcount), PAGE_BUDDY_MAPCOUNT_VALUE); + set_page_private(&page[total_size], order); + set_pageblock_migratetype(&page[total_size], MIGRATE_MOVABLE); + area->nr_free++; + total_size += cur_size; + size -= cur_size; + } + } +} + +static void spilt_page_area_right(struct zone *zone, struct free_area *area, struct page *page, + unsigned long size, int order) +{ + unsigned long cur_size = 1 << order; + struct page *right_page, *head_page; + + right_page = page + size; + while (size && cur_size > size) { + cur_size >>= 1; + order--; + area--; + if (cur_size <= size) { + head_page = right_page - cur_size; + list_add(&head_page->lru, &area->free_list[MIGRATE_MOVABLE]); + atomic_set(&(head_page->_mapcount), PAGE_BUDDY_MAPCOUNT_VALUE); + set_page_private(head_page, order); + set_pageblock_migratetype(head_page, MIGRATE_MOVABLE); + area->nr_free++; + size -= cur_size; + right_page = head_page; + } + } +} + +void reserve_page_from_buddy(unsigned long nr_pages, struct page *page) +{ + unsigned int current_order; + struct page *page_end; + struct free_area *area; + struct zone *zone; + struct page *head_page; + + head_page = find_head_page(page); + if (!head_page) { + pr_warn("Find page head fail."); + return; + } + + current_order = head_page->private; + page_end = head_page + (1 << current_order); + zone = page_zone(head_page); + area = &(zone->free_area[current_order]); + list_del(&head_page->lru); + atomic_set(&head_page->_mapcount, -1); + set_page_private(head_page, 0); + area->nr_free--; + + if (head_page != page) + spilt_page_area_left(zone, area, head_page, + (unsigned long)(page - head_page), current_order); + page = page + nr_pages; + if (page < page_end) { + spilt_page_area_right(zone, area, page, + (unsigned long)(page_end - page), current_order); + } else if (page > page_end) { + pr_warn("Find page end smaller than page."); + } +} + +static inline void reserve_user_normal_pages(struct page *page) +{ + atomic_inc(&page->_refcount); + reserve_page_from_buddy(1, page); +} + +static void init_huge_pmd_pages(struct page *head_page) +{ + int i = 0; + struct page *page = head_page; + unsigned long compound_pad = COMPOUND_PAD_START; + + __set_bit(PG_head, &page->flags); + __set_bit(PG_active, &page->flags); + atomic_set(&page->_refcount, 1); + page++; + i++; + page->compound_head = (unsigned long)head_page + 1; + page->compound_dtor = HUGETLB_PAGE_DTOR + 1; + page->compound_order = HPAGE_PMD_ORDER; + page++; + i++; + page->compound_head = (unsigned long)head_page + 1; + i++; + + INIT_LIST_HEAD(&(page->deferred_list)); + for (; i < HPAGE_PMD_NR; i++) { + page = head_page + i; + page->compound_head = (unsigned long)head_page + 1; + compound_pad += COMPOUND_PAD_DELTA; + } +} + +static inline void reserve_user_huge_pmd_pages(struct page *page) +{ + atomic_inc(&page->_refcount); + reserve_page_from_buddy((1 << HPAGE_PMD_ORDER), page); + init_huge_pmd_pages(page); +} + +void free_user_map_pages(unsigned int pid_index, unsigned int entry_index, unsigned int page_index) +{ + unsigned int i, j, index, order; + struct page_map_info *pmi; + struct page_map_entry *pme; + struct page *page; + unsigned long phy_addr; + + for (index = 0; index < pid_index; index++) { + pmi = &(user_space_reserve_start[index]); + pme = pmi->pme; + for (i = 0; i < pmi->entry_num; i++) { + for (j = 0; j < pme->nr_pages; j++) { + order = pme->is_huge_page ? HPAGE_PMD_ORDER : 0; + phy_addr = pme->phy_addr_array[j]; + if (phy_addr) { + page = phys_to_page(phy_addr); + if (!(page->flags & PAGE_FLAGS_CHECK_RESERVED)) { + __free_pages(page, order); + pme->phy_addr_array[j] = 0; + } + } + } + pme = (struct page_map_entry *)next_pme(pme); + } + } + + pmi = &(user_space_reserve_start[index]); + pme = pmi->pme; + for (i = 0; i < entry_index; i++) { + for (j = 0; j < pme->nr_pages; j++) { + order = pme->is_huge_page ? HPAGE_PMD_ORDER : 0; + phy_addr = pme->phy_addr_array[j]; + if (phy_addr) { + page = phys_to_page(phy_addr); + if (!(page->flags & PAGE_FLAGS_CHECK_RESERVED)) { + __free_pages(page, order); + pme->phy_addr_array[j] = 0; + } + } + } + pme = (struct page_map_entry *)next_pme(pme); + } + + for (j = 0; j < page_index; j++) { + order = pme->is_huge_page ? HPAGE_PMD_ORDER : 0; + phy_addr = pme->phy_addr_array[j]; + if (phy_addr) { + page = phys_to_page(phy_addr); + if (!(page->flags & PAGE_FLAGS_CHECK_RESERVED)) { + __free_pages(page, order); + pme->phy_addr_array[j] = 0; + } + } + } +} + +bool check_redirect_end_valid(struct redirect_info *redirect_start, + unsigned long max_redirect_page_num) +{ + unsigned long redirect_end; + + redirect_end = ((unsigned long)(redirect_start + 1) + + max_redirect_page_num * sizeof(unsigned int)); + if (redirect_end > redirect_space_start + redirect_space_size) + return false; + return true; +} + +static void reserve_user_space_map_pages(void) +{ + struct page_map_info *pmi; + struct page_map_entry *pme; + unsigned int i, j, index; + struct page *page; + unsigned long flags; + unsigned long phy_addr; + unsigned long redirect_pages = 0; + struct redirect_info *redirect_start = (struct redirect_info *)redirect_space_start; + + if (!user_space_reserve_start || !redirect_start) + return; + spin_lock_irqsave(&page_map_entry_lock, flags); + for (index = 0; index < pin_pid_num; index++) { + pmi = &(user_space_reserve_start[index]); + pme = pmi->pme; + for (i = 0; i < pmi->entry_num; i++) { + redirect_pages = 0; + if (!check_redirect_end_valid(redirect_start, pme->nr_pages)) + redirect_start = NULL; + + for (j = 0; j < pme->nr_pages; j++) { + phy_addr = pme->phy_addr_array[j]; + if (!phy_addr) + continue; + page = phys_to_page(phy_addr); + if (atomic_read(&page->_refcount)) { + if ((page->flags & PAGE_FLAGS_CHECK_RESERVED) + && !pme->redirect_start) + pme->redirect_start = + (unsigned long)redirect_start; + + if (redirect_start && + (page->flags & PAGE_FLAGS_CHECK_RESERVED)) { + redirect_start->redirect_index[redirect_pages] = j; + redirect_pages++; + continue; + } else { + reserve_user_map_pages_fail = 1; + pr_warn("Page %pK refcount %d large than zero, no need reserve.\n", + page, atomic_read(&page->_refcount)); + goto free_pages; + } + } + + if (!pme->is_huge_page) + reserve_user_normal_pages(page); + else + reserve_user_huge_pmd_pages(page); + } + pme = (struct page_map_entry *)next_pme(pme); + if (redirect_pages && redirect_start) { + redirect_start->redirect_pages = redirect_pages; + redirect_start = (struct redirect_info *)( + (unsigned long)(redirect_start + 1) + + redirect_start->redirect_pages * sizeof(unsigned int)); + } + } + } + spin_unlock_irqrestore(&page_map_entry_lock, flags); + return; + +free_pages: + free_user_map_pages(index, i, j); + spin_unlock_irqrestore(&page_map_entry_lock, flags); +} + + +int calculate_pin_mem_digest(struct pin_mem_dump_info *pmdi, char *digest) +{ + int i; + struct sha256_state sctx; + + if (!digest) + digest = pmdi->sha_digest; + sha256_init(&sctx); + sha256_update(&sctx, (unsigned char *)(&(pmdi->magic)), + sizeof(struct pin_mem_dump_info) - SHA256_DIGEST_SIZE); + for (i = 0; i < pmdi->pin_pid_num; i++) { + sha256_update(&sctx, (unsigned char *)(&(pmdi->pmi_array[i])), + sizeof(struct page_map_info)); + } + sha256_final(&sctx, digest); + return 0; +} + +static int check_sha_digest(struct pin_mem_dump_info *pmdi) +{ + int ret = 0; + char digest[SHA256_DIGEST_SIZE] = {0}; + + ret = calculate_pin_mem_digest(pmdi, digest); + if (ret) { + pr_warn("calculate pin mem digest fail:%d\n", ret); + return ret; + } + if (memcmp(pmdi->sha_digest, digest, SHA256_DIGEST_SIZE)) { + pr_warn("pin mem dump info sha256 digest match error!\n"); + return -EFAULT; + } + return ret; +} + +/* + * The whole page map entry collect process must be Sequentially. + * The user_space_reserve_start points to the first page map info for + * the first dump task. And the page_map_entry_start points to + * the first page map entry of the first dump vma. + */ +static void init_page_map_info(struct pin_mem_dump_info *pmdi, unsigned long map_len) +{ + if (pin_mem_dump_start || !max_pin_pid_num) { + pr_warn("pin page map already init or max_pin_pid_num not set.\n"); + return; + } + if (map_len < sizeof(struct pin_mem_dump_info) + + max_pin_pid_num * sizeof(struct page_map_info) + redirect_space_size) { + pr_warn("pin memory reserved memblock too small.\n"); + return; + } + if ((pmdi->magic != PIN_MEM_DUMP_MAGIC) || (pmdi->pin_pid_num > max_pin_pid_num) || + check_sha_digest(pmdi)) + memset(pmdi, 0, sizeof(struct pin_mem_dump_info)); + + pin_mem_dump_start = pmdi; + pin_pid_num = pmdi->pin_pid_num; + pr_info("pin_pid_num: %d\n", pin_pid_num); + pin_pid_num_addr = &(pmdi->pin_pid_num); + user_space_reserve_start = + (struct page_map_info *)pmdi->pmi_array; + page_map_entry_start = + (struct page_map_entry *)(user_space_reserve_start + max_pin_pid_num); + __page_map_entry_start = page_map_entry_start; + page_map_entry_end = (unsigned long)pmdi + map_len - redirect_space_size; + redirect_space_start = page_map_entry_end; + + if (pin_pid_num > 0) + reserve_user_space_map_pages(); +} + +int finish_pin_mem_dump(void) +{ + int ret; + + if (!pin_mem_dump_start) + return -EFAULT; + pin_mem_dump_start->magic = PIN_MEM_DUMP_MAGIC; + memset(pin_mem_dump_start->sha_digest, 0, SHA256_DIGEST_SIZE); + ret = calculate_pin_mem_digest(pin_mem_dump_start, NULL); + if (ret) { + pr_warn("calculate pin mem digest fail:%d\n", ret); + return ret; + } + return ret; +} +EXPORT_SYMBOL_GPL(finish_pin_mem_dump); + +int collect_pmd_huge_pages(struct task_struct *task, + unsigned long start_addr, unsigned long end_addr, struct page_map_entry *pme) +{ + int ret, i, res; + int index = 0; + unsigned long start = start_addr; + struct page *temp_page; + unsigned long *pte_entry = pagemap_buffer; + unsigned int count; + struct mm_struct *mm = task->mm; + + while (start < end_addr) { + temp_page = NULL; + count = 0; + ret = pagemap_get(mm, pin_mem_pagewalk, + start, start + HPAGE_PMD_SIZE, pte_entry, &count); + if (ret || !count) { + pr_warn("Get huge page fail: %d.", ret); + return COLLECT_PAGES_FAIL; + } + + /* For huge page, get one map entry per time. */ + if ((pte_entry[0] & PM_SWAP) && (count == 1)) { + res = get_user_pages_remote(task->mm, start, 1, + FOLL_TOUCH | FOLL_GET, &temp_page, NULL, NULL); + if (!res) { + pr_warn("Swap in huge page fail.\n"); + return COLLECT_PAGES_FAIL; + } + pme->phy_addr_array[index] = page_to_phys(temp_page); + start += HPAGE_PMD_SIZE; + index++; + continue; + } + + if (IS_PTE_PRESENT(pte_entry[0])) { + temp_page = pfn_to_page(pte_entry[0] & PM_PFRAME_MASK); + if (PageHead(temp_page)) { + atomic_inc(&((temp_page)->_refcount)); + start += HPAGE_PMD_SIZE; + pme->phy_addr_array[index] = page_to_phys(temp_page); + index++; + } else { + /* If the page is not compound head, goto collect normal pages. */ + pme->nr_pages = index; + return COLLECT_PAGES_NEED_CONTINUE; + } + } else { + for (i = 1; i < count; i++) { + if (pte_entry[i] & PM_PFRAME_MASK) { + pme->nr_pages = index; + return COLLECT_PAGES_NEED_CONTINUE; + } + } + start += HPAGE_PMD_SIZE; + pme->phy_addr_array[index] = 0; + index++; + } + } + pme->nr_pages = index; + return COLLECT_PAGES_FINISH; +} + +int collect_normal_pages(struct task_struct *task, + unsigned long start_addr, unsigned long end_addr, struct page_map_entry *pme) +{ + int ret, res; + unsigned long next; + unsigned long i, nr_pages; + struct page *tmp_page; + unsigned long *phy_addr_array = pme->phy_addr_array; + unsigned int count; + unsigned long *pte_entry = pagemap_buffer; + struct mm_struct *mm = task->mm; + + next = (start_addr & HPAGE_PMD_MASK) + HPAGE_PMD_SIZE; + next = (next > end_addr) ? end_addr : next; + pme->nr_pages = 0; + while (start_addr < next) { + count = 0; + nr_pages = (PAGE_ALIGN(next) - start_addr) / PAGE_SIZE; + ret = pagemap_get(mm, pin_mem_pagewalk, + start_addr, next, pte_entry, &count); + if (ret || !count) { + pr_warn("Get user page fail: %d, count: %u.\n", + ret, count); + return COLLECT_PAGES_FAIL; + } + + if (IS_PTE_PRESENT(pte_entry[0])) { + tmp_page = pfn_to_page(pte_entry[0] & PM_PFRAME_MASK); + /* If the page is compound head, goto collect huge pages. */ + if (PageHead(tmp_page)) + return COLLECT_PAGES_NEED_CONTINUE; + if (PageTail(tmp_page)) { + start_addr = next; + pme->virt_addr = start_addr; + next = NEXT_PIN_ADDR(next, end_addr); + continue; + } + } + + for (i = 0; i < count; i++) { + if (pte_entry[i] & PM_SWAP) { + res = get_user_pages_remote(task->mm, start_addr + i * PAGE_SIZE, + 1, FOLL_TOUCH | FOLL_GET, &tmp_page, NULL, NULL); + if (!res) { + pr_warn("Swap in page fail.\n"); + return COLLECT_PAGES_FAIL; + } + phy_addr_array[i] = page_to_phys(tmp_page); + continue; + } + if (!IS_PTE_PRESENT(pte_entry[i])) { + phy_addr_array[i] = 0; + continue; + } + tmp_page = pfn_to_page(pte_entry[i] & PM_PFRAME_MASK); + atomic_inc(&(tmp_page->_refcount)); + phy_addr_array[i] = ((pte_entry[i] & PM_PFRAME_MASK) << PAGE_SHIFT); + } + pme->nr_pages += count; + phy_addr_array += count; + start_addr = next; + next = NEXT_PIN_ADDR(next, end_addr); + } + return COLLECT_PAGES_FINISH; +} + +void free_pin_pages(struct page_map_entry *pme) +{ + unsigned long i; + struct page *tmp_page; + + if (!pme) + return; + for (i = 0; i < pme->nr_pages; i++) { + if (pme->phy_addr_array[i]) { + tmp_page = phys_to_page(pme->phy_addr_array[i]); + atomic_dec(&(tmp_page->_refcount)); + pme->phy_addr_array[i] = 0; + } + } +} + +int init_pagemap_read(void) +{ + int ret = -ENOMEM; + + if (pin_mem_pagewalk) + return 0; + + mutex_lock(&pin_mem_mutex); + pin_mem_pagewalk = create_pagemap_walk(); + if (!pin_mem_pagewalk) + goto out; + pagemap_buffer = kmalloc(((PMD_SIZE >> PAGE_SHIFT) + 1) * + sizeof(unsigned long), GFP_KERNEL); + if (!pagemap_buffer) + goto free; + + ret = 0; +out: + mutex_unlock(&pin_mem_mutex); + return ret; +free: + free_pagemap_walk(pin_mem_pagewalk); + pin_mem_pagewalk = NULL; + goto out; +} +EXPORT_SYMBOL_GPL(init_pagemap_read); + +/* Users make sure that the pin memory belongs to anonymous vma. */ +int pin_mem_area(struct task_struct *task, struct mm_struct *mm, + unsigned long start_addr, unsigned long end_addr) +{ + int pid, ret; + int is_huge_page = false; + unsigned int page_size; + unsigned long nr_pages, flags; + struct page_map_entry *pme = NULL; + struct page_map_info *pmi; + struct vm_area_struct *vma; + unsigned long i; + struct page *tmp_page; + + if (!page_map_entry_start + || !task || !mm + || start_addr >= end_addr || !pin_mem_pagewalk) + return -EFAULT; + + pid = task->pid; + spin_lock_irqsave(&page_map_entry_lock, flags); + nr_pages = ((end_addr - start_addr) / PAGE_SIZE); + if ((unsigned long)page_map_entry_start + + nr_pages * sizeof(unsigned long) + + sizeof(struct page_map_entry) >= page_map_entry_end) { + pr_warn("Page map entry use up!\n"); + ret = -ENOMEM; + goto finish; + } + + vma = find_extend_vma(mm, start_addr); + if (!vma) { + pr_warn("Find no match vma!\n"); + ret = -EFAULT; + goto finish; + } + if (start_addr == (start_addr & HPAGE_PMD_MASK) && + transparent_hugepage_enabled(vma)) { + page_size = HPAGE_PMD_SIZE; + is_huge_page = true; + } else { + page_size = PAGE_SIZE; + } + + pme = page_map_entry_start; + pme->virt_addr = start_addr; + pme->redirect_start = 0; + pme->is_huge_page = is_huge_page; + memset(pme->phy_addr_array, 0, nr_pages * sizeof(unsigned long)); + + down_read(&mm->mmap_lock); + if (!is_huge_page) { + ret = collect_normal_pages(task, start_addr, end_addr, pme); + if (ret != COLLECT_PAGES_FAIL && !pme->nr_pages) { + if (ret == COLLECT_PAGES_FINISH) { + ret = 0; + up_read(&mm->mmap_lock); + goto finish; + } + pme->is_huge_page = true; + page_size = HPAGE_PMD_SIZE; + ret = collect_pmd_huge_pages(task, pme->virt_addr, end_addr, pme); + } + } else { + ret = collect_pmd_huge_pages(task, start_addr, end_addr, pme); + if (ret != COLLECT_PAGES_FAIL && !pme->nr_pages) { + if (ret == COLLECT_PAGES_FINISH) { + ret = 0; + up_read(&mm->mmap_lock); + goto finish; + } + pme->is_huge_page = false; + page_size = PAGE_SIZE; + ret = collect_normal_pages(task, pme->virt_addr, end_addr, pme); + } + } + up_read(&mm->mmap_lock); + if (ret == COLLECT_PAGES_FAIL) { + ret = -EFAULT; + goto finish; + } + + /* check for zero pages */ + for (i = 0; i < pme->nr_pages; i++) { + tmp_page = phys_to_page(pme->phy_addr_array[i]); + if (!pme->is_huge_page) { + if (page_to_pfn(tmp_page) == my_zero_pfn(pme->virt_addr + i * PAGE_SIZE)) + pme->phy_addr_array[i] = 0; + } else if (is_huge_zero_page(tmp_page)) + pme->phy_addr_array[i] = 0; + } + + page_map_entry_start = (struct page_map_entry *)(next_pme(pme)); + pmi = get_page_map_info(pid); + if (!pmi) + pmi = create_page_map_info(pid); + if (!pmi) { + pr_warn("Create page map info fail for pid: %d!\n", pid); + ret = -EFAULT; + goto finish; + } + if (!pmi->pme) + pmi->pme = pme; + pmi->entry_num++; + spin_unlock_irqrestore(&page_map_entry_lock, flags); + + if (ret == COLLECT_PAGES_NEED_CONTINUE) + ret = pin_mem_area(task, mm, pme->virt_addr + pme->nr_pages * page_size, end_addr); + return ret; + +finish: + if (ret) + free_pin_pages(pme); + spin_unlock_irqrestore(&page_map_entry_lock, flags); + return ret; +} +EXPORT_SYMBOL_GPL(pin_mem_area); + +vm_fault_t remap_normal_pages(struct mm_struct *mm, struct vm_area_struct *vma, + struct page_map_entry *pme) +{ + int ret; + unsigned int j, i; + pgd_t *pgd; + p4d_t *p4d; + pmd_t *pmd; + pud_t *pud; + struct page *page, *new; + unsigned long address; + unsigned long phy_addr; + unsigned int redirect_pages = 0; + struct redirect_info *redirect_start; + + redirect_start = (struct redirect_info *)pme->redirect_start; + for (j = 0; j < pme->nr_pages; j++) { + address = pme->virt_addr + j * PAGE_SIZE; + phy_addr = pme->phy_addr_array[j]; + if (!phy_addr) + continue; + + page = phys_to_page(phy_addr); + if (page_to_pfn(page) == my_zero_pfn(address)) { + pme->phy_addr_array[j] = 0; + continue; + } + pme->phy_addr_array[j] = 0; + + if (redirect_start && (redirect_pages < redirect_start->redirect_pages) && + (j == redirect_start->redirect_index[redirect_pages])) { + new = alloc_zeroed_user_highpage_movable(vma, address); + if (!new) { + pr_warn("Redirect alloc page fail\n"); + continue; + } + copy_page(page_to_virt(new), phys_to_virt(phy_addr)); + page = new; + redirect_pages++; + } + + page->mapping = NULL; + pgd = pgd_offset(mm, address); + ret = VM_FAULT_OOM; + p4d = p4d_alloc(mm, pgd, address); + if (!p4d) + goto free; + pud = pud_alloc(mm, p4d, address); + if (!pud) + goto free; + pmd = pmd_alloc(mm, pud, address); + if (!pmd) + goto free; + ret = do_anon_page_remap(vma, address, pmd, page); + if (ret) + goto free; + } + return 0; + +free: + for (i = j; i < pme->nr_pages; i++) { + phy_addr = pme->phy_addr_array[i]; + if (phy_addr) { + __free_page(phys_to_page(phy_addr)); + pme->phy_addr_array[i] = 0; + } + } + return ret; +} + +static inline gfp_t get_hugepage_gfpmask(struct vm_area_struct *vma) +{ + const bool vma_madvised = !!(vma->vm_flags & VM_HUGEPAGE); + + if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_DIRECT_FLAG, &transparent_hugepage_flags)) + return GFP_TRANSHUGE | (vma_madvised ? 0 : __GFP_NORETRY); + if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_FLAG, &transparent_hugepage_flags)) + return GFP_TRANSHUGE_LIGHT | __GFP_KSWAPD_RECLAIM; + if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_OR_MADV_FLAG, &transparent_hugepage_flags)) + return GFP_TRANSHUGE_LIGHT | (vma_madvised ? __GFP_DIRECT_RECLAIM : + __GFP_KSWAPD_RECLAIM); + if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG, &transparent_hugepage_flags)) + return GFP_TRANSHUGE_LIGHT | (vma_madvised ? __GFP_DIRECT_RECLAIM : + 0); + return GFP_TRANSHUGE_LIGHT; +} + +vm_fault_t remap_huge_pmd_pages(struct mm_struct *mm, struct vm_area_struct *vma, + struct page_map_entry *pme) +{ + int ret; + unsigned int j, i; + pgd_t *pgd; + p4d_t *p4d; + pmd_t *pmd; + pud_t *pud; + gfp_t gfp; + struct page *page, *new; + unsigned long address; + unsigned long phy_addr; + unsigned int redirect_pages = 0; + struct redirect_info *redirect_start; + + redirect_start = (struct redirect_info *)pme->redirect_start; + for (j = 0; j < pme->nr_pages; j++) { + address = pme->virt_addr + j * HPAGE_PMD_SIZE; + phy_addr = pme->phy_addr_array[j]; + if (!phy_addr) + continue; + + page = phys_to_page(phy_addr); + if (is_huge_zero_page(page)) { + pme->phy_addr_array[j] = 0; + continue; + } + pme->phy_addr_array[j] = 0; + + if (redirect_start && (redirect_pages < redirect_start->redirect_pages) && + (j == redirect_start->redirect_index[redirect_pages])) { + gfp = get_hugepage_gfpmask(vma); + new = alloc_hugepage_vma(gfp, vma, address, HPAGE_PMD_ORDER); + if (!new) { + pr_warn("Redirect alloc huge page fail\n"); + continue; + } + memcpy(page_to_virt(new), phys_to_virt(phy_addr), HPAGE_PMD_SIZE); + page = new; + redirect_pages++; + } + + pgd = pgd_offset(mm, address); + ret = VM_FAULT_OOM; + p4d = p4d_alloc(mm, pgd, address); + if (!p4d) + goto free; + pud = pud_alloc(mm, p4d, address); + if (!pud) + goto free; + pmd = pmd_alloc(mm, pud, address); + if (!pmd) + goto free; + ret = do_anon_huge_page_remap(vma, address, pmd, page); + if (ret) + goto free; + } + return 0; + +free: + for (i = j; i < pme->nr_pages; i++) { + phy_addr = pme->phy_addr_array[i]; + if (phy_addr) { + page = phys_to_page(phy_addr); + if (!(page->flags & PAGE_FLAGS_CHECK_RESERVED)) { + __free_pages(page, HPAGE_PMD_ORDER); + pme->phy_addr_array[i] = 0; + } + } + } + return ret; +} + +static void free_unmap_pages(struct page_map_info *pmi, + struct page_map_entry *pme, + unsigned int index) +{ + unsigned int i, j; + unsigned long phy_addr; + unsigned int order; + struct page *page; + + pme = (struct page_map_entry *)(next_pme(pme)); + for (i = index; i < pmi->entry_num; i++) { + for (j = 0; j < pme->nr_pages; j++) { + phy_addr = pme->phy_addr_array[i]; + if (phy_addr) { + page = phys_to_page(phy_addr); + order = pme->is_huge_page ? HPAGE_PMD_ORDER : 0; + if (!(page->flags & PAGE_FLAGS_CHECK_RESERVED)) { + __free_pages(page, order); + pme->phy_addr_array[i] = 0; + } + } + } + pme = (struct page_map_entry *)(next_pme(pme)); + } +} + +vm_fault_t do_mem_remap(int pid, struct mm_struct *mm) +{ + unsigned int i = 0; + vm_fault_t ret = 0; + struct vm_area_struct *vma; + struct page_map_info *pmi; + struct page_map_entry *pme; + unsigned long flags; + + if (reserve_user_map_pages_fail || !mm) + return -EFAULT; + + spin_lock_irqsave(&page_map_entry_lock, flags); + pmi = get_page_map_info(pid); + if (pmi) + pmi->disable_free_page = true; + spin_unlock_irqrestore(&page_map_entry_lock, flags); + if (!pmi) + return -EFAULT; + + down_write(&mm->mmap_lock); + pme = pmi->pme; + vma = mm->mmap; + while ((i < pmi->entry_num) && (vma != NULL)) { + if (pme->virt_addr >= vma->vm_start && pme->virt_addr < vma->vm_end) { + i++; + if (!vma_is_anonymous(vma)) { + pme = (struct page_map_entry *)(next_pme(pme)); + continue; + } + if (!pme->is_huge_page) { + ret = remap_normal_pages(mm, vma, pme); + if (ret < 0) + goto free; + } else { + ret = remap_huge_pmd_pages(mm, vma, pme); + if (ret < 0) + goto free; + } + pme = (struct page_map_entry *)(next_pme(pme)); + } else { + vma = vma->vm_next; + } + } + up_write(&mm->mmap_lock); + return 0; + +free: + free_unmap_pages(pmi, pme, i); + up_write(&mm->mmap_lock); + return ret; +} +EXPORT_SYMBOL_GPL(do_mem_remap); + +#if defined(CONFIG_ARM64) +void init_reserve_page_map(unsigned long map_addr, unsigned long map_size) +{ + void *addr; + + if (!map_addr || !map_size) + return; + addr = phys_to_virt(map_addr); + init_page_map_info((struct pin_mem_dump_info *)addr, map_size); +} +#else +void init_reserve_page_map(unsigned long map_addr, unsigned long map_size) +{ +} +#endif + +static void free_all_reserved_pages(void) +{ + unsigned int i, j, index, order; + struct page_map_info *pmi; + struct page_map_entry *pme; + struct page *page; + unsigned long phy_addr; + + if (!user_space_reserve_start || reserve_user_map_pages_fail) + return; + + for (index = 0; index < pin_pid_num; index++) { + pmi = &(user_space_reserve_start[index]); + if (pmi->disable_free_page) + continue; + pme = pmi->pme; + for (i = 0; i < pmi->entry_num; i++) { + for (j = 0; j < pme->nr_pages; j++) { + order = pme->is_huge_page ? HPAGE_PMD_ORDER : 0; + phy_addr = pme->phy_addr_array[j]; + if (phy_addr) { + page = phys_to_page(phy_addr); + if (!(page->flags & PAGE_FLAGS_CHECK_RESERVED)) { + __free_pages(page, order); + pme->phy_addr_array[j] = 0; + } + } + } + pme = (struct page_map_entry *)next_pme(pme); + } + } +} + +/* Clear all pin memory record. */ +void clear_pin_memory_record(void) +{ + unsigned long flags; + + spin_lock_irqsave(&page_map_entry_lock, flags); + free_all_reserved_pages(); + if (pin_pid_num_addr) { + *pin_pid_num_addr = 0; + pin_pid_num = 0; + page_map_entry_start = __page_map_entry_start; + } + spin_unlock_irqrestore(&page_map_entry_lock, flags); +} +EXPORT_SYMBOL_GPL(clear_pin_memory_record); + +#endif /* CONFIG_PIN_MEMORY */
From: Jingxian He hejingxian@huawei.com
hulk inclusion category: feature bugzilla: 48159 CVE: N/A
------------------------------
We record the pid of dump tasks in the reserved memory, and reserve the pids before init task start. In the recover process, free the reserved pids and realloc them for use.
Signed-off-by: Jingxian He hejingxian@huawei.com Reviewed-by: Chen Wandun chenwandun@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- include/linux/pin_mem.h | 6 +++++ kernel/pid.c | 10 ++++++++ mm/Kconfig | 10 ++++++++ mm/pin_mem.c | 51 +++++++++++++++++++++++++++++++++++++++++ 4 files changed, 77 insertions(+)
diff --git a/include/linux/pin_mem.h b/include/linux/pin_mem.h index b01cd05ace06..6c54482a42a1 100644 --- a/include/linux/pin_mem.h +++ b/include/linux/pin_mem.h @@ -89,5 +89,11 @@ extern struct resource pin_memory_resource; #endif extern void init_reserve_page_map(unsigned long map_addr, unsigned long map_size);
+#ifdef CONFIG_PID_RESERVE +extern bool is_need_reserve_pids(void); +extern void free_reserved_pid(struct idr *idr, int pid); +extern void reserve_pids(struct idr *idr, int pid_max); +#endif + #endif /* CONFIG_PIN_MEMORY */ #endif /* _LINUX_PIN_MEMORY_H */ diff --git a/kernel/pid.c b/kernel/pid.c index 0b90596f9f12..be2ec1d26896 100644 --- a/kernel/pid.c +++ b/kernel/pid.c @@ -44,6 +44,9 @@ #include <linux/idr.h> #include <net/sock.h> #include <uapi/linux/pidfd.h> +#ifdef CONFIG_PID_RESERVE +#include <linux/pin_mem.h> +#endif
struct pid init_struct_pid = { .count = REFCOUNT_INIT(1), @@ -209,6 +212,9 @@ struct pid *alloc_pid(struct pid_namespace *ns, pid_t *set_tid, spin_lock_irq(&pidmap_lock);
if (tid) { +#ifdef CONFIG_PID_RESERVE + free_reserved_pid(&tmp->idr, tid); +#endif nr = idr_alloc(&tmp->idr, NULL, tid, tid + 1, GFP_ATOMIC); /* @@ -622,6 +628,10 @@ void __init pid_idr_init(void)
init_pid_ns.pid_cachep = KMEM_CACHE(pid, SLAB_HWCACHE_ALIGN | SLAB_PANIC | SLAB_ACCOUNT); +#ifdef CONFIG_PID_RESERVE + if (is_need_reserve_pids()) + reserve_pids(&init_pid_ns.idr, pid_max); +#endif }
static struct file *__pidfd_fget(struct task_struct *task, int fd) diff --git a/mm/Kconfig b/mm/Kconfig index 15eebc24d703..8d1f074b615b 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -906,4 +906,14 @@ config PIN_MEMORY and restore. We can pin the memory data of tasks and collect the corresponding physical pages mapping info in checkpoint, and remap the physical pages to restore tasks in restore. + +config PID_RESERVE + bool "Support for reserve pid" + depends on PIN_MEMORY + help + Say y here to enable the pid reserved feature for checkpoint. + and restore. + We record the pid of dump task in the reserve memory, + and reserve the pids before init task start. In restore process, + free the reserved pids and realloc them for use. endmenu diff --git a/mm/pin_mem.c b/mm/pin_mem.c index df618ba1f5bb..3fd3638a89e3 100644 --- a/mm/pin_mem.c +++ b/mm/pin_mem.c @@ -1088,4 +1088,55 @@ void clear_pin_memory_record(void) } EXPORT_SYMBOL_GPL(clear_pin_memory_record);
+#ifdef CONFIG_PID_RESERVE +struct idr *reserve_idr; + +/* test if there exist pin memory tasks */ +bool is_need_reserve_pids(void) +{ + return (pin_pid_num > 0); +} + +void free_reserved_pid(struct idr *idr, int pid) +{ + unsigned int index; + struct page_map_info *pmi; + + if (!max_pin_pid_num || idr != reserve_idr) + return; + + for (index = 0; index < pin_pid_num; index++) { + pmi = &(user_space_reserve_start[index]); + if (pmi->pid == pid && pmi->pid_reserved) { + idr_remove(idr, pid); + return; + } + } +} + +/* reserve pids for check point tasks which pinned memory */ +void reserve_pids(struct idr *idr, int pid_max) +{ + int alloc_pid; + unsigned int index; + struct page_map_info *pmi; + + if (!max_pin_pid_num) + return; + reserve_idr = idr; + for (index = 0; index < pin_pid_num; index++) { + pmi = &(user_space_reserve_start[index]); + pmi->pid_reserved = true; + alloc_pid = idr_alloc(idr, NULL, pmi->pid, pid_max, GFP_ATOMIC); + if (alloc_pid != pmi->pid) { + if (alloc_pid > 0) + idr_remove(idr, alloc_pid); + pr_warn("Reserve pid (%d) fail, real pid is %d.\n", alloc_pid, pmi->pid); + pmi->pid_reserved = false; + continue; + } + } +} +#endif /* CONFIG_PID_RESERVE */ + #endif /* CONFIG_PIN_MEMORY */
From: Sang Yan sangyan@huawei.com
hulk inclusion category: feature bugzilla: 48159 CVE: N/A
------------------------------
enable kernel hot upgrade features by default: 1 add pin mem method for checkpoint and restore: CONFIG_PIN_MEMORY=y CONFIG_PIN_MEMORY_DEV=m 2 add pid reserve method for checkpoint and restore CONFIG_PID_RESERVE=y 3 add cpu park method CONFIG_ARM64_CPU_PARK=y 4 add quick kexec support for kernel CONFIG_QUICK_KEXEC=y 5 add legacy pmem support for arm64 CONFIG_ARM64_PMEM_RESERVE=y CONFIG_ARM64_PMEM_LEGACY_DEVICE=y CONFIG_PMEM_LEGACY=m
Signed-off-by: Sang Yan sangyan@huawei.com Signed-off-by: Jingxian He hejingxian@huawei.com Signed-off-by: Zhu Ling zhuling8@huawei.com Reviewed-by: Chen Wandun chenwandun@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- arch/arm64/configs/openeuler_defconfig | 8 ++++++++ 1 file changed, 8 insertions(+)
diff --git a/arch/arm64/configs/openeuler_defconfig b/arch/arm64/configs/openeuler_defconfig index cb7ea0f4b282..fd62a5d7d069 100644 --- a/arch/arm64/configs/openeuler_defconfig +++ b/arch/arm64/configs/openeuler_defconfig @@ -408,10 +408,13 @@ CONFIG_PARAVIRT_TIME_ACCOUNTING=y CONFIG_KEXEC=y # CONFIG_KEXEC_FILE is not set CONFIG_CRASH_DUMP=y +CONFIG_ARM64_CPU_PARK=y # CONFIG_XEN is not set CONFIG_FORCE_MAX_ZONEORDER=14 CONFIG_UNMAP_KERNEL_AT_EL0=y CONFIG_RODATA_FULL_DEFAULT_ENABLED=y +CONFIG_ARM64_PMEM_RESERVE=y +CONFIG_ARM64_PMEM_LEGACY_DEVICE=y # CONFIG_ARM64_SW_TTBR0_PAN is not set CONFIG_ARM64_TAGGED_ADDR_ABI=y CONFIG_ARM64_ILP32=y @@ -711,6 +714,7 @@ CONFIG_CRYPTO_AES_ARM64_BS=m # CONFIG_CRASH_CORE=y CONFIG_KEXEC_CORE=y +CONFIG_QUICK_KEXEC=y CONFIG_SET_FS=y CONFIG_KPROBES=y CONFIG_JUMP_LABEL=y @@ -1041,6 +1045,8 @@ CONFIG_FRAME_VECTOR=y # CONFIG_GUP_BENCHMARK is not set # CONFIG_READ_ONLY_THP_FOR_FS is not set CONFIG_ARCH_HAS_PTE_SPECIAL=y +CONFIG_PIN_MEMORY=y +CONFIG_PID_RESERVE=y # end of Memory Management options
CONFIG_NET=y @@ -3285,6 +3291,7 @@ CONFIG_TCG_TIS_ST33ZP24=y CONFIG_TCG_TIS_ST33ZP24_I2C=y CONFIG_TCG_TIS_ST33ZP24_SPI=y # CONFIG_XILLYBUS is not set +CONFIG_PIN_MEMORY_DEV=m # end of Character devices
# CONFIG_RANDOM_TRUST_CPU is not set @@ -6007,6 +6014,7 @@ CONFIG_ND_BTT=m CONFIG_BTT=y CONFIG_OF_PMEM=m CONFIG_NVDIMM_KEYS=y +CONFIG_PMEM_LEGACY=m CONFIG_DAX_DRIVER=y CONFIG_DAX=y CONFIG_DEV_DAX=m