From: Jingxian He hejingxian@huawei.com Date: Mon, 1 Mar 2021 17:35:32 +0800 Subject: [PATCH openEuler-21.03 1/2] mm: add pin memory method for checkpoint add restore
hulk inclusion category: feature bugzilla: 48159 CVE: N/A
We can use the checkpoint and restore in userspace(criu) method to dump and restore tasks when updating the kernel. Currently, criu needs dump all memory data of tasks to files. When the memory size is very large(larger than 1G), the cost time of the dumping data will be very long(more than 1 min).
By pin the memory data of tasks and collect the corresponding physical pages mapping info in checkpoint process, we can remap the physical pages to restore tasks after upgrading the kernel. This pin memory method can restore the task data within one second.
The pin memory area info is saved in the reserved memblock, which can keep usable in the kernel update process.
The pin memory driver provides the following ioctl command for criu: 1) SET_PIN_MEM_AREA: Set pin memory area, which can be remap to the restore task. 2) CLEAR_PIN_MEM_AREA: Clear the pin memory area info, which enable user reset the pin data. 3) REMAP_PIN_MEM_AREA: Remap the pages of the pin memory to the restore task.
Signed-off-by: Jingxian He hejingxian@huawei.com Reviewed-by: Wenliang He hewenliang4@huawei.com Reviewed-by: Jing Xiangfeng jingxiangfeng@huawei.com --- arch/arm64/configs/openeuler_defconfig | 2 + arch/arm64/kernel/setup.c | 9 + arch/arm64/mm/init.c | 60 +++ drivers/char/Kconfig | 6 + drivers/char/Makefile | 1 + drivers/char/pin_memory.c | 208 ++++++++ include/linux/crash_core.h | 5 + include/linux/pin_mem.h | 78 +++ kernel/crash_core.c | 11 + mm/Kconfig | 8 + mm/Makefile | 1 + mm/huge_memory.c | 61 +++ mm/memory.c | 59 ++ mm/pin_mem.c | 950 +++++++++++++++++++++++++++++++++ 14 files changed, 1459 insertions(+) create mode 100644 drivers/char/pin_memory.c create mode 100644 include/linux/pin_mem.h create mode 100644 mm/pin_mem.c
diff --git a/arch/arm64/configs/openeuler_defconfig b/arch/arm64/configs/openeuler_defconfig index c5271e7..76fda68 100644 --- a/arch/arm64/configs/openeuler_defconfig +++ b/arch/arm64/configs/openeuler_defconfig @@ -1036,6 +1036,7 @@ CONFIG_FRAME_VECTOR=y # CONFIG_GUP_BENCHMARK is not set # CONFIG_READ_ONLY_THP_FOR_FS is not set CONFIG_ARCH_HAS_PTE_SPECIAL=y +CONFIG_PIN_MEMORY=y # end of Memory Management options CONFIG_NET=y @@ -3282,6 +3283,7 @@ CONFIG_TCG_TIS_ST33ZP24_SPI=y # CONFIG_RANDOM_TRUST_CPU is not set # CONFIG_RANDOM_TRUST_BOOTLOADER is not set +CONFIG_PIN_MEMORY_DEV=m # # I2C support diff --git a/arch/arm64/kernel/setup.c b/arch/arm64/kernel/setup.c index c1f1fb9..5e282d3 100644 --- a/arch/arm64/kernel/setup.c +++ b/arch/arm64/kernel/setup.c @@ -50,6 +50,9 @@ #include <asm/efi.h> #include <asm/xen/hypervisor.h> #include <asm/mmu_context.h> +#ifdef CONFIG_PIN_MEMORY +#include <linux/pin_mem.h> +#endif static int num_standard_resources; static struct resource *standard_resources; @@ -260,6 +263,12 @@ static void __init request_standard_resources(void) quick_kexec_res.end <= res->end) request_resource(res, &quick_kexec_res); #endif +#ifdef CONFIG_PIN_MEMORY + if (pin_memory_resource.end && + pin_memory_resource.start >= res->start && + pin_memory_resource.end <= res->end) + request_resource(res, &pin_memory_resource); +#endif } } diff --git a/arch/arm64/mm/init.c b/arch/arm64/mm/init.c index f3e5a66..8ab5aac 100644 --- a/arch/arm64/mm/init.c +++ b/arch/arm64/mm/init.c @@ -42,6 +42,9 @@ #include <linux/sizes.h> #include <asm/tlb.h> #include <asm/alternative.h> +#ifdef CONFIG_PIN_MEMORY +#include <linux/pin_mem.h> +#endif #define ARM64_ZONE_DMA_BITS 30 @@ -78,6 +81,55 @@ static void __init reserve_crashkernel(void) */ #define MAX_USABLE_RANGES 2 +#ifdef CONFIG_PIN_MEMORY +struct resource pin_memory_resource = { + .name = "Pin memory", + .start = 0, + .end = 0, + .flags = IORESOURCE_MEM, + .desc = IORES_DESC_RESERVED +}; + +static void __init reserve_pin_memory_res(void) +{ + unsigned long long mem_start, mem_len; + int ret; + + ret = parse_pin_memory(boot_command_line, memblock_phys_mem_size(), + &mem_len, &mem_start); + if (ret || !mem_len) + return; + + mem_len = PAGE_ALIGN(mem_len); + + if (!memblock_is_region_memory(mem_start, mem_len)) { + pr_warn("cannot reserve for pin memory: region is not memory!\n"); + return; + } + + if (memblock_is_region_reserved(mem_start, mem_len)) { + pr_warn("cannot reserve for pin memory: region overlaps reserved memory!\n"); + return; + } + + if (!IS_ALIGNED(mem_start, SZ_2M)) { + pr_warn("cannot reserve for pin memory: base address is not 2MB aligned\n"); + return; + } + + memblock_reserve(mem_start, mem_len); + pr_debug("pin memory resource reserved: 0x%016llx - 0x%016llx (%lld MB)\n", + mem_start, mem_start + mem_len, mem_len >> 20); + + pin_memory_resource.start = mem_start; + pin_memory_resource.end = mem_start + mem_len - 1; +} +#else +static void __init reserve_pin_memory_res(void) +{ +} +#endif /* CONFIG_PIN_MEMORY */ + #ifdef CONFIG_CRASH_DUMP static int __init early_init_dt_scan_elfcorehdr(unsigned long node, const char *uname, int depth, void *data) @@ -455,6 +507,8 @@ void __init arm64_memblock_init(void) reserve_park_mem(); #endif + reserve_pin_memory_res(); + reserve_elfcorehdr(); high_memory = __va(memblock_end_of_DRAM() - 1) + 1; @@ -583,6 +637,12 @@ void __init mem_init(void) /* this will put all unused low memory onto the freelists */ memblock_free_all(); +#ifdef CONFIG_PIN_MEMORY + /* pre alloc the pages for pin memory */ + init_reserve_page_map((unsigned long)pin_memory_resource.start, + (unsigned long)(pin_memory_resource.end - pin_memory_resource.start + 1)); +#endif + mem_init_print_info(NULL); /* diff --git a/drivers/char/Kconfig b/drivers/char/Kconfig index d229a2d..fbb94b8 100644 --- a/drivers/char/Kconfig +++ b/drivers/char/Kconfig @@ -496,3 +496,9 @@ config RANDOM_TRUST_BOOTLOADER booloader is trustworthy so it will be added to the kernel's entropy pool. Otherwise, say N here so it will be regarded as device input that only mixes the entropy pool. + +config PIN_MEMORY_DEV + bool "/dev/pinmem character device" + default m + help + pin memory driver diff --git a/drivers/char/Makefile b/drivers/char/Makefile index ffce287..71d76fd 100644 --- a/drivers/char/Makefile +++ b/drivers/char/Makefile @@ -47,3 +47,4 @@ obj-$(CONFIG_PS3_FLASH) += ps3flash.o obj-$(CONFIG_XILLYBUS) += xillybus/ obj-$(CONFIG_POWERNV_OP_PANEL) += powernv-op-panel.o obj-$(CONFIG_ADI) += adi.o +obj-$(CONFIG_PIN_MEMORY_DEV) += pin_memory.o diff --git a/drivers/char/pin_memory.c b/drivers/char/pin_memory.c new file mode 100644 index 0000000..f46e056 --- /dev/null +++ b/drivers/char/pin_memory.c @@ -0,0 +1,208 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (C) 2021. Huawei Technologies Co., Ltd. All rights reserved. + * Pin memory driver for checkpoint and restore. + */ +#include <linux/kernel.h> +#include <linux/module.h> +#include <linux/kprobes.h> +#include <linux/spinlock.h> +#include <linux/workqueue.h> +#include <linux/sched.h> +#include <linux/mm.h> +#include <linux/init.h> +#include <linux/miscdevice.h> +#include <linux/fs.h> +#include <linux/mm_types.h> +#include <linux/processor.h> +#include <uapi/asm-generic/ioctl.h> +#include <uapi/asm-generic/mman-common.h> +#include <uapi/asm/setup.h> +#include <linux/pin_mem.h> +#include <linux/sched/mm.h> + +#define MAX_PIN_MEM_AREA_NUM 16 +struct _pin_mem_area { + unsigned long virt_start; + unsigned long virt_end; +}; + +struct pin_mem_area_set { + unsigned int pid; + unsigned int area_num; + struct _pin_mem_area mem_area[MAX_PIN_MEM_AREA_NUM]; +}; + +#define PIN_MEM_MAGIC 0x59 +#define _SET_PIN_MEM_AREA 1 +#define _CLEAR_PIN_MEM_AREA 2 +#define _REMAP_PIN_MEM_AREA 3 +#define _FINISH_PIN_MEM_DUMP 4 +#define _PIN_MEM_IOC_MAX_NR 4 +#define SET_PIN_MEM_AREA _IOW(PIN_MEM_MAGIC, _SET_PIN_MEM_AREA, struct pin_mem_area_set) +#define CLEAR_PIN_MEM_AREA _IOW(PIN_MEM_MAGIC, _CLEAR_PIN_MEM_AREA, int) +#define REMAP_PIN_MEM_AREA _IOW(PIN_MEM_MAGIC, _REMAP_PIN_MEM_AREA, int) +#define FINISH_PIN_MEM_DUMP _IOW(PIN_MEM_MAGIC, _FINISH_PIN_MEM_DUMP, int) +static int set_pin_mem(struct pin_mem_area_set *pmas) +{ + int i; + int ret = 0; + struct _pin_mem_area *pma; + struct mm_struct *mm; + struct task_struct *task; + struct pid *pid_s; + + pid_s = find_get_pid(pmas->pid); + if (!pid_s) { + pr_warn("Get pid struct fail:%d.\n", pmas->pid); + return -EFAULT; + } + rcu_read_lock(); + task = pid_task(pid_s, PIDTYPE_PID); + if (!task) { + pr_warn("Get task struct fail:%d.\n", pmas->pid); + goto fail; + } + mm = get_task_mm(task); + for (i = 0; i < pmas->area_num; i++) { + pma = &(pmas->mem_area[i]); + ret = pin_mem_area(task, mm, pma->virt_start, pma->virt_end); + if (ret) { + mmput(mm); + goto fail; + } + } + mmput(mm); + rcu_read_unlock(); + put_pid(pid_s); + return ret; + +fail: + rcu_read_unlock(); + put_pid(pid_s); + return -EFAULT; +} + +static int set_pin_mem_area(unsigned long arg) +{ + struct pin_mem_area_set pmas; + void __user *buf = (void __user *)arg; + + if (!access_ok(buf, sizeof(pmas))) + return -EFAULT; + if (copy_from_user(&pmas, buf, sizeof(pmas))) + return -EINVAL; + if (pmas.area_num > MAX_PIN_MEM_AREA_NUM) { + pr_warn("Input area_num is too large.\n"); + return -EINVAL; + } + + return set_pin_mem(&pmas); +} + +static int pin_mem_remap(unsigned long arg) +{ + int pid; + struct task_struct *task; + struct mm_struct *mm; + vm_fault_t ret; + void __user *buf = (void __user *)arg; + struct pid *pid_s; + + if (!access_ok(buf, sizeof(int))) + return -EINVAL; + if (copy_from_user(&pid, buf, sizeof(int))) + return -EINVAL; + + pid_s = find_get_pid(pid); + if (!pid_s) { + pr_warn("Get pid struct fail:%d.\n", pid); + return -EINVAL; + } + rcu_read_lock(); + task = pid_task(pid_s, PIDTYPE_PID); + if (!task) { + pr_warn("Get task struct fail:%d.\n", pid); + goto fault; + } + mm = get_task_mm(task); + ret = do_mem_remap(pid, mm); + if (ret) { + pr_warn("Handle pin memory remap fail.\n"); + mmput(mm); + goto fault; + } + mmput(mm); + rcu_read_unlock(); + put_pid(pid_s); + return 0; + +fault: + rcu_read_unlock(); + put_pid(pid_s); + return -EFAULT; +} + +static long pin_memory_ioctl(struct file *file, unsigned int cmd, unsigned long arg) +{ + long ret = 0; + + if (_IOC_TYPE(cmd) != PIN_MEM_MAGIC) + return -EINVAL; + if (_IOC_NR(cmd) > _PIN_MEM_IOC_MAX_NR) + return -EINVAL; + + switch (cmd) { + case SET_PIN_MEM_AREA: + ret = set_pin_mem_area(arg); + break; + case CLEAR_PIN_MEM_AREA: + clear_pin_memory_record(); + break; + case REMAP_PIN_MEM_AREA: + ret = pin_mem_remap(arg); + break; + case FINISH_PIN_MEM_DUMP: + ret = finish_pin_mem_dump(); + break; + default: + return -EINVAL; + } + return ret; +} + +static const struct file_operations pin_memory_fops = { + .owner = THIS_MODULE, + .unlocked_ioctl = pin_memory_ioctl, + .compat_ioctl = pin_memory_ioctl, +}; + +static struct miscdevice pin_memory_miscdev = { + .minor = MISC_DYNAMIC_MINOR, + .name = "pinmem", + .fops = &pin_memory_fops, +}; + +static int pin_memory_init(void) +{ + int err = misc_register(&pin_memory_miscdev); + + if (!err) + pr_info("pin_memory init\n"); + else + pr_warn("pin_memory init failed!\n"); + return err; +} + +static void pin_memory_exit(void) +{ + misc_deregister(&pin_memory_miscdev); + pr_info("pin_memory ko exists!\n"); +} + +module_init(pin_memory_init); +module_exit(pin_memory_exit); + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Euler"); +MODULE_DESCRIPTION("pin memory"); diff --git a/include/linux/crash_core.h b/include/linux/crash_core.h index fc0ef33..30f0df3 100644 --- a/include/linux/crash_core.h +++ b/include/linux/crash_core.h @@ -87,4 +87,9 @@ int parse_crashkernel_high(char *cmdline, unsigned long long system_ram, int parse_crashkernel_low(char *cmdline, unsigned long long system_ram, unsigned long long *crash_size, unsigned long long *crash_base); +#ifdef CONFIG_PIN_MEMORY +int __init parse_pin_memory(char *cmdline, unsigned long long system_ram, + unsigned long long *pin_size, unsigned long long *pin_base); +#endif + #endif /* LINUX_CRASH_CORE_H */ diff --git a/include/linux/pin_mem.h b/include/linux/pin_mem.h new file mode 100644 index 0000000..bc8b03e --- /dev/null +++ b/include/linux/pin_mem.h @@ -0,0 +1,78 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Copyright (C) 2021. Huawei Technologies Co., Ltd. All rights reserved. + * Provide the pin memory method for checkpoint and restore task. + */ +#ifndef _LINUX_PIN_MEMORY_H +#define _LINUX_PIN_MEMORY_H + +#ifdef CONFIG_PIN_MEMORY +#include <linux/errno.h> +#include <linux/mm_types.h> +#include <linux/err.h> +#ifdef CONFIG_ARM64 +#include <linux/ioport.h> +#endif + +#define PAGE_BUDDY_MAPCOUNT_VALUE (~PG_buddy) + +#define COLLECT_PAGES_FINISH 0 +#define COLLECT_PAGES_NEED_CONTINUE 1 +#define COLLECT_PAGES_FAIL -1 + +#define COMPOUND_PAD_MASK 0xffffffff +#define COMPOUND_PAD_START 0x88 +#define COMPOUND_PAD_DELTA 0x40 +#define LIST_POISON4 0xdead000000000400 +#define PAGE_FLAGS_CHECK_RESERVED (1UL << PG_reserved) +#define SHA256_DIGEST_SIZE 32 +#define next_pme(pme) ((unsigned long *)(pme + 1) + pme->nr_pages) +#define PIN_MEM_DUMP_MAGIC 0xfeab000000001acd +struct page_map_entry { + unsigned long virt_addr; + unsigned int nr_pages; + unsigned int is_huge_page; + unsigned long redirect_start; + unsigned long phy_addr_array[0]; +}; + +struct page_map_info { + int pid; + int pid_reserved; + unsigned int entry_num; + int disable_free_page; + struct page_map_entry *pme; +}; + +struct pin_mem_dump_info { + char sha_digest[SHA256_DIGEST_SIZE]; + unsigned long magic; + unsigned int pin_pid_num; + struct page_map_info pmi_array[0]; +}; + +struct redirect_info { + unsigned int redirect_pages; + unsigned int redirect_index[0]; +}; + +extern struct page_map_info *get_page_map_info(int pid); +extern struct page_map_info *create_page_map_info(int pid); +extern vm_fault_t do_mem_remap(int pid, struct mm_struct *mm); +extern vm_fault_t do_anon_page_remap(struct vm_area_struct *vma, unsigned long address, + pmd_t *pmd, struct page *page); +extern void clear_pin_memory_record(void); +extern int pin_mem_area(struct task_struct *task, struct mm_struct *mm, + unsigned long start_addr, unsigned long end_addr); +extern vm_fault_t do_anon_huge_page_remap(struct vm_area_struct *vma, unsigned long address, + pmd_t *pmd, struct page *page); +extern int finish_pin_mem_dump(void); + +/* reserve space for pin memory*/ +#ifdef CONFIG_ARM64 +extern struct resource pin_memory_resource; +#endif +extern void init_reserve_page_map(unsigned long map_addr, unsigned long map_size); + +#endif /* CONFIG_PIN_MEMORY */ +#endif /* _LINUX_PIN_MEMORY_H */ diff --git a/kernel/crash_core.c b/kernel/crash_core.c index bfed474..2407de3 100644 --- a/kernel/crash_core.c +++ b/kernel/crash_core.c @@ -450,6 +450,17 @@ void __init reserve_crashkernel(void) } #endif /* CONFIG_ARCH_WANT_RESERVE_CRASH_KERNEL */ +#ifdef CONFIG_PIN_MEMORY +int __init parse_pin_memory(char *cmdline, + unsigned long long system_ram, + unsigned long long *pin_size, + unsigned long long *pin_base) +{ + return __parse_crashkernel(cmdline, system_ram, pin_size, pin_base, + "pinmemory=", NULL); +} +#endif + Elf_Word *append_elf_note(Elf_Word *buf, char *name, unsigned int type, void *data, size_t data_len) { diff --git a/mm/Kconfig b/mm/Kconfig index 390165f..930dc13 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -859,4 +859,12 @@ config ARCH_HAS_HUGEPD config MAPPING_DIRTY_HELPERS bool +config PIN_MEMORY + bool "Support for pin memory" + depends on CHECKPOINT_RESTORE + help + Say y here to enable the pin memory feature for checkpoint + and restore. We can pin the memory data of tasks and collect + the corresponding physical pages mapping info in checkpoint, + and remap the physical pages to restore tasks in restore. endmenu diff --git a/mm/Makefile b/mm/Makefile index d73aed0..4963827 100644 --- a/mm/Makefile +++ b/mm/Makefile @@ -120,3 +120,4 @@ obj-$(CONFIG_MEMFD_CREATE) += memfd.o obj-$(CONFIG_MAPPING_DIRTY_HELPERS) += mapping_dirty_helpers.o obj-$(CONFIG_PTDUMP_CORE) += ptdump.o obj-$(CONFIG_PAGE_REPORTING) += page_reporting.o +obj-$(CONFIG_PIN_MEMORY) += pin_mem.o diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 0bc4a2c..8a11d30 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -2996,3 +2996,64 @@ void remove_migration_pmd(struct page_vma_mapped_walk *pvmw, struct page *new) update_mmu_cache_pmd(vma, address, pvmw->pmd); } #endif + +#ifdef CONFIG_PIN_MEMORY +vm_fault_t do_anon_huge_page_remap(struct vm_area_struct *vma, unsigned long address, + pmd_t *pmd, struct page *page) +{ + gfp_t gfp; + pgtable_t pgtable; + spinlock_t *ptl; + pmd_t entry; + vm_fault_t ret = 0; + + if (unlikely(anon_vma_prepare(vma))) + return VM_FAULT_OOM; + if (unlikely(khugepaged_enter(vma, vma->vm_flags))) + return VM_FAULT_OOM; + gfp = alloc_hugepage_direct_gfpmask(vma); + prep_transhuge_page(page); + if (mem_cgroup_charge(page, vma->vm_mm, gfp)) { + put_page(page); + count_vm_event(THP_FAULT_FALLBACK); + count_vm_event(THP_FAULT_FALLBACK_CHARGE); + return VM_FAULT_FALLBACK; + } + cgroup_throttle_swaprate(page, gfp); + + pgtable = pte_alloc_one(vma->vm_mm); + if (unlikely(!pgtable)) { + ret = VM_FAULT_OOM; + goto release; + } + __SetPageUptodate(page); + ptl = pmd_lock(vma->vm_mm, pmd); + if (unlikely(!pmd_none(*pmd))) { + goto unlock_release; + } else { + ret = check_stable_address_space(vma->vm_mm); + if (ret) + goto unlock_release; + entry = mk_huge_pmd(page, vma->vm_page_prot); + entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma); + page_add_new_anon_rmap(page, vma, address, true); + lru_cache_add_inactive_or_unevictable(page, vma); + pgtable_trans_huge_deposit(vma->vm_mm, pmd, pgtable); + set_pmd_at(vma->vm_mm, address, pmd, entry); + add_mm_counter(vma->vm_mm, MM_ANONPAGES, HPAGE_PMD_NR); + mm_inc_nr_ptes(vma->vm_mm); + spin_unlock(ptl); + count_vm_event(THP_FAULT_ALLOC); + count_memcg_event_mm(vma->vm_mm, THP_FAULT_ALLOC); + } + + return 0; +unlock_release: + spin_unlock(ptl); +release: + if (pgtable) + pte_free(vma->vm_mm, pgtable); + put_page(page); + return ret; +} +#endif diff --git a/mm/memory.c b/mm/memory.c index 50632c4..7b7f1a7 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -5248,3 +5248,62 @@ void ptlock_free(struct page *page) kmem_cache_free(page_ptl_cachep, page->ptl); } #endif + +#ifdef CONFIG_PIN_MEMORY +vm_fault_t do_anon_page_remap(struct vm_area_struct *vma, unsigned long address, + pmd_t *pmd, struct page *page) +{ + pte_t entry; + spinlock_t *ptl; + pte_t *pte; + vm_fault_t ret = 0; + + if (pte_alloc(vma->vm_mm, pmd)) + return VM_FAULT_OOM; + + /* See the comment in pte_alloc_one_map() */ + if (unlikely(pmd_trans_unstable(pmd))) + return 0; + + /* Allocate our own private page. */ + if (unlikely(anon_vma_prepare(vma))) + goto oom; + + if (mem_cgroup_charge(page, vma->vm_mm, GFP_KERNEL)) + goto oom_free_page; + cgroup_throttle_swaprate(page, GFP_KERNEL); + + __SetPageUptodate(page); + + entry = mk_pte(page, vma->vm_page_prot); + if (vma->vm_flags & VM_WRITE) + entry = pte_mkwrite(pte_mkdirty(entry)); + pte = pte_offset_map_lock(vma->vm_mm, pmd, address, + &ptl); + if (!pte_none(*pte)) { + ret = VM_FAULT_FALLBACK; + goto release; + } + + ret = check_stable_address_space(vma->vm_mm); + if (ret) + goto release; + inc_mm_counter_fast(vma->vm_mm, MM_ANONPAGES); + page_add_new_anon_rmap(page, vma, address, false); + lru_cache_add_inactive_or_unevictable(page, vma); + + set_pte_at(vma->vm_mm, address, pte, entry); + /* No need to invalidate - it was non-present before */ + update_mmu_cache(vma, address, pte); +unlock: + pte_unmap_unlock(pte, ptl); + return ret; +release: + put_page(page); + goto unlock; +oom_free_page: + put_page(page); +oom: + return VM_FAULT_OOM; +} +#endif diff --git a/mm/pin_mem.c b/mm/pin_mem.c new file mode 100644 index 0000000..0a143b6 --- /dev/null +++ b/mm/pin_mem.c @@ -0,0 +1,950 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (C) 2021. Huawei Technologies Co., Ltd. All rights reserved. + * Provide the pin memory method for checkpoint and restore task. + */ +#ifdef CONFIG_PIN_MEMORY +#include <linux/init.h> +#include <linux/slab.h> +#include <linux/time.h> +#include <linux/sched/cputime.h> +#include <linux/tick.h> +#include <linux/mm.h> +#include <linux/pin_mem.h> +#include <linux/idr.h> +#include <linux/page-isolation.h> +#include <linux/sched/mm.h> +#include <linux/ctype.h> +#include <linux/highmem.h> +#include <crypto/sha.h> + +#define MAX_PIN_PID_NUM 128 +static DEFINE_SPINLOCK(page_map_entry_lock); + +struct pin_mem_dump_info *pin_mem_dump_start; +unsigned int pin_pid_num; +static unsigned int *pin_pid_num_addr; +static unsigned long __page_map_entry_start; +static unsigned long page_map_entry_end; +static struct page_map_info *user_space_reserve_start; +static struct page_map_entry *page_map_entry_start; +unsigned int max_pin_pid_num __read_mostly; +unsigned long redirect_space_size; +unsigned long redirect_space_start; +#define DEFAULT_REDIRECT_SPACE_SIZE 0x100000 + +static int __init setup_max_pin_pid_num(char *str) +{ + int ret = 0; + + if (!str) + goto out; + + ret = kstrtouint(str, 10, &max_pin_pid_num); +out: + if (ret) { + pr_warn("Unable to parse max pin pid num.\n"); + } else { + if (max_pin_pid_num > MAX_PIN_PID_NUM) { + max_pin_pid_num = 0; + pr_warn("Input max_pin_pid_num is too large.\n"); + } + } + return ret; +} +early_param("max_pin_pid_num", setup_max_pin_pid_num); + +static int __init setup_redirect_space_size(char *str) +{ + if (!str) + goto out; + + redirect_space_size = memparse(str, NULL); +out: + if (!redirect_space_size) { + pr_warn("Unable to parse redirect space size, use the default value.\n"); + redirect_space_size = DEFAULT_REDIRECT_SPACE_SIZE; + } + return 0; +} +early_param("redirect_space_size", setup_redirect_space_size); + +struct page_map_info *create_page_map_info(int pid) +{ + struct page_map_info *new; + + if (!user_space_reserve_start) + return NULL; + + if (pin_pid_num >= max_pin_pid_num) { + pr_warn("Pin pid num too large than max_pin_pid_num, fail create: %d!", pid); + return NULL; + } + new = (struct page_map_info *)(user_space_reserve_start + pin_pid_num); + new->pid = pid; + new->pme = NULL; + new->entry_num = 0; + new->pid_reserved = false; + new->disable_free_page = false; + (*pin_pid_num_addr)++; + pin_pid_num++; + return new; +} +EXPORT_SYMBOL_GPL(create_page_map_info); + +struct page_map_info *get_page_map_info(int pid) +{ + int i; + + if (!user_space_reserve_start) + return NULL; + + for (i = 0; i < pin_pid_num; i++) { + if (user_space_reserve_start[i].pid == pid) + return &(user_space_reserve_start[i]); + } + return NULL; +} +EXPORT_SYMBOL_GPL(get_page_map_info); + +static struct page *find_head_page(struct page *page) +{ + struct page *p = page; + + while (!PageBuddy(p)) { + if (PageLRU(p)) + return NULL; + p--; + } + return p; +} + +static void spilt_page_area_left(struct zone *zone, struct free_area *area, struct page *page, + unsigned long size, int order) +{ + unsigned long cur_size = 1 << order; + unsigned long total_size = 0; + + while (size && cur_size > size) { + cur_size >>= 1; + order--; + area--; + if (cur_size <= size) { + list_add(&page[total_size].lru, &area->free_list[MIGRATE_MOVABLE]); + atomic_set(&(page[total_size]._mapcount), PAGE_BUDDY_MAPCOUNT_VALUE); + set_page_private(&page[total_size], order); + set_pageblock_migratetype(&page[total_size], MIGRATE_MOVABLE); + area->nr_free++; + total_size += cur_size; + size -= cur_size; + } + } +} + +static void spilt_page_area_right(struct zone *zone, struct free_area *area, struct page *page, + unsigned long size, int order) +{ + unsigned long cur_size = 1 << order; + struct page *right_page, *head_page; + + right_page = page + size; + while (size && cur_size > size) { + cur_size >>= 1; + order--; + area--; + if (cur_size <= size) { + head_page = right_page - cur_size; + list_add(&head_page->lru, &area->free_list[MIGRATE_MOVABLE]); + atomic_set(&(head_page->_mapcount), PAGE_BUDDY_MAPCOUNT_VALUE); + set_page_private(head_page, order); + set_pageblock_migratetype(head_page, MIGRATE_MOVABLE); + area->nr_free++; + size -= cur_size; + right_page = head_page; + } + } +} + +void reserve_page_from_buddy(unsigned long nr_pages, struct page *page) +{ + unsigned int current_order; + struct page *page_end; + struct free_area *area; + struct zone *zone; + struct page *head_page; + + head_page = find_head_page(page); + if (!head_page) { + pr_warn("Find page head fail."); + return; + } + current_order = head_page->private; + page_end = head_page + (1 << current_order); + zone = page_zone(head_page); + area = &(zone->free_area[current_order]); + list_del(&head_page->lru); + atomic_set(&head_page->_mapcount, -1); + set_page_private(head_page, 0); + area->nr_free--; + if (head_page != page) + spilt_page_area_left(zone, area, head_page, + (unsigned long)(page - head_page), current_order); + page = page + nr_pages; + if (page < page_end) { + spilt_page_area_right(zone, area, page, + (unsigned long)(page_end - page), current_order); + } else if (page > page_end) { + pr_warn("Find page end smaller than page."); + } +} + +static inline void reserve_user_normal_pages(struct page *page) +{ + atomic_inc(&page->_refcount); + reserve_page_from_buddy(1, page); +} + +static void init_huge_pmd_pages(struct page *head_page) +{ + int i = 0; + struct page *page = head_page; + + __set_bit(PG_head, &page->flags); + __set_bit(PG_active, &page->flags); + atomic_set(&page->_refcount, 1); + page++; + i++; + page->compound_head = (unsigned long)head_page + 1; + page->compound_dtor = HUGETLB_PAGE_DTOR + 1; + page->compound_order = HPAGE_PMD_ORDER; + page++; + i++; + page->compound_head = (unsigned long)head_page + 1; + i++; + INIT_LIST_HEAD(&(page->deferred_list)); + for (; i < HPAGE_PMD_NR; i++) { + page = head_page + i; + page->compound_head = (unsigned long)head_page + 1; + } +} + +static inline void reserve_user_huge_pmd_pages(struct page *page) +{ + atomic_inc(&page->_refcount); + reserve_page_from_buddy((1 << HPAGE_PMD_ORDER), page); + init_huge_pmd_pages(page); +} + +int reserve_user_map_pages_fail; + +void free_user_map_pages(unsigned int pid_index, unsigned int entry_index, unsigned int page_index) +{ + unsigned int i, j, index, order; + struct page_map_info *pmi; + struct page_map_entry *pme; + struct page *page; + unsigned long phy_addr; + + for (index = 0; index < pid_index; index++) { + pmi = &(user_space_reserve_start[index]); + pme = pmi->pme; + for (i = 0; i < pmi->entry_num; i++) { + for (j = 0; j < pme->nr_pages; j++) { + order = pme->is_huge_page ? HPAGE_PMD_ORDER : 0; + phy_addr = pme->phy_addr_array[j]; + if (phy_addr) { + page = phys_to_page(phy_addr); + if (!(page->flags & PAGE_FLAGS_CHECK_RESERVED)) { + __free_pages(page, order); + pme->phy_addr_array[j] = 0; + } + } + } + pme = (struct page_map_entry *)next_pme(pme); + } + } + pmi = &(user_space_reserve_start[index]); + pme = pmi->pme; + for (i = 0; i < entry_index; i++) { + for (j = 0; j < pme->nr_pages; j++) { + order = pme->is_huge_page ? HPAGE_PMD_ORDER : 0; + phy_addr = pme->phy_addr_array[j]; + if (phy_addr) { + page = phys_to_page(phy_addr); + if (!(page->flags & PAGE_FLAGS_CHECK_RESERVED)) { + __free_pages(page, order); + pme->phy_addr_array[j] = 0; + } + } + } + pme = (struct page_map_entry *)next_pme(pme); + } + for (j = 0; j < page_index; j++) { + order = pme->is_huge_page ? HPAGE_PMD_ORDER : 0; + phy_addr = pme->phy_addr_array[j]; + if (phy_addr) { + page = phys_to_page(phy_addr); + if (!(page->flags & PAGE_FLAGS_CHECK_RESERVED)) { + __free_pages(page, order); + pme->phy_addr_array[j] = 0; + } + } + } +} + +bool check_redirect_end_valid(struct redirect_info *redirect_start, + unsigned long max_redirect_page_num) +{ + unsigned long redirect_end; + + redirect_end = ((unsigned long)(redirect_start + 1) + + max_redirect_page_num * sizeof(unsigned int)); + if (redirect_end > redirect_space_start + redirect_space_size) + return false; + return false; +} + +static void reserve_user_space_map_pages(void) +{ + struct page_map_info *pmi; + struct page_map_entry *pme; + unsigned int i, j, index; + struct page *page; + unsigned long flags; + unsigned long phy_addr; + unsigned long redirect_pages = 0; + struct redirect_info *redirect_start = (struct redirect_info *)redirect_space_start; + + if (!user_space_reserve_start || !redirect_start) + return; + spin_lock_irqsave(&page_map_entry_lock, flags); + for (index = 0; index < pin_pid_num; index++) { + pmi = &(user_space_reserve_start[index]); + pme = pmi->pme; + for (i = 0; i < pmi->entry_num; i++) { + redirect_pages = 0; + if (!check_redirect_end_valid(redirect_start, pme->nr_pages)) + redirect_start = NULL; + for (j = 0; j < pme->nr_pages; j++) { + phy_addr = pme->phy_addr_array[j]; + if (!phy_addr) + continue; + page = phys_to_page(phy_addr); + if (atomic_read(&page->_refcount)) { + if ((page->flags & PAGE_FLAGS_CHECK_RESERVED) + && !pme->redirect_start) + pme->redirect_start = + (unsigned long)redirect_start; + if (redirect_start && + (page->flags & PAGE_FLAGS_CHECK_RESERVED)) { + redirect_start->redirect_index[redirect_pages] = j; + redirect_pages++; + continue; + } else { + reserve_user_map_pages_fail = 1; + pr_warn("Page %pK refcount %d large than zero, no need reserve.\n", + page, atomic_read(&page->_refcount)); + goto free_pages; + } + } + if (!pme->is_huge_page) + reserve_user_normal_pages(page); + else + reserve_user_huge_pmd_pages(page); + } + pme = (struct page_map_entry *)next_pme(pme); + if (redirect_pages && redirect_start) { + redirect_start->redirect_pages = redirect_pages; + redirect_start = (struct redirect_info *)( + (unsigned long)(redirect_start + 1) + + redirect_start->redirect_pages * sizeof(unsigned int)); + } + } + } + spin_unlock(&page_map_entry_lock); + return; +free_pages: + free_user_map_pages(index, i, j); + spin_unlock(&page_map_entry_lock); +} + + +int calculate_pin_mem_digest(struct pin_mem_dump_info *pmdi, char *digest) +{ + int i; + struct sha256_state sctx; + + if (!digest) + digest = pmdi->sha_digest; + sha256_init(&sctx); + sha256_update(&sctx, (unsigned char *)(&(pmdi->magic)), + sizeof(struct pin_mem_dump_info) - SHA256_DIGEST_SIZE); + for (i = 0; i < pmdi->pin_pid_num; i++) { + sha256_update(&sctx, (unsigned char *)(&(pmdi->pmi_array[i])), + sizeof(struct page_map_info)); + } + sha256_final(&sctx, digest); + return 0; +} + +static int check_sha_digest(struct pin_mem_dump_info *pmdi) +{ + int ret = 0; + char digest[SHA256_DIGEST_SIZE] = {0}; + + ret = calculate_pin_mem_digest(pmdi, digest); + if (ret) { + pr_warn("calculate pin mem digest fail:%d\n", ret); + return ret; + } + if (memcmp(pmdi->sha_digest, digest, SHA256_DIGEST_SIZE)) { + pr_warn("pin mem dump info sha256 digest match error!\n"); + return -EFAULT; + } + return ret; +} + +/* + * The whole page map entry collect process must be Sequentially. + * The user_space_reserve_start points to the first page map info for + * the first dump task. And the page_map_entry_start points to + * the first page map entry of the first dump vma. + */ +static void init_page_map_info(struct pin_mem_dump_info *pmdi, unsigned long map_len) +{ + if (pin_mem_dump_start || !max_pin_pid_num) { + pr_warn("pin page map already init or max_pin_pid_num not set.\n"); + return; + } + if (map_len < sizeof(struct pin_mem_dump_info) + + max_pin_pid_num * sizeof(struct page_map_info) + redirect_space_size) { + pr_warn("pin memory reserved memblock too small.\n"); + return; + } + if ((pmdi->magic != PIN_MEM_DUMP_MAGIC) || (pmdi->pin_pid_num > max_pin_pid_num) || + check_sha_digest(pmdi)) + memset(pmdi, 0, sizeof(struct pin_mem_dump_info)); + pin_mem_dump_start = pmdi; + pin_pid_num = pmdi->pin_pid_num; + pr_info("pin_pid_num: %d\n", pin_pid_num); + pin_pid_num_addr = &(pmdi->pin_pid_num); + user_space_reserve_start = + (struct page_map_info *)pmdi->pmi_array; + page_map_entry_start = + (struct page_map_entry *)(user_space_reserve_start + max_pin_pid_num); + page_map_entry_end = (unsigned long)pmdi + map_len - redirect_space_size; + redirect_space_start = page_map_entry_end; + if (pin_pid_num > 0) + reserve_user_space_map_pages(); +} + +int finish_pin_mem_dump(void) +{ + int ret; + + pin_mem_dump_start->magic = PIN_MEM_DUMP_MAGIC; + memset(pin_mem_dump_start->sha_digest, 0, SHA256_DIGEST_SIZE); + ret = calculate_pin_mem_digest(pin_mem_dump_start, NULL); + if (ret) { + pr_warn("calculate pin mem digest fail:%d\n", ret); + return ret; + } + return ret; +} + +int collect_pmd_huge_pages(struct task_struct *task, + unsigned long start_addr, unsigned long end_addr, struct page_map_entry *pme) +{ + long res; + int index = 0; + unsigned long start = start_addr; + struct page *temp_page; + + while (start < end_addr) { + temp_page = NULL; + res = get_user_pages_remote(task->mm, start, 1, + FOLL_TOUCH | FOLL_GET, &temp_page, NULL, NULL); + if (!res) { + pr_warn("Get huge page for addr(%lx) fail.", start); + return COLLECT_PAGES_FAIL; + } + if (PageHead(temp_page)) { + start += HPAGE_PMD_SIZE; + pme->phy_addr_array[index] = page_to_phys(temp_page); + index++; + } else { + pme->nr_pages = index; + atomic_dec(&((temp_page)->_refcount)); + return COLLECT_PAGES_NEED_CONTINUE; + } + } + pme->nr_pages = index; + return COLLECT_PAGES_FINISH; +} + +int collect_normal_pages(struct task_struct *task, + unsigned long start_addr, unsigned long end_addr, struct page_map_entry *pme) +{ + int res; + unsigned long next; + unsigned long i, nr_pages; + struct page *tmp_page; + unsigned long *phy_addr_array = pme->phy_addr_array; + struct page **page_array = (struct page **)pme->phy_addr_array; + + next = (start_addr & HPAGE_PMD_MASK) + HPAGE_PMD_SIZE; + next = (next > end_addr) ? end_addr : next; + pme->nr_pages = 0; + while (start_addr < next) { + nr_pages = (PAGE_ALIGN(next) - start_addr) / PAGE_SIZE; + res = get_user_pages_remote(task->mm, start_addr, 1, + FOLL_TOUCH | FOLL_GET, &tmp_page, NULL, NULL); + if (!res) { + pr_warn("Get user page of %lx fail.\n", start_addr); + return COLLECT_PAGES_FAIL; + } + if (PageHead(tmp_page)) { + atomic_dec(&(tmp_page->_refcount)); + return COLLECT_PAGES_NEED_CONTINUE; + } + atomic_dec(&(tmp_page->_refcount)); + if (PageTail(tmp_page)) { + start_addr = next; + pme->virt_addr = start_addr; + next = (next + HPAGE_PMD_SIZE) > end_addr ? + end_addr : (next + HPAGE_PMD_SIZE); + continue; + } + res = get_user_pages_remote(task->mm, start_addr, nr_pages, + FOLL_TOUCH | FOLL_GET, page_array, NULL, NULL); + if (!res) { + pr_warn("Get user pages of %lx fail.\n", start_addr); + return COLLECT_PAGES_FAIL; + } + for (i = 0; i < nr_pages; i++) + phy_addr_array[i] = page_to_phys(page_array[i]); + pme->nr_pages += nr_pages; + page_array += nr_pages; + phy_addr_array += nr_pages; + start_addr = next; + next = (next + HPAGE_PMD_SIZE) > end_addr ? end_addr : (next + HPAGE_PMD_SIZE); + } + return COLLECT_PAGES_FINISH; +} + +/* Users make sure that the pin memory belongs to anonymous vma. */ +int pin_mem_area(struct task_struct *task, struct mm_struct *mm, + unsigned long start_addr, unsigned long end_addr) +{ + int pid, ret; + int is_huge_page = false; + unsigned int page_size; + unsigned long nr_pages, flags; + struct page_map_entry *pme; + struct page_map_info *pmi; + struct vm_area_struct *vma; + unsigned long i; + struct page *tmp_page; + + if (!page_map_entry_start + || !task || !mm + || start_addr >= end_addr) + return -EFAULT; + + pid = task->pid; + spin_lock_irqsave(&page_map_entry_lock, flags); + nr_pages = ((end_addr - start_addr) / PAGE_SIZE); + if ((unsigned long)page_map_entry_start + nr_pages * sizeof(struct page *) >= + page_map_entry_end) { + pr_warn("Page map entry use up!\n"); + ret = -EFAULT; + goto finish; + } + vma = find_extend_vma(mm, start_addr); + if (!vma) { + pr_warn("Find no match vma!\n"); + ret = -EFAULT; + goto finish; + } + if (start_addr == (start_addr & HPAGE_PMD_MASK) && + transparent_hugepage_enabled(vma)) { + page_size = HPAGE_PMD_SIZE; + is_huge_page = true; + } else { + page_size = PAGE_SIZE; + } + pme = page_map_entry_start; + pme->virt_addr = start_addr; + pme->redirect_start = 0; + pme->is_huge_page = is_huge_page; + memset(pme->phy_addr_array, 0, nr_pages * sizeof(unsigned long)); + down_write(&mm->mmap_lock); + if (!is_huge_page) { + ret = collect_normal_pages(task, start_addr, end_addr, pme); + if (ret != COLLECT_PAGES_FAIL && !pme->nr_pages) { + if (ret == COLLECT_PAGES_FINISH) { + ret = 0; + up_write(&mm->mmap_lock); + goto finish; + } + pme->is_huge_page = true; + page_size = HPAGE_PMD_SIZE; + ret = collect_pmd_huge_pages(task, pme->virt_addr, end_addr, pme); + } + } else { + ret = collect_pmd_huge_pages(task, start_addr, end_addr, pme); + if (ret != COLLECT_PAGES_FAIL && !pme->nr_pages) { + if (ret == COLLECT_PAGES_FINISH) { + ret = 0; + up_write(&mm->mmap_lock); + goto finish; + } + pme->is_huge_page = false; + page_size = PAGE_SIZE; + ret = collect_normal_pages(task, pme->virt_addr, end_addr, pme); + } + } + up_write(&mm->mmap_lock); + if (ret == COLLECT_PAGES_FAIL) { + ret = -EFAULT; + goto finish; + } + + /* check for zero pages */ + for (i = 0; i < pme->nr_pages; i++) { + tmp_page = phys_to_page(pme->phy_addr_array[i]); + if (!pme->is_huge_page) { + if (page_to_pfn(tmp_page) == my_zero_pfn(pme->virt_addr + i * PAGE_SIZE)) + pme->phy_addr_array[i] = 0; + } else if (is_huge_zero_page(tmp_page)) + pme->phy_addr_array[i] = 0; + } + + page_map_entry_start = (struct page_map_entry *)(next_pme(pme)); + pmi = get_page_map_info(pid); + if (!pmi) + pmi = create_page_map_info(pid); + if (!pmi) { + pr_warn("Create page map info fail for pid: %d!\n", pid); + ret = -EFAULT; + goto finish; + } + if (!pmi->pme) + pmi->pme = pme; + pmi->entry_num++; + spin_unlock_irqrestore(&page_map_entry_lock, flags); + if (ret == COLLECT_PAGES_NEED_CONTINUE) + ret = pin_mem_area(task, mm, pme->virt_addr + pme->nr_pages * page_size, end_addr); + return ret; +finish: + spin_unlock_irqrestore(&page_map_entry_lock, flags); + return ret; +} +EXPORT_SYMBOL_GPL(pin_mem_area); + +vm_fault_t remap_normal_pages(struct mm_struct *mm, struct vm_area_struct *vma, + struct page_map_entry *pme) +{ + int ret; + unsigned int j, i; + pgd_t *pgd; + p4d_t *p4d; + pmd_t *pmd; + pud_t *pud; + struct page *page, *new; + unsigned long address; + unsigned long phy_addr; + unsigned int redirect_pages = 0; + struct redirect_info *redirect_start; + + redirect_start = (struct redirect_info *)pme->redirect_start; + for (j = 0; j < pme->nr_pages; j++) { + address = pme->virt_addr + j * PAGE_SIZE; + phy_addr = pme->phy_addr_array[j]; + if (!phy_addr) + continue; + page = phys_to_page(phy_addr); + if (page_to_pfn(page) == my_zero_pfn(address)) { + pme->phy_addr_array[j] = 0; + continue; + } + pme->phy_addr_array[j] = 0; + if (redirect_start && (redirect_pages < redirect_start->redirect_pages) && + (j == redirect_start->redirect_index[redirect_pages])) { + new = alloc_zeroed_user_highpage_movable(vma, address); + if (!new) { + pr_warn("Redirect alloc page fail\n"); + continue; + } + copy_page(page_to_virt(new), phys_to_virt(phy_addr)); + page = new; + redirect_pages++; + } + page->mapping = NULL; + pgd = pgd_offset(mm, address); + p4d = p4d_alloc(mm, pgd, address); + if (!p4d) { + ret = VM_FAULT_OOM; + goto free; + } + pud = pud_alloc(mm, p4d, address); + if (!pud) { + ret = VM_FAULT_OOM; + goto free; + } + pmd = pmd_alloc(mm, pud, address); + if (!pmd) { + ret = VM_FAULT_OOM; + goto free; + } + ret = do_anon_page_remap(vma, address, pmd, page); + if (ret) + goto free; + } + return 0; +free: + for (i = j; i < pme->nr_pages; i++) { + phy_addr = pme->phy_addr_array[i]; + if (phy_addr) { + __free_page(phys_to_page(phy_addr)); + pme->phy_addr_array[i] = 0; + } + } + return ret; +} + +static inline gfp_t get_hugepage_gfpmask(struct vm_area_struct *vma) +{ + const bool vma_madvised = !!(vma->vm_flags & VM_HUGEPAGE); + + if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_DIRECT_FLAG, &transparent_hugepage_flags)) + return GFP_TRANSHUGE | (vma_madvised ? 0 : __GFP_NORETRY); + if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_FLAG, &transparent_hugepage_flags)) + return GFP_TRANSHUGE_LIGHT | __GFP_KSWAPD_RECLAIM; + if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_OR_MADV_FLAG, &transparent_hugepage_flags)) + return GFP_TRANSHUGE_LIGHT | (vma_madvised ? __GFP_DIRECT_RECLAIM : + __GFP_KSWAPD_RECLAIM); + if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG, &transparent_hugepage_flags)) + return GFP_TRANSHUGE_LIGHT | (vma_madvised ? __GFP_DIRECT_RECLAIM : + 0); + return GFP_TRANSHUGE_LIGHT; +} + +vm_fault_t remap_huge_pmd_pages(struct mm_struct *mm, struct vm_area_struct *vma, + struct page_map_entry *pme) +{ + int ret; + unsigned int j, i; + pgd_t *pgd; + p4d_t *p4d; + pmd_t *pmd; + pud_t *pud; + gfp_t gfp; + struct page *page, *new; + unsigned long address; + unsigned long phy_addr; + unsigned int redirect_pages = 0; + struct redirect_info *redirect_start; + + redirect_start = (struct redirect_info *)pme->redirect_start; + for (j = 0; j < pme->nr_pages; j++) { + address = pme->virt_addr + j * HPAGE_PMD_SIZE; + phy_addr = pme->phy_addr_array[j]; + if (!phy_addr) + continue; + page = phys_to_page(phy_addr); + if (is_huge_zero_page(page)) { + pme->phy_addr_array[j] = 0; + continue; + } + pme->phy_addr_array[j] = 0; + if (redirect_start && (redirect_pages < redirect_start->redirect_pages) && + (j == redirect_start->redirect_index[redirect_pages])) { + gfp = get_hugepage_gfpmask(vma); + new = alloc_hugepage_vma(gfp, vma, address, HPAGE_PMD_ORDER); + if (!new) { + pr_warn("Redirect alloc huge page fail\n"); + continue; + } + memcpy(page_to_virt(new), phys_to_virt(phy_addr), HPAGE_PMD_SIZE); + page = new; + redirect_pages++; + } + pgd = pgd_offset(mm, address); + p4d = p4d_alloc(mm, pgd, address); + if (!p4d) { + ret = VM_FAULT_OOM; + goto free; + } + pud = pud_alloc(mm, p4d, address); + if (!pud) { + ret = VM_FAULT_OOM; + goto free; + } + pmd = pmd_alloc(mm, pud, address); + if (!pmd) { + ret = VM_FAULT_OOM; + goto free; + } + ret = do_anon_huge_page_remap(vma, address, pmd, page); + if (ret) + goto free; + } + return 0; +free: + for (i = j; i < pme->nr_pages; i++) { + phy_addr = pme->phy_addr_array[i]; + if (phy_addr) { + page = phys_to_page(phy_addr); + if (!(page->flags & PAGE_FLAGS_CHECK_RESERVED)) { + __free_pages(page, HPAGE_PMD_ORDER); + pme->phy_addr_array[i] = 0; + } + } + } + return ret; +} + +static void free_unmap_pages(struct page_map_info *pmi, + struct page_map_entry *pme, + unsigned int index) +{ + unsigned int i, j; + unsigned long phy_addr; + unsigned int order; + struct page *page; + + pme = (struct page_map_entry *)(next_pme(pme)); + for (i = index; i < pmi->entry_num; i++) { + for (j = 0; j < pme->nr_pages; j++) { + phy_addr = pme->phy_addr_array[i]; + if (phy_addr) { + page = phys_to_page(phy_addr); + order = pme->is_huge_page ? HPAGE_PMD_ORDER : 0; + if (!(page->flags & PAGE_FLAGS_CHECK_RESERVED)) { + __free_pages(page, order); + pme->phy_addr_array[i] = 0; + } + } + } + pme = (struct page_map_entry *)(next_pme(pme)); + } +} + +vm_fault_t do_mem_remap(int pid, struct mm_struct *mm) +{ + unsigned int i = 0; + vm_fault_t ret = 0; + struct vm_area_struct *vma; + struct page_map_info *pmi; + struct page_map_entry *pme; + unsigned long flags; + + if (reserve_user_map_pages_fail) + return -EFAULT; + pmi = get_page_map_info(pid); + if (!pmi) + return -EFAULT; + + spin_lock_irqsave(&page_map_entry_lock, flags); + pmi->disable_free_page = true; + spin_unlock(&page_map_entry_lock); + down_write(&mm->mmap_lock); + pme = pmi->pme; + vma = mm->mmap; + while ((i < pmi->entry_num) && (vma != NULL)) { + if (pme->virt_addr >= vma->vm_start && pme->virt_addr < vma->vm_end) { + i++; + if (!vma_is_anonymous(vma)) { + pme = (struct page_map_entry *)(next_pme(pme)); + continue; + } + if (!pme->is_huge_page) { + ret = remap_normal_pages(mm, vma, pme); + if (ret < 0) + goto free; + } else { + ret = remap_huge_pmd_pages(mm, vma, pme); + if (ret < 0) + goto free; + } + pme = (struct page_map_entry *)(next_pme(pme)); + } else { + vma = vma->vm_next; + } + } + up_write(&mm->mmap_lock); + return 0; +free: + free_unmap_pages(pmi, pme, i); + up_write(&mm->mmap_lock); + return ret; +} +EXPORT_SYMBOL_GPL(do_mem_remap); + +#if defined(CONFIG_ARM64) +void init_reserve_page_map(unsigned long map_addr, unsigned long map_size) +{ + void *addr; + + if (!map_addr || !map_size) + return; + addr = phys_to_virt(map_addr); + init_page_map_info((struct pin_mem_dump_info *)addr, map_size); +} +#else +void init_reserve_page_map(unsigned long map_addr, unsigned long map_size) +{ +} +#endif + +static void free_all_reserved_pages(void) +{ + unsigned int i, j, index, order; + struct page_map_info *pmi; + struct page_map_entry *pme; + struct page *page; + unsigned long phy_addr; + + if (!user_space_reserve_start || reserve_user_map_pages_fail) + return; + + for (index = 0; index < pin_pid_num; index++) { + pmi = &(user_space_reserve_start[index]); + if (pmi->disable_free_page) + continue; + pme = pmi->pme; + for (i = 0; i < pmi->entry_num; i++) { + for (j = 0; j < pme->nr_pages; j++) { + order = pme->is_huge_page ? HPAGE_PMD_ORDER : 0; + phy_addr = pme->phy_addr_array[j]; + if (phy_addr) { + page = phys_to_page(phy_addr); + if (!(page->flags & PAGE_FLAGS_CHECK_RESERVED)) { + __free_pages(page, order); + pme->phy_addr_array[j] = 0; + } + } + } + pme = (struct page_map_entry *)next_pme(pme); + } + } +} + +/* Clear all pin memory record. */ +void clear_pin_memory_record(void) +{ + unsigned long flags; + + spin_lock_irqsave(&page_map_entry_lock, flags); + free_all_reserved_pages(); + if (pin_pid_num_addr) { + *pin_pid_num_addr = 0; + pin_pid_num = 0; + page_map_entry_start = (struct page_map_entry *)__page_map_entry_start; + } + spin_unlock(&page_map_entry_lock); +} +EXPORT_SYMBOL_GPL(clear_pin_memory_record); + +#endif /* CONFIG_PIN_MEMORY */ -- 2.9.5
For this series,
Reviewed-by: Jing Xiangfengjingxiangfeng@huawei.com
On 2021/3/2 15:24, hejingxian wrote:
From: Jingxian He hejingxian@huawei.com
Date: Mon, 1 Mar 2021 17:35:32 +0800
Subject: [PATCH openEuler-21.03 1/2] mm: add pin memory method for checkpoint add restore
hulk inclusion
category: feature
bugzilla: 48159
CVE: N/A
We can use the checkpoint and restore in userspace(criu) method to
dump and restore tasks when updating the kernel.
Currently, criu needs dump all memory data of tasks to files.
When the memory size is very large(larger than 1G),
the cost time of the dumping data will be very long(more than 1 min).
By pin the memory data of tasks and collect the corresponding
physical pages mapping info in checkpoint process,
we can remap the physical pages to restore tasks after
upgrading the kernel. This pin memory method can
restore the task data within one second.
The pin memory area info is saved in the reserved memblock,
which can keep usable in the kernel update process.
The pin memory driver provides the following ioctl command for criu:
- SET_PIN_MEM_AREA:
Set pin memory area, which can be remap to the restore task.
- CLEAR_PIN_MEM_AREA:
Clear the pin memory area info,
which enable user reset the pin data.
- REMAP_PIN_MEM_AREA:
Remap the pages of the pin memory to the restore task.
Signed-off-by: Jingxian He hejingxian@huawei.com
Reviewed-by: Wenliang He hewenliang4@huawei.com
Reviewed-by: Jing Xiangfeng jingxiangfeng@huawei.com
arch/arm64/configs/openeuler_defconfig | 2 +
arch/arm64/kernel/setup.c | 9 +
arch/arm64/mm/init.c | 60 +++
drivers/char/Kconfig | 6 +
drivers/char/Makefile | 1 +
drivers/char/pin_memory.c | 208 ++++++++
include/linux/crash_core.h | 5 +
include/linux/pin_mem.h | 78 +++
kernel/crash_core.c | 11 +
mm/Kconfig | 8 +
mm/Makefile | 1 +
mm/huge_memory.c | 61 +++
mm/memory.c | 59 ++
mm/pin_mem.c | 950 +++++++++++++++++++++++++++++++++
14 files changed, 1459 insertions(+)
create mode 100644 drivers/char/pin_memory.c
create mode 100644 include/linux/pin_mem.h
create mode 100644 mm/pin_mem.c
diff --git a/arch/arm64/configs/openeuler_defconfig b/arch/arm64/configs/openeuler_defconfig
index c5271e7..76fda68 100644
--- a/arch/arm64/configs/openeuler_defconfig
+++ b/arch/arm64/configs/openeuler_defconfig
@@ -1036,6 +1036,7 @@ CONFIG_FRAME_VECTOR=y
# CONFIG_GUP_BENCHMARK is not set
# CONFIG_READ_ONLY_THP_FOR_FS is not set
CONFIG_ARCH_HAS_PTE_SPECIAL=y
+CONFIG_PIN_MEMORY=y
# end of Memory Management options
CONFIG_NET=y
@@ -3282,6 +3283,7 @@ CONFIG_TCG_TIS_ST33ZP24_SPI=y
# CONFIG_RANDOM_TRUST_CPU is not set
# CONFIG_RANDOM_TRUST_BOOTLOADER is not set
+CONFIG_PIN_MEMORY_DEV=m
#
# I2C support
diff --git a/arch/arm64/kernel/setup.c b/arch/arm64/kernel/setup.c
index c1f1fb9..5e282d3 100644
--- a/arch/arm64/kernel/setup.c
+++ b/arch/arm64/kernel/setup.c
@@ -50,6 +50,9 @@
#include <asm/efi.h>
#include <asm/xen/hypervisor.h>
#include <asm/mmu_context.h>
+#ifdef CONFIG_PIN_MEMORY
+#include <linux/pin_mem.h>
+#endif
static int num_standard_resources;
static struct resource *standard_resources;
@@ -260,6 +263,12 @@ static void __init request_standard_resources(void)
quick_kexec_res.end <= res->end)
request_resource(res, &quick_kexec_res);
#endif
+#ifdef CONFIG_PIN_MEMORY
if (pin_memory_resource.end &&
pin_memory_resource.start >= res->start &&
pin_memory_resource.end <= res->end)
request_resource(res, &pin_memory_resource);
+#endif
}
}
diff --git a/arch/arm64/mm/init.c b/arch/arm64/mm/init.c
index f3e5a66..8ab5aac 100644
--- a/arch/arm64/mm/init.c
+++ b/arch/arm64/mm/init.c
@@ -42,6 +42,9 @@
#include <linux/sizes.h>
#include <asm/tlb.h>
#include <asm/alternative.h>
+#ifdef CONFIG_PIN_MEMORY
+#include <linux/pin_mem.h>
+#endif
#define ARM64_ZONE_DMA_BITS 30
@@ -78,6 +81,55 @@ static void __init reserve_crashkernel(void)
*/
#define MAX_USABLE_RANGES 2
+#ifdef CONFIG_PIN_MEMORY
+struct resource pin_memory_resource = {
.name = "Pin memory",
.start = 0,
.end = 0,
.flags = IORESOURCE_MEM,
.desc = IORES_DESC_RESERVED
+};
+static void __init reserve_pin_memory_res(void)
+{
unsigned long long mem_start, mem_len;
int ret;
ret = parse_pin_memory(boot_command_line,
memblock_phys_mem_size(),
&mem_len, &mem_start);
if (ret || !mem_len)
return;
mem_len = PAGE_ALIGN(mem_len);
if (!memblock_is_region_memory(mem_start, mem_len)) {
pr_warn("cannot reserve for pin memory: region is not memory!\n");
return;
}
if (memblock_is_region_reserved(mem_start, mem_len)) {
pr_warn("cannot reserve for pin memory: region overlaps reserved
memory!\n");
return;
}
if (!IS_ALIGNED(mem_start, SZ_2M)) {
pr_warn("cannot reserve for pin memory: base address is not 2MB
aligned\n");
return;
}
memblock_reserve(mem_start, mem_len);
pr_debug("pin memory resource reserved: 0x%016llx - 0x%016llx
(%lld MB)\n",
mem_start, mem_start + mem_len, mem_len >> 20);
pin_memory_resource.start = mem_start;
pin_memory_resource.end = mem_start + mem_len - 1;
+}
+#else
+static void __init reserve_pin_memory_res(void)
+{
+}
+#endif /* CONFIG_PIN_MEMORY */
#ifdef CONFIG_CRASH_DUMP
static int __init early_init_dt_scan_elfcorehdr(unsigned long node,
const char *uname, int depth, void *data)
@@ -455,6 +507,8 @@ void __init arm64_memblock_init(void)
reserve_park_mem();
#endif
reserve_pin_memory_res();
reserve_elfcorehdr();
high_memory = __va(memblock_end_of_DRAM() - 1) + 1;
@@ -583,6 +637,12 @@ void __init mem_init(void)
/* this will put all unused low memory onto the freelists */
memblock_free_all();
+#ifdef CONFIG_PIN_MEMORY
/* pre alloc the pages for pin memory */
init_reserve_page_map((unsigned long)pin_memory_resource.start,
(unsigned long)(pin_memory_resource.end - pin_memory_resource.start
1));
+#endif
mem_init_print_info(NULL);
/*
diff --git a/drivers/char/Kconfig b/drivers/char/Kconfig
index d229a2d..fbb94b8 100644
--- a/drivers/char/Kconfig
+++ b/drivers/char/Kconfig
@@ -496,3 +496,9 @@ config RANDOM_TRUST_BOOTLOADER
booloader is trustworthy so it will be added to the kernel's
entropy
pool. Otherwise, say N here so it will be regarded as device
input that
only mixes the entropy pool.
+config PIN_MEMORY_DEV
bool "/dev/pinmem character device"
default m
help
pin memory driver
diff --git a/drivers/char/Makefile b/drivers/char/Makefile
index ffce287..71d76fd 100644
--- a/drivers/char/Makefile
+++ b/drivers/char/Makefile
@@ -47,3 +47,4 @@ obj-$(CONFIG_PS3_FLASH) += ps3flash.o
obj-$(CONFIG_XILLYBUS) += xillybus/
obj-$(CONFIG_POWERNV_OP_PANEL) += powernv-op-panel.o
obj-$(CONFIG_ADI) += adi.o
+obj-$(CONFIG_PIN_MEMORY_DEV) += pin_memory.o
diff --git a/drivers/char/pin_memory.c b/drivers/char/pin_memory.c
new file mode 100644
index 0000000..f46e056
--- /dev/null
+++ b/drivers/char/pin_memory.c
@@ -0,0 +1,208 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
- Copyright (C) 2021. Huawei Technologies Co., Ltd. All rights reserved.
- Pin memory driver for checkpoint and restore.
*/
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/kprobes.h>
+#include <linux/spinlock.h>
+#include <linux/workqueue.h>
+#include <linux/sched.h>
+#include <linux/mm.h>
+#include <linux/init.h>
+#include <linux/miscdevice.h>
+#include <linux/fs.h>
+#include <linux/mm_types.h>
+#include <linux/processor.h>
+#include <uapi/asm-generic/ioctl.h>
+#include <uapi/asm-generic/mman-common.h>
+#include <uapi/asm/setup.h>
+#include <linux/pin_mem.h>
+#include <linux/sched/mm.h>
+#define MAX_PIN_MEM_AREA_NUM 16
+struct _pin_mem_area {
unsigned long virt_start;
unsigned long virt_end;
+};
+struct pin_mem_area_set {
unsigned int pid;
unsigned int area_num;
struct _pin_mem_area mem_area[MAX_PIN_MEM_AREA_NUM];
+};
+#define PIN_MEM_MAGIC 0x59
+#define _SET_PIN_MEM_AREA 1
+#define _CLEAR_PIN_MEM_AREA 2
+#define _REMAP_PIN_MEM_AREA 3
+#define _FINISH_PIN_MEM_DUMP 4
+#define _PIN_MEM_IOC_MAX_NR 4
+#define SET_PIN_MEM_AREA _IOW(PIN_MEM_MAGIC, _SET_PIN_MEM_AREA, struct pin_mem_area_set)
+#define CLEAR_PIN_MEM_AREA _IOW(PIN_MEM_MAGIC, _CLEAR_PIN_MEM_AREA, int)
+#define REMAP_PIN_MEM_AREA _IOW(PIN_MEM_MAGIC, _REMAP_PIN_MEM_AREA, int)
+#define FINISH_PIN_MEM_DUMP _IOW(PIN_MEM_MAGIC, _FINISH_PIN_MEM_DUMP, int)
+static int set_pin_mem(struct pin_mem_area_set *pmas)
+{
int i;
int ret = 0;
struct _pin_mem_area *pma;
struct mm_struct *mm;
struct task_struct *task;
struct pid *pid_s;
pid_s = find_get_pid(pmas->pid);
if (!pid_s) {
pr_warn("Get pid struct fail:%d.\n", pmas->pid);
return -EFAULT;
}
rcu_read_lock();
task = pid_task(pid_s, PIDTYPE_PID);
if (!task) {
pr_warn("Get task struct fail:%d.\n", pmas->pid);
goto fail;
}
mm = get_task_mm(task);
for (i = 0; i < pmas->area_num; i++) {
pma = &(pmas->mem_area[i]);
ret = pin_mem_area(task, mm, pma->virt_start,
pma->virt_end);
if (ret) {
mmput(mm);
goto fail;
}
}
mmput(mm);
rcu_read_unlock();
put_pid(pid_s);
return ret;
+fail:
rcu_read_unlock();
put_pid(pid_s);
return -EFAULT;
+}
+static int set_pin_mem_area(unsigned long arg)
+{
struct pin_mem_area_set pmas;
void __user *buf = (void __user *)arg;
if (!access_ok(buf, sizeof(pmas)))
return -EFAULT;
if (copy_from_user(&pmas, buf, sizeof(pmas)))
return -EINVAL;
if (pmas.area_num > MAX_PIN_MEM_AREA_NUM) {
pr_warn("Input area_num is too large.\n");
return -EINVAL;
}
return set_pin_mem(&pmas);
+}
+static int pin_mem_remap(unsigned long arg)
+{
int pid;
struct task_struct *task;
struct mm_struct *mm;
vm_fault_t ret;
void __user *buf = (void __user *)arg;
struct pid *pid_s;
if (!access_ok(buf, sizeof(int)))
return -EINVAL;
if (copy_from_user(&pid, buf, sizeof(int)))
return -EINVAL;
pid_s = find_get_pid(pid);
if (!pid_s) {
pr_warn("Get pid struct fail:%d.\n", pid);
return -EINVAL;
}
rcu_read_lock();
task = pid_task(pid_s, PIDTYPE_PID);
if (!task) {
pr_warn("Get task struct fail:%d.\n", pid);
goto fault;
}
mm = get_task_mm(task);
ret = do_mem_remap(pid, mm);
if (ret) {
pr_warn("Handle pin memory remap fail.\n");
mmput(mm);
goto fault;
}
mmput(mm);
rcu_read_unlock();
put_pid(pid_s);
return 0;
+fault:
rcu_read_unlock();
put_pid(pid_s);
return -EFAULT;
+}
+static long pin_memory_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
+{
long ret = 0;
if (_IOC_TYPE(cmd) != PIN_MEM_MAGIC)
return -EINVAL;
if (_IOC_NR(cmd) > _PIN_MEM_IOC_MAX_NR)
return -EINVAL;
switch (cmd) {
case SET_PIN_MEM_AREA:
ret = set_pin_mem_area(arg);
break;
case CLEAR_PIN_MEM_AREA:
clear_pin_memory_record();
break;
case REMAP_PIN_MEM_AREA:
ret = pin_mem_remap(arg);
break;
case FINISH_PIN_MEM_DUMP:
ret = finish_pin_mem_dump();
break;
default:
return -EINVAL;
}
return ret;
+}
+static const struct file_operations pin_memory_fops = {
.owner = THIS_MODULE,
.unlocked_ioctl = pin_memory_ioctl,
.compat_ioctl = pin_memory_ioctl,
+};
+static struct miscdevice pin_memory_miscdev = {
.minor = MISC_DYNAMIC_MINOR,
.name = "pinmem",
.fops = &pin_memory_fops,
+};
+static int pin_memory_init(void)
+{
int err = misc_register(&pin_memory_miscdev);
if (!err)
pr_info("pin_memory init\n");
else
pr_warn("pin_memory init failed!\n");
return err;
+}
+static void pin_memory_exit(void)
+{
misc_deregister(&pin_memory_miscdev);
pr_info("pin_memory ko exists!\n");
+}
+module_init(pin_memory_init);
+module_exit(pin_memory_exit);
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Euler");
+MODULE_DESCRIPTION("pin memory");
diff --git a/include/linux/crash_core.h b/include/linux/crash_core.h
index fc0ef33..30f0df3 100644
--- a/include/linux/crash_core.h
+++ b/include/linux/crash_core.h
@@ -87,4 +87,9 @@ int parse_crashkernel_high(char *cmdline, unsigned long long system_ram,
int parse_crashkernel_low(char *cmdline, unsigned long long system_ram,
unsigned long long *crash_size, unsigned long long
*crash_base);
+#ifdef CONFIG_PIN_MEMORY
+int __init parse_pin_memory(char *cmdline, unsigned long long system_ram,
unsigned long long *pin_size, unsigned long long *pin_base);
+#endif
#endif /* LINUX_CRASH_CORE_H */
diff --git a/include/linux/pin_mem.h b/include/linux/pin_mem.h
new file mode 100644
index 0000000..bc8b03e
--- /dev/null
+++ b/include/linux/pin_mem.h
@@ -0,0 +1,78 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
- Copyright (C) 2021. Huawei Technologies Co., Ltd. All rights reserved.
- Provide the pin memory method for checkpoint and restore task.
*/
+#ifndef _LINUX_PIN_MEMORY_H
+#define _LINUX_PIN_MEMORY_H
+#ifdef CONFIG_PIN_MEMORY
+#include <linux/errno.h>
+#include <linux/mm_types.h>
+#include <linux/err.h>
+#ifdef CONFIG_ARM64
+#include <linux/ioport.h>
+#endif
+#define PAGE_BUDDY_MAPCOUNT_VALUE (~PG_buddy)
+#define COLLECT_PAGES_FINISH 0
+#define COLLECT_PAGES_NEED_CONTINUE 1
+#define COLLECT_PAGES_FAIL -1
+#define COMPOUND_PAD_MASK 0xffffffff
+#define COMPOUND_PAD_START 0x88
+#define COMPOUND_PAD_DELTA 0x40
+#define LIST_POISON4 0xdead000000000400
+#define PAGE_FLAGS_CHECK_RESERVED (1UL << PG_reserved)
+#define SHA256_DIGEST_SIZE 32
+#define next_pme(pme) ((unsigned long *)(pme + 1) + pme->nr_pages)
+#define PIN_MEM_DUMP_MAGIC 0xfeab000000001acd
+struct page_map_entry {
unsigned long virt_addr;
unsigned int nr_pages;
unsigned int is_huge_page;
unsigned long redirect_start;
unsigned long phy_addr_array[0];
+};
+struct page_map_info {
int pid;
int pid_reserved;
unsigned int entry_num;
int disable_free_page;
struct page_map_entry *pme;
+};
+struct pin_mem_dump_info {
char sha_digest[SHA256_DIGEST_SIZE];
unsigned long magic;
unsigned int pin_pid_num;
struct page_map_info pmi_array[0];
+};
+struct redirect_info {
unsigned int redirect_pages;
unsigned int redirect_index[0];
+};
+extern struct page_map_info *get_page_map_info(int pid);
+extern struct page_map_info *create_page_map_info(int pid);
+extern vm_fault_t do_mem_remap(int pid, struct mm_struct *mm);
+extern vm_fault_t do_anon_page_remap(struct vm_area_struct *vma, unsigned long address,
pmd_t *pmd, struct page *page);
+extern void clear_pin_memory_record(void);
+extern int pin_mem_area(struct task_struct *task, struct mm_struct *mm,
- unsigned long start_addr, unsigned long end_addr);
+extern vm_fault_t do_anon_huge_page_remap(struct vm_area_struct *vma, unsigned long address,
pmd_t *pmd, struct page *page);
+extern int finish_pin_mem_dump(void);
+/* reserve space for pin memory*/
+#ifdef CONFIG_ARM64
+extern struct resource pin_memory_resource;
+#endif
+extern void init_reserve_page_map(unsigned long map_addr, unsigned long map_size);
+#endif /* CONFIG_PIN_MEMORY */
+#endif /* _LINUX_PIN_MEMORY_H */
diff --git a/kernel/crash_core.c b/kernel/crash_core.c
index bfed474..2407de3 100644
--- a/kernel/crash_core.c
+++ b/kernel/crash_core.c
@@ -450,6 +450,17 @@ void __init reserve_crashkernel(void)
}
#endif /* CONFIG_ARCH_WANT_RESERVE_CRASH_KERNEL */
+#ifdef CONFIG_PIN_MEMORY
+int __init parse_pin_memory(char *cmdline,
unsigned long long system_ram,
unsigned long long *pin_size,
unsigned long long *pin_base)
+{
return __parse_crashkernel(cmdline, system_ram, pin_size,
pin_base,
- "pinmemory=", NULL);
+}
+#endif
Elf_Word *append_elf_note(Elf_Word *buf, char *name, unsigned int type,
void *data, size_t data_len)
{
diff --git a/mm/Kconfig b/mm/Kconfig
index 390165f..930dc13 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -859,4 +859,12 @@ config ARCH_HAS_HUGEPD
config MAPPING_DIRTY_HELPERS
bool
+config PIN_MEMORY
bool "Support for pin memory"
depends on CHECKPOINT_RESTORE
help
Say y here to enable the pin memory feature for checkpoint
and restore. We can pin the memory data of tasks and collect
the corresponding physical pages mapping info in checkpoint,
and remap the physical pages to restore tasks in restore.
endmenu
diff --git a/mm/Makefile b/mm/Makefile
index d73aed0..4963827 100644
--- a/mm/Makefile
+++ b/mm/Makefile
@@ -120,3 +120,4 @@ obj-$(CONFIG_MEMFD_CREATE) += memfd.o
obj-$(CONFIG_MAPPING_DIRTY_HELPERS) += mapping_dirty_helpers.o
obj-$(CONFIG_PTDUMP_CORE) += ptdump.o
obj-$(CONFIG_PAGE_REPORTING) += page_reporting.o
+obj-$(CONFIG_PIN_MEMORY) += pin_mem.o
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 0bc4a2c..8a11d30 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -2996,3 +2996,64 @@ void remove_migration_pmd(struct page_vma_mapped_walk *pvmw, struct page *new)
update_mmu_cache_pmd(vma, address, pvmw->pmd);
}
#endif
+#ifdef CONFIG_PIN_MEMORY
+vm_fault_t do_anon_huge_page_remap(struct vm_area_struct *vma, unsigned long address,
pmd_t *pmd, struct page *page)
+{
gfp_t gfp;
pgtable_t pgtable;
spinlock_t *ptl;
pmd_t entry;
vm_fault_t ret = 0;
if (unlikely(anon_vma_prepare(vma)))
return VM_FAULT_OOM;
if (unlikely(khugepaged_enter(vma, vma->vm_flags)))
return VM_FAULT_OOM;
gfp = alloc_hugepage_direct_gfpmask(vma);
prep_transhuge_page(page);
if (mem_cgroup_charge(page, vma->vm_mm, gfp)) {
put_page(page);
count_vm_event(THP_FAULT_FALLBACK);
count_vm_event(THP_FAULT_FALLBACK_CHARGE);
return VM_FAULT_FALLBACK;
}
cgroup_throttle_swaprate(page, gfp);
pgtable = pte_alloc_one(vma->vm_mm);
if (unlikely(!pgtable)) {
ret = VM_FAULT_OOM;
goto release;
}
__SetPageUptodate(page);
ptl = pmd_lock(vma->vm_mm, pmd);
if (unlikely(!pmd_none(*pmd))) {
goto unlock_release;
} else {
ret = check_stable_address_space(vma->vm_mm);
if (ret)
goto unlock_release;
entry = mk_huge_pmd(page, vma->vm_page_prot);
entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma);
page_add_new_anon_rmap(page, vma, address, true);
lru_cache_add_inactive_or_unevictable(page, vma);
pgtable_trans_huge_deposit(vma->vm_mm, pmd, pgtable);
set_pmd_at(vma->vm_mm, address, pmd, entry);
add_mm_counter(vma->vm_mm, MM_ANONPAGES, HPAGE_PMD_NR);
mm_inc_nr_ptes(vma->vm_mm);
spin_unlock(ptl);
count_vm_event(THP_FAULT_ALLOC);
count_memcg_event_mm(vma->vm_mm, THP_FAULT_ALLOC);
}
return 0;
+unlock_release:
- spin_unlock(ptl);
+release:
if (pgtable)
pte_free(vma->vm_mm, pgtable);
put_page(page);
return ret;
+}
+#endif
diff --git a/mm/memory.c b/mm/memory.c
index 50632c4..7b7f1a7 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -5248,3 +5248,62 @@ void ptlock_free(struct page *page)
kmem_cache_free(page_ptl_cachep, page->ptl);
}
#endif
+#ifdef CONFIG_PIN_MEMORY
+vm_fault_t do_anon_page_remap(struct vm_area_struct *vma, unsigned long address,
pmd_t *pmd, struct page *page)
+{
pte_t entry;
spinlock_t *ptl;
pte_t *pte;
vm_fault_t ret = 0;
if (pte_alloc(vma->vm_mm, pmd))
return VM_FAULT_OOM;
/* See the comment in pte_alloc_one_map() */
if (unlikely(pmd_trans_unstable(pmd)))
return 0;
/* Allocate our own private page. */
if (unlikely(anon_vma_prepare(vma)))
goto oom;
if (mem_cgroup_charge(page, vma->vm_mm, GFP_KERNEL))
goto oom_free_page;
cgroup_throttle_swaprate(page, GFP_KERNEL);
__SetPageUptodate(page);
entry = mk_pte(page, vma->vm_page_prot);
if (vma->vm_flags & VM_WRITE)
entry = pte_mkwrite(pte_mkdirty(entry));
pte = pte_offset_map_lock(vma->vm_mm, pmd, address,
&ptl);
if (!pte_none(*pte)) {
ret = VM_FAULT_FALLBACK;
goto release;
}
ret = check_stable_address_space(vma->vm_mm);
if (ret)
goto release;
inc_mm_counter_fast(vma->vm_mm, MM_ANONPAGES);
page_add_new_anon_rmap(page, vma, address, false);
lru_cache_add_inactive_or_unevictable(page, vma);
set_pte_at(vma->vm_mm, address, pte, entry);
/* No need to invalidate - it was non-present before */
update_mmu_cache(vma, address, pte);
+unlock:
pte_unmap_unlock(pte, ptl);
return ret;
+release:
put_page(page);
goto unlock;
+oom_free_page:
put_page(page);
+oom:
return VM_FAULT_OOM;
+}
+#endif
diff --git a/mm/pin_mem.c b/mm/pin_mem.c
new file mode 100644
index 0000000..0a143b6
--- /dev/null
+++ b/mm/pin_mem.c
@@ -0,0 +1,950 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
- Copyright (C) 2021. Huawei Technologies Co., Ltd. All rights reserved.
- Provide the pin memory method for checkpoint and restore task.
*/
+#ifdef CONFIG_PIN_MEMORY
+#include <linux/init.h>
+#include <linux/slab.h>
+#include <linux/time.h>
+#include <linux/sched/cputime.h>
+#include <linux/tick.h>
+#include <linux/mm.h>
+#include <linux/pin_mem.h>
+#include <linux/idr.h>
+#include <linux/page-isolation.h>
+#include <linux/sched/mm.h>
+#include <linux/ctype.h>
+#include <linux/highmem.h>
+#include <crypto/sha.h>
+#define MAX_PIN_PID_NUM 128
+static DEFINE_SPINLOCK(page_map_entry_lock);
+struct pin_mem_dump_info *pin_mem_dump_start;
+unsigned int pin_pid_num;
+static unsigned int *pin_pid_num_addr;
+static unsigned long __page_map_entry_start;
+static unsigned long page_map_entry_end;
+static struct page_map_info *user_space_reserve_start;
+static struct page_map_entry *page_map_entry_start;
+unsigned int max_pin_pid_num __read_mostly;
+unsigned long redirect_space_size;
+unsigned long redirect_space_start;
+#define DEFAULT_REDIRECT_SPACE_SIZE 0x100000
+static int __init setup_max_pin_pid_num(char *str)
+{
int ret = 0;
if (!str)
goto out;
ret = kstrtouint(str, 10, &max_pin_pid_num);
+out:
if (ret) {
pr_warn("Unable to parse max pin pid num.\n");
} else {
if (max_pin_pid_num > MAX_PIN_PID_NUM) {
max_pin_pid_num = 0;
pr_warn("Input max_pin_pid_num is too large.\n");
}
}
return ret;
+}
+early_param("max_pin_pid_num", setup_max_pin_pid_num);
+static int __init setup_redirect_space_size(char *str)
+{
if (!str)
goto out;
redirect_space_size = memparse(str, NULL);
+out:
if (!redirect_space_size) {
pr_warn("Unable to parse redirect space size, use the default
value.\n");
redirect_space_size = DEFAULT_REDIRECT_SPACE_SIZE;
}
return 0;
+}
+early_param("redirect_space_size", setup_redirect_space_size);
+struct page_map_info *create_page_map_info(int pid)
+{
struct page_map_info *new;
if (!user_space_reserve_start)
return NULL;
if (pin_pid_num >= max_pin_pid_num) {
pr_warn("Pin pid num too large than max_pin_pid_num, fail create:
%d!", pid);
return NULL;
}
new = (struct page_map_info *)(user_space_reserve_start +
pin_pid_num);
new->pid = pid;
new->pme = NULL;
new->entry_num = 0;
new->pid_reserved = false;
new->disable_free_page = false;
(*pin_pid_num_addr)++;
pin_pid_num++;
return new;
+}
+EXPORT_SYMBOL_GPL(create_page_map_info);
+struct page_map_info *get_page_map_info(int pid)
+{
int i;
if (!user_space_reserve_start)
return NULL;
for (i = 0; i < pin_pid_num; i++) {
if (user_space_reserve_start[i].pid == pid)
return &(user_space_reserve_start[i]);
}
return NULL;
+}
+EXPORT_SYMBOL_GPL(get_page_map_info);
+static struct page *find_head_page(struct page *page)
+{
struct page *p = page;
while (!PageBuddy(p)) {
if (PageLRU(p))
return NULL;
p--;
}
return p;
+}
+static void spilt_page_area_left(struct zone *zone, struct free_area *area, struct page *page,
unsigned long size, int order)
+{
unsigned long cur_size = 1 << order;
unsigned long total_size = 0;
while (size && cur_size > size) {
cur_size >>= 1;
order--;
area--;
if (cur_size <= size) {
list_add(&page[total_size].lru, &area->free_list[MIGRATE_MOVABLE]);
atomic_set(&(page[total_size]._mapcount), PAGE_BUDDY_MAPCOUNT_VALUE);
set_page_private(&page[total_size], order);
set_pageblock_migratetype(&page[total_size], MIGRATE_MOVABLE);
area->nr_free++;
total_size += cur_size;
size -= cur_size;
}
}
+}
+static void spilt_page_area_right(struct zone *zone, struct free_area *area, struct page *page,
- unsigned long size, int order)
+{
unsigned long cur_size = 1 << order;
struct page *right_page, *head_page;
right_page = page + size;
while (size && cur_size > size) {
cur_size >>= 1;
order--;
area--;
if (cur_size <= size) {
head_page = right_page - cur_size;
list_add(&head_page->lru, &area->free_list[MIGRATE_MOVABLE]);
atomic_set(&(head_page->_mapcount), PAGE_BUDDY_MAPCOUNT_VALUE);
set_page_private(head_page, order);
set_pageblock_migratetype(head_page, MIGRATE_MOVABLE);
area->nr_free++;
size -= cur_size;
right_page = head_page;
}
}
+}
+void reserve_page_from_buddy(unsigned long nr_pages, struct page *page)
+{
unsigned int current_order;
struct page *page_end;
struct free_area *area;
struct zone *zone;
struct page *head_page;
head_page = find_head_page(page);
if (!head_page) {
pr_warn("Find page head fail.");
return;
}
current_order = head_page->private;
page_end = head_page + (1 << current_order);
zone = page_zone(head_page);
area = &(zone->free_area[current_order]);
list_del(&head_page->lru);
atomic_set(&head_page->_mapcount, -1);
set_page_private(head_page, 0);
area->nr_free--;
if (head_page != page)
spilt_page_area_left(zone, area, head_page,
(unsigned long)(page - head_page), current_order);
page = page + nr_pages;
if (page < page_end) {
spilt_page_area_right(zone, area, page,
(unsigned long)(page_end - page), current_order);
} else if (page > page_end) {
pr_warn("Find page end smaller than page.");
}
+}
+static inline void reserve_user_normal_pages(struct page *page)
+{
atomic_inc(&page->_refcount);
reserve_page_from_buddy(1, page);
+}
+static void init_huge_pmd_pages(struct page *head_page)
+{
int i = 0;
struct page *page = head_page;
__set_bit(PG_head, &page->flags);
__set_bit(PG_active, &page->flags);
atomic_set(&page->_refcount, 1);
page++;
i++;
page->compound_head = (unsigned long)head_page + 1;
page->compound_dtor = HUGETLB_PAGE_DTOR + 1;
page->compound_order = HPAGE_PMD_ORDER;
page++;
i++;
page->compound_head = (unsigned long)head_page + 1;
i++;
INIT_LIST_HEAD(&(page->deferred_list));
for (; i < HPAGE_PMD_NR; i++) {
page = head_page + i;
page->compound_head = (unsigned long)head_page + 1;
}
+}
+static inline void reserve_user_huge_pmd_pages(struct page *page)
+{
atomic_inc(&page->_refcount);
reserve_page_from_buddy((1 << HPAGE_PMD_ORDER), page);
init_huge_pmd_pages(page);
+}
+int reserve_user_map_pages_fail;
+void free_user_map_pages(unsigned int pid_index, unsigned int entry_index, unsigned int page_index)
+{
unsigned int i, j, index, order;
struct page_map_info *pmi;
struct page_map_entry *pme;
struct page *page;
unsigned long phy_addr;
for (index = 0; index < pid_index; index++) {
pmi = &(user_space_reserve_start[index]);
pme = pmi->pme;
for (i = 0; i < pmi->entry_num; i++) {
for (j = 0; j < pme->nr_pages; j++) {
order = pme->is_huge_page ? HPAGE_PMD_ORDER : 0;
phy_addr = pme->phy_addr_array[j];
if (phy_addr) {
page = phys_to_page(phy_addr);
if (!(page->flags & PAGE_FLAGS_CHECK_RESERVED)) {
__free_pages(page, order);
pme->phy_addr_array[j] = 0;
}
}
}
pme = (struct page_map_entry *)next_pme(pme);
}
}
pmi = &(user_space_reserve_start[index]);
pme = pmi->pme;
for (i = 0; i < entry_index; i++) {
for (j = 0; j < pme->nr_pages; j++) {
order = pme->is_huge_page ? HPAGE_PMD_ORDER : 0;
phy_addr = pme->phy_addr_array[j];
if (phy_addr) {
page = phys_to_page(phy_addr);
if (!(page->flags & PAGE_FLAGS_CHECK_RESERVED)) {
__free_pages(page, order);
pme->phy_addr_array[j] = 0;
}
}
}
pme = (struct page_map_entry *)next_pme(pme);
}
for (j = 0; j < page_index; j++) {
order = pme->is_huge_page ? HPAGE_PMD_ORDER : 0;
phy_addr = pme->phy_addr_array[j];
if (phy_addr) {
page = phys_to_page(phy_addr);
if (!(page->flags & PAGE_FLAGS_CHECK_RESERVED)) {
__free_pages(page, order);
pme->phy_addr_array[j] = 0;
}
}
}
+}
+bool check_redirect_end_valid(struct redirect_info *redirect_start,
- unsigned long max_redirect_page_num)
+{
unsigned long redirect_end;
redirect_end = ((unsigned long)(redirect_start + 1) +
max_redirect_page_num * sizeof(unsigned int));
if (redirect_end > redirect_space_start + redirect_space_size)
return false;
return false;
+}
+static void reserve_user_space_map_pages(void)
+{
struct page_map_info *pmi;
struct page_map_entry *pme;
unsigned int i, j, index;
struct page *page;
unsigned long flags;
unsigned long phy_addr;
unsigned long redirect_pages = 0;
struct redirect_info *redirect_start = (struct redirect_info
*)redirect_space_start;
if (!user_space_reserve_start || !redirect_start)
return;
spin_lock_irqsave(&page_map_entry_lock, flags);
for (index = 0; index < pin_pid_num; index++) {
pmi = &(user_space_reserve_start[index]);
pme = pmi->pme;
for (i = 0; i < pmi->entry_num; i++) {
redirect_pages = 0;
if (!check_redirect_end_valid(redirect_start, pme->nr_pages))
redirect_start = NULL;
for (j = 0; j < pme->nr_pages; j++) {
phy_addr = pme->phy_addr_array[j];
if (!phy_addr)
continue;
page = phys_to_page(phy_addr);
if (atomic_read(&page->_refcount)) {
if ((page->flags & PAGE_FLAGS_CHECK_RESERVED)
&& !pme->redirect_start)
pme->redirect_start =
(unsigned long)redirect_start;
if (redirect_start &&
(page->flags & PAGE_FLAGS_CHECK_RESERVED)) {
redirect_start->redirect_index[redirect_pages] = j;
redirect_pages++;
continue;
} else {
reserve_user_map_pages_fail = 1;
pr_warn("Page %pK refcount %d large than zero, no need reserve.\n",
page, atomic_read(&page->_refcount));
goto free_pages;
}
}
if (!pme->is_huge_page)
reserve_user_normal_pages(page);
else
reserve_user_huge_pmd_pages(page);
}
pme = (struct page_map_entry *)next_pme(pme);
if (redirect_pages && redirect_start) {
redirect_start->redirect_pages = redirect_pages;
redirect_start = (struct redirect_info *)(
(unsigned long)(redirect_start + 1) +
redirect_start->redirect_pages * sizeof(unsigned int));
}
}
}
spin_unlock(&page_map_entry_lock);
return;
+free_pages:
free_user_map_pages(index, i, j);
spin_unlock(&page_map_entry_lock);
+}
+int calculate_pin_mem_digest(struct pin_mem_dump_info *pmdi, char *digest)
+{
int i;
struct sha256_state sctx;
if (!digest)
digest = pmdi->sha_digest;
sha256_init(&sctx);
sha256_update(&sctx, (unsigned char *)(&(pmdi->magic)),
sizeof(struct pin_mem_dump_info) - SHA256_DIGEST_SIZE);
for (i = 0; i < pmdi->pin_pid_num; i++) {
sha256_update(&sctx, (unsigned char *)(&(pmdi->pmi_array[i])),
sizeof(struct page_map_info));
}
sha256_final(&sctx, digest);
return 0;
+}
+static int check_sha_digest(struct pin_mem_dump_info *pmdi)
+{
int ret = 0;
char digest[SHA256_DIGEST_SIZE] = {0};
ret = calculate_pin_mem_digest(pmdi, digest);
if (ret) {
pr_warn("calculate pin mem digest fail:%d\n", ret);
return ret;
}
if (memcmp(pmdi->sha_digest, digest, SHA256_DIGEST_SIZE)) {
pr_warn("pin mem dump info sha256 digest match error!\n");
return -EFAULT;
}
return ret;
+}
+/*
- The whole page map entry collect process must be Sequentially.
- The user_space_reserve_start points to the first page map info for
- the first dump task. And the page_map_entry_start points to
- the first page map entry of the first dump vma.
*/
+static void init_page_map_info(struct pin_mem_dump_info *pmdi, unsigned long map_len)
+{
if (pin_mem_dump_start || !max_pin_pid_num) {
pr_warn("pin page map already init or max_pin_pid_num not set.\n");
return;
}
if (map_len < sizeof(struct pin_mem_dump_info) +
max_pin_pid_num * sizeof(struct page_map_info) + redirect_space_size) {
pr_warn("pin memory reserved memblock too small.\n");
return;
}
if ((pmdi->magic != PIN_MEM_DUMP_MAGIC) || (pmdi->pin_pid_num
max_pin_pid_num) ||
check_sha_digest(pmdi))
memset(pmdi, 0, sizeof(struct pin_mem_dump_info));
pin_mem_dump_start = pmdi;
pin_pid_num = pmdi->pin_pid_num;
pr_info("pin_pid_num: %d\n", pin_pid_num);
pin_pid_num_addr = &(pmdi->pin_pid_num);
user_space_reserve_start =
(struct page_map_info *)pmdi->pmi_array;
page_map_entry_start =
(struct page_map_entry *)(user_space_reserve_start +
max_pin_pid_num);
- page_map_entry_end = (unsigned long)pmdi + map_len -
redirect_space_size;
redirect_space_start = page_map_entry_end;
if (pin_pid_num > 0)
reserve_user_space_map_pages();
+}
+int finish_pin_mem_dump(void)
+{
int ret;
pin_mem_dump_start->magic = PIN_MEM_DUMP_MAGIC;
memset(pin_mem_dump_start->sha_digest, 0, SHA256_DIGEST_SIZE);
ret = calculate_pin_mem_digest(pin_mem_dump_start, NULL);
if (ret) {
pr_warn("calculate pin mem digest fail:%d\n", ret);
return ret;
}
return ret;
+}
+int collect_pmd_huge_pages(struct task_struct *task,
unsigned long start_addr, unsigned long end_addr, struct
page_map_entry *pme)
+{
long res;
int index = 0;
unsigned long start = start_addr;
struct page *temp_page;
while (start < end_addr) {
temp_page = NULL;
res = get_user_pages_remote(task->mm, start, 1,
FOLL_TOUCH | FOLL_GET, &temp_page, NULL, NULL);
if (!res) {
pr_warn("Get huge page for addr(%lx) fail.", start);
return COLLECT_PAGES_FAIL;
}
if (PageHead(temp_page)) {
start += HPAGE_PMD_SIZE;
pme->phy_addr_array[index] = page_to_phys(temp_page);
index++;
} else {
pme->nr_pages = index;
atomic_dec(&((temp_page)->_refcount));
return COLLECT_PAGES_NEED_CONTINUE;
}
}
pme->nr_pages = index;
return COLLECT_PAGES_FINISH;
+}
+int collect_normal_pages(struct task_struct *task,
unsigned long start_addr, unsigned long end_addr, struct
page_map_entry *pme)
+{
int res;
unsigned long next;
unsigned long i, nr_pages;
struct page *tmp_page;
unsigned long *phy_addr_array = pme->phy_addr_array;
struct page **page_array = (struct page **)pme->phy_addr_array;
next = (start_addr & HPAGE_PMD_MASK) + HPAGE_PMD_SIZE;
next = (next > end_addr) ? end_addr : next;
pme->nr_pages = 0;
while (start_addr < next) {
nr_pages = (PAGE_ALIGN(next) - start_addr) / PAGE_SIZE;
res = get_user_pages_remote(task->mm, start_addr, 1,
FOLL_TOUCH | FOLL_GET, &tmp_page, NULL, NULL);
if (!res) {
pr_warn("Get user page of %lx fail.\n", start_addr);
return COLLECT_PAGES_FAIL;
}
if (PageHead(tmp_page)) {
atomic_dec(&(tmp_page->_refcount));
return COLLECT_PAGES_NEED_CONTINUE;
}
atomic_dec(&(tmp_page->_refcount));
if (PageTail(tmp_page)) {
start_addr = next;
pme->virt_addr = start_addr;
next = (next + HPAGE_PMD_SIZE) > end_addr ?
end_addr : (next + HPAGE_PMD_SIZE);
continue;
}
res = get_user_pages_remote(task->mm, start_addr,
nr_pages,
FOLL_TOUCH | FOLL_GET, page_array, NULL, NULL);
if (!res) {
pr_warn("Get user pages of %lx fail.\n", start_addr);
return COLLECT_PAGES_FAIL;
}
for (i = 0; i < nr_pages; i++)
phy_addr_array[i] = page_to_phys(page_array[i]);
pme->nr_pages += nr_pages;
page_array += nr_pages;
phy_addr_array += nr_pages;
start_addr = next;
next = (next + HPAGE_PMD_SIZE) > end_addr ? end_addr
: (next + HPAGE_PMD_SIZE);
}
return COLLECT_PAGES_FINISH;
+}
+/* Users make sure that the pin memory belongs to anonymous vma. */
+int pin_mem_area(struct task_struct *task, struct mm_struct *mm,
- unsigned long start_addr, unsigned long end_addr)
+{
int pid, ret;
int is_huge_page = false;
unsigned int page_size;
unsigned long nr_pages, flags;
struct page_map_entry *pme;
struct page_map_info *pmi;
struct vm_area_struct *vma;
unsigned long i;
struct page *tmp_page;
if (!page_map_entry_start
|| !task || !mm
|| start_addr >= end_addr)
return -EFAULT;
pid = task->pid;
spin_lock_irqsave(&page_map_entry_lock, flags);
nr_pages = ((end_addr - start_addr) / PAGE_SIZE);
if ((unsigned long)page_map_entry_start + nr_pages *
sizeof(struct page *) >=
page_map_entry_end) {
pr_warn("Page map entry use up!\n");
ret = -EFAULT;
goto finish;
}
vma = find_extend_vma(mm, start_addr);
if (!vma) {
pr_warn("Find no match vma!\n");
ret = -EFAULT;
goto finish;
}
if (start_addr == (start_addr & HPAGE_PMD_MASK) &&
transparent_hugepage_enabled(vma)) {
page_size = HPAGE_PMD_SIZE;
is_huge_page = true;
} else {
page_size = PAGE_SIZE;
}
pme = page_map_entry_start;
pme->virt_addr = start_addr;
pme->redirect_start = 0;
pme->is_huge_page = is_huge_page;
memset(pme->phy_addr_array, 0, nr_pages * sizeof(unsigned long));
down_write(&mm->mmap_lock);
if (!is_huge_page) {
ret = collect_normal_pages(task, start_addr,
end_addr, pme);
if (ret != COLLECT_PAGES_FAIL && !pme->nr_pages) {
if (ret == COLLECT_PAGES_FINISH) {
ret = 0;
up_write(&mm->mmap_lock);
goto finish;
}
pme->is_huge_page = true;
page_size = HPAGE_PMD_SIZE;
ret = collect_pmd_huge_pages(task, pme->virt_addr, end_addr, pme);
}
} else {
ret = collect_pmd_huge_pages(task, start_addr,
end_addr, pme);
if (ret != COLLECT_PAGES_FAIL && !pme->nr_pages) {
if (ret == COLLECT_PAGES_FINISH) {
ret = 0;
up_write(&mm->mmap_lock);
goto finish;
}
pme->is_huge_page = false;
page_size = PAGE_SIZE;
ret = collect_normal_pages(task, pme->virt_addr, end_addr, pme);
}
}
up_write(&mm->mmap_lock);
if (ret == COLLECT_PAGES_FAIL) {
ret = -EFAULT;
goto finish;
}
/* check for zero pages */
for (i = 0; i < pme->nr_pages; i++) {
tmp_page = phys_to_page(pme->phy_addr_array[i]);
if (!pme->is_huge_page) {
if (page_to_pfn(tmp_page) == my_zero_pfn(pme->virt_addr + i *
PAGE_SIZE))
pme->phy_addr_array[i] = 0;
} else if (is_huge_zero_page(tmp_page))
pme->phy_addr_array[i] = 0;
}
page_map_entry_start = (struct page_map_entry *)(next_pme(pme));
pmi = get_page_map_info(pid);
if (!pmi)
pmi = create_page_map_info(pid);
if (!pmi) {
pr_warn("Create page map info fail for pid: %d!\n", pid);
ret = -EFAULT;
goto finish;
}
if (!pmi->pme)
pmi->pme = pme;
pmi->entry_num++;
spin_unlock_irqrestore(&page_map_entry_lock, flags);
if (ret == COLLECT_PAGES_NEED_CONTINUE)
ret = pin_mem_area(task, mm, pme->virt_addr +
pme->nr_pages * page_size, end_addr);
return ret;
+finish:
spin_unlock_irqrestore(&page_map_entry_lock, flags);
return ret;
+}
+EXPORT_SYMBOL_GPL(pin_mem_area);
+vm_fault_t remap_normal_pages(struct mm_struct *mm, struct vm_area_struct *vma,
struct page_map_entry *pme)
+{
int ret;
unsigned int j, i;
pgd_t *pgd;
p4d_t *p4d;
pmd_t *pmd;
pud_t *pud;
struct page *page, *new;
unsigned long address;
unsigned long phy_addr;
unsigned int redirect_pages = 0;
struct redirect_info *redirect_start;
redirect_start = (struct redirect_info *)pme->redirect_start;
for (j = 0; j < pme->nr_pages; j++) {
address = pme->virt_addr + j * PAGE_SIZE;
phy_addr = pme->phy_addr_array[j];
if (!phy_addr)
continue;
page = phys_to_page(phy_addr);
if (page_to_pfn(page) == my_zero_pfn(address)) {
pme->phy_addr_array[j] = 0;
continue;
}
pme->phy_addr_array[j] = 0;
if (redirect_start && (redirect_pages <
redirect_start->redirect_pages) &&
(j == redirect_start->redirect_index[redirect_pages])) {
new = alloc_zeroed_user_highpage_movable(vma, address);
if (!new) {
pr_warn("Redirect alloc page fail\n");
continue;
}
copy_page(page_to_virt(new), phys_to_virt(phy_addr));
page = new;
redirect_pages++;
}
page->mapping = NULL;
pgd = pgd_offset(mm, address);
p4d = p4d_alloc(mm, pgd, address);
if (!p4d) {
ret = VM_FAULT_OOM;
goto free;
}
pud = pud_alloc(mm, p4d, address);
if (!pud) {
ret = VM_FAULT_OOM;
goto free;
}
pmd = pmd_alloc(mm, pud, address);
if (!pmd) {
ret = VM_FAULT_OOM;
goto free;
}
ret = do_anon_page_remap(vma, address, pmd, page);
if (ret)
goto free;
}
return 0;
+free:
for (i = j; i < pme->nr_pages; i++) {
phy_addr = pme->phy_addr_array[i];
if (phy_addr) {
__free_page(phys_to_page(phy_addr));
pme->phy_addr_array[i] = 0;
}
}
return ret;
+}
+static inline gfp_t get_hugepage_gfpmask(struct vm_area_struct *vma)
+{
const bool vma_madvised = !!(vma->vm_flags & VM_HUGEPAGE);
if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_DIRECT_FLAG,
&transparent_hugepage_flags))
return GFP_TRANSHUGE | (vma_madvised ? 0 :
__GFP_NORETRY);
if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_FLAG,
&transparent_hugepage_flags))
return GFP_TRANSHUGE_LIGHT | __GFP_KSWAPD_RECLAIM;
if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_OR_MADV_FLAG,
&transparent_hugepage_flags))
return GFP_TRANSHUGE_LIGHT | (vma_madvised ?
__GFP_DIRECT_RECLAIM :
__GFP_KSWAPD_RECLAIM);
if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG,
&transparent_hugepage_flags))
return GFP_TRANSHUGE_LIGHT | (vma_madvised ?
__GFP_DIRECT_RECLAIM :
0);
return GFP_TRANSHUGE_LIGHT;
+}
+vm_fault_t remap_huge_pmd_pages(struct mm_struct *mm, struct vm_area_struct *vma,
struct page_map_entry *pme)
+{
int ret;
unsigned int j, i;
pgd_t *pgd;
p4d_t *p4d;
pmd_t *pmd;
pud_t *pud;
gfp_t gfp;
struct page *page, *new;
unsigned long address;
unsigned long phy_addr;
unsigned int redirect_pages = 0;
struct redirect_info *redirect_start;
redirect_start = (struct redirect_info *)pme->redirect_start;
for (j = 0; j < pme->nr_pages; j++) {
address = pme->virt_addr + j * HPAGE_PMD_SIZE;
phy_addr = pme->phy_addr_array[j];
if (!phy_addr)
continue;
page = phys_to_page(phy_addr);
if (is_huge_zero_page(page)) {
pme->phy_addr_array[j] = 0;
continue;
}
pme->phy_addr_array[j] = 0;
if (redirect_start && (redirect_pages <
redirect_start->redirect_pages) &&
(j == redirect_start->redirect_index[redirect_pages])) {
gfp = get_hugepage_gfpmask(vma);
new = alloc_hugepage_vma(gfp, vma, address, HPAGE_PMD_ORDER);
if (!new) {
pr_warn("Redirect alloc huge page fail\n");
continue;
}
memcpy(page_to_virt(new), phys_to_virt(phy_addr), HPAGE_PMD_SIZE);
page = new;
redirect_pages++;
}
pgd = pgd_offset(mm, address);
p4d = p4d_alloc(mm, pgd, address);
if (!p4d) {
ret = VM_FAULT_OOM;
goto free;
}
pud = pud_alloc(mm, p4d, address);
if (!pud) {
ret = VM_FAULT_OOM;
goto free;
}
pmd = pmd_alloc(mm, pud, address);
if (!pmd) {
ret = VM_FAULT_OOM;
goto free;
}
ret = do_anon_huge_page_remap(vma, address, pmd, page);
if (ret)
goto free;
}
return 0;
+free:
for (i = j; i < pme->nr_pages; i++) {
phy_addr = pme->phy_addr_array[i];
if (phy_addr) {
page = phys_to_page(phy_addr);
if (!(page->flags & PAGE_FLAGS_CHECK_RESERVED)) {
__free_pages(page, HPAGE_PMD_ORDER);
pme->phy_addr_array[i] = 0;
}
}
}
return ret;
+}
+static void free_unmap_pages(struct page_map_info *pmi,
struct page_map_entry *pme,
unsigned int index)
+{
unsigned int i, j;
unsigned long phy_addr;
unsigned int order;
struct page *page;
pme = (struct page_map_entry *)(next_pme(pme));
for (i = index; i < pmi->entry_num; i++) {
for (j = 0; j < pme->nr_pages; j++) {
phy_addr = pme->phy_addr_array[i];
if (phy_addr) {
page = phys_to_page(phy_addr);
order = pme->is_huge_page ? HPAGE_PMD_ORDER : 0;
if (!(page->flags & PAGE_FLAGS_CHECK_RESERVED)) {
__free_pages(page, order);
pme->phy_addr_array[i] = 0;
}
}
}
pme = (struct page_map_entry *)(next_pme(pme));
}
+}
+vm_fault_t do_mem_remap(int pid, struct mm_struct *mm)
+{
unsigned int i = 0;
vm_fault_t ret = 0;
struct vm_area_struct *vma;
struct page_map_info *pmi;
struct page_map_entry *pme;
unsigned long flags;
if (reserve_user_map_pages_fail)
return -EFAULT;
pmi = get_page_map_info(pid);
if (!pmi)
return -EFAULT;
spin_lock_irqsave(&page_map_entry_lock, flags);
pmi->disable_free_page = true;
spin_unlock(&page_map_entry_lock);
down_write(&mm->mmap_lock);
pme = pmi->pme;
vma = mm->mmap;
while ((i < pmi->entry_num) && (vma != NULL)) {
if (pme->virt_addr >= vma->vm_start && pme->virt_addr
< vma->vm_end) {
i++;
if (!vma_is_anonymous(vma)) {
pme = (struct page_map_entry *)(next_pme(pme));
continue;
}
if (!pme->is_huge_page) {
ret = remap_normal_pages(mm, vma, pme);
if (ret < 0)
goto free;
} else {
ret = remap_huge_pmd_pages(mm, vma, pme);
if (ret < 0)
goto free;
}
pme = (struct page_map_entry *)(next_pme(pme));
} else {
vma = vma->vm_next;
}
}
up_write(&mm->mmap_lock);
return 0;
+free:
free_unmap_pages(pmi, pme, i);
up_write(&mm->mmap_lock);
return ret;
+}
+EXPORT_SYMBOL_GPL(do_mem_remap);
+#if defined(CONFIG_ARM64)
+void init_reserve_page_map(unsigned long map_addr, unsigned long map_size)
+{
void *addr;
if (!map_addr || !map_size)
return;
addr = phys_to_virt(map_addr);
init_page_map_info((struct pin_mem_dump_info *)addr, map_size);
+}
+#else
+void init_reserve_page_map(unsigned long map_addr, unsigned long map_size)
+{
+}
+#endif
+static void free_all_reserved_pages(void)
+{
unsigned int i, j, index, order;
struct page_map_info *pmi;
struct page_map_entry *pme;
struct page *page;
unsigned long phy_addr;
if (!user_space_reserve_start || reserve_user_map_pages_fail)
return;
for (index = 0; index < pin_pid_num; index++) {
pmi = &(user_space_reserve_start[index]);
if (pmi->disable_free_page)
continue;
pme = pmi->pme;
for (i = 0; i < pmi->entry_num; i++) {
for (j = 0; j < pme->nr_pages; j++) {
order = pme->is_huge_page ? HPAGE_PMD_ORDER : 0;
phy_addr = pme->phy_addr_array[j];
if (phy_addr) {
page = phys_to_page(phy_addr);
if (!(page->flags & PAGE_FLAGS_CHECK_RESERVED)) {
__free_pages(page, order);
pme->phy_addr_array[j] = 0;
}
}
}
pme = (struct page_map_entry *)next_pme(pme);
}
}
+}
+/* Clear all pin memory record. */
+void clear_pin_memory_record(void)
+{
unsigned long flags;
spin_lock_irqsave(&page_map_entry_lock, flags);
free_all_reserved_pages();
if (pin_pid_num_addr) {
*pin_pid_num_addr = 0;
pin_pid_num = 0;
page_map_entry_start = (struct page_map_entry *)__page_map_entry_start;
}
spin_unlock(&page_map_entry_lock);
+}
+EXPORT_SYMBOL_GPL(clear_pin_memory_record);
+#endif /* CONFIG_PIN_MEMORY */
--
2.9.5