May 2021 - Kernel - mailweb.openeuler.org

[PATCH kernel-4.19 1/5] mm/vmscan: count layzfree pages and fix nr_isolated_* mismatch
by Yang Yingliang 08 May '21

08 May '21

From: Jaewon Kim <jaewon31.kim(a)samsung.com> mainline inclusion from mainline-v5.8-rc1 commit 1f318a9b0dc3990490e98eef48f21e6f15185781 category: bugfix bugzilla: 36239 CVE: NA ------------------------------------------------- Fix an nr_isolate_* mismatch problem between cma and dirty lazyfree pages. If try_to_unmap_one is used for reclaim and it detects a dirty lazyfree page, then the lazyfree page is changed to a normal anon page having SwapBacked by commit 802a3a92ad7a ("mm: reclaim MADV_FREE pages"). Even with the change, reclaim context correctly counts isolated files because it uses is_file_lru to distinguish file. And the change to anon is not happened if try_to_unmap_one is used for migration. So migration context like compaction also correctly counts isolated files even though it uses page_is_file_lru insted of is_file_lru. Recently page_is_file_cache was renamed to page_is_file_lru by commit 9de4f22a60f7 ("mm: code cleanup for MADV_FREE"). But the nr_isolate_* mismatch problem happens on cma alloc. There is reclaim_clean_pages_from_list which is being used only by cma. It was introduced by commit 02c6de8d757c ("mm: cma: discard clean pages during contiguous allocation instead of migration") to reclaim clean file pages without migration. The cma alloc uses both reclaim_clean_pages_from_list and migrate_pages, and it uses page_is_file_lru to count isolated files. If there are dirty lazyfree pages allocated from cma memory region, the pages are counted as isolated file at the beginging but are counted as isolated anon after finished. Mem-Info: Node 0 active_anon:3045904kB inactive_anon:611448kB active_file:14892kB inactive_file:205636kB unevictable:10416kB isolated(anon):0kB isolated(file):37664kB mapped:630216kB dirty:384kB writeback:0kB shmem:42576kB writeback_tmp:0kB unstable:0kB all_unreclaimable? no Like log above, there were too much isolated files, 37664kB, which triggers too_many_isolated in reclaim even when there is no actually isolated file in system wide. It could be reproducible by running two programs, writing on MADV_FREE page and doing cma alloc, respectively. Although isolated anon is 0, I found that the internal value of isolated anon was the negative value of isolated file. Fix this by compensating the isolated count for both LRU lists. Count non-discarded lazyfree pages in shrink_page_list, then compensate the counted number in reclaim_clean_pages_from_list. Reported-by: Yong-Taek Lee <ytk.lee(a)samsung.com> Suggested-by: Minchan Kim <minchan(a)kernel.org> Signed-off-by: Jaewon Kim <jaewon31.kim(a)samsung.com> Signed-off-by: Andrew Morton <akpm(a)linux-foundation.org> Acked-by: Minchan Kim <minchan(a)kernel.org> Cc: Mel Gorman <mgorman(a)suse.de> Cc: Johannes Weiner <hannes(a)cmpxchg.org> Cc: Marek Szyprowski <m.szyprowski(a)samsung.com> Cc: Michal Nazarewicz <mina86(a)mina86.com> Cc: Shaohua Li <shli(a)fb.com> Link: http://lkml.kernel.org/r/20200426011718.30246-1-jaewon31.kim@samsung.com Signed-off-by: Linus Torvalds <torvalds(a)linux-foundation.org> Signed-off-by: Liu Shixin <liushixin2(a)huawei.com> Reviewed-by: Kefeng Wang <wangkefeng.wang(a)huawei.com> Signed-off-by: Yang Yingliang <yangyingliang(a)huawei.com> --- include/linux/vmstat.h | 1 + mm/vmscan.c | 27 ++++++++++++++++++++++----- 2 files changed, 23 insertions(+), 5 deletions(-) diff --git a/include/linux/vmstat.h b/include/linux/vmstat.h index f25cef84b41db..0beaea0e5b832 100644 --- a/include/linux/vmstat.h +++ b/include/linux/vmstat.h @@ -29,6 +29,7 @@ struct reclaim_stat { unsigned nr_activate; unsigned nr_ref_keep; unsigned nr_unmap_fail; + unsigned nr_lazyfree_fail; }; #ifdef CONFIG_VM_EVENT_COUNTERS diff --git a/mm/vmscan.c b/mm/vmscan.c index 6f3c655fc8879..bedea8b4024a8 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -1129,6 +1129,7 @@ static unsigned long shrink_page_list(struct list_head *page_list, unsigned nr_immediate = 0; unsigned nr_ref_keep = 0; unsigned nr_unmap_fail = 0; + unsigned nr_lazyfree_fail = 0; cond_resched(); @@ -1336,11 +1337,15 @@ static unsigned long shrink_page_list(struct list_head *page_list, */ if (page_mapped(page)) { enum ttu_flags flags = ttu_flags | TTU_BATCH_FLUSH; + bool was_swapbacked = PageSwapBacked(page); if (unlikely(PageTransHuge(page))) flags |= TTU_SPLIT_HUGE_PMD; + if (!try_to_unmap(page, flags)) { nr_unmap_fail++; + if (!was_swapbacked && PageSwapBacked(page)) + nr_lazyfree_fail++; goto activate_locked; } } @@ -1519,6 +1524,7 @@ static unsigned long shrink_page_list(struct list_head *page_list, stat->nr_activate = pgactivate; stat->nr_ref_keep = nr_ref_keep; stat->nr_unmap_fail = nr_unmap_fail; + stat->nr_lazyfree_fail = nr_lazyfree_fail; } return nr_reclaimed; } @@ -1531,7 +1537,8 @@ unsigned long reclaim_clean_pages_from_list(struct zone *zone, .priority = DEF_PRIORITY, .may_unmap = 1, }; - unsigned long ret; + struct reclaim_stat stat; + unsigned long nr_reclaimed; struct page *page, *next; LIST_HEAD(clean_pages); @@ -1543,11 +1550,21 @@ unsigned long reclaim_clean_pages_from_list(struct zone *zone, } } - ret = shrink_page_list(&clean_pages, zone->zone_pgdat, &sc, - TTU_IGNORE_ACCESS, NULL, true); + nr_reclaimed = shrink_page_list(&clean_pages, zone->zone_pgdat, &sc, + TTU_IGNORE_ACCESS, &stat, true); list_splice(&clean_pages, page_list); - mod_node_page_state(zone->zone_pgdat, NR_ISOLATED_FILE, -ret); - return ret; + mod_node_page_state(zone->zone_pgdat, NR_ISOLATED_FILE, -nr_reclaimed); + /* + * Since lazyfree pages are isolated from file LRU from the beginning, + * they will rotate back to anonymous LRU in the end if it failed to + * discard so isolated count will be mismatched. + * Compensate the isolated count for both LRU lists. + */ + mod_node_page_state(zone->zone_pgdat, NR_ISOLATED_ANON, + stat.nr_lazyfree_fail); + mod_node_page_state(zone->zone_pgdat, NR_ISOLATED_FILE, + -stat.nr_lazyfree_fail); + return nr_reclaimed; } /* -- 2.25.1

1 4

[PATCH kernel-4.19 1/7] bpf: Move off_reg into sanitize_ptr_alu
by Yang Yingliang 08 May '21

08 May '21

From: Daniel Borkmann <daniel(a)iogearbox.net> mainline inclusion from mainline-v5.12-rc8 commit 6f55b2f2a1178856c19bbce2f71449926e731914 category: bugfix bugzilla: NA CVE: CVE-2021-29155 -------------------------------- Small refactor to drag off_reg into sanitize_ptr_alu(), so we later on can use off_reg for generalizing some of the checks for all pointer types. Signed-off-by: Daniel Borkmann <daniel(a)iogearbox.net> Reviewed-by: John Fastabend <john.fastabend(a)gmail.com> Acked-by: Alexei Starovoitov <ast(a)kernel.org> Signed-off-by: Yang Yingliang <yangyingliang(a)huawei.com> Reviewed-by: Kuohai Xu <xukuohai(a)huawei.com> Reviewed-by: Xiu Jianfeng <xiujianfeng(a)huawei.com> Signed-off-by: Yang Yingliang <yangyingliang(a)huawei.com> --- kernel/bpf/verifier.c | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index f9d8fc953b70d..4cb746168e4e6 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -2719,11 +2719,12 @@ static int sanitize_val_alu(struct bpf_verifier_env *env, static int sanitize_ptr_alu(struct bpf_verifier_env *env, struct bpf_insn *insn, const struct bpf_reg_state *ptr_reg, - struct bpf_reg_state *dst_reg, - bool off_is_neg) + const struct bpf_reg_state *off_reg, + struct bpf_reg_state *dst_reg) { struct bpf_verifier_state *vstate = env->cur_state; struct bpf_insn_aux_data *aux = cur_aux(env); + bool off_is_neg = off_reg->smin_value < 0; bool ptr_is_dst_reg = ptr_reg == dst_reg; u8 opcode = BPF_OP(insn->code); u32 alu_state, alu_limit; @@ -2847,7 +2848,7 @@ static int adjust_ptr_min_max_vals(struct bpf_verifier_env *env, switch (opcode) { case BPF_ADD: - ret = sanitize_ptr_alu(env, insn, ptr_reg, dst_reg, smin_val < 0); + ret = sanitize_ptr_alu(env, insn, ptr_reg, off_reg, dst_reg); if (ret < 0) { verbose(env, "R%d tried to add from different maps, paths, or prohibited types\n", dst); return ret; @@ -2902,7 +2903,7 @@ static int adjust_ptr_min_max_vals(struct bpf_verifier_env *env, } break; case BPF_SUB: - ret = sanitize_ptr_alu(env, insn, ptr_reg, dst_reg, smin_val < 0); + ret = sanitize_ptr_alu(env, insn, ptr_reg, off_reg, dst_reg); if (ret < 0) { verbose(env, "R%d tried to sub from different maps, paths, or prohibited types\n", dst); return ret; -- 2.25.1

1 3

[PATCH openEuler-1.0-LTS 6/6] pid: fix pid recover method kabi change
by Yang Yingliang 08 May '21

08 May '21

From: Jingxian He <hejingxian(a)huawei.com> hulk inclusion category: feature bugzilla: 48159 CVE: N/A ------------------------------ Pid recover method add new member to task_truct, which generates kabi change problem. We use KABI_RESERVE of task_truct, instead of adding new member to task_truct. Signed-off-by: Jingxian He <hejingxian(a)huawei.com> Reviewed-by: Jing Xiangfeng <jingxiangfeng(a)huawei.com> Reviewed-by: Xie XiuQi <xiexiuqi(a)huawei.com> Reviewed-by: Hanjun Guo <guohanjun(a)huawei.com> Signed-off-by: Yang Yingliang <yangyingliang(a)huawei.com> --- drivers/char/pin_memory.c | 2 +- include/linux/sched.h | 18 +++++++++++++++--- init/init_task.c | 4 +++- kernel/pid.c | 8 ++++---- 4 files changed, 23 insertions(+), 9 deletions(-) diff --git a/drivers/char/pin_memory.c b/drivers/char/pin_memory.c index 05fa7cfde03b2..12ae7627bdf0c 100644 --- a/drivers/char/pin_memory.c +++ b/drivers/char/pin_memory.c @@ -164,7 +164,7 @@ static int set_fork_pid(unsigned long arg) goto fault; if (copy_from_user(&pid, buf, sizeof(int))) goto fault; - current->fork_pid = pid; + current->fork_pid_union.fork_pid = pid; return 0; fault: return -EFAULT; diff --git a/include/linux/sched.h b/include/linux/sched.h index e532338aab8e7..b4c6dfe5e1d9d 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -596,6 +596,13 @@ struct wake_q_node { struct wake_q_node *next; }; +#ifdef CONFIG_PID_RESERVE +typedef union { + pid_t fork_pid; + unsigned long pid_reserve; +} fork_pid_t; +#endif + struct task_struct { #ifdef CONFIG_THREAD_INFO_IN_TASK /* @@ -1206,9 +1213,6 @@ struct task_struct { /* Used by LSM modules for access restriction: */ void *security; #endif -#ifdef CONFIG_PID_RESERVE - int fork_pid; -#endif /* * New fields for task_struct should be added above here, so that @@ -1227,7 +1231,15 @@ struct task_struct { KABI_RESERVE(3) KABI_RESERVE(4) #endif +#ifdef CONFIG_PID_RESERVE +#ifndef __GENKSYMS__ + fork_pid_t fork_pid_union; +#else + KABI_RESERVE(5) +#endif +#else KABI_RESERVE(5) +#endif KABI_RESERVE(6) KABI_RESERVE(7) KABI_RESERVE(8) diff --git a/init/init_task.c b/init/init_task.c index dd02e591490ae..57ff82ab98110 100644 --- a/init/init_task.c +++ b/init/init_task.c @@ -181,7 +181,9 @@ struct task_struct init_task .security = NULL, #endif #ifdef CONFIG_PID_RESERVE - .fork_pid = 0, + .fork_pid_union = { + .fork_pid = 0, + }, #endif }; EXPORT_SYMBOL(init_task); diff --git a/kernel/pid.c b/kernel/pid.c index d6a8009f96739..5666ce075487d 100644 --- a/kernel/pid.c +++ b/kernel/pid.c @@ -189,7 +189,7 @@ struct pid *alloc_pid(struct pid_namespace *ns) pid_min = RESERVED_PIDS; #ifdef CONFIG_PID_RESERVE - if (!current->fork_pid) { + if (!current->fork_pid_union.fork_pid) { /* * Store a null pointer so find_pid_ns does not find * a partially initialized PID (see below). @@ -199,9 +199,9 @@ struct pid *alloc_pid(struct pid_namespace *ns) GFP_ATOMIC); } else { /* Try to free the reserved fork_pid, and then use it to alloc pid. */ - free_reserved_pid(&tmp->idr, current->fork_pid); - pid_min = current->fork_pid; - current->fork_pid = 0; + free_reserved_pid(&tmp->idr, current->fork_pid_union.fork_pid); + pid_min = current->fork_pid_union.fork_pid; + current->fork_pid_union.fork_pid = 0; nr = idr_alloc(&tmp->idr, NULL, pid_min, pid_min + 1, GFP_ATOMIC); -- 2.25.1

1 0

[PATCH openEuler-1.0-LTS 5/6] config: enable kernel hotupgrade features by default
by Yang Yingliang 08 May '21

08 May '21

From: Jingxian He <hejingxian(a)huawei.com> hulk inclusion category: feature bugzilla: 48159 CVE: N/A ------------------------------ enable kernel hot upgrade features by default: 1 add pin mem method for checkpoint and restore: CONFIG_PIN_MEMORY=y CONFIG_PIN_MEMORY_DEV=m 2 add pid recover method for checkpoint and restore CONFIG_PID_RESERVE=y 3 add cpu park method CONFIG_ARM64_CPU_PARK=y 4 add quick kexec support for kernel CONFIG_QUICK_KEXEC=y Signed-off-by: Sang Yan <sangyan(a)huawei.com> Signed-off-by: Jingxian He <hejingxian(a)huawei.com> Reviewed-by: Jing Xiangfeng <jingxiangfeng(a)huawei.com> Acked-by: Hanjun Guo <guohanjun(a)huawei.com> Reviewed-by: Xie XiuQi <xiexiuqi(a)huawei.com> Signed-off-by: Yang Yingliang <yangyingliang(a)huawei.com> --- arch/arm64/configs/hulk_defconfig | 6 ++++++ arch/arm64/configs/openeuler_defconfig | 5 +++++ 2 files changed, 11 insertions(+) diff --git a/arch/arm64/configs/hulk_defconfig b/arch/arm64/configs/hulk_defconfig index 8950fff02a43f..9a3db8a88170e 100644 --- a/arch/arm64/configs/hulk_defconfig +++ b/arch/arm64/configs/hulk_defconfig @@ -445,6 +445,7 @@ CONFIG_PARAVIRT=y CONFIG_PARAVIRT_TIME_ACCOUNTING=y CONFIG_KEXEC=y CONFIG_CRASH_DUMP=y +CONFIG_ARM64_CPU_PARK=y # CONFIG_XEN is not set CONFIG_FORCE_MAX_ZONEORDER=11 CONFIG_UNMAP_KERNEL_AT_EL0=y @@ -701,6 +702,7 @@ CONFIG_CRYPTO_AES_ARM64_NEON_BLK=m # CONFIG_CRASH_CORE=y CONFIG_KEXEC_CORE=y +CONFIG_QUICK_KEXEC=y CONFIG_OPROFILE_NMI_TIMER=y CONFIG_KPROBES=y CONFIG_JUMP_LABEL=y @@ -983,6 +985,8 @@ CONFIG_IDLE_PAGE_TRACKING=y # CONFIG_PERCPU_STATS is not set # CONFIG_GUP_BENCHMARK is not set CONFIG_ARCH_HAS_PTE_SPECIAL=y +CONFIG_PIN_MEMORY=y +CONFIG_PID_RESERVE=y CONFIG_NET=y CONFIG_NET_INGRESS=y CONFIG_NET_EGRESS=y @@ -2956,6 +2960,8 @@ CONFIG_TCG_CRB=y # CONFIG_DEVPORT is not set # CONFIG_XILLYBUS is not set CONFIG_HISI_SVM=y +CONFIG_PIN_MEMORY_DEV=m + # # I2C support # diff --git a/arch/arm64/configs/openeuler_defconfig b/arch/arm64/configs/openeuler_defconfig index 89dec8e1954fd..34b34fa103d0a 100644 --- a/arch/arm64/configs/openeuler_defconfig +++ b/arch/arm64/configs/openeuler_defconfig @@ -446,6 +446,7 @@ CONFIG_PARAVIRT=y CONFIG_PARAVIRT_TIME_ACCOUNTING=y CONFIG_KEXEC=y CONFIG_CRASH_DUMP=y +CONFIG_ARM64_CPU_PARK=y # CONFIG_XEN is not set CONFIG_FORCE_MAX_ZONEORDER=14 CONFIG_UNMAP_KERNEL_AT_EL0=y @@ -691,6 +692,7 @@ CONFIG_CRYPTO_AES_ARM64_NEON_BLK=m # CONFIG_CRASH_CORE=y CONFIG_KEXEC_CORE=y +CONFIG_QUICK_KEXEC=y CONFIG_KPROBES=y CONFIG_JUMP_LABEL=y CONFIG_STATIC_KEYS_SELFTEST=y @@ -976,6 +978,8 @@ CONFIG_FRAME_VECTOR=y # CONFIG_PERCPU_STATS is not set # CONFIG_GUP_BENCHMARK is not set CONFIG_ARCH_HAS_PTE_SPECIAL=y +CONFIG_PIN_MEMORY=y +CONFIG_PID_RESERVE=y CONFIG_NET=y CONFIG_COMPAT_NETLINK_MESSAGES=y CONFIG_NET_INGRESS=y @@ -3045,6 +3049,7 @@ CONFIG_TCG_CRB=m # CONFIG_DEVPORT is not set # CONFIG_XILLYBUS is not set CONFIG_HISI_SVM=y +CONFIG_PIN_MEMORY_DEV=m # # I2C support -- 2.25.1

1 0

[PATCH openEuler-1.0-LTS 4/6] kexec: Add quick kexec support for kernel
by Yang Yingliang 08 May '21

08 May '21

From: Jingxian He <hejingxian(a)huawei.com> hulk inclusion category: feature bugzilla: 48159 CVE: N/A ------------------------------ In normal kexec, relocating kernel may cost 5 ~ 10 seconds, to copy all segments from vmalloced memory to kernel boot memory, because of disabled mmu. We introduce quick kexec to save time of copying memory as above, just like kdump(kexec on crash), by using reserved memory "Quick Kexec". Constructing quick kimage as the same as crash kernel, then simply copy all segments of kimage to reserved memroy. We also add this support in syscall kexec_load using flags of KEXEC_QUICK. Signed-off-by: Sang Yan <sangyan(a)huawei.com> Signed-off-by: Jingxian He <hejingxian(a)huawei.com> Reviewed-by: Wang Xiongfeng <wangxiongfeng2(a)huawei.com> Reviewed-by: Xie XiuQi <xiexiuqi(a)huawei.com> Reviewed-by: Hanjun Guo <guohanjun(a)huawei.com> Signed-off-by: Yang Yingliang <yangyingliang(a)huawei.com> --- arch/Kconfig | 9 ++++++++ arch/arm64/kernel/setup.c | 7 +++++++ arch/arm64/mm/init.c | 43 ++++++++++++++++++++++++++++++++++++++ include/linux/ioport.h | 1 + include/linux/kexec.h | 11 +++++++++- include/uapi/linux/kexec.h | 1 + kernel/kexec.c | 10 +++++++++ kernel/kexec_core.c | 41 ++++++++++++++++++++++++++++-------- 8 files changed, 113 insertions(+), 10 deletions(-) diff --git a/arch/Kconfig b/arch/Kconfig index 0dd478fcee0b6..e906cbb213444 100644 --- a/arch/Kconfig +++ b/arch/Kconfig @@ -18,6 +18,15 @@ config KEXEC_CORE select CRASH_CORE bool +config QUICK_KEXEC + bool "Support for quick kexec" + depends on KEXEC_CORE + help + It uses pre-reserved memory to accelerate kexec, just like + crash kexec, loads new kernel and initrd to reserved memory, + and boots new kernel on that memory. It will save the time + of relocating kernel. + config HAVE_IMA_KEXEC bool diff --git a/arch/arm64/kernel/setup.c b/arch/arm64/kernel/setup.c index 76abf40f0949d..e23b804773874 100644 --- a/arch/arm64/kernel/setup.c +++ b/arch/arm64/kernel/setup.c @@ -282,6 +282,13 @@ static void __init request_standard_resources(void) request_resource(res, &pin_memory_resource); #endif +#ifdef CONFIG_QUICK_KEXEC + if (quick_kexec_res.end && + quick_kexec_res.start >= res->start && + quick_kexec_res.end <= res->end) + request_resource(res, &quick_kexec_res); +#endif + for (j = 0; j < res_mem_count; j++) { if (res_resources[j].start >= res->start && res_resources[j].end <= res->end) diff --git a/arch/arm64/mm/init.c b/arch/arm64/mm/init.c index 71bfe4195f205..554824ce0f286 100644 --- a/arch/arm64/mm/init.c +++ b/arch/arm64/mm/init.c @@ -243,6 +243,45 @@ static void __init kexec_reserve_crashkres_pages(void) } #endif /* CONFIG_KEXEC_CORE */ +#ifdef CONFIG_QUICK_KEXEC +static int __init parse_quick_kexec(char *p) +{ + if (!p) + return 0; + + quick_kexec_res.end = PAGE_ALIGN(memparse(p, NULL)); + + return 0; +} +early_param("quickkexec", parse_quick_kexec); + +static void __init reserve_quick_kexec(void) +{ + unsigned long long mem_start, mem_len; + + mem_len = quick_kexec_res.end; + if (mem_len == 0) + return; + + /* Current arm64 boot protocol requires 2MB alignment */ + mem_start = memblock_find_in_range(0, ARCH_LOW_ADDRESS_LIMIT, + mem_len, CRASH_ALIGN); + if (mem_start == 0) { + pr_warn("cannot allocate quick kexec mem (size:0x%llx)\n", + mem_len); + quick_kexec_res.end = 0; + return; + } + + memblock_reserve(mem_start, mem_len); + pr_info("quick kexec mem reserved: 0x%016llx - 0x%016llx (%lld MB)\n", + mem_start, mem_start + mem_len, mem_len >> 20); + + quick_kexec_res.start = mem_start; + quick_kexec_res.end = mem_start + mem_len - 1; +} +#endif + #ifdef CONFIG_CRASH_DUMP static int __init early_init_dt_scan_elfcorehdr(unsigned long node, const char *uname, int depth, void *data) @@ -699,6 +738,10 @@ void __init arm64_memblock_init(void) reserve_crashkernel(); +#ifdef CONFIG_QUICK_KEXEC + reserve_quick_kexec(); +#endif + reserve_elfcorehdr(); high_memory = __va(memblock_end_of_DRAM() - 1) + 1; diff --git a/include/linux/ioport.h b/include/linux/ioport.h index 5330288da8dbb..1a438ecd15d01 100644 --- a/include/linux/ioport.h +++ b/include/linux/ioport.h @@ -139,6 +139,7 @@ enum { IORES_DESC_PERSISTENT_MEMORY_LEGACY = 5, IORES_DESC_DEVICE_PRIVATE_MEMORY = 6, IORES_DESC_DEVICE_PUBLIC_MEMORY = 7, + IORES_DESC_QUICK_KEXEC = 8, }; /* helpers to define resources */ diff --git a/include/linux/kexec.h b/include/linux/kexec.h index d6b8d0a69720d..98cf0fb091579 100644 --- a/include/linux/kexec.h +++ b/include/linux/kexec.h @@ -233,9 +233,10 @@ struct kimage { unsigned long control_page; /* Flags to indicate special processing */ - unsigned int type : 1; + unsigned int type : 2; #define KEXEC_TYPE_DEFAULT 0 #define KEXEC_TYPE_CRASH 1 +#define KEXEC_TYPE_QUICK 2 unsigned int preserve_context : 1; /* If set, we are using file mode kexec syscall */ unsigned int file_mode:1; @@ -296,6 +297,11 @@ extern int kexec_load_disabled; #define KEXEC_FLAGS (KEXEC_ON_CRASH | KEXEC_PRESERVE_CONTEXT) #endif +#ifdef CONFIG_QUICK_KEXEC +#undef KEXEC_FLAGS +#define KEXEC_FLAGS (KEXEC_ON_CRASH | KEXEC_QUICK) +#endif + /* List of defined/legal kexec file flags */ #define KEXEC_FILE_FLAGS (KEXEC_FILE_UNLOAD | KEXEC_FILE_ON_CRASH | \ KEXEC_FILE_NO_INITRAMFS) @@ -305,6 +311,9 @@ extern int kexec_load_disabled; extern struct resource crashk_res; extern struct resource crashk_low_res; extern note_buf_t __percpu *crash_notes; +#ifdef CONFIG_QUICK_KEXEC +extern struct resource quick_kexec_res; +#endif /* flag to track if kexec reboot is in progress */ extern bool kexec_in_progress; diff --git a/include/uapi/linux/kexec.h b/include/uapi/linux/kexec.h index 6d112868272da..ca3cebebd6b20 100644 --- a/include/uapi/linux/kexec.h +++ b/include/uapi/linux/kexec.h @@ -12,6 +12,7 @@ /* kexec flags for different usage scenarios */ #define KEXEC_ON_CRASH 0x00000001 #define KEXEC_PRESERVE_CONTEXT 0x00000002 +#define KEXEC_QUICK 0x00000004 #define KEXEC_ARCH_MASK 0xffff0000 /* diff --git a/kernel/kexec.c b/kernel/kexec.c index 68559808fdfa1..47dfad722b7ca 100644 --- a/kernel/kexec.c +++ b/kernel/kexec.c @@ -46,6 +46,9 @@ static int kimage_alloc_init(struct kimage **rimage, unsigned long entry, int ret; struct kimage *image; bool kexec_on_panic = flags & KEXEC_ON_CRASH; +#ifdef CONFIG_QUICK_KEXEC + bool kexec_on_quick = flags & KEXEC_QUICK; +#endif if (kexec_on_panic) { /* Verify we have a valid entry point */ @@ -71,6 +74,13 @@ static int kimage_alloc_init(struct kimage **rimage, unsigned long entry, image->type = KEXEC_TYPE_CRASH; } +#ifdef CONFIG_QUICK_KEXEC + if (kexec_on_quick) { + image->control_page = quick_kexec_res.start; + image->type = KEXEC_TYPE_QUICK; + } +#endif + ret = sanity_check_segment_list(image); if (ret) goto out_free_image; diff --git a/kernel/kexec_core.c b/kernel/kexec_core.c index b36c9c46cd2ce..595a757af6568 100644 --- a/kernel/kexec_core.c +++ b/kernel/kexec_core.c @@ -74,6 +74,16 @@ struct resource crashk_low_res = { .desc = IORES_DESC_CRASH_KERNEL }; +#ifdef CONFIG_QUICK_KEXEC +struct resource quick_kexec_res = { + .name = "Quick kexec", + .start = 0, + .end = 0, + .flags = IORESOURCE_BUSY | IORESOURCE_SYSTEM_RAM, + .desc = IORES_DESC_QUICK_KEXEC +}; +#endif + int kexec_should_crash(struct task_struct *p) { /* @@ -470,8 +480,10 @@ static struct page *kimage_alloc_normal_control_pages(struct kimage *image, return pages; } -static struct page *kimage_alloc_crash_control_pages(struct kimage *image, - unsigned int order) + +static struct page *kimage_alloc_special_control_pages(struct kimage *image, + unsigned int order, + unsigned long end) { /* Control pages are special, they are the intermediaries * that are needed while we copy the rest of the pages @@ -501,7 +513,7 @@ static struct page *kimage_alloc_crash_control_pages(struct kimage *image, size = (1 << order) << PAGE_SHIFT; hole_start = (image->control_page + (size - 1)) & ~(size - 1); hole_end = hole_start + size - 1; - while (hole_end <= crashk_res.end) { + while (hole_end <= end) { unsigned long i; cond_resched(); @@ -536,7 +548,6 @@ static struct page *kimage_alloc_crash_control_pages(struct kimage *image, return pages; } - struct page *kimage_alloc_control_pages(struct kimage *image, unsigned int order) { @@ -547,8 +558,15 @@ struct page *kimage_alloc_control_pages(struct kimage *image, pages = kimage_alloc_normal_control_pages(image, order); break; case KEXEC_TYPE_CRASH: - pages = kimage_alloc_crash_control_pages(image, order); + pages = kimage_alloc_special_control_pages(image, order, + crashk_res.end); + break; +#ifdef CONFIG_QUICK_KEXEC + case KEXEC_TYPE_QUICK: + pages = kimage_alloc_special_control_pages(image, order, + quick_kexec_res.end); break; +#endif } return pages; @@ -898,11 +916,11 @@ static int kimage_load_normal_segment(struct kimage *image, return result; } -static int kimage_load_crash_segment(struct kimage *image, +static int kimage_load_special_segment(struct kimage *image, struct kexec_segment *segment) { - /* For crash dumps kernels we simply copy the data from - * user space to it's destination. + /* For crash dumps kernels and quick kexec kernels + * we simply copy the data from user space to it's destination. * We do things a page at a time for the sake of kmap. */ unsigned long maddr; @@ -976,8 +994,13 @@ int kimage_load_segment(struct kimage *image, result = kimage_load_normal_segment(image, segment); break; case KEXEC_TYPE_CRASH: - result = kimage_load_crash_segment(image, segment); + result = kimage_load_special_segment(image, segment); + break; +#ifdef CONFIG_QUICK_KEXEC + case KEXEC_TYPE_QUICK: + result = kimage_load_special_segment(image, segment); break; +#endif } return result; -- 2.25.1

1 0

[PATCH kernel-4.19 1/5] mm: add pin memory method for checkpoint add restore
by Yang Yingliang 08 May '21

08 May '21

From: Jingxian He <hejingxian(a)huawei.com> hulk inclusion category: feature bugzilla: 48159 CVE: N/A ------------------------------ We can use the checkpoint and restore in userspace(criu) method to dump and restore tasks when updating the kernel. Currently, criu needs dump all memory data of tasks to files. When the memory size is very large(larger than 1G), the cost time of the dumping data will be very long(more than 1 min). By pin the memory data of tasks and collect the corresponding physical pages mapping info in checkpoint process, we can remap the physical pages to restore tasks after upgrading the kernel. This pin memory method can restore the task data within one second. The pin memory area info is saved in the reserved memblock, which can keep usable in the kernel update process. The pin memory driver provides the following ioctl command for criu: 1) SET_PIN_MEM_AREA: Set pin memory area, which can be remap to the restore task. 2) CLEAR_PIN_MEM_AREA: Clear the pin memory area info, which enable user reset the pin data. 3) REMAP_PIN_MEM_AREA: Remap the pages of the pin memory to the restore task. v1->v2: - Use pagemap_walk instead of pagemap_read due to difference between kernel-4.19 and kernel-5.10. v2->v3: - Remove pointless 'goto' in early_param parse. - Compact anomaly process in remap_huge_pmd_pages function. v3->v4: - Use kzalloc instead of kmalloc when alloc pagemap_walk. v4->v5: - Add static statement to global variables. - Improve input para check for do_mem_remap. Signed-off-by: Jingxian He <hejingxian(a)huawei.com> Reviewed-by: Jing Xiangfeng <jingxiangfeng(a)huawei.com> Acked-by: Hanjun Guo <guohanjun(a)huawei.com> Reviewed-by: Xie XiuQi <xiexiuqi(a)huawei.com> Signed-off-by: Yang Yingliang <yangyingliang(a)huawei.com> --- arch/arm64/kernel/setup.c | 8 + arch/arm64/mm/init.c | 57 ++ drivers/char/Kconfig | 7 + drivers/char/Makefile | 1 + drivers/char/pin_memory.c | 213 +++++++ fs/proc/task_mmu.c | 136 +++++ include/linux/crash_core.h | 5 + include/linux/pin_mem.h | 93 ++++ kernel/crash_core.c | 11 + lib/Makefile | 2 + mm/Kconfig | 8 + mm/Makefile | 1 + mm/huge_memory.c | 62 +++ mm/memory.c | 69 +++ mm/pin_mem.c | 1074 ++++++++++++++++++++++++++++++++++++ 15 files changed, 1747 insertions(+) create mode 100644 drivers/char/pin_memory.c create mode 100644 include/linux/pin_mem.h create mode 100644 mm/pin_mem.c diff --git a/arch/arm64/kernel/setup.c b/arch/arm64/kernel/setup.c index 64f12477016a1..54b35b6bd3041 100644 --- a/arch/arm64/kernel/setup.c +++ b/arch/arm64/kernel/setup.c @@ -42,6 +42,9 @@ #include <linux/psci.h> #include <linux/sched/task.h> #include <linux/mm.h> +#ifdef CONFIG_PIN_MEMORY +#include <linux/pin_mem.h> +#endif #include <asm/acpi.h> #include <asm/fixmap.h> @@ -273,6 +276,11 @@ static void __init request_standard_resources(void) crashk_res.end <= res->end) request_resource(res, &crashk_res); #endif +#ifdef CONFIG_PIN_MEMORY + if (pin_memory_resource.end && pin_memory_resource.start >= res->start && + pin_memory_resource.end <= res->end) + request_resource(res, &pin_memory_resource); +#endif for (j = 0; j < res_mem_count; j++) { if (res_resources[j].start >= res->start && diff --git a/arch/arm64/mm/init.c b/arch/arm64/mm/init.c index fa37c4c2e7556..dad66d882c974 100644 --- a/arch/arm64/mm/init.c +++ b/arch/arm64/mm/init.c @@ -42,6 +42,9 @@ #include <linux/crash_dump.h> #include <linux/iommu.h> #include <linux/suspend.h> +#ifdef CONFIG_PIN_MEMORY +#include <linux/pin_mem.h> +#endif #include <asm/boot.h> #include <asm/fixmap.h> @@ -92,6 +95,52 @@ early_param("initrd", early_initrd); */ #define MAX_USABLE_RANGES 2 +#ifdef CONFIG_PIN_MEMORY +struct resource pin_memory_resource = { + .name = "Pin memory", + .start = 0, + .end = 0, + .flags = IORESOURCE_MEM, + .desc = IOMMU_RESV_RESERVED +}; + +static void __init reserve_pin_memory_res(void) +{ + unsigned long long mem_start, mem_len; + int ret; + + ret = parse_pin_memory(boot_command_line, memblock_phys_mem_size(), + &mem_len, &mem_start); + if (ret || !mem_len) + return; + + mem_len = PAGE_ALIGN(mem_len); + + if (!memblock_is_region_memory(mem_start, mem_len)) { + pr_warn("cannot reserve for pin memory: region is not memory!\n"); + return; + } + + if (memblock_is_region_reserved(mem_start, mem_len)) { + pr_warn("cannot reserve for pin memory: region overlaps reserved memory!\n"); + return; + } + + if (!IS_ALIGNED(mem_start, SZ_2M)) { + pr_warn("cannot reserve for pin memory: base address is not 2MB aligned\n"); + return; + } + + memblock_reserve(mem_start, mem_len); + pin_memory_resource.start = mem_start; + pin_memory_resource.end = mem_start + mem_len - 1; +} +#else +static void __init reserve_pin_memory_res(void) +{ +} +#endif /* CONFIG_PIN_MEMORY */ + #ifdef CONFIG_KEXEC_CORE /* @@ -582,6 +631,8 @@ void __init arm64_memblock_init(void) else arm64_dma_phys_limit = PHYS_MASK + 1; + reserve_pin_memory_res(); + reserve_crashkernel(); reserve_elfcorehdr(); @@ -704,6 +755,12 @@ void __init mem_init(void) /* this will put all unused low memory onto the freelists */ free_all_bootmem(); +#ifdef CONFIG_PIN_MEMORY + /* pre alloc the pages for pin memory */ + init_reserve_page_map((unsigned long)pin_memory_resource.start, + (unsigned long)(pin_memory_resource.end - pin_memory_resource.start)); +#endif + kexec_reserve_crashkres_pages(); mem_init_print_info(NULL); diff --git a/drivers/char/Kconfig b/drivers/char/Kconfig index 60e05b27b2a34..505562acf2755 100644 --- a/drivers/char/Kconfig +++ b/drivers/char/Kconfig @@ -562,6 +562,13 @@ config HISI_SVM svm and share the virtual memory with hisilicon svm device. When in doubt, say "N". +config PIN_MEMORY_DEV + tristate "/dev/pinmem character device" + depends on PIN_MEMORY + default m + help + pin memory driver + endmenu config RANDOM_TRUST_CPU diff --git a/drivers/char/Makefile b/drivers/char/Makefile index 3adc5ced7bc54..8ded2f436a368 100644 --- a/drivers/char/Makefile +++ b/drivers/char/Makefile @@ -59,3 +59,4 @@ obj-$(CONFIG_XILLYBUS) += xillybus/ obj-$(CONFIG_POWERNV_OP_PANEL) += powernv-op-panel.o obj-$(CONFIG_ADI) += adi.o obj-$(CONFIG_HISI_SVM) += svm.o +obj-$(CONFIG_PIN_MEMORY_DEV) += pin_memory.o diff --git a/drivers/char/pin_memory.c b/drivers/char/pin_memory.c new file mode 100644 index 0000000000000..9b50ab867c5d3 --- /dev/null +++ b/drivers/char/pin_memory.c @@ -0,0 +1,213 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright @ Huawei Technologies Co., Ltd. 2020-2020. ALL rights reserved. + * Description: Euler pin memory driver + */ +#include <linux/kernel.h> +#include <linux/module.h> +#include <linux/kprobes.h> +#include <linux/spinlock.h> +#include <linux/workqueue.h> +#include <linux/sched.h> +#include <linux/mm.h> +#include <linux/init.h> +#include <linux/miscdevice.h> +#include <linux/fs.h> +#include <linux/mm_types.h> +#include <linux/processor.h> +#include <uapi/asm-generic/ioctl.h> +#include <uapi/asm-generic/mman-common.h> +#include <uapi/asm/setup.h> +#include <linux/pin_mem.h> +#include <linux/sched/mm.h> + +#define MAX_PIN_MEM_AREA_NUM 16 +struct _pin_mem_area { + unsigned long virt_start; + unsigned long virt_end; +}; + +struct pin_mem_area_set { + unsigned int pid; + unsigned int area_num; + struct _pin_mem_area mem_area[MAX_PIN_MEM_AREA_NUM]; +}; + +#define PIN_MEM_MAGIC 0x59 +#define _SET_PIN_MEM_AREA 1 +#define _CLEAR_PIN_MEM_AREA 2 +#define _REMAP_PIN_MEM_AREA 3 +#define _FINISH_PIN_MEM_DUMP 4 +#define _INIT_PAGEMAP_READ 5 +#define _PIN_MEM_IOC_MAX_NR 5 +#define SET_PIN_MEM_AREA _IOW(PIN_MEM_MAGIC, _SET_PIN_MEM_AREA, struct pin_mem_area_set) +#define CLEAR_PIN_MEM_AREA _IOW(PIN_MEM_MAGIC, _CLEAR_PIN_MEM_AREA, int) +#define REMAP_PIN_MEM_AREA _IOW(PIN_MEM_MAGIC, _REMAP_PIN_MEM_AREA, int) +#define FINISH_PIN_MEM_DUMP _IOW(PIN_MEM_MAGIC, _FINISH_PIN_MEM_DUMP, int) +#define INIT_PAGEMAP_READ _IOW(PIN_MEM_MAGIC, _INIT_PAGEMAP_READ, int) +static int set_pin_mem(struct pin_mem_area_set *pmas) +{ + int i; + int ret = 0; + struct _pin_mem_area *pma; + struct mm_struct *mm; + struct task_struct *task; + struct pid *pid_s; + + pid_s = find_get_pid(pmas->pid); + if (!pid_s) { + pr_warn("Get pid struct fail:%d.\n", pmas->pid); + return -EFAULT; + } + rcu_read_lock(); + task = pid_task(pid_s, PIDTYPE_PID); + if (!task) { + pr_warn("Get task struct fail:%d.\n", pmas->pid); + goto fail; + } + mm = get_task_mm(task); + for (i = 0; i < pmas->area_num; i++) { + pma = &(pmas->mem_area[i]); + ret = pin_mem_area(task, mm, pma->virt_start, pma->virt_end); + if (ret) { + mmput(mm); + goto fail; + } + } + mmput(mm); + rcu_read_unlock(); + put_pid(pid_s); + return ret; + +fail: + rcu_read_unlock(); + put_pid(pid_s); + return -EFAULT; +} + +static int set_pin_mem_area(unsigned long arg) +{ + struct pin_mem_area_set pmas; + void __user *buf = (void __user *)arg; + + if (!access_ok(buf, sizeof(pmas))) + return -EFAULT; + if (copy_from_user(&pmas, buf, sizeof(pmas))) + return -EINVAL; + if (pmas.area_num > MAX_PIN_MEM_AREA_NUM) { + pr_warn("Input area_num is too large.\n"); + return -EINVAL; + } + + return set_pin_mem(&pmas); +} + +static int pin_mem_remap(unsigned long arg) +{ + int pid; + struct task_struct *task; + struct mm_struct *mm; + vm_fault_t ret; + void __user *buf = (void __user *)arg; + struct pid *pid_s; + + if (!access_ok(buf, sizeof(int))) + return -EINVAL; + if (copy_from_user(&pid, buf, sizeof(int))) + return -EINVAL; + + pid_s = find_get_pid(pid); + if (!pid_s) { + pr_warn("Get pid struct fail:%d.\n", pid); + return -EINVAL; + } + rcu_read_lock(); + task = pid_task(pid_s, PIDTYPE_PID); + if (!task) { + pr_warn("Get task struct fail:%d.\n", pid); + goto fault; + } + mm = get_task_mm(task); + ret = do_mem_remap(pid, mm); + if (ret) { + pr_warn("Handle pin memory remap fail.\n"); + mmput(mm); + goto fault; + } + mmput(mm); + rcu_read_unlock(); + put_pid(pid_s); + return 0; + +fault: + rcu_read_unlock(); + put_pid(pid_s); + return -EFAULT; +} + +static long pin_memory_ioctl(struct file *file, unsigned int cmd, unsigned long arg) +{ + long ret = 0; + + if (_IOC_TYPE(cmd) != PIN_MEM_MAGIC) + return -EINVAL; + if (_IOC_NR(cmd) > _PIN_MEM_IOC_MAX_NR) + return -EINVAL; + + switch (cmd) { + case SET_PIN_MEM_AREA: + ret = set_pin_mem_area(arg); + break; + case CLEAR_PIN_MEM_AREA: + clear_pin_memory_record(); + break; + case REMAP_PIN_MEM_AREA: + ret = pin_mem_remap(arg); + break; + case FINISH_PIN_MEM_DUMP: + ret = finish_pin_mem_dump(); + break; + case INIT_PAGEMAP_READ: + ret = init_pagemap_read(); + break; + default: + return -EINVAL; + } + return ret; +} + +static const struct file_operations pin_memory_fops = { + .owner = THIS_MODULE, + .unlocked_ioctl = pin_memory_ioctl, + .compat_ioctl = pin_memory_ioctl, +}; + +static struct miscdevice pin_memory_miscdev = { + .minor = MISC_DYNAMIC_MINOR, + .name = "pinmem", + .fops = &pin_memory_fops, +}; + +static int pin_memory_init(void) +{ + int err = misc_register(&pin_memory_miscdev); + + if (!err) + pr_info("pin_memory init\n"); + else + pr_warn("pin_memory init failed!\n"); + return err; +} + +static void pin_memory_exit(void) +{ + misc_deregister(&pin_memory_miscdev); + pr_info("pin_memory ko exists!\n"); +} + +module_init(pin_memory_init); +module_exit(pin_memory_exit); + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Euler"); +MODULE_DESCRIPTION("pin memory"); diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index 66939a7998ab8..0417343481cd4 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c @@ -1575,6 +1575,142 @@ static ssize_t pagemap_read(struct file *file, char __user *buf, return ret; } +#ifdef CONFIG_PIN_MEMORY +static int get_pagemap_pmd_range(pmd_t *pmdp, unsigned long addr, unsigned long end, + struct mm_walk *walk) +{ + struct vm_area_struct *vma = walk->vma; + struct pagemapread *pm = walk->private; + spinlock_t *ptl; + pte_t *pte, *orig_pte; + int err = 0; + pagemap_entry_t pme; + +#ifdef CONFIG_TRANSPARENT_HUGEPAGE + ptl = pmd_trans_huge_lock(pmdp, vma); + if (ptl) { + u64 flags = 0, frame = 0; + pmd_t pmd = *pmdp; + struct page *page = NULL; + + if (pmd_present(pmd)) { + page = pmd_page(pmd); + flags |= PM_PRESENT; + frame = pmd_pfn(pmd) + + ((addr & ~PMD_MASK) >> PAGE_SHIFT); + } +#ifdef CONFIG_ARCH_ENABLE_THP_MIGRATION + else if (is_swap_pmd(pmd)) { + swp_entry_t entry = pmd_to_swp_entry(pmd); + unsigned long offset; + + offset = swp_offset(entry) + + ((addr & ~PMD_MASK) >> PAGE_SHIFT); + frame = swp_type(entry) | + (offset << MAX_SWAPFILES_SHIFT); + + flags |= PM_SWAP; + if (pmd_swp_soft_dirty(pmd)) + flags |= PM_SOFT_DIRTY; + VM_BUG_ON(!is_pmd_migration_entry(pmd)); + page = migration_entry_to_page(entry); + } +#endif + pme = make_pme(frame, flags); + err = add_to_pagemap(addr, &pme, pm); + spin_unlock(ptl); + return err; + } + + if (pmd_trans_unstable(pmdp)) + return 0; +#endif /* CONFIG_TRANSPARENT_HUGEPAGE */ + + orig_pte = pte = pte_offset_map_lock(walk->mm, pmdp, addr, &ptl); + for (; addr < end; pte++, addr += PAGE_SIZE) { + pme = pte_to_pagemap_entry(pm, vma, addr, *pte); + err = add_to_pagemap(addr, &pme, pm); + if (err) + break; + } + pte_unmap_unlock(orig_pte, ptl); + return err; +} + +void *create_pagemap_walk(void) +{ + struct pagemapread *pm; + struct mm_walk *pagemap_walk; + + pagemap_walk = kzalloc(sizeof(struct mm_walk), GFP_KERNEL); + if (!pagemap_walk) + return NULL; + pm = kmalloc(sizeof(struct pagemapread), GFP_KERNEL); + if (!pm) { + goto out_free_walk; + } + pm->show_pfn = true; + pm->len = (PAGEMAP_WALK_SIZE >> PAGE_SHIFT) + 1; + pm->buffer = kmalloc_array(pm->len, PM_ENTRY_BYTES, GFP_KERNEL); + if (!pm->buffer) + goto out_free; + + pagemap_walk->pmd_entry = get_pagemap_pmd_range; + pagemap_walk->pte_hole = pagemap_pte_hole; +#ifdef CONFIG_HUGETLB_PAGE + pagemap_walk->hugetlb_entry = pagemap_hugetlb_range; +#endif + pagemap_walk->private = pm; + return (void *)pagemap_walk; +out_free: + kfree(pm); +out_free_walk: + kfree(pagemap_walk); + return NULL; +} + +void free_pagemap_walk(void *mem_walk) +{ + struct pagemapread *pm; + struct mm_walk *pagemap_walk = (struct mm_walk *)mem_walk; + + if (!pagemap_walk) + return; + if (pagemap_walk->private) { + pm = (struct pagemapread *)pagemap_walk->private; + kfree(pm->buffer); + kfree(pm); + pagemap_walk->private = NULL; + } + kfree(pagemap_walk); +} + +int pagemap_get(struct mm_struct *mm, void *mem_walk, + unsigned long start_vaddr, unsigned long end_vaddr, + unsigned long *pte_entry, unsigned int *count) +{ + int i, ret; + struct pagemapread *pm; + unsigned long end; + struct mm_walk *pagemap_walk = (struct mm_walk *)mem_walk; + + if (!pte_entry || !mm || !pagemap_walk) + return -EFAULT; + + pm = (struct pagemapread *)pagemap_walk->private; + pagemap_walk->mm = mm; + pm->pos = 0; + end = (start_vaddr + PAGEMAP_WALK_SIZE) & PAGEMAP_WALK_MASK; + if (end > end_vaddr) + end = end_vaddr; + ret = walk_page_range(start_vaddr, end, pagemap_walk); + *count = pm->pos; + for (i = 0; i < pm->pos; i++) + pte_entry[i] = pm->buffer[i].pme; + return ret; +} +#endif + static int pagemap_open(struct inode *inode, struct file *file) { struct mm_struct *mm; diff --git a/include/linux/crash_core.h b/include/linux/crash_core.h index 525510a9f965f..a74a87857865e 100644 --- a/include/linux/crash_core.h +++ b/include/linux/crash_core.h @@ -75,4 +75,9 @@ int parse_crashkernel_high(char *cmdline, unsigned long long system_ram, int parse_crashkernel_low(char *cmdline, unsigned long long system_ram, unsigned long long *crash_size, unsigned long long *crash_base); +#ifdef CONFIG_PIN_MEMORY +int __init parse_pin_memory(char *cmdline, unsigned long long system_ram, + unsigned long long *pin_size, unsigned long long *pin_base); +#endif + #endif /* LINUX_CRASH_CORE_H */ diff --git a/include/linux/pin_mem.h b/include/linux/pin_mem.h new file mode 100644 index 0000000000000..61e925a455de2 --- /dev/null +++ b/include/linux/pin_mem.h @@ -0,0 +1,93 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Copyright (C) 2020. Huawei Technologies Co., Ltd. All rights reserved. + * Provide the pin memory method for check point and restore task. + */ +#ifndef _LINUX_PIN_MEMORY_H +#define _LINUX_PIN_MEMORY_H + +#ifdef CONFIG_PIN_MEMORY +#include <linux/errno.h> +#include <linux/mm_types.h> +#include <linux/err.h> +#ifdef CONFIG_ARM64 +#include <linux/ioport.h> +#endif + +#define PAGE_BUDDY_MAPCOUNT_VALUE (~PG_buddy) + +#define COLLECT_PAGES_FINISH 0 +#define COLLECT_PAGES_NEED_CONTINUE 1 +#define COLLECT_PAGES_FAIL -1 + +#define COMPOUND_PAD_MASK 0xffffffff +#define COMPOUND_PAD_START 0x88 +#define COMPOUND_PAD_DELTA 0x40 +#define LIST_POISON4 0xdead000000000400 +#define PAGE_FLAGS_CHECK_RESERVED (1UL << PG_reserved) +#define SHA256_DIGEST_SIZE 32 +#define next_pme(pme) ((unsigned long *)((pme) + 1) + (pme)->nr_pages) +#define PIN_MEM_DUMP_MAGIC 0xfeab000000001acd +#define PM_PFRAME_BITS 55 +#define PM_PFRAME_MASK GENMASK_ULL(PM_PFRAME_BITS - 1, 0) +#define PM_PRESENT BIT_ULL(63) +#define PM_SWAP BIT_ULL(62) +#define IS_PTE_PRESENT(entry) (((entry) & PM_PFRAME_MASK) && ((entry) & PM_PRESENT)) +#define NEXT_PIN_ADDR(next, end_addr) (((next) + HPAGE_PMD_SIZE) > (end_addr) ? \ + (end_addr) : ((next) + HPAGE_PMD_SIZE)) + +struct page_map_entry { + unsigned long virt_addr; + unsigned int nr_pages; + unsigned int is_huge_page; + unsigned long redirect_start; + unsigned long phy_addr_array[0]; +}; + +struct page_map_info { + int pid; + int pid_reserved; + unsigned int entry_num; + int disable_free_page; + struct page_map_entry *pme; +}; + +struct pin_mem_dump_info { + char sha_digest[SHA256_DIGEST_SIZE]; + unsigned long magic; + unsigned int pin_pid_num; + struct page_map_info pmi_array[0]; +}; + +struct redirect_info { + unsigned int redirect_pages; + unsigned int redirect_index[0]; +}; + +extern struct page_map_info *get_page_map_info(int pid); +extern struct page_map_info *create_page_map_info(int pid); +extern vm_fault_t do_mem_remap(int pid, struct mm_struct *mm); +extern vm_fault_t do_anon_page_remap(struct vm_area_struct *vma, unsigned long address, + pmd_t *pmd, struct page *page); +extern void clear_pin_memory_record(void); +extern int pin_mem_area(struct task_struct *task, struct mm_struct *mm, + unsigned long start_addr, unsigned long end_addr); +extern vm_fault_t do_anon_huge_page_remap(struct vm_area_struct *vma, unsigned long address, + pmd_t *pmd, struct page *page); +extern int finish_pin_mem_dump(void); + +extern void *create_pagemap_walk(void); +extern void free_pagemap_walk(void *mem_walk); +extern int pagemap_get(struct mm_struct *mm, void *mem_walk, + unsigned long start_vaddr, unsigned long end_vaddr, + unsigned long *pte_entry, unsigned int *count); + +extern int init_pagemap_read(void); +/* reserve space for pin memory*/ +#ifdef CONFIG_ARM64 +extern struct resource pin_memory_resource; +#endif +extern void init_reserve_page_map(unsigned long map_addr, unsigned long map_size); + +#endif /* CONFIG_PIN_MEMORY */ +#endif /* _LINUX_PIN_MEMORY_H */ diff --git a/kernel/crash_core.c b/kernel/crash_core.c index 933cb3e45b987..7e65e10088440 100644 --- a/kernel/crash_core.c +++ b/kernel/crash_core.c @@ -294,6 +294,17 @@ int __init parse_crashkernel_low(char *cmdline, "crashkernel=", suffix_tbl[SUFFIX_LOW]); } +#ifdef CONFIG_PIN_MEMORY +int __init parse_pin_memory(char *cmdline, + unsigned long long system_ram, + unsigned long long *pin_size, + unsigned long long *pin_base) +{ + return __parse_crashkernel(cmdline, system_ram, pin_size, pin_base, + "pinmemory=", NULL); +} +#endif + Elf_Word *append_elf_note(Elf_Word *buf, char *name, unsigned int type, void *data, size_t data_len) { diff --git a/lib/Makefile b/lib/Makefile index 71443e03deb36..f5ee8e881c59d 100644 --- a/lib/Makefile +++ b/lib/Makefile @@ -276,6 +276,8 @@ obj-$(CONFIG_SBITMAP) += sbitmap.o obj-$(CONFIG_PARMAN) += parman.o +obj-$(CONFIG_PIN_MEMORY) += sha256.o + # GCC library routines obj-$(CONFIG_GENERIC_LIB_ASHLDI3) += ashldi3.o obj-$(CONFIG_GENERIC_LIB_ASHRDI3) += ashrdi3.o diff --git a/mm/Kconfig b/mm/Kconfig index f622df92712e2..f8f2db73ceb2c 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -801,4 +801,12 @@ config GUP_BENCHMARK config ARCH_HAS_PTE_SPECIAL bool +config PIN_MEMORY + bool "Support for pin memory" + depends on MMU && ARM64 + help + Say y here to enable the pin memory feature for checkpoint + and restore. We can pin the memory data of tasks and collect + the corresponding physical pages mapping info in checkpoint, + and remap the physical pages to restore tasks in restore. endmenu diff --git a/mm/Makefile b/mm/Makefile index eb9545fbb20d9..5ef96ea79c8b2 100644 --- a/mm/Makefile +++ b/mm/Makefile @@ -107,3 +107,4 @@ obj-$(CONFIG_HMM) += hmm.o obj-$(CONFIG_MEMFD_CREATE) += memfd.o obj-$(CONFIG_ASCEND_AUTO_TUNING_HUGEPAGE) += hugepage_tuning.o obj-$(CONFIG_ASCEND_SHARE_POOL) += share_pool.o +obj-$(CONFIG_PIN_MEMORY) += pin_mem.o diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 64f8c44cb65e1..31628c65642a6 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -3094,3 +3094,65 @@ void remove_migration_pmd(struct page_vma_mapped_walk *pvmw, struct page *new) update_mmu_cache_pmd(vma, address, pvmw->pmd); } #endif + +#ifdef CONFIG_PIN_MEMORY +vm_fault_t do_anon_huge_page_remap(struct vm_area_struct *vma, unsigned long address, + pmd_t *pmd, struct page *page) +{ + gfp_t gfp; + pgtable_t pgtable; + spinlock_t *ptl; + pmd_t entry; + vm_fault_t ret = 0; + struct mem_cgroup *memcg; + + if (unlikely(anon_vma_prepare(vma))) + return VM_FAULT_OOM; + if (unlikely(khugepaged_enter(vma, vma->vm_flags))) + return VM_FAULT_OOM; + gfp = alloc_hugepage_direct_gfpmask(vma); + + prep_transhuge_page(page); + if (mem_cgroup_try_charge_delay(page, vma->vm_mm, gfp, &memcg, true)) { + put_page(page); + count_vm_event(THP_FAULT_FALLBACK); + return VM_FAULT_FALLBACK; + } + pgtable = pte_alloc_one(vma->vm_mm, address); + if (unlikely(!pgtable)) { + ret = VM_FAULT_OOM; + goto release; + } + __SetPageUptodate(page); + ptl = pmd_lock(vma->vm_mm, pmd); + if (unlikely(!pmd_none(*pmd))) { + goto unlock_release; + } else { + ret = check_stable_address_space(vma->vm_mm); + if (ret) + goto unlock_release; + entry = mk_huge_pmd(page, vma->vm_page_prot); + entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma); + page_add_new_anon_rmap(page, vma, address, true); + mem_cgroup_commit_charge(page, memcg, false, true); + lru_cache_add_active_or_unevictable(page, vma); + pgtable_trans_huge_deposit(vma->vm_mm, pmd, pgtable); + set_pmd_at(vma->vm_mm, address, pmd, entry); + add_mm_counter(vma->vm_mm, MM_ANONPAGES, HPAGE_PMD_NR); + mm_inc_nr_ptes(vma->vm_mm); + spin_unlock(ptl); + count_vm_event(THP_FAULT_ALLOC); + } + + return 0; + +unlock_release: + spin_unlock(ptl); +release: + if (pgtable) + pte_free(vma->vm_mm, pgtable); + mem_cgroup_cancel_charge(page, memcg, true); + put_page(page); + return ret; +} +#endif diff --git a/mm/memory.c b/mm/memory.c index 6eb4e8e60284c..a2b34d20a1591 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -4821,3 +4821,72 @@ void ptlock_free(struct page *page) kmem_cache_free(page_ptl_cachep, page->ptl); } #endif + +#ifdef CONFIG_PIN_MEMORY +vm_fault_t do_anon_page_remap(struct vm_area_struct *vma, unsigned long address, + pmd_t *pmd, struct page *page) +{ + struct mem_cgroup *memcg; + pte_t entry; + spinlock_t *ptl; + pte_t *pte; + vm_fault_t ret = 0; + + if (pte_alloc(vma->vm_mm, pmd, address)) + return VM_FAULT_OOM; + + /* See the comment in pte_alloc_one_map() */ + if (unlikely(pmd_trans_unstable(pmd))) + return 0; + + /* Allocate our own private page. */ + if (unlikely(anon_vma_prepare(vma))) + goto oom; + + if (mem_cgroup_try_charge_delay(page, vma->vm_mm, GFP_KERNEL, &memcg, + false)) + goto oom_free_page; + + /* + * The memory barrier inside __SetPageUptodate makes sure that + * preceding stores to the page contents become visible before + * the set_pte_at() write. + */ + __SetPageUptodate(page); + + entry = mk_pte(page, vma->vm_page_prot); + if (vma->vm_flags & VM_WRITE) + entry = pte_mkwrite(pte_mkdirty(entry)); + pte = pte_offset_map_lock(vma->vm_mm, pmd, address, + &ptl); + if (!pte_none(*pte)) { + ret = VM_FAULT_FALLBACK; + goto release; + } + + ret = check_stable_address_space(vma->vm_mm); + if (ret) + goto release; + inc_mm_counter_fast(vma->vm_mm, MM_ANONPAGES); + page_add_new_anon_rmap(page, vma, address, false); + mem_cgroup_commit_charge(page, memcg, false, false); + lru_cache_add_active_or_unevictable(page, vma); + + set_pte_at(vma->vm_mm, address, pte, entry); + /* No need to invalidate - it was non-present before */ + update_mmu_cache(vma, address, pte); + +unlock: + pte_unmap_unlock(pte, ptl); + return ret; + +release: + mem_cgroup_cancel_charge(page, memcg, false); + put_page(page); + goto unlock; +oom_free_page: + put_page(page); +oom: + return VM_FAULT_OOM; +} +#endif diff --git a/mm/pin_mem.c b/mm/pin_mem.c new file mode 100644 index 0000000000000..56641d6e2f4e5 --- /dev/null +++ b/mm/pin_mem.c @@ -0,0 +1,1074 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Copyright (C) 2020. Huawei Technologies Co., Ltd. All rights reserved. + * Provide the pin memory method for check point and restore task. + */ +#ifdef CONFIG_PIN_MEMORY +#include <linux/init.h> +#include <linux/slab.h> +#include <linux/time.h> +#include <linux/sched/cputime.h> +#include <linux/tick.h> +#include <linux/mm.h> +#include <linux/pin_mem.h> +#include <linux/idr.h> +#include <linux/page-isolation.h> +#include <linux/sched/mm.h> +#include <linux/ctype.h> +#include <linux/highmem.h> +#include <linux/sha256.h> + +#define MAX_PIN_PID_NUM 128 +#define DEFAULT_REDIRECT_SPACE_SIZE 0x100000 + +static DEFINE_SPINLOCK(page_map_entry_lock); +static DEFINE_MUTEX(pin_mem_mutex); +static struct pin_mem_dump_info *pin_mem_dump_start; +static unsigned int pin_pid_num; +static unsigned int *pin_pid_num_addr; +static struct page_map_entry *__page_map_entry_start; +static unsigned long page_map_entry_end; +static struct page_map_info *user_space_reserve_start; +static struct page_map_entry *page_map_entry_start; + +unsigned int max_pin_pid_num __read_mostly; +unsigned long redirect_space_size __read_mostly; +static unsigned long redirect_space_start; +static void *pin_mem_pagewalk; +static unsigned long *pagemap_buffer; +static int reserve_user_map_pages_fail; + +static int __init setup_max_pin_pid_num(char *str) +{ + int ret; + + if (!str) + return 0; + + ret = kstrtouint(str, 10, &max_pin_pid_num); + if (ret) { + pr_warn("Unable to parse max pin pid num.\n"); + } else { + if (max_pin_pid_num > MAX_PIN_PID_NUM) { + max_pin_pid_num = 0; + pr_warn("Input max_pin_pid_num is too large.\n"); + } + } + return ret; +} +early_param("max_pin_pid_num", setup_max_pin_pid_num); + +static int __init setup_redirect_space_size(char *str) +{ + if (!str) + return 0; + + redirect_space_size = memparse(str, NULL); + if (!redirect_space_size) { + pr_warn("Unable to parse redirect space size, use the default value.\n"); + redirect_space_size = DEFAULT_REDIRECT_SPACE_SIZE; + } + return 0; +} +early_param("redirect_space_size", setup_redirect_space_size); + +struct page_map_info *create_page_map_info(int pid) +{ + struct page_map_info *new; + + if (!user_space_reserve_start) + return NULL; + + if (pin_pid_num >= max_pin_pid_num) { + pr_warn("Pin pid num too large than max_pin_pid_num, fail create: %d!", pid); + return NULL; + } + new = (struct page_map_info *)(user_space_reserve_start + pin_pid_num); + new->pid = pid; + new->pme = NULL; + new->entry_num = 0; + new->pid_reserved = false; + new->disable_free_page = false; + (*pin_pid_num_addr)++; + pin_pid_num++; + return new; +} +EXPORT_SYMBOL_GPL(create_page_map_info); + +struct page_map_info *get_page_map_info(int pid) +{ + int i; + + if (!user_space_reserve_start) + return NULL; + + for (i = 0; i < pin_pid_num; i++) { + if (user_space_reserve_start[i].pid == pid) + return &(user_space_reserve_start[i]); + } + return NULL; +} +EXPORT_SYMBOL_GPL(get_page_map_info); + +static struct page *find_head_page(struct page *page) +{ + struct page *p = page; + + while (!PageBuddy(p)) { + if (PageLRU(p)) + return NULL; + p--; + } + return p; +} + +static void spilt_page_area_left(struct zone *zone, struct free_area *area, struct page *page, + unsigned long size, int order) +{ + unsigned long cur_size = 1 << order; + unsigned long total_size = 0; + + while (size && cur_size > size) { + cur_size >>= 1; + order--; + area--; + if (cur_size <= size) { + list_add(&page[total_size].lru, &area->free_list[MIGRATE_MOVABLE]); + atomic_set(&(page[total_size]._mapcount), PAGE_BUDDY_MAPCOUNT_VALUE); + set_page_private(&page[total_size], order); + set_pageblock_migratetype(&page[total_size], MIGRATE_MOVABLE); + area->nr_free++; + total_size += cur_size; + size -= cur_size; + } + } +} + +static void spilt_page_area_right(struct zone *zone, struct free_area *area, struct page *page, + unsigned long size, int order) +{ + unsigned long cur_size = 1 << order; + struct page *right_page, *head_page; + + right_page = page + size; + while (size && cur_size > size) { + cur_size >>= 1; + order--; + area--; + if (cur_size <= size) { + head_page = right_page - cur_size; + list_add(&head_page->lru, &area->free_list[MIGRATE_MOVABLE]); + atomic_set(&(head_page->_mapcount), PAGE_BUDDY_MAPCOUNT_VALUE); + set_page_private(head_page, order); + set_pageblock_migratetype(head_page, MIGRATE_MOVABLE); + area->nr_free++; + size -= cur_size; + right_page = head_page; + } + } +} + +void reserve_page_from_buddy(unsigned long nr_pages, struct page *page) +{ + unsigned int current_order; + struct page *page_end; + struct free_area *area; + struct zone *zone; + struct page *head_page; + + head_page = find_head_page(page); + if (!head_page) { + pr_warn("Find page head fail."); + return; + } + + current_order = head_page->private; + page_end = head_page + (1 << current_order); + zone = page_zone(head_page); + area = &(zone->free_area[current_order]); + list_del(&head_page->lru); + atomic_set(&head_page->_mapcount, -1); + set_page_private(head_page, 0); + area->nr_free--; + + if (head_page != page) + spilt_page_area_left(zone, area, head_page, + (unsigned long)(page - head_page), current_order); + page = page + nr_pages; + if (page < page_end) { + spilt_page_area_right(zone, area, page, + (unsigned long)(page_end - page), current_order); + } else if (page > page_end) { + pr_warn("Find page end smaller than page."); + } +} + +static inline void reserve_user_normal_pages(struct page *page) +{ + atomic_inc(&page->_refcount); + reserve_page_from_buddy(1, page); +} + +static void init_huge_pmd_pages(struct page *head_page) +{ + int i = 0; + struct page *page = head_page; + unsigned long *temp; + unsigned long compound_pad = COMPOUND_PAD_START; + + __set_bit(PG_head, &page->flags); + __set_bit(PG_active, &page->flags); + atomic_set(&page->_refcount, 1); + page++; + i++; + page->compound_head = (unsigned long)head_page + 1; + page->_compound_pad_2 = (unsigned long)head_page & COMPOUND_PAD_MASK; + temp = (unsigned long *)(&(page->_compound_pad_2)); + temp[1] = LIST_POISON4; + page->compound_dtor = HUGETLB_PAGE_DTOR + 1; + page->compound_order = HPAGE_PMD_ORDER; + page++; + i++; + page->compound_head = (unsigned long)head_page + 1; + page->_compound_pad_2 = (unsigned long)head_page + compound_pad; + i++; + + INIT_LIST_HEAD(&(page->deferred_list)); + for (; i < HPAGE_PMD_NR; i++) { + page = head_page + i; + page->compound_head = (unsigned long)head_page + 1; + compound_pad += COMPOUND_PAD_DELTA; + page->_compound_pad_2 = (unsigned long)head_page + compound_pad; + temp = (unsigned long *)(&(page->_compound_pad_2)); + temp[1] = LIST_POISON4; + } +} + +static inline void reserve_user_huge_pmd_pages(struct page *page) +{ + atomic_inc(&page->_refcount); + reserve_page_from_buddy((1 << HPAGE_PMD_ORDER), page); + init_huge_pmd_pages(page); +} + +void free_user_map_pages(unsigned int pid_index, unsigned int entry_index, unsigned int page_index) +{ + unsigned int i, j, index, order; + struct page_map_info *pmi; + struct page_map_entry *pme; + struct page *page; + unsigned long phy_addr; + + for (index = 0; index < pid_index; index++) { + pmi = &(user_space_reserve_start[index]); + pme = pmi->pme; + for (i = 0; i < pmi->entry_num; i++) { + for (j = 0; j < pme->nr_pages; j++) { + order = pme->is_huge_page ? HPAGE_PMD_ORDER : 0; + phy_addr = pme->phy_addr_array[j]; + if (phy_addr) { + page = phys_to_page(phy_addr); + if (!(page->flags & PAGE_FLAGS_CHECK_RESERVED)) { + __free_pages(page, order); + pme->phy_addr_array[j] = 0; + } + } + } + pme = (struct page_map_entry *)next_pme(pme); + } + } + + pmi = &(user_space_reserve_start[index]); + pme = pmi->pme; + for (i = 0; i < entry_index; i++) { + for (j = 0; j < pme->nr_pages; j++) { + order = pme->is_huge_page ? HPAGE_PMD_ORDER : 0; + phy_addr = pme->phy_addr_array[j]; + if (phy_addr) { + page = phys_to_page(phy_addr); + if (!(page->flags & PAGE_FLAGS_CHECK_RESERVED)) { + __free_pages(page, order); + pme->phy_addr_array[j] = 0; + } + } + } + pme = (struct page_map_entry *)next_pme(pme); + } + + for (j = 0; j < page_index; j++) { + order = pme->is_huge_page ? HPAGE_PMD_ORDER : 0; + phy_addr = pme->phy_addr_array[j]; + if (phy_addr) { + page = phys_to_page(phy_addr); + if (!(page->flags & PAGE_FLAGS_CHECK_RESERVED)) { + __free_pages(page, order); + pme->phy_addr_array[j] = 0; + } + } + } +} + +bool check_redirect_end_valid(struct redirect_info *redirect_start, + unsigned long max_redirect_page_num) +{ + unsigned long redirect_end; + + redirect_end = ((unsigned long)(redirect_start + 1) + + max_redirect_page_num * sizeof(unsigned int)); + if (redirect_end > redirect_space_start + redirect_space_size) + return false; + return true; +} + +static void reserve_user_space_map_pages(void) +{ + struct page_map_info *pmi; + struct page_map_entry *pme; + unsigned int i, j, index; + struct page *page; + unsigned long flags; + unsigned long phy_addr; + unsigned long redirect_pages = 0; + struct redirect_info *redirect_start = (struct redirect_info *)redirect_space_start; + + if (!user_space_reserve_start || !redirect_start) + return; + spin_lock_irqsave(&page_map_entry_lock, flags); + for (index = 0; index < pin_pid_num; index++) { + pmi = &(user_space_reserve_start[index]); + pme = pmi->pme; + for (i = 0; i < pmi->entry_num; i++) { + redirect_pages = 0; + if (!check_redirect_end_valid(redirect_start, pme->nr_pages)) + redirect_start = NULL; + + for (j = 0; j < pme->nr_pages; j++) { + phy_addr = pme->phy_addr_array[j]; + if (!phy_addr) + continue; + page = phys_to_page(phy_addr); + if (atomic_read(&page->_refcount)) { + if ((page->flags & PAGE_FLAGS_CHECK_RESERVED) + && !pme->redirect_start) + pme->redirect_start = + (unsigned long)redirect_start; + + if (redirect_start && + (page->flags & PAGE_FLAGS_CHECK_RESERVED)) { + redirect_start->redirect_index[redirect_pages] = j; + redirect_pages++; + continue; + } else { + reserve_user_map_pages_fail = 1; + pr_warn("Page %pK refcount %d large than zero, no need reserve.\n", + page, atomic_read(&page->_refcount)); + goto free_pages; + } + } + + if (!pme->is_huge_page) + reserve_user_normal_pages(page); + else + reserve_user_huge_pmd_pages(page); + } + pme = (struct page_map_entry *)next_pme(pme); + if (redirect_pages && redirect_start) { + redirect_start->redirect_pages = redirect_pages; + redirect_start = (struct redirect_info *)( + (unsigned long)(redirect_start + 1) + + redirect_start->redirect_pages * sizeof(unsigned int)); + } + } + } + spin_unlock(&page_map_entry_lock); + return; + +free_pages: + free_user_map_pages(index, i, j); + spin_unlock(&page_map_entry_lock); +} + + +int calculate_pin_mem_digest(struct pin_mem_dump_info *pmdi, char *digest) +{ + int i; + struct sha256_state sctx; + + if (!digest) + digest = pmdi->sha_digest; + sha256_init(&sctx); + sha256_update(&sctx, (unsigned char *)(&(pmdi->magic)), + sizeof(struct pin_mem_dump_info) - SHA256_DIGEST_SIZE); + for (i = 0; i < pmdi->pin_pid_num; i++) { + sha256_update(&sctx, (unsigned char *)(&(pmdi->pmi_array[i])), + sizeof(struct page_map_info)); + } + sha256_final(&sctx, digest); + return 0; +} + +static int check_sha_digest(struct pin_mem_dump_info *pmdi) +{ + int ret = 0; + char digest[SHA256_DIGEST_SIZE] = {0}; + + ret = calculate_pin_mem_digest(pmdi, digest); + if (ret) { + pr_warn("calculate pin mem digest fail:%d\n", ret); + return ret; + } + if (memcmp(pmdi->sha_digest, digest, SHA256_DIGEST_SIZE)) { + pr_warn("pin mem dump info sha256 digest match error!\n"); + return -EFAULT; + } + return ret; +} + +/* + * The whole page map entry collect process must be Sequentially. + * The user_space_reserve_start points to the first page map info for + * the first dump task. And the page_map_entry_start points to + * the first page map entry of the first dump vma. + */ +static void init_page_map_info(struct pin_mem_dump_info *pmdi, unsigned long map_len) +{ + if (pin_mem_dump_start || !max_pin_pid_num) { + pr_warn("pin page map already init or max_pin_pid_num not set.\n"); + return; + } + if (map_len < sizeof(struct pin_mem_dump_info) + + max_pin_pid_num * sizeof(struct page_map_info) + redirect_space_size) { + pr_warn("pin memory reserved memblock too small.\n"); + return; + } + if ((pmdi->magic != PIN_MEM_DUMP_MAGIC) || (pmdi->pin_pid_num > max_pin_pid_num) || + check_sha_digest(pmdi)) + memset(pmdi, 0, sizeof(struct pin_mem_dump_info)); + + pin_mem_dump_start = pmdi; + pin_pid_num = pmdi->pin_pid_num; + pr_info("pin_pid_num: %d\n", pin_pid_num); + pin_pid_num_addr = &(pmdi->pin_pid_num); + user_space_reserve_start = + (struct page_map_info *)pmdi->pmi_array; + page_map_entry_start = + (struct page_map_entry *)(user_space_reserve_start + max_pin_pid_num); + __page_map_entry_start = page_map_entry_start; + page_map_entry_end = (unsigned long)pmdi + map_len - redirect_space_size; + redirect_space_start = page_map_entry_end; + + if (pin_pid_num > 0) + reserve_user_space_map_pages(); +} + +int finish_pin_mem_dump(void) +{ + int ret; + + if (!pin_mem_dump_start) + return -EFAULT; + pin_mem_dump_start->magic = PIN_MEM_DUMP_MAGIC; + memset(pin_mem_dump_start->sha_digest, 0, SHA256_DIGEST_SIZE); + ret = calculate_pin_mem_digest(pin_mem_dump_start, NULL); + if (ret) { + pr_warn("calculate pin mem digest fail:%d\n", ret); + return ret; + } + return ret; +} +EXPORT_SYMBOL_GPL(finish_pin_mem_dump); + +int collect_pmd_huge_pages(struct task_struct *task, + unsigned long start_addr, unsigned long end_addr, struct page_map_entry *pme) +{ + int ret, i, res; + int index = 0; + unsigned long start = start_addr; + struct page *temp_page; + unsigned long *pte_entry = pagemap_buffer; + unsigned int count; + struct mm_struct *mm = task->mm; + + while (start < end_addr) { + temp_page = NULL; + count = 0; + ret = pagemap_get(mm, pin_mem_pagewalk, + start, start + HPAGE_PMD_SIZE, pte_entry, &count); + if (ret || !count) { + pr_warn("Get huge page fail: %d.", ret); + return COLLECT_PAGES_FAIL; + } + + /* For huge page, get one map entry per time. */ + if ((pte_entry[0] & PM_SWAP) && (count == 1)) { + res = get_user_pages_remote(task, mm, start, + 1, FOLL_TOUCH | FOLL_GET, &temp_page, NULL, NULL); + if (!res) { + pr_warn("Swap in huge page fail.\n"); + return COLLECT_PAGES_FAIL; + } + pme->phy_addr_array[index] = page_to_phys(temp_page); + start += HPAGE_PMD_SIZE; + index++; + continue; + } + + if (IS_PTE_PRESENT(pte_entry[0])) { + temp_page = pfn_to_page(pte_entry[0] & PM_PFRAME_MASK); + if (PageHead(temp_page)) { + atomic_inc(&((temp_page)->_refcount)); + start += HPAGE_PMD_SIZE; + pme->phy_addr_array[index] = page_to_phys(temp_page); + index++; + } else { + /* If the page is not compound head, goto collect normal pages. */ + pme->nr_pages = index; + return COLLECT_PAGES_NEED_CONTINUE; + } + } else { + for (i = 1; i < count; i++) { + if (pte_entry[i] & PM_PFRAME_MASK) { + pme->nr_pages = index; + return COLLECT_PAGES_NEED_CONTINUE; + } + } + start += HPAGE_PMD_SIZE; + pme->phy_addr_array[index] = 0; + index++; + } + } + pme->nr_pages = index; + return COLLECT_PAGES_FINISH; +} + +int collect_normal_pages(struct task_struct *task, + unsigned long start_addr, unsigned long end_addr, struct page_map_entry *pme) +{ + int ret, res; + unsigned long next; + unsigned long i, nr_pages; + struct page *tmp_page; + unsigned long *phy_addr_array = pme->phy_addr_array; + unsigned int count; + unsigned long *pte_entry = pagemap_buffer; + struct mm_struct *mm = task->mm; + + next = (start_addr & HPAGE_PMD_MASK) + HPAGE_PMD_SIZE; + next = (next > end_addr) ? end_addr : next; + pme->nr_pages = 0; + while (start_addr < next) { + count = 0; + nr_pages = (PAGE_ALIGN(next) - start_addr) / PAGE_SIZE; + ret = pagemap_get(mm, pin_mem_pagewalk, + start_addr, next, pte_entry, &count); + if (ret || !count) { + pr_warn("Get user page fail: %d, count: %u.\n", + ret, count); + return COLLECT_PAGES_FAIL; + } + + if (IS_PTE_PRESENT(pte_entry[0])) { + tmp_page = pfn_to_page(pte_entry[0] & PM_PFRAME_MASK); + /* If the page is compound head, goto collect huge pages. */ + if (PageHead(tmp_page)) + return COLLECT_PAGES_NEED_CONTINUE; + if (PageTail(tmp_page)) { + start_addr = next; + pme->virt_addr = start_addr; + next = NEXT_PIN_ADDR(next, end_addr); + continue; + } + } + + for (i = 0; i < count; i++) { + if (pte_entry[i] & PM_SWAP) { + res = get_user_pages_remote(task, mm, start_addr + i * PAGE_SIZE, + 1, FOLL_TOUCH | FOLL_GET, &tmp_page, NULL, NULL); + if (!res) { + pr_warn("Swap in page fail.\n"); + return COLLECT_PAGES_FAIL; + } + phy_addr_array[i] = page_to_phys(tmp_page); + continue; + } + if (!IS_PTE_PRESENT(pte_entry[i])) { + phy_addr_array[i] = 0; + continue; + } + tmp_page = pfn_to_page(pte_entry[i] & PM_PFRAME_MASK); + atomic_inc(&(tmp_page->_refcount)); + phy_addr_array[i] = ((pte_entry[i] & PM_PFRAME_MASK) << PAGE_SHIFT); + } + pme->nr_pages += count; + phy_addr_array += count; + start_addr = next; + next = NEXT_PIN_ADDR(next, end_addr); + } + return COLLECT_PAGES_FINISH; +} + +void free_pin_pages(struct page_map_entry *pme) +{ + unsigned long i; + struct page *tmp_page; + + if (!pme) + return; + for (i = 0; i < pme->nr_pages; i++) { + if (pme->phy_addr_array[i]) { + tmp_page = phys_to_page(pme->phy_addr_array[i]); + atomic_dec(&(tmp_page->_refcount)); + pme->phy_addr_array[i] = 0; + } + } +} + +int init_pagemap_read(void) +{ + int ret = -ENOMEM; + + if (pin_mem_pagewalk) + return 0; + + mutex_lock(&pin_mem_mutex); + pin_mem_pagewalk = create_pagemap_walk(); + if (!pin_mem_pagewalk) + goto out; + pagemap_buffer = kmalloc(((PMD_SIZE >> PAGE_SHIFT) + 1) * + sizeof(unsigned long), GFP_KERNEL); + if (!pagemap_buffer) + goto free; + + ret = 0; +out: + mutex_unlock(&pin_mem_mutex); + return ret; +free: + free_pagemap_walk(pin_mem_pagewalk); + pin_mem_pagewalk = NULL; + goto out; +} +EXPORT_SYMBOL_GPL(init_pagemap_read); + +/* Users make sure that the pin memory belongs to anonymous vma. */ +int pin_mem_area(struct task_struct *task, struct mm_struct *mm, + unsigned long start_addr, unsigned long end_addr) +{ + int pid, ret; + int is_huge_page = false; + unsigned int page_size; + unsigned long nr_pages, flags; + struct page_map_entry *pme = NULL; + struct page_map_info *pmi; + struct vm_area_struct *vma; + unsigned long i; + struct page *tmp_page; + + if (!page_map_entry_start + || !task || !mm + || start_addr >= end_addr || !pin_mem_pagewalk) + return -EFAULT; + + pid = task->pid; + spin_lock_irqsave(&page_map_entry_lock, flags); + nr_pages = ((end_addr - start_addr) / PAGE_SIZE); + if ((unsigned long)page_map_entry_start + nr_pages * sizeof(struct page *) >= + page_map_entry_end) { + pr_warn("Page map entry use up!\n"); + ret = -EFAULT; + goto finish; + } + + vma = find_extend_vma(mm, start_addr); + if (!vma) { + pr_warn("Find no match vma!\n"); + ret = -EFAULT; + goto finish; + } + if (start_addr == (start_addr & HPAGE_PMD_MASK) && + transparent_hugepage_enabled(vma)) { + page_size = HPAGE_PMD_SIZE; + is_huge_page = true; + } else { + page_size = PAGE_SIZE; + } + + pme = page_map_entry_start; + pme->virt_addr = start_addr; + pme->redirect_start = 0; + pme->is_huge_page = is_huge_page; + memset(pme->phy_addr_array, 0, nr_pages * sizeof(unsigned long)); + + down_read(&mm->mmap_sem); + if (!is_huge_page) { + ret = collect_normal_pages(task, start_addr, end_addr, pme); + if (ret != COLLECT_PAGES_FAIL && !pme->nr_pages) { + if (ret == COLLECT_PAGES_FINISH) { + ret = 0; + up_read(&mm->mmap_sem); + goto finish; + } + pme->is_huge_page = true; + page_size = HPAGE_PMD_SIZE; + ret = collect_pmd_huge_pages(task, pme->virt_addr, end_addr, pme); + } + } else { + ret = collect_pmd_huge_pages(task, start_addr, end_addr, pme); + if (ret != COLLECT_PAGES_FAIL && !pme->nr_pages) { + if (ret == COLLECT_PAGES_FINISH) { + ret = 0; + up_read(&mm->mmap_sem); + goto finish; + } + pme->is_huge_page = false; + page_size = PAGE_SIZE; + ret = collect_normal_pages(task, pme->virt_addr, end_addr, pme); + } + } + up_read(&mm->mmap_sem); + if (ret == COLLECT_PAGES_FAIL) { + ret = -EFAULT; + goto finish; + } + + /* check for zero pages */ + for (i = 0; i < pme->nr_pages; i++) { + tmp_page = phys_to_page(pme->phy_addr_array[i]); + if (!pme->is_huge_page) { + if (page_to_pfn(tmp_page) == my_zero_pfn(pme->virt_addr + i * PAGE_SIZE)) + pme->phy_addr_array[i] = 0; + } else if (is_huge_zero_page(tmp_page)) + pme->phy_addr_array[i] = 0; + } + + page_map_entry_start = (struct page_map_entry *)(next_pme(pme)); + pmi = get_page_map_info(pid); + if (!pmi) + pmi = create_page_map_info(pid); + if (!pmi) { + pr_warn("Create page map info fail for pid: %d!\n", pid); + ret = -EFAULT; + goto finish; + } + if (!pmi->pme) + pmi->pme = pme; + pmi->entry_num++; + spin_unlock_irqrestore(&page_map_entry_lock, flags); + + if (ret == COLLECT_PAGES_NEED_CONTINUE) + ret = pin_mem_area(task, mm, pme->virt_addr + pme->nr_pages * page_size, end_addr); + return ret; + +finish: + if (ret) + free_pin_pages(pme); + spin_unlock_irqrestore(&page_map_entry_lock, flags); + return ret; +} +EXPORT_SYMBOL_GPL(pin_mem_area); + +vm_fault_t remap_normal_pages(struct mm_struct *mm, struct vm_area_struct *vma, + struct page_map_entry *pme) +{ + int ret; + unsigned int j, i; + pgd_t *pgd; + p4d_t *p4d; + pmd_t *pmd; + pud_t *pud; + struct page *page, *new; + unsigned long address; + unsigned long phy_addr; + unsigned int redirect_pages = 0; + struct redirect_info *redirect_start; + + redirect_start = (struct redirect_info *)pme->redirect_start; + for (j = 0; j < pme->nr_pages; j++) { + address = pme->virt_addr + j * PAGE_SIZE; + phy_addr = pme->phy_addr_array[j]; + if (!phy_addr) + continue; + + page = phys_to_page(phy_addr); + if (page_to_pfn(page) == my_zero_pfn(address)) { + pme->phy_addr_array[j] = 0; + continue; + } + pme->phy_addr_array[j] = 0; + + if (redirect_start && (redirect_pages < redirect_start->redirect_pages) && + (j == redirect_start->redirect_index[redirect_pages])) { + new = alloc_zeroed_user_highpage_movable(vma, address); + if (!new) { + pr_warn("Redirect alloc page fail\n"); + continue; + } + copy_page(page_to_virt(new), phys_to_virt(phy_addr)); + page = new; + redirect_pages++; + } + + page->mapping = NULL; + pgd = pgd_offset(mm, address); + ret = VM_FAULT_OOM; + p4d = p4d_alloc(mm, pgd, address); + if (!p4d) + goto free; + pud = pud_alloc(mm, p4d, address); + if (!pud) + goto free; + pmd = pmd_alloc(mm, pud, address); + if (!pmd) + goto free; + ret = do_anon_page_remap(vma, address, pmd, page); + if (ret) + goto free; + } + return 0; + +free: + for (i = j; i < pme->nr_pages; i++) { + phy_addr = pme->phy_addr_array[i]; + if (phy_addr) { + __free_page(phys_to_page(phy_addr)); + pme->phy_addr_array[i] = 0; + } + } + return ret; +} + +static inline gfp_t get_hugepage_gfpmask(struct vm_area_struct *vma) +{ + const bool vma_madvised = !!(vma->vm_flags & VM_HUGEPAGE); + + if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_DIRECT_FLAG, &transparent_hugepage_flags)) + return GFP_TRANSHUGE | (vma_madvised ? 0 : __GFP_NORETRY); + if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_FLAG, &transparent_hugepage_flags)) + return GFP_TRANSHUGE_LIGHT | __GFP_KSWAPD_RECLAIM; + if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_OR_MADV_FLAG, &transparent_hugepage_flags)) + return GFP_TRANSHUGE_LIGHT | (vma_madvised ? __GFP_DIRECT_RECLAIM : + __GFP_KSWAPD_RECLAIM); + if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG, &transparent_hugepage_flags)) + return GFP_TRANSHUGE_LIGHT | (vma_madvised ? __GFP_DIRECT_RECLAIM : + 0); + return GFP_TRANSHUGE_LIGHT; +} + +vm_fault_t remap_huge_pmd_pages(struct mm_struct *mm, struct vm_area_struct *vma, + struct page_map_entry *pme) +{ + int ret; + unsigned int j, i; + pgd_t *pgd; + p4d_t *p4d; + pmd_t *pmd; + pud_t *pud; + gfp_t gfp; + struct page *page, *new; + unsigned long address; + unsigned long phy_addr; + unsigned int redirect_pages = 0; + struct redirect_info *redirect_start; + + redirect_start = (struct redirect_info *)pme->redirect_start; + for (j = 0; j < pme->nr_pages; j++) { + address = pme->virt_addr + j * HPAGE_PMD_SIZE; + phy_addr = pme->phy_addr_array[j]; + if (!phy_addr) + continue; + + page = phys_to_page(phy_addr); + if (is_huge_zero_page(page)) { + pme->phy_addr_array[j] = 0; + continue; + } + pme->phy_addr_array[j] = 0; + + if (redirect_start && (redirect_pages < redirect_start->redirect_pages) && + (j == redirect_start->redirect_index[redirect_pages])) { + gfp = get_hugepage_gfpmask(vma); + new = alloc_hugepage_vma(gfp, vma, address, HPAGE_PMD_ORDER); + if (!new) { + pr_warn("Redirect alloc huge page fail\n"); + continue; + } + memcpy(page_to_virt(new), phys_to_virt(phy_addr), HPAGE_PMD_SIZE); + page = new; + redirect_pages++; + } + + pgd = pgd_offset(mm, address); + ret = VM_FAULT_OOM; + p4d = p4d_alloc(mm, pgd, address); + if (!p4d) + goto free; + pud = pud_alloc(mm, p4d, address); + if (!pud) + goto free; + pmd = pmd_alloc(mm, pud, address); + if (!pmd) + goto free; + ret = do_anon_huge_page_remap(vma, address, pmd, page); + if (ret) + goto free; + } + return 0; + +free: + for (i = j; i < pme->nr_pages; i++) { + phy_addr = pme->phy_addr_array[i]; + if (phy_addr) { + page = phys_to_page(phy_addr); + if (!(page->flags & PAGE_FLAGS_CHECK_RESERVED)) { + __free_pages(page, HPAGE_PMD_ORDER); + pme->phy_addr_array[i] = 0; + } + } + } + return ret; +} + +static void free_unmap_pages(struct page_map_info *pmi, + struct page_map_entry *pme, + unsigned int index) +{ + unsigned int i, j; + unsigned long phy_addr; + unsigned int order; + struct page *page; + + pme = (struct page_map_entry *)(next_pme(pme)); + for (i = index; i < pmi->entry_num; i++) { + for (j = 0; j < pme->nr_pages; j++) { + phy_addr = pme->phy_addr_array[i]; + if (phy_addr) { + page = phys_to_page(phy_addr); + order = pme->is_huge_page ? HPAGE_PMD_ORDER : 0; + if (!(page->flags & PAGE_FLAGS_CHECK_RESERVED)) { + __free_pages(page, order); + pme->phy_addr_array[i] = 0; + } + } + } + pme = (struct page_map_entry *)(next_pme(pme)); + } +} + +vm_fault_t do_mem_remap(int pid, struct mm_struct *mm) +{ + unsigned int i = 0; + vm_fault_t ret = 0; + struct vm_area_struct *vma; + struct page_map_info *pmi; + struct page_map_entry *pme; + unsigned long flags; + + if (reserve_user_map_pages_fail || !mm) + return -EFAULT; + pmi = get_page_map_info(pid); + if (!pmi) + return -EFAULT; + + spin_lock_irqsave(&page_map_entry_lock, flags); + pmi->disable_free_page = true; + spin_unlock(&page_map_entry_lock); + down_write(&mm->mmap_sem); + pme = pmi->pme; + vma = mm->mmap; + while ((i < pmi->entry_num) && (vma != NULL)) { + if (pme->virt_addr >= vma->vm_start && pme->virt_addr < vma->vm_end) { + i++; + if (!vma_is_anonymous(vma)) { + pme = (struct page_map_entry *)(next_pme(pme)); + continue; + } + if (!pme->is_huge_page) { + ret = remap_normal_pages(mm, vma, pme); + if (ret < 0) + goto free; + } else { + ret = remap_huge_pmd_pages(mm, vma, pme); + if (ret < 0) + goto free; + } + pme = (struct page_map_entry *)(next_pme(pme)); + } else { + vma = vma->vm_next; + } + } + up_write(&mm->mmap_sem); + return 0; + +free: + free_unmap_pages(pmi, pme, i); + up_write(&mm->mmap_sem); + return ret; +} +EXPORT_SYMBOL_GPL(do_mem_remap); + +#if defined(CONFIG_ARM64) +void init_reserve_page_map(unsigned long map_addr, unsigned long map_size) +{ + void *addr; + + if (!map_addr || !map_size) + return; + addr = phys_to_virt(map_addr); + init_page_map_info((struct pin_mem_dump_info *)addr, map_size); +} +#else +void init_reserve_page_map(unsigned long map_addr, unsigned long map_size) +{ +} +#endif + +static void free_all_reserved_pages(void) +{ + unsigned int i, j, index, order; + struct page_map_info *pmi; + struct page_map_entry *pme; + struct page *page; + unsigned long phy_addr; + + if (!user_space_reserve_start || reserve_user_map_pages_fail) + return; + + for (index = 0; index < pin_pid_num; index++) { + pmi = &(user_space_reserve_start[index]); + if (pmi->disable_free_page) + continue; + pme = pmi->pme; + for (i = 0; i < pmi->entry_num; i++) { + for (j = 0; j < pme->nr_pages; j++) { + order = pme->is_huge_page ? HPAGE_PMD_ORDER : 0; + phy_addr = pme->phy_addr_array[j]; + if (phy_addr) { + page = phys_to_page(phy_addr); + if (!(page->flags & PAGE_FLAGS_CHECK_RESERVED)) { + __free_pages(page, order); + pme->phy_addr_array[j] = 0; + } + } + } + pme = (struct page_map_entry *)next_pme(pme); + } + } +} + +/* Clear all pin memory record. */ +void clear_pin_memory_record(void) +{ + unsigned long flags; + + spin_lock_irqsave(&page_map_entry_lock, flags); + free_all_reserved_pages(); + if (pin_pid_num_addr) { + *pin_pid_num_addr = 0; + pin_pid_num = 0; + page_map_entry_start = __page_map_entry_start; + } + spin_unlock(&page_map_entry_lock); +} +EXPORT_SYMBOL_GPL(clear_pin_memory_record); + +#endif /* CONFIG_PIN_MEMORY */ -- 2.25.1

1 4

[PATCH kernel-4.19 00/14] backport linux-4.19.190
by Yang Yingliang 08 May '21

08 May '21

Chris Chiu (1): USB: Add reset-resume quirk for WD19's Realtek Hub Daniel Borkmann (1): bpf: Fix masking negation logic upon negative dst register Gao Xiang (1): erofs: fix extended inode could cross boundary Greg Kroah-Hartman (1): Linux 4.19.190 Jiri Kosina (2): iwlwifi: Fix softirq/hardirq disabling in iwl_pcie_enqueue_hcmd() iwlwifi: Fix softirq/hardirq disabling in iwl_pcie_gen2_enqueue_hcmd() Kai-Heng Feng (1): USB: Add LPM quirk for Lenovo ThinkPad USB-C Dock Gen2 Ethernet Mark Pearson (1): platform/x86: thinkpad_acpi: Correct thermal sensor allocation Miklos Szeredi (1): ovl: allow upperdir inside lowerdir Phillip Potter (1): net: usb: ax88179_178a: initialize local variables before use Rafael J. Wysocki (2): ACPI: tables: x86: Reserve memory occupied by ACPI tables ACPI: x86: Call acpi_boot_table_init() after acpi_table_upgrade() Romain Naour (1): mips: Do not include hi and lo in clobber list for R6 Takashi Iwai (1): ALSA: usb-audio: Add MIDI quirk for Vox ToneLab EX Makefile | 2 +- arch/mips/vdso/gettimeofday.c | 14 +- arch/x86/kernel/acpi/boot.c | 25 ++-- arch/x86/kernel/setup.c | 7 +- drivers/acpi/tables.c | 42 +++++- drivers/net/usb/ax88179_178a.c | 4 +- .../net/wireless/intel/iwlwifi/pcie/tx-gen2.c | 7 +- drivers/net/wireless/intel/iwlwifi/pcie/tx.c | 7 +- drivers/platform/x86/thinkpad_acpi.c | 31 ++-- drivers/staging/erofs/inode.c | 135 ++++++++++++------ drivers/usb/core/quirks.c | 4 + fs/overlayfs/super.c | 12 +- include/linux/acpi.h | 9 +- kernel/bpf/verifier.c | 12 +- sound/usb/quirks-table.h | 10 ++ 15 files changed, 221 insertions(+), 100 deletions(-) -- 2.25.1

1 12

[PATCH kernel-4.19 1/2] memcg: support priority for oom
by Yang Yingliang 07 May '21

07 May '21

From: Jing Xiangfeng <jingxiangfeng(a)huawei.com> hulk inclusion category: feature bugzilla: 51827 CVE: NA -------------------------------------- We first kill the process from the low priority memcg if OOM occurs. If the process is not found, then fallback to normal handle. Signed-off-by: Jing Xiangfeng <jingxiangfeng(a)huawei.com> Reviewed-by: Kefeng Wang <wangkefeng.wang(a)huawei.com> Signed-off-by: Yang Yingliang <yangyingliang(a)huawei.com> --- include/linux/memcontrol.h | 12 ++++ mm/Kconfig | 12 ++++ mm/memcontrol.c | 127 +++++++++++++++++++++++++++++++++++++ mm/oom_kill.c | 49 +++++++++++++- 4 files changed, 199 insertions(+), 1 deletion(-) diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index 0a55493f41da0..d59ce25206de4 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -296,6 +296,12 @@ struct mem_cgroup { bool tcpmem_active; int tcpmem_pressure; +#ifdef CONFIG_MEMCG_QOS + /* Currently support 0 and -1. + * in the future it can expand to other value. + */ + int memcg_priority; +#endif #ifdef CONFIG_MEMCG_KMEM /* Index in the kmem_cache->memcg_params.memcg_caches array */ int kmemcg_id; @@ -327,6 +333,12 @@ struct mem_cgroup { /* WARNING: nodeinfo must be the last member here */ }; +#ifdef CONFIG_MEMCG_QOS +bool memcg_low_priority_scan_tasks(int (*)(struct task_struct *, void *), + void *); +void memcg_print_bad_task(void *arg, int ret); +#endif + /* * size of first charge trial. "32" comes from vmscan.c's magic value. * TODO: maybe necessary to use big numbers in big irons. diff --git a/mm/Kconfig b/mm/Kconfig index f622df92712e2..709006c195eee 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -516,6 +516,18 @@ config USERSWAP Support for User Swap. This is based on userfaultfd. We can implement our own swapout and swapin functions in usersapce. +config MEMCG_QOS + bool "Enable Memory Cgroup Priority" + depends on MEMCG + depends on X86 || ARM64 + default y + help + MEMCG_QOS means that we first kill the process from the low priority + memcg if OOM occurs. If the process is not found, then fallback to + normal handle. + + If unsure, say "n". + config CMA bool "Contiguous Memory Allocator" depends on HAVE_MEMBLOCK && MMU diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 88ab44a5696fd..ce3346010b480 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -1235,6 +1235,9 @@ int mem_cgroup_scan_tasks(struct mem_cgroup *memcg, break; } } +#ifdef CONFIG_MEMCG_QOS + memcg_print_bad_task(arg, ret); +#endif return ret; } @@ -3636,6 +3639,119 @@ static int mem_cgroup_move_charge_write(struct cgroup_subsys_state *css, } #endif +#ifdef CONFIG_MEMCG_QOS +static void memcg_qos_init(struct mem_cgroup *memcg) +{ + struct mem_cgroup *parent = parent_mem_cgroup(memcg); + + if (!parent) + return; + + if (parent->memcg_priority && parent->use_hierarchy) + memcg->memcg_priority = parent->memcg_priority; +} + +static s64 memcg_qos_read(struct cgroup_subsys_state *css, + struct cftype *cft) +{ + return mem_cgroup_from_css(css)->memcg_priority; +} + +static int memcg_qos_write(struct cgroup_subsys_state *css, + struct cftype *cft, s64 val) +{ + struct mem_cgroup *memcg = mem_cgroup_from_css(css); + + if (val >= 0) + memcg->memcg_priority = 0; + else + memcg->memcg_priority = -1; + + return 0; +} + +static struct mem_cgroup *memcg_find_max_usage(struct mem_cgroup *last) +{ + struct mem_cgroup *iter, *max_memcg = NULL; + struct cgroup_subsys_state *css; + unsigned long usage, max_usage = 0; + + rcu_read_lock(); + css_for_each_descendant_pre(css, &root_mem_cgroup->css) { + iter = mem_cgroup_from_css(css); + + if (!iter->memcg_priority || iter == root_mem_cgroup || + iter == last) + continue; + + usage = mem_cgroup_usage(iter, false); + if (usage > max_usage) { + max_usage = usage; + max_memcg = iter; + } + } + rcu_read_unlock(); + + return max_memcg; +} + +bool memcg_low_priority_scan_tasks(int (*fn)(struct task_struct *, void *), + void *arg) +{ + struct mem_cgroup *max, *last = NULL; + struct oom_control *oc = arg; + struct css_task_iter it; + struct task_struct *task; + int ret = 0; + bool retry = true; + +retry: + max = memcg_find_max_usage(last); + if (!max) + return false; + + css_task_iter_start(&max->css, CSS_TASK_ITER_PROCS, &it); + while (!ret && (task = css_task_iter_next(&it))) { + if (test_tsk_thread_flag(task, TIF_MEMDIE)) { + pr_info("task %s is dying.\n", task->comm); + continue; + } + + ret = fn(task, arg); + } + css_task_iter_end(&it); + + if (ret) + return false; + + if (!oc->chosen && retry) { + last = max; + retry = false; + goto retry; + } + + if (oc->chosen) + pr_info("The bad task [%d:%s] is from low-priority memcg.\n", + oc->chosen->pid, oc->chosen->comm); + + return oc->chosen ? true : false; +} + +void memcg_print_bad_task(void *arg, int ret) +{ + struct oom_control *oc = arg; + + if (!ret && oc->chosen) { + struct mem_cgroup *memcg; + + memcg = mem_cgroup_from_task(oc->chosen); + if (memcg->memcg_priority) + pr_info("The bad task [%d:%s] is from low-priority memcg.\n", + oc->chosen->pid, oc->chosen->comm); + } +} +#endif + #ifdef CONFIG_NUMA #define LRU_ALL_FILE (BIT(LRU_INACTIVE_FILE) | BIT(LRU_ACTIVE_FILE)) @@ -4591,6 +4707,13 @@ static struct cftype mem_cgroup_legacy_files[] = { { .name = "pressure_level", }, +#ifdef CONFIG_MEMCG_QOS + { + .name = "qos_level", + .read_s64 = memcg_qos_read, + .write_s64 = memcg_qos_write, + }, +#endif #ifdef CONFIG_NUMA { .name = "numa_stat", @@ -4942,6 +5065,10 @@ static int mem_cgroup_css_online(struct cgroup_subsys_state *css) return -ENOMEM; } +#ifdef CONFIG_MEMCG_QOS + memcg_qos_init(memcg); +#endif + /* Online state pins memcg ID, memcg ID pins CSS */ atomic_set(&memcg->id.ref, 1); css_get(css); diff --git a/mm/oom_kill.c b/mm/oom_kill.c index 0dbd6d2a31733..f46c8fa9a1254 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c @@ -318,6 +318,49 @@ static enum oom_constraint constrained_alloc(struct oom_control *oc) return CONSTRAINT_NONE; } +#ifdef CONFIG_MEMCG_QOS +/** + * We choose the task in low-priority memcg firstly. For the same state, we + * choose the task with the highest number of 'points'. + */ +static bool oom_next_task(struct task_struct *task, struct oom_control *oc, + unsigned long points) +{ + struct mem_cgroup *cur_memcg; + struct mem_cgroup *oc_memcg; + + + if (!points) + return true; + + if (!oc->chosen) + return false; + + oc_memcg = mem_cgroup_from_task(oc->chosen); + cur_memcg = mem_cgroup_from_task(task); + + if (cur_memcg->memcg_priority == oc_memcg->memcg_priority) { + if (points < oc->chosen_points) + return true; + return false; + } + /* if oc is low-priority, so skip the task */ + if (oc_memcg->memcg_priority) + return true; + + return false; +} +#else +static inline bool oom_next_task(struct task_struct *task, + struct oom_control *oc, unsigned long points) +{ + if (!points || points < oc->chosen_points) + return true; + + return false; +} +#endif + static int oom_evaluate_task(struct task_struct *task, void *arg) { struct oom_control *oc = arg; @@ -348,7 +391,7 @@ static int oom_evaluate_task(struct task_struct *task, void *arg) } points = oom_badness(task, NULL, oc->nodemask, oc->totalpages); - if (!points || points < oc->chosen_points) + if (oom_next_task(task, oc, points)) goto next; select: @@ -377,6 +420,10 @@ static void select_bad_process(struct oom_control *oc) else { struct task_struct *p; +#ifdef CONFIG_MEMCG_QOS + if (memcg_low_priority_scan_tasks(oom_evaluate_task, oc)) + return; +#endif rcu_read_lock(); for_each_process(p) if (oom_evaluate_task(p, oc)) -- 2.25.1

1 1

[PATCH kernel-4.19 1/2] livepatch: Delete redundant variable 'flag'
by Yang Yingliang 07 May '21

07 May '21

From: Zhao Xuehui <zhaoxuehui1(a)huawei.com> hulk inclusion category: bugfix bugzilla: 51819 CVE: NA --------------------------- The varible 'flag' in klp_try_enable_patch is assigned with value '1', but that stored value is not used, so delete it. Signed-off-by: Zhao Xuehui <zhaoxuehui1(a)huawei.com> Reviewed-by: Yang Jihong <yangjihong1(a)huawei.com> Signed-off-by: Yang Yingliang <yangyingliang(a)huawei.com> --- kernel/livepatch/core.c | 16 ++-------------- 1 file changed, 2 insertions(+), 14 deletions(-) diff --git a/kernel/livepatch/core.c b/kernel/livepatch/core.c index c981d400fe552..65f48c468e162 100644 --- a/kernel/livepatch/core.c +++ b/kernel/livepatch/core.c @@ -402,27 +402,23 @@ static int disable_patch(struct klp_patch *patch) int klp_try_disable_patch(void *data) { int ret = 0; - int flag = 0; struct patch_data *pd = (struct patch_data *)data; if (atomic_inc_return(&pd->cpu_count) == 1) { struct klp_patch *patch = pd->patch; if (klp_check_patch_kprobed(patch)) { - flag = 1; atomic_inc(&pd->cpu_count); return -EINVAL; } ret = klp_check_calltrace(patch, 0); if (ret) { - flag = 1; atomic_inc(&pd->cpu_count); return ret; } ret = disable_patch(patch); if (ret) { - flag = 1; atomic_inc(&pd->cpu_count); return ret; } @@ -430,9 +426,7 @@ int klp_try_disable_patch(void *data) } else { while (atomic_read(&pd->cpu_count) <= num_online_cpus()) cpu_relax(); - - if (!flag) - klp_smp_isb(); + klp_smp_isb(); } return ret; @@ -617,27 +611,23 @@ static int enable_patch(struct klp_patch *patch) int klp_try_enable_patch(void *data) { int ret = 0; - int flag = 0; struct patch_data *pd = (struct patch_data *)data; if (atomic_inc_return(&pd->cpu_count) == 1) { struct klp_patch *patch = pd->patch; if (klp_check_patch_kprobed(patch)) { - flag = 1; atomic_inc(&pd->cpu_count); return -EINVAL; } ret = klp_check_calltrace(patch, 1); if (ret) { - flag = 1; atomic_inc(&pd->cpu_count); return ret; } ret = enable_patch(patch); if (ret) { - flag = 1; atomic_inc(&pd->cpu_count); return ret; } @@ -645,9 +635,7 @@ int klp_try_enable_patch(void *data) } else { while (atomic_read(&pd->cpu_count) <= num_online_cpus()) cpu_relax(); - - if (!flag) - klp_smp_isb(); + klp_smp_isb(); } return ret; -- 2.25.1

1 0