From: hejingxian hejingxian@huawei.com Date: Thu, 10 Dec 2020 20:35:00 +0800 Subject: [PATCH v1 1/1 openEuler-1.0-LTS] fork: add pid recover method for checkpoint and recover
We record the pid of dump task in the reserved memory, and reserve the pids before init task start. In the recover process, set the fork_pid of the recovery task before call fork(). And then the fork_pid will be used to alloc pid.
/proc/sys/kernel/ns_last_pid can also be used to fork child task with assigned pid. However, when there exist many tasks need to recover at the same time, we will fail to recover pids by using /proc/sys/kernel/ns_last_pid.
Signed-off-by: Jingxian He hejingxian@huawei.com --- drivers/char/pin_memory.c | 24 +++++++++++++++++++++- include/linux/pin_mem.h | 6 ++++++ include/linux/sched.h | 4 ++++ init/init_task.c | 3 +++ kernel/pid.c | 27 ++++++++++++++++++++++++- mm/Kconfig | 7 +++++++ mm/pin_mem.c | 51 +++++++++++++++++++++++++++++++++++++++++++++++ 7 files changed, 120 insertions(+), 2 deletions(-)
diff --git a/drivers/char/pin_memory.c b/drivers/char/pin_memory.c index a0464e1..2c65de7 100644 --- a/drivers/char/pin_memory.c +++ b/drivers/char/pin_memory.c @@ -36,9 +36,12 @@ struct pin_mem_area_set { #define _SET_PIN_MEM_AREA 1 #define _CLEAR_PIN_MEM_AREA 2 #define _REMAP_PIN_MEM_AREA 3 +#define _SET_FORK_PID 4 + #define SET_PIN_MEM_AREA _IOW(PIN_MEM_MAGIC, _SET_PIN_MEM_AREA, struct pin_mem_area_set) #define CLEAR_PIN_MEM_AREA _IOW(PIN_MEM_MAGIC, _CLEAR_PIN_MEM_AREA, int) #define REMAP_PIN_MEM_AREA _IOW(PIN_MEM_MAGIC, _REMAP_PIN_MEM_AREA, int) +#define SET_FORK_PID _IOW(PIN_MEM_MAGIC, _SET_FORK_PID, int) static int set_pin_mem(struct pin_mem_area_set *pmas) { @@ -136,13 +139,29 @@ static int pin_mem_remap(unsigned long arg) return -EFAULT; } +static int set_fork_pid(unsigned long arg) +{ + int pid; + struct page_map_info * pmi = NULL; + void __user *buf = (void __user *)arg; + + if (!access_ok(buf, sizeof(int))) + goto fault; + if (copy_from_user(&pid, buf, sizeof(int))) + goto fault; + current->fork_pid = pid; + return 0; +fault: + return -EFAULT; +} + static long pin_memory_ioctl(struct file *file, unsigned cmd, unsigned long arg) { long ret = 0; if (_IOC_TYPE(cmd) != PIN_MEM_MAGIC) return -EINVAL; - if (_IOC_NR(cmd) > _REMAP_PIN_MEM_AREA) + if (_IOC_NR(cmd) > _SET_FORK_PID) return -EINVAL; switch (cmd) { @@ -155,6 +174,9 @@ static long pin_memory_ioctl(struct file *file, unsigned cmd, unsigned long arg) case REMAP_PIN_MEM_AREA: ret = pin_mem_remap(arg); break; + case SET_FORK_PID: + ret = set_fork_pid(arg); + break; default: return -EINVAL; } diff --git a/include/linux/pin_mem.h b/include/linux/pin_mem.h index 0ca44ac..4162043 100644 --- a/include/linux/pin_mem.h +++ b/include/linux/pin_mem.h @@ -58,5 +58,11 @@ extern vm_fault_t do_anon_huge_page_remap(struct vm_area_struct *vma, unsigned l #endif extern void init_reserve_page_map(unsigned long map_addr, unsigned long map_size); +#ifdef CONFIG_PID_RECOVER +extern bool is_need_reserve_pids(void); +extern void reserve_pids(struct idr *idr, int pid_max); +extern void free_reserved_pid(struct idr *idr, int pid); +#endif + #endif /* CONFIG_PIN_MEMORY */ #endif /* _LINUX_PIN_MEMORY_H */ diff --git a/include/linux/sched.h b/include/linux/sched.h index 716ad1d..f6b1560 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1281,6 +1281,10 @@ struct task_struct { unsigned long prev_lowest_stack; #endif +#ifdef CONFIG_PID_RECOVER + int fork_pid; +#endif + /* * New fields for task_struct should be added above here, so that * they are included in the randomized portion of task_struct. diff --git a/init/init_task.c b/init/init_task.c index 9e5cbe5..40cd98f 100644 --- a/init/init_task.c +++ b/init/init_task.c @@ -181,6 +181,9 @@ struct task_struct init_task #ifdef CONFIG_SECURITY .security = NULL, #endif +#ifdef CONFIG_PID_RECOVER + .fork_pid = 0, +#endif }; EXPORT_SYMBOL(init_task); diff --git a/kernel/pid.c b/kernel/pid.c index 2278e24..dc93f7a 100644 --- a/kernel/pid.c +++ b/kernel/pid.c @@ -42,6 +42,9 @@ #include <linux/sched/signal.h> #include <linux/sched/task.h> #include <linux/idr.h> +#ifdef CONFIG_PID_RECOVER +#include <linux/pin_mem.h> +#endif struct pid init_struct_pid = { .count = REFCOUNT_INIT(1), @@ -208,8 +211,26 @@ struct pid *alloc_pid(struct pid_namespace *ns, pid_t *set_tid, idr_preload(GFP_KERNEL); spin_lock_irq(&pidmap_lock); - +#ifdef CONFIG_PID_RECOVER + if (current->fork_pid) { + int pid_min; + /* Try to free the reserved fork_pid, and then use it to alloc pid. */ + free_reserved_pid(&tmp->idr, current->fork_pid); + pid_min = current->fork_pid; + current->fork_pid = 0; + nr = idr_alloc(&tmp->idr, NULL, pid_min, + pid_min + 1, + GFP_ATOMIC); + /* + * If ENOSPC is returned it means that the PID is + * alreay in use. Return EEXIST in that case. + */ + if (nr == -ENOSPC) + nr = -EEXIST; + } else if (tid) { +#else if (tid) { +#endif nr = idr_alloc(&tmp->idr, NULL, tid, tid + 1, GFP_ATOMIC); /* @@ -577,4 +598,8 @@ void __init pid_idr_init(void) init_pid_ns.pid_cachep = KMEM_CACHE(pid, SLAB_HWCACHE_ALIGN | SLAB_PANIC | SLAB_ACCOUNT); +#ifdef CONFIG_PID_RECOVER + if (is_need_reserve_pids()) + reserve_pids(&init_pid_ns.idr, pid_max); +#endif } diff --git a/mm/Kconfig b/mm/Kconfig index c2dd088..847e8e4 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -745,4 +745,11 @@ config PIN_MEMORY help Say y here to enable the pin memory feature for checkpoint and restore. + +config PID_RECOVER + bool "Support for pid recover" + depends on PIN_MEMORY + help + Say y here to enable the pid recover feature for checkpoint + and restore. endmenu diff --git a/mm/pin_mem.c b/mm/pin_mem.c index ca3f23a..9426874 100644 --- a/mm/pin_mem.c +++ b/mm/pin_mem.c @@ -688,4 +688,55 @@ vm_fault_t reserve_kernel_space_mem(unsigned long start_addr, unsigned int pages } EXPORT_SYMBOL_GPL(reserve_kernel_space_mem); +#ifdef CONFIG_PID_RECOVER +struct idr *reserve_idr; + +/* test if there exist pin memory tasks */ +bool is_need_reserve_pids(void) +{ + return (pin_pid_num > 0); +} + +void free_reserved_pid(struct idr *idr, int pid) +{ + unsigned int index; + struct page_map_info *pmi; + + if (!max_pin_pid_num || idr != reserve_idr) + return; + + for (index = 0; index < pin_pid_num; index++) { + pmi = &(user_space_reserve_start[index]); + if (pmi->pid == pid && pmi->pid_reserved) { + idr_remove(idr, pid); + return; + } + } +} + +/* reserve pids for check point tasks which pinned memory */ +void reserve_pids(struct idr *idr, int pid_max) +{ + int alloc_pid; + unsigned int index; + struct page_map_info *pmi; + + if (!max_pin_pid_num) + return; + reserve_idr = idr; + for (index = 0; index < pin_pid_num; index++) { + pmi = &(user_space_reserve_start[index]); + pmi->pid_reserved = true; + alloc_pid = idr_alloc(idr, NULL, pmi->pid, pid_max, GFP_ATOMIC); + if (alloc_pid != pmi->pid) { + if (alloc_pid > 0) + idr_remove(idr, alloc_pid); + pr_warn("Reserve pid (%d) fail, real pid is %d.\n", alloc_pid, pmi->pid); + pmi->pid_reserved = false; + continue; + } + } + return; +} +#endif /* CONFIG_PID_RECOVER */ #endif /* CONFIG_PIN_MEMORY */ -- 1.8.3.1
On 2020/12/31 10:23, hejingxian wrote:
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1281,6 +1281,10 @@ struct task_struct {
unsigned long prev_lowest_stack;
#endif
+#ifdef CONFIG_PID_RECOVER
+ int fork_pid;
+#endif
This will cause thousands of KABI changing, NACK for LTS kernel.
Thanks Hanjun