From: hejingxian <hejingxian@huawei.com>
Date: Thu, 31 Dec 2020 20:18:56 +0800
Subject: [PATCH v2 1/1 openEuler-1.0-LTS] fork: add pid recover method for checkpoint and recover
We record the pid of dump task in the reserved memory, and reserve the pids before
init task start. In the recover process, set the fork_pid of the recovery task before
call fork(). And then the fork_pid will be used to alloc pid.
/proc/sys/kernel/ns_last_pid can also be used to fork child task with assigned pid.
However, when there exist many tasks need to recover at the same time, we will fail to
recover pids by using /proc/sys/kernel/ns_last_pid.
Signed-off-by: Jingxian He <hejingxian@huawei.com>
---
drivers/char/pin_memory.c | 24 +++++++++++++++++++++-
include/linux/pin_mem.h | 6 ++++++
include/linux/sched.h | 9 +++++++++
init/init_task.c | 3 +++
kernel/pid.c | 33 ++++++++++++++++++++++++++++++
mm/Kconfig | 7 +++++++
mm/pin_mem.c | 51 +++++++++++++++++++++++++++++++++++++++++++++++
7 files changed, 132 insertions(+), 1 deletion(-)
diff --git a/drivers/char/pin_memory.c b/drivers/char/pin_memory.c
index a0464e1..2c65de7 100644
--- a/drivers/char/pin_memory.c
+++ b/drivers/char/pin_memory.c
@@ -36,9 +36,12 @@ struct pin_mem_area_set {
#define _SET_PIN_MEM_AREA 1
#define _CLEAR_PIN_MEM_AREA 2
#define _REMAP_PIN_MEM_AREA 3
+#define _SET_FORK_PID 4
+
#define SET_PIN_MEM_AREA _IOW(PIN_MEM_MAGIC, _SET_PIN_MEM_AREA, struct pin_mem_area_set)
#define CLEAR_PIN_MEM_AREA _IOW(PIN_MEM_MAGIC, _CLEAR_PIN_MEM_AREA, int)
#define REMAP_PIN_MEM_AREA _IOW(PIN_MEM_MAGIC, _REMAP_PIN_MEM_AREA, int)
+#define SET_FORK_PID _IOW(PIN_MEM_MAGIC, _SET_FORK_PID, int)
static int set_pin_mem(struct pin_mem_area_set *pmas)
{
@@ -136,13 +139,29 @@ static int pin_mem_remap(unsigned long arg)
return -EFAULT;
}
+static int set_fork_pid(unsigned long arg)
+{
+ int pid;
+ struct page_map_info * pmi = NULL;
+ void __user *buf = (void __user *)arg;
+
+ if (!access_ok(buf, sizeof(int)))
+ goto fault;
+ if (copy_from_user(&pid, buf, sizeof(int)))
+ goto fault;
+ current->fork_pid = pid;
+ return 0;
+fault:
+ return -EFAULT;
+}
+
static long pin_memory_ioctl(struct file *file, unsigned cmd, unsigned long arg)
{
long ret = 0;
if (_IOC_TYPE(cmd) != PIN_MEM_MAGIC)
return -EINVAL;
- if (_IOC_NR(cmd) > _REMAP_PIN_MEM_AREA)
+ if (_IOC_NR(cmd) > _SET_FORK_PID)
return -EINVAL;
switch (cmd) {
@@ -155,6 +174,9 @@ static long pin_memory_ioctl(struct file *file, unsigned cmd, unsigned long arg)
case REMAP_PIN_MEM_AREA:
ret = pin_mem_remap(arg);
break;
+ case SET_FORK_PID:
+ ret = set_fork_pid(arg);
+ break;
default:
return -EINVAL;
}
diff --git a/include/linux/pin_mem.h b/include/linux/pin_mem.h
index 997ddb8..f47dab4 100644
--- a/include/linux/pin_mem.h
+++ b/include/linux/pin_mem.h
@@ -58,5 +58,11 @@ extern vm_fault_t do_anon_huge_page_remap(struct vm_area_struct *vma, unsigned l
#endif
extern void init_reserve_page_map(unsigned long map_addr, unsigned long map_size);
+#ifdef CONFIG_PID_RECOVER
+extern bool is_need_reserve_pids(void);
+extern void reserve_pids(struct idr *idr, int pid_max);
+extern void free_reserved_pid(struct idr *idr, int pid);
+#endif
+
#endif /* CONFIG_PIN_MEMORY */
#endif /* _LINUX_PIN_MEMORY_H */
diff --git a/include/linux/sched.h b/include/linux/sched.h
index ced2d7f..3d23e46 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1220,7 +1220,16 @@ struct task_struct {
KABI_RESERVE(1)
KABI_RESERVE(2)
#endif
+#ifdef CONFIG_PID_RECOVER
+#ifndef __GENKSYMS__
+ int fork_pid;
+ int reserve;
+#else
KABI_RESERVE(3)
+#endif
+#else
+ KABI_RESERVE(3)
+#endif
KABI_RESERVE(4)
KABI_RESERVE(5)
KABI_RESERVE(6)
diff --git a/init/init_task.c b/init/init_task.c
index 5aebe3b..0334ad8 100644
--- a/init/init_task.c
+++ b/init/init_task.c
@@ -179,6 +179,9 @@ struct task_struct init_task
#ifdef CONFIG_SECURITY
.security = NULL,
#endif
+#ifdef CONFIG_PID_RECOVER
+ .fork_pid = 0,
+#endif
};
EXPORT_SYMBOL(init_task);
diff --git a/kernel/pid.c b/kernel/pid.c
index bfdcd16..5d02591 100644
--- a/kernel/pid.c
+++ b/kernel/pid.c
@@ -41,6 +41,9 @@
#include <linux/sched/task.h>
#include <linux/idr.h>
#include <linux/kmemleak.h>
+#ifdef CONFIG_PID_RECOVER
+#include <linux/pin_mem.h>
+#endif
struct pid init_struct_pid = {
.count = ATOMIC_INIT(1),
@@ -185,6 +188,31 @@ struct pid *alloc_pid(struct pid_namespace *ns)
if (idr_get_cursor(&tmp->idr) > RESERVED_PIDS)
pid_min = RESERVED_PIDS;
+#ifdef CONFIG_PID_RECOVER
+ if (!current->fork_pid) {
+ /*
+ * Store a null pointer so find_pid_ns does not find
+ * a partially initialized PID (see below).
+ */
+ nr = idr_alloc_cyclic(&tmp->idr, NULL, pid_min,
+ tmp->pid_max,
+ GFP_ATOMIC);
+ } else {
+ /* Try to use fork_pid to alloc pid, and change to use the default way after fail. */
+ test_pid_reserved_and_free(&tmp->idr, current->fork_pid);
+ pid_min = current->fork_pid;
+ current->fork_pid = 0;
+ nr = idr_alloc(&tmp->idr, NULL, pid_min,
+ tmp->pid_max,
+ GFP_ATOMIC);
+ pr_debug("Alloc pid by fork_pid ret: %d.", nr);
+ if (nr < 0)
+ nr = idr_alloc_cyclic(&tmp->idr, NULL, pid_min,
+ tmp->pid_max,
+ GFP_ATOMIC);
+ }
+#else
+
/*
* Store a null pointer so find_pid_ns does not find
* a partially initialized PID (see below).
@@ -192,6 +220,7 @@ struct pid *alloc_pid(struct pid_namespace *ns)
nr = idr_alloc_cyclic(&tmp->idr, NULL, pid_min,
tmp->pid_max,
GFP_ATOMIC);
+#endif
spin_unlock_irq(&pidmap_lock);
idr_preload_end();
@@ -500,6 +529,10 @@ void __init pid_idr_init(void)
init_pid_ns.pid_cachep = KMEM_CACHE(pid,
SLAB_HWCACHE_ALIGN | SLAB_PANIC | SLAB_ACCOUNT);
+#ifdef CONFIG_PID_RECOVER
+ if (is_need_reserve_pids())
+ reserve_pids(&init_pid_ns.idr, pid_max);
+#endif
hdr = register_sysctl_paths(pid_kern_path, pid_ctl_table);
kmemleak_not_leak(hdr);
diff --git a/mm/Kconfig b/mm/Kconfig
index 31fa670..9ac20da 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -794,4 +794,11 @@ config PIN_MEMORY
help
Say y here to enable the pin memory feature for checkpoint
and restore.
+
+config PID_RECOVER
+ bool "Support for pid recover"
+ depends on PIN_MEMORY
+ help
+ Say y here to enable the pid recover feature for checkpoint
+ and restore.
endmenu
diff --git a/mm/pin_mem.c b/mm/pin_mem.c
index 89af938..2f77e86 100644
--- a/mm/pin_mem.c
+++ b/mm/pin_mem.c
@@ -688,4 +688,55 @@ vm_fault_t reserve_kernel_space_mem(unsigned long start_addr, unsigned int pages
}
EXPORT_SYMBOL_GPL(reserve_kernel_space_mem);
+#ifdef CONFIG_PID_RECOVER
+struct idr *reserve_idr;
+
+/* test if there exist pin memory tasks */
+bool is_need_reserve_pids(void)
+{
+ return (pin_pid_num > 0);
+}
+
+void free_reserved_pid(struct idr *idr, int pid)
+{
+ unsigned int index;
+ struct page_map_info *pmi;
+
+ if (!max_pin_pid_num || idr != reserve_idr)
+ return;
+
+ for (index = 0; index < pin_pid_num; index++) {
+ pmi = &(user_space_reserve_start[index]);
+ if (pmi->pid == pid && pmi->pid_reserved) {
+ idr_remove(idr, pid);
+ return;
+ }
+ }
+}
+
+/* reserve pids for check point tasks which pinned memory */
+void reserve_pids(struct idr *idr, int pid_max)
+{
+ int alloc_pid;
+ unsigned int index;
+ struct page_map_info *pmi;
+
+ if (!max_pin_pid_num)
+ return;
+ reserve_idr = idr;
+ for (index = 0; index < pin_pid_num; index++) {
+ pmi = &(user_space_reserve_start[index]);
+ pmi->pid_reserved = true;
+ alloc_pid = idr_alloc(idr, NULL, pmi->pid, pid_max, GFP_ATOMIC);
+ if (alloc_pid != pmi->pid) {
+ if (alloc_pid > 0)
+ idr_remove(idr, alloc_pid);
+ pr_warn("Reserve pid (%d) fail, real pid is %d.\n", alloc_pid, pmi->pid);
+ pmi->pid_reserved = false;
+ continue;
+ }
+ }
+ return;
+}
+#endif /* CONFIG_PID_RECOVER */
#endif /* CONFIG_PIN_MEMORY */
--
1.8.3.1