From: Jingxian He hejingxian@huawei.com
Add pin memory method for criu to improve memory recover speed and avoid user private data saving to files.
Conflict:NA Reference:https://gitee.com/src-openeuler/criu/pulls/21 Signed-off-by: Jingxian He hejingxian@huawei.com --- criu/config.c | 1 + criu/cr-dump.c | 5 ++ criu/cr-restore.c | 5 ++ criu/crtools.c | 3 +- criu/include/cr_options.h | 1 + criu/include/mem.h | 2 + criu/include/restorer.h | 28 ++++++++ criu/mem.c | 130 +++++++++++++++++++++++++++++++++++++- criu/pie/restorer.c | 25 +++++++- criu/seize.c | 6 ++ 10 files changed, 203 insertions(+), 3 deletions(-)
diff --git a/criu/config.c b/criu/config.c index 5a53256..61b81fa 100644 --- a/criu/config.c +++ b/criu/config.c @@ -542,6 +542,7 @@ int parse_options(int argc, char **argv, bool *usage_error, { "pre-dump-mode", required_argument, 0, 1097}, { "file-validation", required_argument, 0, 1098 }, BOOL_OPT("with-cpu-affinity", &opts.with_cpu_affinity), + BOOL_OPT("pin-memory", &opts.pin_memory), { }, };
diff --git a/criu/cr-dump.c b/criu/cr-dump.c index f078c27..8575516 100644 --- a/criu/cr-dump.c +++ b/criu/cr-dump.c @@ -1778,6 +1778,11 @@ static int cr_dump_finish(int ret) close_service_fd(CR_PROC_FD_OFF); close_image_dir();
+ if (ret == 0 && opts.pin_memory) { + pr_info("start restore_task_special_pages\n"); + restore_task_special_pages(0); + } + if (ret) { pr_err("Dumping FAILED.\n"); } else { diff --git a/criu/cr-restore.c b/criu/cr-restore.c index 1374a69..27f3c54 100644 --- a/criu/cr-restore.c +++ b/criu/cr-restore.c @@ -3869,6 +3869,11 @@ static int sigreturn_restore(pid_t pid, struct task_restore_args *task_args, uns task_args->clone_restore_fn, task_args->thread_args);
+ if (opts.pin_memory) + task_args->pin_memory = true; + else + task_args->pin_memory = false; + /* * An indirect call to task_restore, note it never returns * and restoring core is extremely destructive. diff --git a/criu/crtools.c b/criu/crtools.c index 949dc9f..7bda86d 100644 --- a/criu/crtools.c +++ b/criu/crtools.c @@ -441,8 +441,9 @@ usage: " --file-validation METHOD\n" " pass the validation method to be used; argument\n" " can be 'filesize' or 'buildid' (default).\n" -" --with-cpu-affinity Allow to restore cpu affinity. Only for hosts with\n" +" --with-cpu-affinity Allow to restore cpu affinity. Only for hosts with\n" " same cpu quantity.\n" +" --pin-memory Use pin memory method for checkpoint and restore.\n" "\n" "Check options:\n" " Without options, "criu check" checks availability of absolutely required\n" diff --git a/criu/include/cr_options.h b/criu/include/cr_options.h index fda54a4..a4dc5b8 100644 --- a/criu/include/cr_options.h +++ b/criu/include/cr_options.h @@ -176,6 +176,7 @@ struct cr_options { int file_validation_method; /* restore cpu affinity */ int with_cpu_affinity; + int pin_memory; };
extern struct cr_options opts; diff --git a/criu/include/mem.h b/criu/include/mem.h index 251cb1a..3b3fdf8 100644 --- a/criu/include/mem.h +++ b/criu/include/mem.h @@ -50,4 +50,6 @@ int prepare_vmas(struct pstree_item *t, struct task_restore_args *ta); int unmap_guard_pages(struct pstree_item *t); int prepare_mappings(struct pstree_item *t); bool should_dump_page(VmaEntry *vmae, u64 pme); +int dump_task_special_pages(int pid); +int restore_task_special_pages(int pid); #endif /* __CR_MEM_H__ */ diff --git a/criu/include/restorer.h b/criu/include/restorer.h index bd6ef6a..9614720 100644 --- a/criu/include/restorer.h +++ b/criu/include/restorer.h @@ -225,6 +225,7 @@ struct task_restore_args { int lsm_type; int child_subreaper; bool has_clone3_set_tid; + bool pin_memory; } __aligned(64);
/* @@ -317,4 +318,31 @@ enum { #define __r_sym(name) restorer_sym ## name #define restorer_sym(rblob, name) (void*)(rblob + __r_sym(name))
+#define PIN_MEM_FILE "/dev/pinmem" +#define PIN_MEM_MAGIC 0x59 +#define _SET_PIN_MEM_AREA 1 +#define _CLEAR_PIN_MEM_AREA 2 +#define _REMAP_PIN_MEM_AREA 3 +#define _DUMP_SEPCIAL_PAGES 6 +#define _RETORE_SEPCIAL_PAGES 7 +#define SET_PIN_MEM_AREA _IOW(PIN_MEM_MAGIC, _SET_PIN_MEM_AREA, struct pin_mem_area_set) +#define CLEAR_PIN_MEM_AREA _IOW(PIN_MEM_MAGIC, _CLEAR_PIN_MEM_AREA, int) +#define REMAP_PIN_MEM_AREA _IOW(PIN_MEM_MAGIC, _REMAP_PIN_MEM_AREA, int) +#define DUMP_SEPCIAL_PAGES _IOW(PIN_MEM_MAGIC, _DUMP_SEPCIAL_PAGES, int) +#define RETORE_SEPCIAL_PAGES _IOW(PIN_MEM_MAGIC, _RETORE_SEPCIAL_PAGES, int) + +#define ONCE_PIN_MEM_SIZE_LIMIT 32 * 1024 * 1024 +#define MAX_PIN_MEM_AREA_NUM 16 + +struct pin_mem_area { + unsigned long virt_start; + unsigned long virt_end; +}; + +struct pin_mem_area_set { + unsigned int pid; + unsigned int area_num; + struct pin_mem_area mem_area[MAX_PIN_MEM_AREA_NUM]; +}; + #endif /* __CR_RESTORER_H__ */ diff --git a/criu/mem.c b/criu/mem.c index 167838b..2eabb8d 100644 --- a/criu/mem.c +++ b/criu/mem.c @@ -438,6 +438,119 @@ again: return ret; }
+bool should_pin_vmae(VmaEntry *vmae) +{ + /* + * vDSO area must be always dumped because on restore + * we might need to generate a proxy. + */ + if (vma_entry_is(vmae, VMA_AREA_VDSO)) + return false; + /* + * In turn VVAR area is special and referenced from + * vDSO area by IP addressing (at least on x86) thus + * never ever dump its content but always use one provided + * by the kernel on restore, ie runtime VVAR area must + * be remapped into proper place.. + */ + if (vma_entry_is(vmae, VMA_AREA_VVAR)) + return false; + + if (vma_entry_is(vmae, VMA_AREA_AIORING)) + return false; + if (vma_entry_is(vmae, VMA_ANON_PRIVATE)) + return true; + + return false; +} + +static int pin_one_pmas(int fd, unsigned long start, + unsigned long *pend, struct pstree_item *item) +{ + int ret; + unsigned int index = 0; + unsigned long end; + unsigned long next = start; + struct pin_mem_area_set pmas; + struct pin_mem_area *pma; + + end = *pend; + while (start < end) { + next = (start + ONCE_PIN_MEM_SIZE_LIMIT > end) ? end : (start + ONCE_PIN_MEM_SIZE_LIMIT); + pma = &(pmas.mem_area[index]); + pma->virt_start = start; + pma->virt_end = next; + index++; + start += ONCE_PIN_MEM_SIZE_LIMIT; + if (index >= MAX_PIN_MEM_AREA_NUM) + break; + } + *pend = next; + pmas.area_num = index; + pmas.pid = vpid(item); + ret = ioctl(fd, SET_PIN_MEM_AREA, &pmas); + if (ret < 0) + pr_err("pin mem fail, errno: %s\n", strerror(errno)); + return ret; +} +static int pin_vmae(VmaEntry *vmae, struct pstree_item *item) +{ + int fd; + int ret = 0; + unsigned long start, end; + + fd = open(PIN_MEM_FILE, O_RDWR); + if (fd < 0) { + pr_err("open file: %s fail.\n", PIN_MEM_FILE); + return -1; + } + start = vmae->start; + while (start < vmae->end) { + end = vmae->end; + ret = pin_one_pmas(fd, start, &end, item); + if (ret < 0) + break; + start = end; + } + close(fd); + return ret; +} + +int dump_task_special_pages(int pid) +{ + int fd, ret; + + fd = open(PIN_MEM_FILE, O_RDWR, 0); + if (fd < 0) { + pr_warn("error open file: %s\n", PIN_MEM_FILE); + return -1; + } + ret = ioctl(fd, DUMP_SEPCIAL_PAGES, (unsigned long) &pid); + if (ret < 0) { + pr_warn("No need DUMP_SEPCIAL_PAGES for %d\n", pid); + } + close(fd); + return ret; +} + +int restore_task_special_pages(int pid) +{ + int fd, ret; + + fd = open(PIN_MEM_FILE, O_RDWR, 0); + if (fd < 0) { + pr_warn("error open file: %s\n", PIN_MEM_FILE); + return -1; + } + ret = ioctl(fd, RETORE_SEPCIAL_PAGES, (unsigned long) &pid); + if (ret < 0) { + pr_warn("No need RETORE_SEPCIAL_PAGES for %d\n", pid); + } + close(fd); + return ret; +} + + static int __parasite_dump_pages_seized(struct pstree_item *item, struct parasite_dump_pages_args *args, struct vm_area_list *vma_area_list, @@ -513,7 +626,18 @@ static int __parasite_dump_pages_seized(struct pstree_item *item, if (possible_pid_reuse == -1) goto out_xfer; } - + if (opts.pin_memory) { + /* pin memory before dump pages */ + list_for_each_entry(vma_area, &vma_area_list->h, list) { + if (should_pin_vmae(vma_area->e)) { + ret = pin_vmae(vma_area->e, item); + if (ret) { + exit_code = -1; + goto out_xfer; + } + } + } + }
/* * Step 1 -- generate the pagemap @@ -524,6 +648,10 @@ static int __parasite_dump_pages_seized(struct pstree_item *item, parent_predump_mode = mdc->parent_ie->pre_dump_mode;
list_for_each_entry(vma_area, &vma_area_list->h, list) { + if (opts.pin_memory && should_pin_vmae(vma_area->e)) { + continue; + } + ret = generate_vma_iovs(item, vma_area, pp, &xfer, args, ctl, &pmc, has_parent, mdc->pre_dump, parent_predump_mode); diff --git a/criu/pie/restorer.c b/criu/pie/restorer.c index c63f96b..1565e3c 100644 --- a/criu/pie/restorer.c +++ b/criu/pie/restorer.c @@ -1414,6 +1414,24 @@ int cleanup_current_inotify_events(struct task_restore_args *task_args) return 0; }
+int remap_vmas(int pid) +{ + int fd, ret = 0; + + fd = sys_open(PIN_MEM_FILE, O_RDWR, 0); + if (fd == -1) { + pr_err("open file: %s fail.\n", PIN_MEM_FILE); + return -1;; + } + + ret = sys_ioctl(fd, REMAP_PIN_MEM_AREA, (unsigned long) &pid); + if (ret < 0) + pr_err("remap pin mem fail for pid: %d\n", pid); + sys_close(fd); + return ret; +} + + /* * The main routine to restore task via sigreturn. * This one is very special, we never return there @@ -1585,7 +1603,12 @@ long __export_restore_task(struct task_restore_args *args) goto core_restore_end; } } - + if (args->pin_memory) { + if (remap_vmas(my_pid) < 0) { + pr_err("Remap vmas fail\n"); + goto core_restore_end; + } + } /* * Now read the contents (if any) */ diff --git a/criu/seize.c b/criu/seize.c index f973806..a661097 100644 --- a/criu/seize.c +++ b/criu/seize.c @@ -23,6 +23,7 @@ #include "string.h" #include "xmalloc.h" #include "util.h" +#include "mem.h"
#define NR_ATTEMPTS 5
@@ -655,6 +656,11 @@ static void unseize_task_and_threads(const struct pstree_item *item, int st) if (item->pid->state == TASK_DEAD) return;
+ if (opts.pin_memory) { + for (i = 0; i < item->nr_threads; i++) + dump_task_special_pages(item->threads[i].real); + } + /* * The st is the state we want to switch tasks into, * the item->state is the state task was in when we seized one.