Integrate CRIU with the kernel's rmfork feature (syscall 454): - Add --enable-rmfork CLI option to enable kernel-side memory save/restore - Dump path: skip parasite_dump_pages_seized, call rmfork_dump_mm_iov which delegates page table walking and page copy to the kernel - Restore path: call rmfork_restore_iov after VMA setup, the kernel inserts saved pages from ubmem via vm_insert_page - Meta offset persisted to imgs_dir/rmfork.meta between dump and restore Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com> --- criu/Makefile.crtools | 1 + criu/config.c | 26 +++++++ criu/cr-dump.c | 16 +++- criu/cr-restore.c | 13 ++++ criu/include/cr_options.h | 2 + criu/include/rmfork.h | 28 +++++++ criu/rmfork.c | 156 ++++++++++++++++++++++++++++++++++++++ 7 files changed, 239 insertions(+), 3 deletions(-) create mode 100644 criu/include/rmfork.h create mode 100644 criu/rmfork.c diff --git a/criu/Makefile.crtools b/criu/Makefile.crtools index ba6132d2f..56581f781 100644 --- a/criu/Makefile.crtools +++ b/criu/Makefile.crtools @@ -59,6 +59,7 @@ obj-y += protobuf-desc.o obj-y += protobuf.o obj-y += pstree.o obj-y += rbtree.o +obj-y += rmfork.o obj-y += rst-malloc.o obj-y += seccomp.o obj-y += seize.o diff --git a/criu/config.c b/criu/config.c index 1322a490a..007ee127b 100644 --- a/criu/config.c +++ b/criu/config.c @@ -671,6 +671,8 @@ int parse_options(int argc, char **argv, bool *usage_error, bool *has_exec_cmd, { "external", required_argument, 0, 1073 }, { "empty-ns", required_argument, 0, 1074 }, { "lazy-pages", no_argument, 0, 1076 }, + BOOL_OPT("skip-pages", &opts.skip_pages), + BOOL_OPT("enable-rmfork", &opts.enable_rmfork), BOOL_OPT("extra", &opts.check_extra_features), BOOL_OPT("experimental", &opts.check_experimental_features), { "all", no_argument, 0, 1079 }, @@ -1129,6 +1131,30 @@ int check_options(void) return 1; } + if (opts.skip_pages) { + if (opts.mode != CR_DUMP && opts.mode != CR_PRE_DUMP) { + pr_err("--skip-pages is only valid for dump and pre-dump\n"); + return 1; + } + if (opts.lazy_pages) { + pr_err("--skip-pages and --lazy-pages are mutually exclusive\n"); + return 1; + } + if (opts.use_page_server) { + pr_err("--skip-pages cannot be used with page server\n"); + return 1; + } + pr_info("Will skip pages.img generation (lightweight dump)\n"); + } + + if (opts.enable_rmfork) { + if (opts.mode != CR_DUMP && opts.mode != CR_RESTORE) { + pr_err("--enable-rmfork is only valid for dump and restore\n"); + return 1; + } + pr_info("RMFork enabled: kernel-side memory checkpoint/restore\n"); + } + if (check_namespace_opts()) { pr_err("Error: namespace flags conflict\n"); return 1; diff --git a/criu/cr-dump.c b/criu/cr-dump.c index b8cf7d64d..57cb129aa 100644 --- a/criu/cr-dump.c +++ b/criu/cr-dump.c @@ -87,6 +87,7 @@ #include "apparmor.h" #include "asm/dump.h" #include "timer.h" +#include "rmfork.h" #include "sigact.h" /* @@ -1693,9 +1694,18 @@ static int dump_one_task(struct pstree_item *item, InventoryEntry *parent_ie) mdc.stat = &pps_buf; mdc.parent_ie = parent_ie; - ret = parasite_dump_pages_seized(item, &vmas, &mdc, parasite_ctl); - if (ret) - goto err_cure; + if (opts.enable_rmfork) { + pr_info("rmfork: dumping pages via kernel for pid=%d\n", pid); + ret = rmfork_dump_mm_iov(pid, vpid(item), NULL, 0); + if (ret < 0) { + pr_err("rmfork: kernel dump failed for pid=%d\n", pid); + goto err_cure; + } + } else { + ret = parasite_dump_pages_seized(item, &vmas, &mdc, parasite_ctl); + if (ret) + goto err_cure; + } ret = parasite_dump_sigacts_seized(parasite_ctl, item); if (ret) { diff --git a/criu/cr-restore.c b/criu/cr-restore.c index c1d1f4b9d..691595ff7 100644 --- a/criu/cr-restore.c +++ b/criu/cr-restore.c @@ -102,6 +102,7 @@ #include "cr-errno.h" #include "timer.h" #include "sigact.h" +#include "rmfork.h" #ifndef arch_export_restore_thread #define arch_export_restore_thread __export_restore_thread @@ -705,6 +706,18 @@ static int restore_one_alive_task(int pid, CoreEntry *core) if (prepare_vmas(current, ta)) return -1; + /* + * RMFork: restore memory from kernel ubmem instead of pages.img. + * Must be called after VMAs are set up, before normal pages restore. + */ + if (opts.enable_rmfork) { + pid_t my_pid = getpid(); + if (rmfork_restore_iov(my_pid, 0, NULL, 0) < 0) { + pr_err("rmfork: kernel restore failed for pid=%d\n", my_pid); + return -1; + } + } + /* * Sockets have to be restored in their network namespaces, * so a task namespace has to be restored after sockets. diff --git a/criu/include/cr_options.h b/criu/include/cr_options.h index 4df8056b7..e03f3d07f 100644 --- a/criu/include/cr_options.h +++ b/criu/include/cr_options.h @@ -193,6 +193,8 @@ struct cr_options { unsigned int empty_ns; int tcp_skip_in_flight; bool lazy_pages; + bool skip_pages; /* Skip pages.img generation during dump */ + bool enable_rmfork; /* Enable RMFork (kernel-side checkpoint/restore) */ char *work_dir; int network_lock_method; int skip_file_rwx_check; diff --git a/criu/include/rmfork.h b/criu/include/rmfork.h new file mode 100644 index 000000000..bd71dffae --- /dev/null +++ b/criu/include/rmfork.h @@ -0,0 +1,28 @@ +#ifndef __CR_RMFORK_H__ +#define __CR_RMFORK_H__ + +#include <sys/uio.h> +#include <sys/types.h> + +/* + * Dump task memory via kernel rmfork syscall. + * Returns meta_off (positive) on success, -1 on error. + */ +extern long rmfork_dump_mm_iov(pid_t pid, pid_t vpid, + struct iovec *iovs, int nr); + +/* + * Get the meta offset previously saved during dump. + * Reads from the image directory. + */ +extern unsigned long rmfork_get_meta_off(void); + +/* + * Restore task memory via kernel rmfork syscall. + * If meta_off is 0, it will be read from the image directory. + * Returns 0 on success, -1 on error. + */ +extern int rmfork_restore_iov(pid_t pid, unsigned long meta_off, + struct iovec *iovs, int nr); + +#endif /* __CR_RMFORK_H__ */ diff --git a/criu/rmfork.c b/criu/rmfork.c new file mode 100644 index 000000000..86836e3ee --- /dev/null +++ b/criu/rmfork.c @@ -0,0 +1,156 @@ +/* + * RMFork - CRIU kernel-side checkpoint/restore helpers + * + * When --enable-rmfork is specified, CRIU delegates memory dump/restore + * to the kernel via the remote_fork syscall (syscall 454), using kernel- + * reserved pmem (ubmem). Userspace pages.img is not needed. + */ + +#include <unistd.h> +#include <limits.h> +#include <sys/syscall.h> +#include <sys/uio.h> +#include <errno.h> +#include <stdio.h> +#include <fcntl.h> + +#include "crtools.h" +#include "cr_options.h" +#include "log.h" +#include "mem.h" +#include "common/list.h" +#include "pstree.h" +#include "image.h" + +/* Syscall number must match kernel __NR_remote_fork */ +#ifndef __NR_remote_fork +#define __NR_remote_fork 454 +#endif + +/* + * Struct matching kernel's struct rmfork_kargs. + */ +struct rmfork_kargs { + unsigned long rmfork_opt; + unsigned long pid; + unsigned long va; + unsigned long pa; + unsigned long iovs; + unsigned long nr_iovs; +}; + +#define RMFORK_OPT_RESTORE_ONE 1 +#define RMFORK_OPT_RESTORE_ALL 2 +#define RMFORK_OPT_DUMP 3 + +/* Meta offset file name stored in the image directory */ +#define RMFORK_META_FILE "rmfork.meta" + +/* + * rmfork_dump_mm_iov - dump task memory via kernel syscall + * Returns meta offset in ubmem (positive) on success, or -1 on error. + */ +long rmfork_dump_mm_iov(pid_t pid, pid_t vpid, struct iovec *iovs, int nr) +{ + struct rmfork_kargs kargs; + long ret; + char metapath[PATH_MAX]; + FILE *f; + + kargs.rmfork_opt = RMFORK_OPT_DUMP; + kargs.pid = (unsigned long)pid; + kargs.va = 0; + kargs.pa = 0; + kargs.iovs = (unsigned long)iovs; + kargs.nr_iovs = (unsigned long)nr; + + ret = syscall(__NR_remote_fork, &kargs); + if (ret < 0) { + pr_perror("rmfork: dump syscall failed for pid=%d", pid); + return -1; + } + + pr_info("rmfork: dump done for pid=%d, meta_off=0x%lx\n", + pid, kargs.pa); + + /* Write meta offset to image dir for restore */ + if (opts.imgs_dir) { + snprintf(metapath, sizeof(metapath), "%s/%s", + opts.imgs_dir, RMFORK_META_FILE); + f = fopen(metapath, "w"); + if (f) { + fprintf(f, "0x%lx\n", kargs.pa); + fclose(f); + pr_info("rmfork: meta_off saved to %s\n", metapath); + } else { + pr_perror("rmfork: cannot write %s", metapath); + } + } + + return (long)kargs.pa; +} + +/* + * rmfork_get_meta_off - read meta offset from image directory + * Returns meta_off, or 0 if unavailable. + */ +unsigned long rmfork_get_meta_off(void) +{ + char metapath[PATH_MAX]; + FILE *f; + unsigned long meta_off = 0; + + if (!opts.imgs_dir) + return 0; + + snprintf(metapath, sizeof(metapath), "%s/%s", + opts.imgs_dir, RMFORK_META_FILE); + f = fopen(metapath, "r"); + if (!f) { + pr_debug("rmfork: %s not found\n", metapath); + return 0; + } + + if (fscanf(f, "0x%lx", &meta_off) != 1) + pr_err("rmfork: failed to parse %s\n", metapath); + + fclose(f); + return meta_off; +} + +/* + * rmfork_restore_iov - restore task memory via kernel syscall + * Returns 0 on success, -1 on error. + */ +int rmfork_restore_iov(pid_t pid, unsigned long meta_off, + struct iovec *iovs, int nr) +{ + struct rmfork_kargs kargs; + long ret; + + if (meta_off == 0) + meta_off = rmfork_get_meta_off(); + + if (meta_off == 0) { + pr_err("rmfork: no meta offset available for restore (pid=%d)\n", + pid); + return -1; + } + + kargs.rmfork_opt = RMFORK_OPT_RESTORE_ONE; + kargs.pid = (unsigned long)pid; + kargs.va = 0; + kargs.pa = meta_off; + kargs.iovs = (unsigned long)iovs; + kargs.nr_iovs = (unsigned long)nr; + + ret = syscall(__NR_remote_fork, &kargs); + if (ret < 0) { + pr_perror("rmfork: restore syscall failed for pid=%d", pid); + return -1; + } + + pr_info("rmfork: restore done for pid=%d (meta_off=0x%lx)\n", + pid, meta_off); + return 0; +} -- 2.53.0