Qi Xi (4): skip-pages: add --skip-pages option to omit pages.img generation rmfork: add --enable-rmfork for kernel-side checkpoint/restore rmfork: add restorer blob support for kernel-side memory restore rmfork: integrate into dump/restore infrastructure criu/Makefile.crtools | 1 + criu/config.c | 26 +++++ criu/cr-dump.c | 18 +++- criu/cr-restore.c | 14 +++ criu/include/cr_options.h | 2 + criu/include/linux/rseq.h | 2 +- criu/include/restorer.h | 3 + criu/include/rmfork.h | 28 ++++++ criu/mem.c | 21 +++- criu/page-xfer.c | 44 ++++++-- criu/pie/restorer.c | 204 +++++++++++++++++++++++++++++--------- criu/rmfork.c | 156 +++++++++++++++++++++++++++++ criu/shmem.c | 19 ++-- criu/tty.c | 2 +- 14 files changed, 471 insertions(+), 69 deletions(-) create mode 100644 criu/include/rmfork.h create mode 100644 criu/rmfork.c -- 2.53.0
Add a new --skip-pages flag for lightweight dumps that only generate pagemap.img without the actual pages.img. All pages are marked as PE_LAZY so that external restore mechanisms can provide page data. This is useful for RMFork integration where kernel-side pmem handles the actual page data. Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com> --- criu/mem.c | 16 +++++++++++++++- criu/page-xfer.c | 44 ++++++++++++++++++++++++++++++++++++-------- criu/shmem.c | 19 +++++++++++++------ 3 files changed, 64 insertions(+), 15 deletions(-) diff --git a/criu/mem.c b/criu/mem.c index 803cb545b..b06a0d850 100644 --- a/criu/mem.c +++ b/criu/mem.c @@ -543,7 +543,15 @@ static int __parasite_dump_pages_seized(struct pstree_item *item, struct parasit if (ret < 0) goto out_pp; - xfer.transfer_lazy = !mdc->lazy; + /* + * With --skip-pages, we don't write actual page data. + * Set transfer_lazy to false so pages are only recorded + * in pagemap without being written to pages.img. + */ + if (opts.skip_pages) + xfer.transfer_lazy = false; + else + xfer.transfer_lazy = !mdc->lazy; } else { ret = check_parent_page_xfer(CR_FD_PAGEMAP, vpid(item)); if (ret < 0) @@ -582,9 +590,15 @@ static int __parasite_dump_pages_seized(struct pstree_item *item, struct parasit * will happen after task unfreezing in cr_pre_dump_finish(). This is * actual optimization which reduces time for which process was frozen * during pre-dump. + * + * With --skip-pages, we don't need to actually drain page data + * since pages.img won't be generated. This is a significant + * optimization for lightweight snapshots. */ if (mdc->pre_dump && opts.pre_dump_mode == PRE_DUMP_READ) ret = 0; + else if (opts.skip_pages) + ret = 0; /* Skip draining pages for lightweight dump */ else ret = drain_pages(pp, ctl, args); diff --git a/criu/page-xfer.c b/criu/page-xfer.c index 0314963e6..f59983782 100644 --- a/criu/page-xfer.c +++ b/criu/page-xfer.c @@ -257,6 +257,10 @@ static int write_pages_loc(struct page_xfer *xfer, int p, unsigned long len) ssize_t ret; ssize_t curr = 0; + /* With --skip-pages, pi is NULL, so just return success */ + if (!xfer->pi) + return 0; + while (1) { ret = splice(p, NULL, img_raw_fd(xfer->pi), NULL, len - curr, SPLICE_F_MOVE); if (ret == -1) { @@ -358,7 +362,8 @@ static void close_page_xfer(struct page_xfer *xfer) xfree(xfer->parent); xfer->parent = NULL; } - close_image(xfer->pi); + if (xfer->pi) + close_image(xfer->pi); close_image(xfer->pmi); } @@ -370,9 +375,22 @@ static int open_page_local_xfer(struct page_xfer *xfer, int fd_type, unsigned lo if (!xfer->pmi) return -1; - xfer->pi = open_pages_image(O_DUMP, xfer->pmi, &pages_id); - if (!xfer->pi) - goto err_pmi; + /* + * With --skip-pages, we only create pagemap.img without pages.img. + * Pages will be marked as PE_LAZY for external restore mechanism. + */ + if (opts.skip_pages) { + /* Still write PagemapHead with a dummy pages_id for compatibility */ + PagemapHead h = PAGEMAP_HEAD__INIT; + h.pages_id = 0; /* No actual pages.img will exist */ + if (pb_write_one(xfer->pmi, &h, PB_PAGEMAP_HEAD) < 0) + goto err_pmi; + xfer->pi = NULL; + } else { + xfer->pi = open_pages_image(O_DUMP, xfer->pmi, &pages_id); + if (!xfer->pi) + goto err_pmi; + } /* * Open page-read for parent images (if it exists). It will @@ -380,9 +398,10 @@ static int open_page_local_xfer(struct page_xfer *xfer, int fd_type, unsigned lo * 1) when writing a page, those from parent will be dedup-ed * 2) when writing a hole, the respective place would be checked * to exist in parent (either pagemap or hole) + * Note: skip-pages mode doesn't support incremental dump, so skip this. */ xfer->parent = NULL; - if (fd_type == CR_FD_PAGEMAP || fd_type == CR_FD_SHMEM_PAGEMAP) { + if (!opts.skip_pages && (fd_type == CR_FD_PAGEMAP || fd_type == CR_FD_SHMEM_PAGEMAP)) { int ret; int pfd; int pr_flags = (fd_type == CR_FD_PAGEMAP) ? PR_TASK : PR_SHMEM; @@ -420,7 +439,8 @@ out: return 0; err_pi: - close_image(xfer->pi); + if (xfer->pi) + close_image(xfer->pi); err_pmi: close_image(xfer->pmi); return -1; @@ -490,8 +510,16 @@ static inline u32 ppb_xfer_flags(struct page_xfer *xfer, struct page_pipe_buf *p * as present as well. */ return (xfer->transfer_lazy ? PE_PRESENT : 0) | PE_LAZY; - else - return PE_PRESENT; + + /* + * With --skip-pages, all pages are marked as PE_LAZY without PE_PRESENT. + * This signals that page data is not in pages.img and should be + * provided by external mechanism during restore. + */ + if (opts.skip_pages) + return PE_LAZY; + + return PE_PRESENT; } /* diff --git a/criu/shmem.c b/criu/shmem.c index 9e3178352..d984d2c17 100644 --- a/criu/shmem.c +++ b/criu/shmem.c @@ -693,12 +693,19 @@ static int dump_pages(struct page_pipe *pp, struct page_xfer *xfer) { struct page_pipe_buf *ppb; - list_for_each_entry(ppb, &pp->bufs, l) - if (vmsplice(ppb->p[1], ppb->iov, ppb->nr_segs, SPLICE_F_GIFT | SPLICE_F_NONBLOCK) != - ppb->pages_in * PAGE_SIZE) { - pr_perror("Can't get shmem into page-pipe"); - return -1; - } + /* + * With --skip-pages, we don't need to actually splice page data + * since pages.img won't be generated. This is an optimization + * for lightweight snapshots. + */ + if (!opts.skip_pages) { + list_for_each_entry(ppb, &pp->bufs, l) + if (vmsplice(ppb->p[1], ppb->iov, ppb->nr_segs, SPLICE_F_GIFT | SPLICE_F_NONBLOCK) != + ppb->pages_in * PAGE_SIZE) { + pr_perror("Can't get shmem into page-pipe"); + return -1; + } + } return page_xfer_dump_pages(xfer, pp); } -- 2.53.0
Integrate CRIU with the kernel's rmfork feature (syscall 454): - Add --enable-rmfork CLI option to enable kernel-side memory save/restore - Dump path: skip parasite_dump_pages_seized, call rmfork_dump_mm_iov which delegates page table walking and page copy to the kernel - Restore path: call rmfork_restore_iov after VMA setup, the kernel inserts saved pages from ubmem via vm_insert_page - Meta offset persisted to imgs_dir/rmfork.meta between dump and restore Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com> --- criu/Makefile.crtools | 1 + criu/config.c | 26 +++++++ criu/cr-dump.c | 16 +++- criu/cr-restore.c | 13 ++++ criu/include/cr_options.h | 2 + criu/include/rmfork.h | 28 +++++++ criu/rmfork.c | 156 ++++++++++++++++++++++++++++++++++++++ 7 files changed, 239 insertions(+), 3 deletions(-) create mode 100644 criu/include/rmfork.h create mode 100644 criu/rmfork.c diff --git a/criu/Makefile.crtools b/criu/Makefile.crtools index ba6132d2f..56581f781 100644 --- a/criu/Makefile.crtools +++ b/criu/Makefile.crtools @@ -59,6 +59,7 @@ obj-y += protobuf-desc.o obj-y += protobuf.o obj-y += pstree.o obj-y += rbtree.o +obj-y += rmfork.o obj-y += rst-malloc.o obj-y += seccomp.o obj-y += seize.o diff --git a/criu/config.c b/criu/config.c index 1322a490a..007ee127b 100644 --- a/criu/config.c +++ b/criu/config.c @@ -671,6 +671,8 @@ int parse_options(int argc, char **argv, bool *usage_error, bool *has_exec_cmd, { "external", required_argument, 0, 1073 }, { "empty-ns", required_argument, 0, 1074 }, { "lazy-pages", no_argument, 0, 1076 }, + BOOL_OPT("skip-pages", &opts.skip_pages), + BOOL_OPT("enable-rmfork", &opts.enable_rmfork), BOOL_OPT("extra", &opts.check_extra_features), BOOL_OPT("experimental", &opts.check_experimental_features), { "all", no_argument, 0, 1079 }, @@ -1129,6 +1131,30 @@ int check_options(void) return 1; } + if (opts.skip_pages) { + if (opts.mode != CR_DUMP && opts.mode != CR_PRE_DUMP) { + pr_err("--skip-pages is only valid for dump and pre-dump\n"); + return 1; + } + if (opts.lazy_pages) { + pr_err("--skip-pages and --lazy-pages are mutually exclusive\n"); + return 1; + } + if (opts.use_page_server) { + pr_err("--skip-pages cannot be used with page server\n"); + return 1; + } + pr_info("Will skip pages.img generation (lightweight dump)\n"); + } + + if (opts.enable_rmfork) { + if (opts.mode != CR_DUMP && opts.mode != CR_RESTORE) { + pr_err("--enable-rmfork is only valid for dump and restore\n"); + return 1; + } + pr_info("RMFork enabled: kernel-side memory checkpoint/restore\n"); + } + if (check_namespace_opts()) { pr_err("Error: namespace flags conflict\n"); return 1; diff --git a/criu/cr-dump.c b/criu/cr-dump.c index b8cf7d64d..57cb129aa 100644 --- a/criu/cr-dump.c +++ b/criu/cr-dump.c @@ -87,6 +87,7 @@ #include "apparmor.h" #include "asm/dump.h" #include "timer.h" +#include "rmfork.h" #include "sigact.h" /* @@ -1693,9 +1694,18 @@ static int dump_one_task(struct pstree_item *item, InventoryEntry *parent_ie) mdc.stat = &pps_buf; mdc.parent_ie = parent_ie; - ret = parasite_dump_pages_seized(item, &vmas, &mdc, parasite_ctl); - if (ret) - goto err_cure; + if (opts.enable_rmfork) { + pr_info("rmfork: dumping pages via kernel for pid=%d\n", pid); + ret = rmfork_dump_mm_iov(pid, vpid(item), NULL, 0); + if (ret < 0) { + pr_err("rmfork: kernel dump failed for pid=%d\n", pid); + goto err_cure; + } + } else { + ret = parasite_dump_pages_seized(item, &vmas, &mdc, parasite_ctl); + if (ret) + goto err_cure; + } ret = parasite_dump_sigacts_seized(parasite_ctl, item); if (ret) { diff --git a/criu/cr-restore.c b/criu/cr-restore.c index c1d1f4b9d..691595ff7 100644 --- a/criu/cr-restore.c +++ b/criu/cr-restore.c @@ -102,6 +102,7 @@ #include "cr-errno.h" #include "timer.h" #include "sigact.h" +#include "rmfork.h" #ifndef arch_export_restore_thread #define arch_export_restore_thread __export_restore_thread @@ -705,6 +706,18 @@ static int restore_one_alive_task(int pid, CoreEntry *core) if (prepare_vmas(current, ta)) return -1; + /* + * RMFork: restore memory from kernel ubmem instead of pages.img. + * Must be called after VMAs are set up, before normal pages restore. + */ + if (opts.enable_rmfork) { + pid_t my_pid = getpid(); + if (rmfork_restore_iov(my_pid, 0, NULL, 0) < 0) { + pr_err("rmfork: kernel restore failed for pid=%d\n", my_pid); + return -1; + } + } + /* * Sockets have to be restored in their network namespaces, * so a task namespace has to be restored after sockets. diff --git a/criu/include/cr_options.h b/criu/include/cr_options.h index 4df8056b7..e03f3d07f 100644 --- a/criu/include/cr_options.h +++ b/criu/include/cr_options.h @@ -193,6 +193,8 @@ struct cr_options { unsigned int empty_ns; int tcp_skip_in_flight; bool lazy_pages; + bool skip_pages; /* Skip pages.img generation during dump */ + bool enable_rmfork; /* Enable RMFork (kernel-side checkpoint/restore) */ char *work_dir; int network_lock_method; int skip_file_rwx_check; diff --git a/criu/include/rmfork.h b/criu/include/rmfork.h new file mode 100644 index 000000000..bd71dffae --- /dev/null +++ b/criu/include/rmfork.h @@ -0,0 +1,28 @@ +#ifndef __CR_RMFORK_H__ +#define __CR_RMFORK_H__ + +#include <sys/uio.h> +#include <sys/types.h> + +/* + * Dump task memory via kernel rmfork syscall. + * Returns meta_off (positive) on success, -1 on error. + */ +extern long rmfork_dump_mm_iov(pid_t pid, pid_t vpid, + struct iovec *iovs, int nr); + +/* + * Get the meta offset previously saved during dump. + * Reads from the image directory. + */ +extern unsigned long rmfork_get_meta_off(void); + +/* + * Restore task memory via kernel rmfork syscall. + * If meta_off is 0, it will be read from the image directory. + * Returns 0 on success, -1 on error. + */ +extern int rmfork_restore_iov(pid_t pid, unsigned long meta_off, + struct iovec *iovs, int nr); + +#endif /* __CR_RMFORK_H__ */ diff --git a/criu/rmfork.c b/criu/rmfork.c new file mode 100644 index 000000000..86836e3ee --- /dev/null +++ b/criu/rmfork.c @@ -0,0 +1,156 @@ +/* + * RMFork - CRIU kernel-side checkpoint/restore helpers + * + * When --enable-rmfork is specified, CRIU delegates memory dump/restore + * to the kernel via the remote_fork syscall (syscall 454), using kernel- + * reserved pmem (ubmem). Userspace pages.img is not needed. + */ + +#include <unistd.h> +#include <limits.h> +#include <sys/syscall.h> +#include <sys/uio.h> +#include <errno.h> +#include <stdio.h> +#include <fcntl.h> + +#include "crtools.h" +#include "cr_options.h" +#include "log.h" +#include "mem.h" +#include "common/list.h" +#include "pstree.h" +#include "image.h" + +/* Syscall number must match kernel __NR_remote_fork */ +#ifndef __NR_remote_fork +#define __NR_remote_fork 454 +#endif + +/* + * Struct matching kernel's struct rmfork_kargs. + */ +struct rmfork_kargs { + unsigned long rmfork_opt; + unsigned long pid; + unsigned long va; + unsigned long pa; + unsigned long iovs; + unsigned long nr_iovs; +}; + +#define RMFORK_OPT_RESTORE_ONE 1 +#define RMFORK_OPT_RESTORE_ALL 2 +#define RMFORK_OPT_DUMP 3 + +/* Meta offset file name stored in the image directory */ +#define RMFORK_META_FILE "rmfork.meta" + +/* + * rmfork_dump_mm_iov - dump task memory via kernel syscall + * Returns meta offset in ubmem (positive) on success, or -1 on error. + */ +long rmfork_dump_mm_iov(pid_t pid, pid_t vpid, struct iovec *iovs, int nr) +{ + struct rmfork_kargs kargs; + long ret; + char metapath[PATH_MAX]; + FILE *f; + + kargs.rmfork_opt = RMFORK_OPT_DUMP; + kargs.pid = (unsigned long)pid; + kargs.va = 0; + kargs.pa = 0; + kargs.iovs = (unsigned long)iovs; + kargs.nr_iovs = (unsigned long)nr; + + ret = syscall(__NR_remote_fork, &kargs); + if (ret < 0) { + pr_perror("rmfork: dump syscall failed for pid=%d", pid); + return -1; + } + + pr_info("rmfork: dump done for pid=%d, meta_off=0x%lx\n", + pid, kargs.pa); + + /* Write meta offset to image dir for restore */ + if (opts.imgs_dir) { + snprintf(metapath, sizeof(metapath), "%s/%s", + opts.imgs_dir, RMFORK_META_FILE); + f = fopen(metapath, "w"); + if (f) { + fprintf(f, "0x%lx\n", kargs.pa); + fclose(f); + pr_info("rmfork: meta_off saved to %s\n", metapath); + } else { + pr_perror("rmfork: cannot write %s", metapath); + } + } + + return (long)kargs.pa; +} + +/* + * rmfork_get_meta_off - read meta offset from image directory + * Returns meta_off, or 0 if unavailable. + */ +unsigned long rmfork_get_meta_off(void) +{ + char metapath[PATH_MAX]; + FILE *f; + unsigned long meta_off = 0; + + if (!opts.imgs_dir) + return 0; + + snprintf(metapath, sizeof(metapath), "%s/%s", + opts.imgs_dir, RMFORK_META_FILE); + f = fopen(metapath, "r"); + if (!f) { + pr_debug("rmfork: %s not found\n", metapath); + return 0; + } + + if (fscanf(f, "0x%lx", &meta_off) != 1) + pr_err("rmfork: failed to parse %s\n", metapath); + + fclose(f); + return meta_off; +} + +/* + * rmfork_restore_iov - restore task memory via kernel syscall + * Returns 0 on success, -1 on error. + */ +int rmfork_restore_iov(pid_t pid, unsigned long meta_off, + struct iovec *iovs, int nr) +{ + struct rmfork_kargs kargs; + long ret; + + if (meta_off == 0) + meta_off = rmfork_get_meta_off(); + + if (meta_off == 0) { + pr_err("rmfork: no meta offset available for restore (pid=%d)\n", + pid); + return -1; + } + + kargs.rmfork_opt = RMFORK_OPT_RESTORE_ONE; + kargs.pid = (unsigned long)pid; + kargs.va = 0; + kargs.pa = meta_off; + kargs.iovs = (unsigned long)iovs; + kargs.nr_iovs = (unsigned long)nr; + + ret = syscall(__NR_remote_fork, &kargs); + if (ret < 0) { + pr_perror("rmfork: restore syscall failed for pid=%d", pid); + return -1; + } + + pr_info("rmfork: restore done for pid=%d (meta_off=0x%lx)\n", + pid, meta_off); + return 0; +} -- 2.53.0
In the rmfork restore path, replace the normal preadv-based page restore from pages.img with a sys_remote_fork(RESTORE_ONE) call that tells the kernel to copy pages from its ubmem (persistent memory) back into the process address space. VDSO/VVAR remap: instead of the normal vdso_proxify, mmap anonymous pages at the original VDSO/VVAR addresses, memcpy the parked contents, then mprotect to the original permissions (RX for VDSO, R for VVAR). This avoids mremap(MREMAP_FIXED) which conflicts with the kernel rmfork path. Skip the mprotect downgrade walk, AIO ring restore, and madvise calls when running under rmfork, since kernel-side restore handles all pages at once through the syscall. Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com> --- criu/pie/restorer.c | 204 +++++++++++++++++++++++++++++++++----------- 1 file changed, 155 insertions(+), 49 deletions(-) diff --git a/criu/pie/restorer.c b/criu/pie/restorer.c index 9867a3ddd..5eddf2ed6 100644 --- a/criu/pie/restorer.c +++ b/criu/pie/restorer.c @@ -362,7 +362,6 @@ skip_xids: } } - if (lsm_type != LSMTYPE__SELINUX) { /* * SELinux does not support setting the process context for @@ -629,6 +628,7 @@ static int restore_robust_futex(struct thread_restore_args *args) } static int restore_thread_common(struct thread_restore_args *args) + { sys_set_tid_address((int *)decode_pointer(args->clear_tid_addr)); @@ -1861,53 +1861,85 @@ __visible long __export_restore_task(struct task_restore_args *args) * Now read the contents (if any) */ - rio = args->vma_ios; - for (i = 0; i < args->vma_ios_n; i++) { - struct iovec *iovs = rio->iovs; - int nr = rio->nr_iovs; - ssize_t r; - - while (nr) { - pr_debug("Preadv %lx:%d... (%d iovs)\n", (unsigned long)iovs->iov_base, (int)iovs->iov_len, nr); - /* - * If we're requested to punch holes in the file after reading we do - * it to save memory. Limit the reads then to an arbitrary block size. - */ - r = preadv_limited(args->vma_ios_fd, iovs, nr, rio->off, - args->auto_dedup ? AUTO_DEDUP_OVERHEAD_BYTES : 0); - if (r < 0) { - pr_err("Can't read pages data (%d)\n", (int)r); - goto core_restore_end; - } + if (args->enable_rmfork) { + /* + * RMFork: restore memory from kernel ubmem instead of pages.img. + */ + struct rmfork_kargs { + unsigned long rmfork_opt; + unsigned long pid; + unsigned long va; + unsigned long pa; + unsigned long iovs; + unsigned long nr_iovs; + }; + struct rmfork_kargs kargs; + long r; + + kargs.rmfork_opt = 1; /* RMFORK_OPT_RESTORE_ONE */ + kargs.pid = sys_getpid(); + kargs.va = 0; + kargs.pa = args->rmfork_meta_off; + kargs.iovs = 0; + kargs.nr_iovs = 0; + + pr_info("rmfork: restoring pages from kernel ubmem (pid=%ld, meta=0x%lx)\n", + kargs.pid, kargs.pa); + + r = sys_remote_fork(&kargs); + if (r < 0) { + pr_err("rmfork: kernel restore failed: %ld\n", r); + goto core_restore_end; + } + } else { + rio = args->vma_ios; + for (i = 0; i < args->vma_ios_n; i++) { + struct iovec *iovs = rio->iovs; + int nr = rio->nr_iovs; + ssize_t r; + + while (nr) { + pr_debug("Preadv %lx:%d... (%d iovs)\n", (unsigned long)iovs->iov_base, (int)iovs->iov_len, nr); + /* + * If we're requested to punch holes in the file after reading we do + * it to save memory. Limit the reads then to an arbitrary block size. + */ + r = preadv_limited(args->vma_ios_fd, iovs, nr, rio->off, + args->auto_dedup ? AUTO_DEDUP_OVERHEAD_BYTES : 0); + if (r < 0) { + pr_err("Can't read pages data (%d)\n", (int)r); + goto core_restore_end; + } - pr_debug("`- returned %ld\n", (long)r); - /* If the file is open for writing, then it means we should punch holes - * in it. */ - if (r > 0 && args->auto_dedup) { - int fr = sys_fallocate(args->vma_ios_fd, FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE, - rio->off, r); - if (fr < 0) { - pr_debug("Failed to punch holes with fallocate: %d\n", fr); + pr_debug("`- returned %ld\n", (long)r); + /* If the file is open for writing, then it means we should punch holes + * in it. */ + if (r > 0 && args->auto_dedup) { + int fr = sys_fallocate(args->vma_ios_fd, FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE, + rio->off, r); + if (fr < 0) { + pr_debug("Failed to punch holes with fallocate: %d\n", fr); + } } + rio->off += r; + /* Advance the iovecs */ + do { + if (iovs->iov_len <= r) { + pr_debug(" `- skip pagemap\n"); + r -= iovs->iov_len; + iovs++; + nr--; + continue; + } + + iovs->iov_base += r; + iovs->iov_len -= r; + break; + } while (nr > 0); } - rio->off += r; - /* Advance the iovecs */ - do { - if (iovs->iov_len <= r) { - pr_debug(" `- skip pagemap\n"); - r -= iovs->iov_len; - iovs++; - nr--; - continue; - } - iovs->iov_base += r; - iovs->iov_len -= r; - break; - } while (nr > 0); + rio = ((void *)rio) + RIO_SIZE(rio->nr_iovs); } - - rio = ((void *)rio) + RIO_SIZE(rio->nr_iovs); } if (args->vma_ios_fd != -1) @@ -1916,19 +1948,91 @@ __visible long __export_restore_task(struct task_restore_args *args) /* * Proxify vDSO. */ - if (vdso_proxify(&args->vdso_maps_rt, &has_vdso_proxy, args->vmas, args->vmas_n, args->compatible_mode, - fault_injected(FI_VDSO_TRAMPOLINES))) + if (args->enable_rmfork && vdso_is_present(&args->vdso_maps_rt)) { + unsigned long orig_vdso = 0, orig_vvar = 0; + unsigned long vdso_sz = args->vdso_maps_rt.sym.vdso_size; + unsigned long vvar_sz = args->vdso_maps_rt.sym.vvar_size; + VmaEntry *vdso_vma = NULL, *vvar_vma = NULL; + int vdso_ret; + long mmap_ret; + + /* Find original VDSO/VVAR addresses from VMA entries */ + for (i = 0; i < args->vmas_n; i++) { + vma_entry = args->vmas + i; + if (vma_entry_is(vma_entry, VMA_AREA_VDSO)) + vdso_vma = vma_entry; + if (vma_entry_is(vma_entry, VMA_AREA_VVAR)) + vvar_vma = vma_entry; + } + + vdso_ret = 0; + + /* Find VDSO VMA entry for original address */ + if (vdso_vma) { + orig_vdso = (unsigned long)decode_pointer(vdso_vma->start); + + mmap_ret = sys_mmap((void *)orig_vdso, vdso_sz, + PROT_READ | PROT_WRITE | PROT_EXEC, + MAP_FIXED | MAP_PRIVATE | MAP_ANONYMOUS, + 0, 0); + if (mmap_ret < 0) { + pr_err("rmfork: mmap VDSO at %lx failed (%ld)\n", + orig_vdso, mmap_ret); + vdso_ret = -1; + } else { + memcpy((void *)orig_vdso, + (void *)args->vdso_maps_rt.vdso_start, + vdso_sz); + args->vdso_maps_rt.vdso_start = orig_vdso; + sys_mprotect((void *)orig_vdso, vdso_sz, PROT_READ | PROT_EXEC); + } + } + + /* Remap VVAR from parked location to original address */ + if (vvar_vma && vdso_ret == 0 && + args->vdso_maps_rt.vvar_start != VVAR_BAD_ADDR && + vvar_sz != VVAR_BAD_SIZE) { + orig_vvar = (unsigned long)decode_pointer(vvar_vma->start); + + mmap_ret = sys_mmap((void *)orig_vvar, vvar_sz, + PROT_READ | PROT_WRITE, + MAP_FIXED | MAP_PRIVATE | MAP_ANONYMOUS, + 0, 0); + if (mmap_ret < 0) { + pr_err("rmfork: mmap VVAR at %lx failed (%ld)\n", + orig_vvar, mmap_ret); + } else { + memcpy((void *)orig_vvar, + (void *)args->vdso_maps_rt.vvar_start, + vvar_sz); + args->vdso_maps_rt.vvar_start = orig_vvar; + sys_mprotect((void *)orig_vvar, vvar_sz, PROT_READ); + } + } + + vdso_update_gtod_addr(&args->vdso_maps_rt); + + if (vdso_ret) { + pr_err("rmfork: VDSO remap failed, continuing without\n"); + vdso_rt_size = 0; + } + } else if (vdso_proxify(&args->vdso_maps_rt, &has_vdso_proxy, args->vmas, args->vmas_n, args->compatible_mode, + fault_injected(FI_VDSO_TRAMPOLINES))) goto core_restore_end; - /* unmap rt-vdso with restorer blob after restore's finished */ + /* unmap rt-vdso with restorer blob after restore */ if (!has_vdso_proxy) vdso_rt_size = 0; + if (args->enable_rmfork) { + /* skip AIO, madvise, mprotect walk for RMFork */ + goto skip_restore_middle; + } /* * Walk though all VMAs again to drop PROT_WRITE * if it was not there. */ - for (i = 0; i < args->vmas_n; i++) { + if (!args->enable_rmfork) for (i = 0; i < args->vmas_n; i++) { vma_entry = args->vmas + i; if (!(vma_entry_is(vma_entry, VMA_AREA_REGULAR))) @@ -1971,6 +2075,7 @@ __visible long __export_restore_task(struct task_restore_args *args) } } } + skip_restore_middle: /* * Tune up the task fields. @@ -2064,9 +2169,9 @@ __visible long __export_restore_task(struct task_restore_args *args) */ rt_sigframe = (void *)&args->t->mz->rt_sigframe; + pr_info("rmfork: before restore_thread_common\n"); if (restore_thread_common(args->t)) goto core_restore_end; - /* * Threads restoration. This requires some more comments. This * restorer routine and thread restorer routine has the following @@ -2290,7 +2395,8 @@ __visible long __export_restore_task(struct task_restore_args *args) * pure assembly since we don't need any additional * code insns from gcc. */ - rst_sigreturn(new_sp, rt_sigframe); + pr_info("rmfork: before rst_sigreturn\n"); + rst_sigreturn(new_sp, rt_sigframe); core_restore_end: futex_abort_and_wake(&task_entries_local->nr_in_progress); -- 2.53.0
- cr-dump.c: call rmfork_dump_mm_iov for kernel-side page dump - cr-restore.c: pass rmfork_meta_off to restorer task args - config.c: fix BOOL_OPT cast for skip_pages/enable_rmfork - restorer.h: add enable_rmfork and rmfork_meta_off fields - mem.c: skip page restore in prepare_mappings for rmfork - rseq.h: fix double-include guard - tty.c: const-correctness fix for strrchr result Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com> --- criu/config.c | 4 ++-- criu/cr-dump.c | 10 ++++++---- criu/cr-restore.c | 11 ++++++----- criu/include/linux/rseq.h | 2 +- criu/include/restorer.h | 3 +++ criu/mem.c | 5 +++++ criu/tty.c | 2 +- 7 files changed, 24 insertions(+), 13 deletions(-) diff --git a/criu/config.c b/criu/config.c index 007ee127b..fa428e0d9 100644 --- a/criu/config.c +++ b/criu/config.c @@ -671,8 +671,8 @@ int parse_options(int argc, char **argv, bool *usage_error, bool *has_exec_cmd, { "external", required_argument, 0, 1073 }, { "empty-ns", required_argument, 0, 1074 }, { "lazy-pages", no_argument, 0, 1076 }, - BOOL_OPT("skip-pages", &opts.skip_pages), - BOOL_OPT("enable-rmfork", &opts.enable_rmfork), + BOOL_OPT("skip-pages", (int *)&opts.skip_pages), + BOOL_OPT("enable-rmfork", (int *)&opts.enable_rmfork), BOOL_OPT("extra", &opts.check_extra_features), BOOL_OPT("experimental", &opts.check_experimental_features), { "all", no_argument, 0, 1079 }, diff --git a/criu/cr-dump.c b/criu/cr-dump.c index 57cb129aa..4be2a9115 100644 --- a/criu/cr-dump.c +++ b/criu/cr-dump.c @@ -1696,10 +1696,12 @@ static int dump_one_task(struct pstree_item *item, InventoryEntry *parent_ie) if (opts.enable_rmfork) { pr_info("rmfork: dumping pages via kernel for pid=%d\n", pid); - ret = rmfork_dump_mm_iov(pid, vpid(item), NULL, 0); - if (ret < 0) { - pr_err("rmfork: kernel dump failed for pid=%d\n", pid); - goto err_cure; + { + long rm_ret = rmfork_dump_mm_iov(pid, vpid(item), NULL, 0); + if (rm_ret < 0) { + pr_err("rmfork: kernel dump failed for pid=%d\n", pid); + goto err_cure; + } } } else { ret = parasite_dump_pages_seized(item, &vmas, &mdc, parasite_ctl); diff --git a/criu/cr-restore.c b/criu/cr-restore.c index 691595ff7..72046efa7 100644 --- a/criu/cr-restore.c +++ b/criu/cr-restore.c @@ -707,13 +707,14 @@ static int restore_one_alive_task(int pid, CoreEntry *core) return -1; /* - * RMFork: restore memory from kernel ubmem instead of pages.img. - * Must be called after VMAs are set up, before normal pages restore. + * RMFork: pass meta offset to restorer blob so it can call + * the kernel rmfork syscall instead of reading pages.img. */ if (opts.enable_rmfork) { - pid_t my_pid = getpid(); - if (rmfork_restore_iov(my_pid, 0, NULL, 0) < 0) { - pr_err("rmfork: kernel restore failed for pid=%d\n", my_pid); + ta->enable_rmfork = true; + ta->rmfork_meta_off = rmfork_get_meta_off(); + if (ta->rmfork_meta_off == 0) { + pr_err("rmfork: no meta offset available for restore\n"); return -1; } } diff --git a/criu/include/linux/rseq.h b/criu/include/linux/rseq.h index 5ceefbf8e..668967f98 100644 --- a/criu/include/linux/rseq.h +++ b/criu/include/linux/rseq.h @@ -14,7 +14,7 @@ #include "common/config.h" -#ifdef CONFIG_HAS_NO_LIBC_RSEQ_DEFS +#if defined(CONFIG_HAS_NO_LIBC_RSEQ_DEFS) && !defined(_LINUX_RSEQ_H) /* * linux/rseq.h * diff --git a/criu/include/restorer.h b/criu/include/restorer.h index 56bea0fcc..a6290724a 100644 --- a/criu/include/restorer.h +++ b/criu/include/restorer.h @@ -168,6 +168,9 @@ struct task_restore_args { struct restore_vma_io *vma_ios; unsigned int vma_ios_n; + bool enable_rmfork; + unsigned long rmfork_meta_off; + struct restore_posix_timer *posix_timers; unsigned int posix_timers_n; bool posix_timer_cr_ids; diff --git a/criu/mem.c b/criu/mem.c index b06a0d850..07425b501 100644 --- a/criu/mem.c +++ b/criu/mem.c @@ -1352,6 +1352,10 @@ int prepare_mappings(struct pstree_item *t) rsti(t)->premmapped_addr = addr; rsti(t)->premmapped_len = vmas->rst_priv_size; + /* RMFork: memory restored by kernel via rmfork syscall */ + if (opts.enable_rmfork) + goto skip_page_restore; + ret = open_page_read(vpid(t), &pr, PR_TASK); if (ret <= 0) return -1; @@ -1393,6 +1397,7 @@ int prepare_mappings(struct pstree_item *t) pr_info("Shrunk premap area to %p(%lx)\n", rsti(t)->premmapped_addr, rsti(t)->premmapped_len); } +skip_page_restore: out: return ret; } diff --git a/criu/tty.c b/criu/tty.c index ae23094b7..9a4520d53 100644 --- a/criu/tty.c +++ b/criu/tty.c @@ -259,7 +259,7 @@ static int pts_fd_get_index(int fd, const struct fd_parms *p) { int index; const struct fd_link *link = p->link; - char *pos = strrchr(link->name, '/'); + const char *pos = strrchr(link->name, '/'); if (!pos || pos == (link->name + link->len - 1)) { pr_err("Unexpected format on path %s\n", link->name + 1); -- 2.53.0
participants (1)
-
Qi Xi