In the rmfork restore path, replace the normal preadv-based page restore from pages.img with a sys_remote_fork(RESTORE_ONE) call that tells the kernel to copy pages from its ubmem (persistent memory) back into the process address space. VDSO/VVAR remap: instead of the normal vdso_proxify, mmap anonymous pages at the original VDSO/VVAR addresses, memcpy the parked contents, then mprotect to the original permissions (RX for VDSO, R for VVAR). This avoids mremap(MREMAP_FIXED) which conflicts with the kernel rmfork path. Skip the mprotect downgrade walk, AIO ring restore, and madvise calls when running under rmfork, since kernel-side restore handles all pages at once through the syscall. Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com> --- criu/pie/restorer.c | 204 +++++++++++++++++++++++++++++++++----------- 1 file changed, 155 insertions(+), 49 deletions(-) diff --git a/criu/pie/restorer.c b/criu/pie/restorer.c index 9867a3ddd..5eddf2ed6 100644 --- a/criu/pie/restorer.c +++ b/criu/pie/restorer.c @@ -362,7 +362,6 @@ skip_xids: } } - if (lsm_type != LSMTYPE__SELINUX) { /* * SELinux does not support setting the process context for @@ -629,6 +628,7 @@ static int restore_robust_futex(struct thread_restore_args *args) } static int restore_thread_common(struct thread_restore_args *args) + { sys_set_tid_address((int *)decode_pointer(args->clear_tid_addr)); @@ -1861,53 +1861,85 @@ __visible long __export_restore_task(struct task_restore_args *args) * Now read the contents (if any) */ - rio = args->vma_ios; - for (i = 0; i < args->vma_ios_n; i++) { - struct iovec *iovs = rio->iovs; - int nr = rio->nr_iovs; - ssize_t r; - - while (nr) { - pr_debug("Preadv %lx:%d... (%d iovs)\n", (unsigned long)iovs->iov_base, (int)iovs->iov_len, nr); - /* - * If we're requested to punch holes in the file after reading we do - * it to save memory. Limit the reads then to an arbitrary block size. - */ - r = preadv_limited(args->vma_ios_fd, iovs, nr, rio->off, - args->auto_dedup ? AUTO_DEDUP_OVERHEAD_BYTES : 0); - if (r < 0) { - pr_err("Can't read pages data (%d)\n", (int)r); - goto core_restore_end; - } + if (args->enable_rmfork) { + /* + * RMFork: restore memory from kernel ubmem instead of pages.img. + */ + struct rmfork_kargs { + unsigned long rmfork_opt; + unsigned long pid; + unsigned long va; + unsigned long pa; + unsigned long iovs; + unsigned long nr_iovs; + }; + struct rmfork_kargs kargs; + long r; + + kargs.rmfork_opt = 1; /* RMFORK_OPT_RESTORE_ONE */ + kargs.pid = sys_getpid(); + kargs.va = 0; + kargs.pa = args->rmfork_meta_off; + kargs.iovs = 0; + kargs.nr_iovs = 0; + + pr_info("rmfork: restoring pages from kernel ubmem (pid=%ld, meta=0x%lx)\n", + kargs.pid, kargs.pa); + + r = sys_remote_fork(&kargs); + if (r < 0) { + pr_err("rmfork: kernel restore failed: %ld\n", r); + goto core_restore_end; + } + } else { + rio = args->vma_ios; + for (i = 0; i < args->vma_ios_n; i++) { + struct iovec *iovs = rio->iovs; + int nr = rio->nr_iovs; + ssize_t r; + + while (nr) { + pr_debug("Preadv %lx:%d... (%d iovs)\n", (unsigned long)iovs->iov_base, (int)iovs->iov_len, nr); + /* + * If we're requested to punch holes in the file after reading we do + * it to save memory. Limit the reads then to an arbitrary block size. + */ + r = preadv_limited(args->vma_ios_fd, iovs, nr, rio->off, + args->auto_dedup ? AUTO_DEDUP_OVERHEAD_BYTES : 0); + if (r < 0) { + pr_err("Can't read pages data (%d)\n", (int)r); + goto core_restore_end; + } - pr_debug("`- returned %ld\n", (long)r); - /* If the file is open for writing, then it means we should punch holes - * in it. */ - if (r > 0 && args->auto_dedup) { - int fr = sys_fallocate(args->vma_ios_fd, FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE, - rio->off, r); - if (fr < 0) { - pr_debug("Failed to punch holes with fallocate: %d\n", fr); + pr_debug("`- returned %ld\n", (long)r); + /* If the file is open for writing, then it means we should punch holes + * in it. */ + if (r > 0 && args->auto_dedup) { + int fr = sys_fallocate(args->vma_ios_fd, FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE, + rio->off, r); + if (fr < 0) { + pr_debug("Failed to punch holes with fallocate: %d\n", fr); + } } + rio->off += r; + /* Advance the iovecs */ + do { + if (iovs->iov_len <= r) { + pr_debug(" `- skip pagemap\n"); + r -= iovs->iov_len; + iovs++; + nr--; + continue; + } + + iovs->iov_base += r; + iovs->iov_len -= r; + break; + } while (nr > 0); } - rio->off += r; - /* Advance the iovecs */ - do { - if (iovs->iov_len <= r) { - pr_debug(" `- skip pagemap\n"); - r -= iovs->iov_len; - iovs++; - nr--; - continue; - } - iovs->iov_base += r; - iovs->iov_len -= r; - break; - } while (nr > 0); + rio = ((void *)rio) + RIO_SIZE(rio->nr_iovs); } - - rio = ((void *)rio) + RIO_SIZE(rio->nr_iovs); } if (args->vma_ios_fd != -1) @@ -1916,19 +1948,91 @@ __visible long __export_restore_task(struct task_restore_args *args) /* * Proxify vDSO. */ - if (vdso_proxify(&args->vdso_maps_rt, &has_vdso_proxy, args->vmas, args->vmas_n, args->compatible_mode, - fault_injected(FI_VDSO_TRAMPOLINES))) + if (args->enable_rmfork && vdso_is_present(&args->vdso_maps_rt)) { + unsigned long orig_vdso = 0, orig_vvar = 0; + unsigned long vdso_sz = args->vdso_maps_rt.sym.vdso_size; + unsigned long vvar_sz = args->vdso_maps_rt.sym.vvar_size; + VmaEntry *vdso_vma = NULL, *vvar_vma = NULL; + int vdso_ret; + long mmap_ret; + + /* Find original VDSO/VVAR addresses from VMA entries */ + for (i = 0; i < args->vmas_n; i++) { + vma_entry = args->vmas + i; + if (vma_entry_is(vma_entry, VMA_AREA_VDSO)) + vdso_vma = vma_entry; + if (vma_entry_is(vma_entry, VMA_AREA_VVAR)) + vvar_vma = vma_entry; + } + + vdso_ret = 0; + + /* Find VDSO VMA entry for original address */ + if (vdso_vma) { + orig_vdso = (unsigned long)decode_pointer(vdso_vma->start); + + mmap_ret = sys_mmap((void *)orig_vdso, vdso_sz, + PROT_READ | PROT_WRITE | PROT_EXEC, + MAP_FIXED | MAP_PRIVATE | MAP_ANONYMOUS, + 0, 0); + if (mmap_ret < 0) { + pr_err("rmfork: mmap VDSO at %lx failed (%ld)\n", + orig_vdso, mmap_ret); + vdso_ret = -1; + } else { + memcpy((void *)orig_vdso, + (void *)args->vdso_maps_rt.vdso_start, + vdso_sz); + args->vdso_maps_rt.vdso_start = orig_vdso; + sys_mprotect((void *)orig_vdso, vdso_sz, PROT_READ | PROT_EXEC); + } + } + + /* Remap VVAR from parked location to original address */ + if (vvar_vma && vdso_ret == 0 && + args->vdso_maps_rt.vvar_start != VVAR_BAD_ADDR && + vvar_sz != VVAR_BAD_SIZE) { + orig_vvar = (unsigned long)decode_pointer(vvar_vma->start); + + mmap_ret = sys_mmap((void *)orig_vvar, vvar_sz, + PROT_READ | PROT_WRITE, + MAP_FIXED | MAP_PRIVATE | MAP_ANONYMOUS, + 0, 0); + if (mmap_ret < 0) { + pr_err("rmfork: mmap VVAR at %lx failed (%ld)\n", + orig_vvar, mmap_ret); + } else { + memcpy((void *)orig_vvar, + (void *)args->vdso_maps_rt.vvar_start, + vvar_sz); + args->vdso_maps_rt.vvar_start = orig_vvar; + sys_mprotect((void *)orig_vvar, vvar_sz, PROT_READ); + } + } + + vdso_update_gtod_addr(&args->vdso_maps_rt); + + if (vdso_ret) { + pr_err("rmfork: VDSO remap failed, continuing without\n"); + vdso_rt_size = 0; + } + } else if (vdso_proxify(&args->vdso_maps_rt, &has_vdso_proxy, args->vmas, args->vmas_n, args->compatible_mode, + fault_injected(FI_VDSO_TRAMPOLINES))) goto core_restore_end; - /* unmap rt-vdso with restorer blob after restore's finished */ + /* unmap rt-vdso with restorer blob after restore */ if (!has_vdso_proxy) vdso_rt_size = 0; + if (args->enable_rmfork) { + /* skip AIO, madvise, mprotect walk for RMFork */ + goto skip_restore_middle; + } /* * Walk though all VMAs again to drop PROT_WRITE * if it was not there. */ - for (i = 0; i < args->vmas_n; i++) { + if (!args->enable_rmfork) for (i = 0; i < args->vmas_n; i++) { vma_entry = args->vmas + i; if (!(vma_entry_is(vma_entry, VMA_AREA_REGULAR))) @@ -1971,6 +2075,7 @@ __visible long __export_restore_task(struct task_restore_args *args) } } } + skip_restore_middle: /* * Tune up the task fields. @@ -2064,9 +2169,9 @@ __visible long __export_restore_task(struct task_restore_args *args) */ rt_sigframe = (void *)&args->t->mz->rt_sigframe; + pr_info("rmfork: before restore_thread_common\n"); if (restore_thread_common(args->t)) goto core_restore_end; - /* * Threads restoration. This requires some more comments. This * restorer routine and thread restorer routine has the following @@ -2290,7 +2395,8 @@ __visible long __export_restore_task(struct task_restore_args *args) * pure assembly since we don't need any additional * code insns from gcc. */ - rst_sigreturn(new_sp, rt_sigframe); + pr_info("rmfork: before rst_sigreturn\n"); + rst_sigreturn(new_sp, rt_sigframe); core_restore_end: futex_abort_and_wake(&task_entries_local->nr_in_progress); -- 2.53.0