Replace the old page restoration approach that used vm_insert_page to insert ubmem pages directly into the process page table. The new approach copies page data from ubmem to the process address space using copy_to_user, which triggers normal page faults. This is simpler and more reliable: - Remove rmfork_handle_one_fault, rmfork_restore_vma, rmfork_restore_iov - Add rmfork_restore_pages with two modes: * IOV-based: only touch pages in the specified iovec ranges * All-pages: iterate all saved page table entries - Pages that hit -EFAULT (e.g., guard pages with prot=0) are skipped and counted, not treated as errors - Fix typo: rfork_task[i].vpid → rmfork_task[i].vpid Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com> --- kernel/rmfork.c | 135 +++++++++++++++++++++--------------------------- 1 file changed, 60 insertions(+), 75 deletions(-) diff --git a/kernel/rmfork.c b/kernel/rmfork.c index d9bf6eae1520..3ac48106652b 100644 --- a/kernel/rmfork.c +++ b/kernel/rmfork.c @@ -94,7 +94,7 @@ static unsigned long rfork_ubmem_off; /* current allocation offset */ static struct rmfork_pstree *rmfork_pstree; static unsigned long rfork_meta_off; - +/* Whether we are on the same machine as dump (simplifies pfn handling) */ /* ------------------------------------------------------------------ */ /* Forward declarations */ /* ------------------------------------------------------------------ */ @@ -104,6 +104,8 @@ static int rmfork_restore_one(struct rmfork_kargs *kargs); static int dump_one_task(struct pid *pid, struct rmfork_pstree *pstree, unsigned long vpid, struct iovec __user *iovs, unsigned long nr); +static int rmfork_restore_pages(struct rmfork_task *task, + struct iovec __user *iovs, unsigned long nr); /* ------------------------------------------------------------------ */ /* UBMEM (pmem) allocator */ @@ -586,94 +588,77 @@ static unsigned long rmfork_task_find_pa_bsearch(struct rmfork_task *task, return 0; } -static int rmfork_handle_one_fault(struct vm_area_struct *vma, unsigned long addr, - unsigned long size, unsigned long pfn, - int remap, int thp) +/* + * Restore pages using copy_to_user from the kernel ubmem. + * This triggers normal page faults that allocate the correct page type + * (anonymous for anonymous VMAs, COW for private file-backed), + * matching what the normal CRIU preadv restore does. + * + * No mmap lock is needed — the restorer is single-threaded so the + * address space is stable. The page fault handler takes mmap_read_lock + * internally. Pages whose VMA was unmapped in the meantime simply + * get -EFAULT and are skipped. + */ +static int rmfork_restore_pages(struct rmfork_task *task, + struct iovec __user *iovs, unsigned long nr) { - struct page *page; - unsigned long pa; + unsigned long i, find_time = 0, skip_time = 0; + int ret = 0; - pa = rfork_ubmem_phys + (pfn << PAGE_SHIFT); - page = pfn_to_page(pa >> PAGE_SHIFT); - if (!page) - return -EINVAL; + if (iovs && nr > 0) { + /* IOV-based restore — only touch ranges in the list */ + struct iovec *kiovs; - return vm_insert_page(vma, addr, page); -} + kiovs = memdup_user(iovs, nr * sizeof(struct iovec)); + if (IS_ERR(kiovs)) + return PTR_ERR(kiovs); -static int rmfork_restore_vma(struct mm_struct *mm, struct rmfork_task *task, - unsigned long iov_base, unsigned long iov_off, - unsigned long len, unsigned long *saved_idx, - unsigned long *find_time) -{ - unsigned long va, pfn; - unsigned long pos; - struct vm_area_struct *vma; - int ret; + for (i = 0; i < nr; i++) { + unsigned long base = (unsigned long)kiovs[i].iov_base; + unsigned long len = kiovs[i].iov_len; + unsigned long addr; - if (len == 0) - return 0; + for (addr = base; addr < base + len; addr += PAGE_SIZE) { + unsigned long pos; + unsigned long pfn; + void *src; - for (va = iov_base; va < iov_base + len; va += PAGE_SIZE) { - pfn = rmfork_task_find_pa_bsearch(task, va, &pos); - if (pfn == 0) - continue; + pfn = rmfork_task_find_pa_bsearch(task, addr, &pos); + if (pfn == 0) + continue; - vma = find_vma(mm, va); - if (!vma) { - pr_warn("rmfork: no vma for va=0x%lx\n", va); - continue; + src = (void *)rmfork_ubmem_base + (pfn << PAGE_SHIFT); + + /* copy_to_user triggers page fault if needed */ + if (copy_to_user((void __user *)addr, src, PAGE_SIZE)) { + skip_time++; + continue; + } + find_time++; + } } + kfree(kiovs); + } else { + /* Restore all saved pages */ + for (i = 0; i < task->pt_cnt; i++) { + unsigned long va = task->pts[i].va << PAGE_SHIFT; + void *src; - *saved_idx = pos; - (*find_time)++; + src = (void *)rmfork_ubmem_base + (task->pts[i].pfn << PAGE_SHIFT); - ret = rmfork_handle_one_fault(vma, va, PAGE_SIZE, pfn, 0, 0); - if (ret) { - pr_err("rmfork: handle_one_fault failed at va=0x%lx: %d\n", - va, ret); - return ret; + if (copy_to_user((void __user *)va, src, PAGE_SIZE)) { + skip_time++; + continue; + } + find_time++; } } + pr_info("rmfork: restored %lu pages (skipped %lu) for task pid=%d\n", + find_time, skip_time, task->pid); return 0; } -static int rmfork_restore_iov(struct mm_struct *mm, struct rmfork_task *task, - struct iovec __user *iovs, unsigned long nr) -{ - struct iovec *kiovs; - unsigned long saved_idx = 0, find_time = 0; - int ret; - unsigned long i; - - if (!iovs || nr == 0) - return -EINVAL; - - kiovs = memdup_user(iovs, nr * sizeof(struct iovec)); - if (IS_ERR(kiovs)) - return PTR_ERR(kiovs); - - for (i = 0; i < nr; i++) { - if (!kiovs[i].iov_base || !kiovs[i].iov_len) - continue; - - ret = rmfork_restore_vma(mm, task, - (unsigned long)kiovs[i].iov_base, 0, - kiovs[i].iov_len, &saved_idx, &find_time); - if (ret) - goto out; - } - - pr_info("rmfork: restored %lu pages for task pid=%d\n", - find_time, task->pid); - ret = 0; - -out: - kfree(kiovs); - return ret; -} - static int rmfork_restore_one(struct rmfork_kargs *kargs) { struct mm_struct *mm; @@ -774,9 +759,9 @@ static int rmfork_restore_one(struct rmfork_kargs *kargs) for (i = 0; i < rmfork_pstree->tsk_cnt; i++) { if (rmfork_task[i].vpid == (pid_t)kargs->pid) { pr_info("rmfork: restoring pid=%d (vpid=%d), pt_cnt=%lu\n", - rmfork_task[i].pid, rfork_task[i].vpid, + rmfork_task[i].pid, rmfork_task[i].vpid, rmfork_task[i].pt_cnt); - ret = rmfork_restore_iov(mm, &rmfork_task[i], + ret = rmfork_restore_pages(&rmfork_task[i], (struct iovec __user *)kargs->iovs, kargs->nr_iovs); break; -- 2.53.0