Qi Xi (4): rmfork: add kernel-side memory checkpoint/restore for CRIU rmfork arm64: add pmemmem kernel parameter for persistent memory rmfork: refactor ubmem allocator, add sysfs reset, expand limits rmfork: switch page restore from vm_insert_page to copy_to_user Kconfig | 1 + arch/arm64/include/uapi/asm/unistd.h | 6 + arch/arm64/kernel/setup.c | 5 + arch/arm64/mm/init.c | 75 +++ include/linux/ioport.h | 1 + include/linux/mm.h | 4 + include/uapi/linux/rmfork.h | 26 + kernel/Kconfig.rmfork | 8 + kernel/Makefile | 1 + kernel/rmfork.c | 830 +++++++++++++++++++++++++++ kernel/sys_ni.c | 3 + lib/Kconfig | 6 + 12 files changed, 966 insertions(+) create mode 100644 include/uapi/linux/rmfork.h create mode 100644 kernel/Kconfig.rmfork create mode 100644 kernel/rmfork.c -- 2.53.0
Introduce RMFork (Remote Memory Fork) — a kernel mechanism that allows CRIU to save and restore process memory via kernel-reserved pmem (ubmem), bypassing the need for userspace pages.img. Core changes: - Add syscall 454 (remote_fork) for ARM64 - Create kernel/rmfork.c with dump and restore handlers - Reserve 32MB of pmem at boot via memblock + memremap as ubmem - Dump path (opt=3): walk page table, copy pages to ubmem, record VA->PFN - Restore path (opt=1): insert saved pages into target process's VMAs - Add UAPI header (include/uapi/linux/rmfork.h) for struct rmfork_kargs - Add Kconfig option CONFIG_RMFORK, default y on ARM64 Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com> --- Kconfig | 1 + arch/arm64/include/uapi/asm/unistd.h | 6 + include/uapi/linux/rmfork.h | 26 + kernel/Kconfig.rmfork | 8 + kernel/Makefile | 1 + kernel/rmfork.c | 768 +++++++++++++++++++++++++++ kernel/sys_ni.c | 3 + 7 files changed, 813 insertions(+) create mode 100644 include/uapi/linux/rmfork.h create mode 100644 kernel/Kconfig.rmfork create mode 100644 kernel/rmfork.c diff --git a/Kconfig b/Kconfig index 4a96e16e5f31..33b9d4a3d858 100644 --- a/Kconfig +++ b/Kconfig @@ -10,6 +10,7 @@ source "scripts/Kconfig.include" source "init/Kconfig" source "kernel/Kconfig.freezer" +source "kernel/Kconfig.rmfork" source "fs/Kconfig.binfmt" diff --git a/arch/arm64/include/uapi/asm/unistd.h b/arch/arm64/include/uapi/asm/unistd.h index 079139c04b14..2456d590c123 100644 --- a/arch/arm64/include/uapi/asm/unistd.h +++ b/arch/arm64/include/uapi/asm/unistd.h @@ -36,3 +36,9 @@ #define __ARCH_WANT_MEMFD_SECRET #include <asm-generic/unistd.h> + +/* + * ARM64-specific syscalls + */ +#define __NR_remote_fork 454 +__SYSCALL(__NR_remote_fork, sys_remote_fork) diff --git a/include/uapi/linux/rmfork.h b/include/uapi/linux/rmfork.h new file mode 100644 index 000000000000..48ff75cbb559 --- /dev/null +++ b/include/uapi/linux/rmfork.h @@ -0,0 +1,26 @@ +/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */ +#ifndef _UAPI_LINUX_RMFORK_H +#define _UAPI_LINUX_RMFORK_H + +#include <linux/types.h> + +/* + * rmfork_opt values for sys_remote_fork + */ +#define RMFORK_OPT_RESTORE_ONE 1 /* restore one task's mm */ +#define RMFORK_OPT_RESTORE_ALL 2 /* restore all tasks */ +#define RMFORK_OPT_DUMP 3 /* dump one task's mm */ + +/* + * Arguments passed to sys_remote_fork via pointer + */ +struct rmfork_kargs { + unsigned long rmfork_opt; /* dump=3, restore=1/2 */ + unsigned long pid; /* target pid */ + unsigned long va; /* virtual address base */ + unsigned long pa; /* meta offset in ubmem (for restore) */ + unsigned long iovs; /* pointer to iovec array (user-space) */ + unsigned long nr_iovs; /* number of iov entries */ +}; + +#endif /* _UAPI_LINUX_RMFORK_H */ diff --git a/kernel/Kconfig.rmfork b/kernel/Kconfig.rmfork new file mode 100644 index 000000000000..ce120a515657 --- /dev/null +++ b/kernel/Kconfig.rmfork @@ -0,0 +1,8 @@ +config RMFORK + bool "RMFork support for CRIU kernel-side checkpoint/restore" + depends on ARM64 + default y + help + Enable RMFork (Remote Memory Fork) for CRIU integration. This allows + CRIU to dump and restore process memory via kernel-reserved pmem, + bypassing the need for userspace pages.img. diff --git a/kernel/Makefile b/kernel/Makefile index fe3559ee90d9..67848a48673e 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -127,6 +127,7 @@ obj-$(CONFIG_CONTEXT_TRACKING) += context_tracking.o obj-$(CONFIG_TORTURE_TEST) += torture.o obj-$(CONFIG_HAS_IOMEM) += iomem.o +obj-$(CONFIG_RMFORK) += rmfork.o obj-$(CONFIG_RSEQ) += rseq.o obj-$(CONFIG_WATCH_QUEUE) += watch_queue.o diff --git a/kernel/rmfork.c b/kernel/rmfork.c new file mode 100644 index 000000000000..675d712b3d7d --- /dev/null +++ b/kernel/rmfork.c @@ -0,0 +1,768 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * RMFork - Remote Memory Fork (CRIU + kernel checkpoint/restore) + * + * Copyright (C) 2026 + */ + +#include <linux/kernel.h> +#include <linux/module.h> +#include <linux/syscalls.h> +#include <linux/sched.h> +#include <linux/sched/mm.h> +#include <linux/mm.h> +#include <linux/mm_types.h> +#include <linux/memblock.h> +#include <linux/io.h> +#include <linux/hugetlb.h> +#include <linux/swap.h> +#include <linux/highmem.h> +#include <linux/pagemap.h> +#include <linux/slab.h> +#include <linux/uaccess.h> +#include <linux/fs.h> +#include <linux/fdtable.h> +#include <linux/string.h> +#include <linux/pid.h> +#include <linux/spinlock.h> +#include <linux/mutex.h> +#include <linux/types.h> +#include <linux/bitops.h> +#include <linux/uio.h> +#include <linux/ptrace.h> + +#include <uapi/linux/rmfork.h> + +#include <asm/pgtable.h> +#include <asm/page.h> +#include <asm/tlbflush.h> +#include <asm/io.h> + +/* ------------------------------------------------------------------ */ +/* Config */ +/* ------------------------------------------------------------------ */ +#define RMFORK_MAGIC 0x1234abcdUL +#define RMFORK_UBMEM_START 0x840000000ULL +#define RMFORK_UBMEM_SIZE SZ_32M +#define RMFORK_PTS_MAX 4096 +#define RMFORK_TASK_MAX 512 + +/* ------------------------------------------------------------------ */ +/* Data structures (must match spec) */ +/* ------------------------------------------------------------------ */ +struct rmfork_pt_entry { + unsigned long va; /* VA >> PAGE_SHIFT */ + unsigned long pfn; /* ubmem offset index after dump */ + unsigned long pfn_saved; /* original PFN (for dedup) */ +}; + +struct rmfork_task { + pid_t pid; /* real pid */ + pid_t vpid; /* virtual pid (from criu) */ + unsigned long pt_cnt; /* number of pages */ + struct rmfork_pt_entry *pts; /* VA->PFN mapping array */ + unsigned long pts_off; /* pts offset in ubmem */ + void *data_ptr; /* page data buffer in ubmem */ + unsigned long data_ptr_off; /* data_ptr offset in ubmem */ +}; + +struct rmfork_pstree { + unsigned long tsk_cnt; /* number of tasks */ + struct rmfork_task *tasks; /* tasks array */ + unsigned long tasks_off; /* tasks offset in ubmem */ + pid_t criu_pid; /* which criu pid did the dump */ +}; + +struct rmfork_metadata { + unsigned long magic; /* = RMFORK_MAGIC */ + struct rmfork_pstree *pstrees; /* pointer to pstrees */ + unsigned long pstrees_pa; /* pstrees offset in ubmem (pa as off) */ + int tsk_cnt; /* number of tasks */ +} __packed; + +/* ------------------------------------------------------------------ */ +/* Global state */ +/* ------------------------------------------------------------------ */ +static DEFINE_MUTEX(rmfork_mutex); + +static void *rmfork_ubmem_base; /* VA of reserved ubmem region */ +static unsigned long rfork_ubmem_phys; /* PA of reserved ubmem region */ +static unsigned long rfork_ubmem_size; /* size */ +static unsigned long rfork_ubmem_off; /* current allocation offset */ + +/* The live pstree used during restore */ +static struct rmfork_pstree *rmfork_pstree; +static unsigned long rfork_meta_off; + +/* Whether we are on the same machine as dump (simplifies pfn handling) */ +static bool rmfork_on_single_machine = true; + +/* ------------------------------------------------------------------ */ +/* Forward declarations */ +/* ------------------------------------------------------------------ */ +static int rmfork_dump_mem(struct rmfork_kargs *kargs); +static int rmfork_restore_all(struct rmfork_kargs *kargs); +static int rmfork_restore_one(struct rmfork_kargs *kargs); +static int dump_one_task(struct pid *pid, struct rmfork_pstree *pstree, + unsigned long vpid, struct iovec __user *iovs, + unsigned long nr); + +/* ------------------------------------------------------------------ */ +/* UBMEM (pmem) allocator */ +/* ------------------------------------------------------------------ */ +static int __init rmfork_ubmem_init(void) +{ + phys_addr_t phys; + + phys = memblock_phys_alloc(RMFORK_UBMEM_SIZE, SZ_2M); + if (!phys) { + pr_err("rmfork: failed to reserve %pa of pmem\n", &RMFORK_UBMEM_SIZE); + return -ENOMEM; + } + + rmfork_ubmem_base = memremap(phys, RMFORK_UBMEM_SIZE, MEMREMAP_WB); + if (!rmfork_ubmem_base) { + pr_err("rmfork: failed to memremap ubmem\n"); + memblock_phys_free(phys, RMFORK_UBMEM_SIZE); + return -ENOMEM; + } + + rfork_ubmem_phys = phys; + rfork_ubmem_size = RMFORK_UBMEM_SIZE; + rfork_ubmem_off = 0; + + memset(rmfork_ubmem_base, 0, RMFORK_UBMEM_SIZE); + + pr_info("rmfork: ubmem reserved at phys=0x%llx, size=0x%lx, va=%px\n", + (unsigned long long)phys, RMFORK_UBMEM_SIZE, rmfork_ubmem_base); + return 0; +} +early_initcall(rmfork_ubmem_init); + +static void *rmfork_alloc_from_ubmem(unsigned long size) +{ + void *ptr; + unsigned long off; + + mutex_lock(&rmfork_mutex); + off = ALIGN(rfork_ubmem_off, 8); + if (off + size > rfork_ubmem_size) { + mutex_unlock(&rmfork_mutex); + pr_err("rmfork: ubmem exhausted (off=0x%lx, size=0x%lx, max=0x%lx)\n", + off, size, rfork_ubmem_size); + return NULL; + } + ptr = rmfork_ubmem_base + off; + rfork_ubmem_off = off + size; + mutex_unlock(&rmfork_mutex); + + memset(ptr, 0, size); + return ptr; +} + +static void *rmfork_off_to_va(unsigned long off) +{ + if (off >= rfork_ubmem_size) { + pr_err("rmfork: invalid ubmem offset 0x%lx (max 0x%lx)\n", + off, rfork_ubmem_size); + return NULL; + } + return rmfork_ubmem_base + off; +} + +static unsigned long rfork_va_to_off(void *va) +{ + unsigned long off; + + off = (unsigned long)(va - rmfork_ubmem_base); + if (off >= rfork_ubmem_size) { + pr_err("rmfork: va %px not in ubmem range\n", va); + return 0; + } + return off; +} + +/* ------------------------------------------------------------------ */ +/* Page-table walk helpers */ +/* ------------------------------------------------------------------ */ +static int rmfork_get_pte(struct mm_struct *mm, unsigned long addr, + spinlock_t **ptl, pte_t **ptep) +{ + pgd_t *pgd; + p4d_t *p4d; + pud_t *pud; + pmd_t *pmd; + pte_t *pte; + + pgd = pgd_offset(mm, addr); + if (pgd_none(*pgd) || pgd_bad(*pgd)) + return -ENOENT; + + p4d = p4d_offset(pgd, addr); + if (p4d_none(*p4d) || p4d_bad(*p4d)) + return -ENOENT; + + pud = pud_offset(p4d, addr); + if (pud_none(*pud) || pud_bad(*pud)) + return -ENOENT; + + pmd = pmd_offset(pud, addr); + if (pmd_none(*pmd)) + return -ENOENT; + + /* Handle THP */ + if (pmd_trans_huge(*pmd)) { + *ptep = (pte_t *)pmd; + *ptl = pmd_lock(mm, pmd); + return 0; + } + + if (pmd_bad(*pmd)) + return -ENOENT; + + pte = pte_offset_map_lock(mm, pmd, addr, ptl); + if (!pte || pte_none(*pte)) { + pte_unmap_unlock(pte, *ptl); + return -ENOENT; + } + + *ptep = pte; + return 0; +} + +/* ------------------------------------------------------------------ */ +/* Dump path (rmfork_opt=3) */ +/* ------------------------------------------------------------------ */ +static int rmfork_prepare_pts(struct rmfork_task *task, unsigned long max_pages) +{ + size_t size; + + size = max_pages * sizeof(struct rmfork_pt_entry); + task->pts = rmfork_alloc_from_ubmem(size); + if (!task->pts) + return -ENOMEM; + + task->pts_off = rfork_va_to_off(task->pts); + task->pt_cnt = 0; + return 0; +} + +static int rmfork_record_vma(struct mm_struct *mm, struct rmfork_task *task, + unsigned long vma_start, unsigned long vma_end) +{ + unsigned long addr; + spinlock_t *ptl; + pte_t *ptep; + int ret; + + for (addr = vma_start; addr < vma_end; addr += PAGE_SIZE) { + if (task->pt_cnt >= RMFORK_PTS_MAX) { + pr_warn("rmfork: too many pages (%lu) for task %d\n", + task->pt_cnt, task->pid); + return -E2BIG; + } + + ret = rmfork_get_pte(mm, addr, &ptl, &ptep); + if (ret == -ENOENT) + continue; + if (ret < 0) + continue; + + if (pte_none(*ptep) || !pte_present(*ptep)) { + pte_unmap_unlock(ptep, ptl); + continue; + } + + task->pts[task->pt_cnt].va = addr >> PAGE_SHIFT; + task->pts[task->pt_cnt].pfn_saved = pte_pfn(*ptep); + + pte_unmap_unlock(ptep, ptl); + task->pt_cnt++; + } + + return 0; +} + +static int rmfork_copy_pages(struct rmfork_task *task) +{ + unsigned long i; + struct page *page; + void *src, *dst; + + dst = task->data_ptr; + + for (i = 0; i < task->pt_cnt; i++) { + page = pfn_to_page(task->pts[i].pfn_saved); + if (!page) { + pr_warn("rmfork: invalid pfn 0x%lx at index %lu\n", + task->pts[i].pfn_saved, i); + continue; + } + + src = kmap_local_page(page); + if (!src) + continue; + + memcpy(dst + i * PAGE_SIZE, src, PAGE_SIZE); + kunmap_local(src); + + /* Convert to ubmem offset index */ + task->pts[i].pfn = (rfork_va_to_off(task->data_ptr) >> PAGE_SHIFT) + i; + } + + return 0; +} + +static unsigned long rmfork_init_metadata(struct rmfork_pstree *pstree) +{ + struct rmfork_metadata *meta; + + meta = rmfork_alloc_from_ubmem(sizeof(*meta)); + if (!meta) + return 0; + + meta->magic = RMFORK_MAGIC; + meta->pstrees = pstree; + meta->pstrees_pa = rfork_va_to_off(pstree); + meta->tsk_cnt = pstree->tsk_cnt; + + return rfork_va_to_off(meta); +} + +static int dump_one_task(struct pid *pid, struct rmfork_pstree *pstree, + unsigned long vpid, struct iovec __user *iovs, + unsigned long nr) +{ + struct task_struct *tsk; + struct mm_struct *mm; + struct vm_area_struct *vma; + struct rmfork_task *task; + unsigned long max_pages, data_size; + int ret; + unsigned long idx; + + tsk = get_pid_task(pid, PIDTYPE_PID); + if (!tsk) { + pr_err("rmfork: task not found, pid=%d\n", pid_nr(pid)); + return -ESRCH; + } + + mm = get_task_mm(tsk); + if (!mm) { + pr_err("rmfork: no mm for task %d\n", pid_nr(pid)); + put_task_struct(tsk); + return -EINVAL; + } + + /* Allocate task slot in pstree */ + idx = pstree->tsk_cnt; + if (idx >= RMFORK_TASK_MAX) { + pr_err("rmfork: too many tasks\n"); + mmput(mm); + put_task_struct(tsk); + return -E2BIG; + } + task = &pstree->tasks[idx]; + + task->pid = pid_nr(pid); + task->vpid = (pid_t)vpid; + task->pt_cnt = 0; + + /* Count VMAs to estimate max pages */ + max_pages = 0; + mmap_read_lock(mm); + for (vma = mm->mmap; vma; vma = vma->vm_next) + max_pages += (vma->vm_end - vma->vm_start) >> PAGE_SHIFT; + + if (max_pages > RMFORK_PTS_MAX) + max_pages = RMFORK_PTS_MAX; + + /* Allocate pts and data pages in ubmem */ + ret = rmfork_prepare_pts(task, max_pages); + if (ret) { + mmap_read_unlock(mm); + mmput(mm); + put_task_struct(tsk); + return ret; + } + + /* Walk all VMAs and record pages */ + for (vma = mm->mmap; vma; vma = vma->vm_next) { + if (vma->vm_flags & (VM_IO | VM_PFNMAP)) + continue; + rmfork_record_vma(mm, task, vma->vm_start, vma->vm_end); + } + mmap_read_unlock(mm); + + if (task->pt_cnt == 0) { + pr_warn("rmfork: no pages recorded for task %d\n", task->pid); + mmput(mm); + put_task_struct(tsk); + return 0; + } + + /* Allocate data buffer in ubmem */ + data_size = task->pt_cnt * PAGE_SIZE; + task->data_ptr = rmfork_alloc_from_ubmem(data_size); + if (!task->data_ptr) { + mmput(mm); + put_task_struct(tsk); + return -ENOMEM; + } + task->data_ptr_off = rfork_va_to_off(task->data_ptr); + + /* Copy page content */ + ret = rmfork_copy_pages(task); + if (ret) { + mmput(mm); + put_task_struct(tsk); + return ret; + } + + pr_info("rmfork: dumped task pid=%d vpid=%lu pt_cnt=%lu data_off=0x%lx\n", + task->pid, vpid, task->pt_cnt, task->data_ptr_off); + + pstree->tsk_cnt++; + mmput(mm); + put_task_struct(tsk); + return 0; +} + +static int rmfork_dump_mem(struct rmfork_kargs *kargs) +{ + struct pid *pid; + struct rmfork_pstree *pstrees = NULL; + unsigned long meta_off; + int ret; + + pid = find_get_pid((pid_t)kargs->pid); + if (!pid) { + pr_err("rmfork: cannot find pid %lu\n", kargs->pid); + return -ESRCH; + } + + mutex_lock(&rmfork_mutex); + + pstrees = rmfork_pstree; + if (!pstrees) { + pstrees = rmfork_alloc_from_ubmem(sizeof(*pstrees)); + if (!pstrees) { + mutex_unlock(&rmfork_mutex); + put_pid(pid); + return -ENOMEM; + } + pstrees->tsk_cnt = 0; + pstrees->criu_pid = (pid_t)kargs->pid; + + pstrees->tasks = rmfork_alloc_from_ubmem( + sizeof(struct rmfork_task) * RMFORK_TASK_MAX); + if (!pstrees->tasks) { + mutex_unlock(&rmfork_mutex); + put_pid(pid); + return -ENOMEM; + } + pstrees->tasks_off = rfork_va_to_off(pstrees->tasks); + + rmfork_pstree = pstrees; + pr_info("rmfork: created new pstree at %px\n", pstrees); + } else { + pr_info("rmfork: reusing existing pstree (tsk_cnt=%lu)\n", + pstrees->tsk_cnt); + } + + mutex_unlock(&rmfork_mutex); + + ret = dump_one_task(pid, pstrees, kargs->pid, + (struct iovec __user *)kargs->iovs, kargs->nr_iovs); + if (ret) { + put_pid(pid); + return ret; + } + + mutex_lock(&rmfork_mutex); + meta_off = rmfork_init_metadata(pstrees); + if (meta_off) { + rfork_meta_off = meta_off; + pr_info("rmfork: metadata written at ubmem offset 0x%lx\n", meta_off); + } + mutex_unlock(&rmfork_mutex); + + kargs->pa = meta_off; + + put_pid(pid); + return 0; +} + +/* ------------------------------------------------------------------ */ +/* Restore path (rmfork_opt=1 / 2) */ +/* ------------------------------------------------------------------ */ +static unsigned long rmfork_task_find_pa_bsearch(struct rmfork_task *task, + unsigned long va, + unsigned long *pos) +{ + int lo = 0, hi = task->pt_cnt - 1, mid; + unsigned long key = va >> PAGE_SHIFT; + + while (lo <= hi) { + mid = (lo + hi) / 2; + if (task->pts[mid].va == key) { + *pos = mid; + return task->pts[mid].pfn; + } + if (task->pts[mid].va < key) + lo = mid + 1; + else + hi = mid - 1; + } + *pos = 0; + return 0; +} + +static int rmfork_handle_one_fault(struct vm_area_struct *vma, unsigned long addr, + unsigned long size, unsigned long pfn, + int remap, int thp) +{ + struct page *page; + unsigned long pa; + + pa = rfork_ubmem_phys + (pfn << PAGE_SHIFT); + page = pfn_to_page(pa >> PAGE_SHIFT); + if (!page) + return -EINVAL; + + return vm_insert_page(vma, addr, page); +} + +static int rmfork_restore_vma(struct mm_struct *mm, struct rmfork_task *task, + unsigned long iov_base, unsigned long iov_off, + unsigned long len, unsigned long *saved_idx, + unsigned long *find_time) +{ + unsigned long va, pfn; + unsigned long pos; + struct vm_area_struct *vma; + int ret; + + if (len == 0) + return 0; + + for (va = iov_base; va < iov_base + len; va += PAGE_SIZE) { + pfn = rmfork_task_find_pa_bsearch(task, va, &pos); + if (pfn == 0) + continue; + + vma = find_vma(mm, va); + if (!vma) { + pr_warn("rmfork: no vma for va=0x%lx\n", va); + continue; + } + + *saved_idx = pos; + (*find_time)++; + + ret = rmfork_handle_one_fault(vma, va, PAGE_SIZE, pfn, 0, 0); + if (ret) { + pr_err("rmfork: handle_one_fault failed at va=0x%lx: %d\n", + va, ret); + return ret; + } + } + + return 0; +} + +static int rmfork_restore_iov(struct mm_struct *mm, struct rmfork_task *task, + struct iovec __user *iovs, unsigned long nr) +{ + struct iovec *kiovs; + unsigned long saved_idx = 0, find_time = 0; + int ret; + unsigned long i; + + if (!iovs || nr == 0) + return -EINVAL; + + kiovs = memdup_user(iovs, nr * sizeof(struct iovec)); + if (IS_ERR(kiovs)) + return PTR_ERR(kiovs); + + for (i = 0; i < nr; i++) { + if (!kiovs[i].iov_base || !kiovs[i].iov_len) + continue; + + ret = rmfork_restore_vma(mm, task, + (unsigned long)kiovs[i].iov_base, 0, + kiovs[i].iov_len, &saved_idx, &find_time); + if (ret) + goto out; + } + + pr_info("rmfork: restored %lu pages for task pid=%d\n", + find_time, task->pid); + ret = 0; + +out: + kfree(kiovs); + return ret; +} + +static int rmfork_restore_one(struct rmfork_kargs *kargs) +{ + struct mm_struct *mm; + struct task_struct *tsk; + struct pid *pid; + struct rmfork_metadata *meta; + struct rmfork_pstree *old_tree; + struct rmfork_task *rmfork_task, *old_task; + int ret = 0; + unsigned long i; + + pid = find_get_pid((pid_t)kargs->pid); + if (!pid) + return -ESRCH; + + tsk = get_pid_task(pid, PIDTYPE_PID); + if (!tsk) { + put_pid(pid); + return -ESRCH; + } + + mm = get_task_mm(tsk); + if (!mm) { + put_task_struct(tsk); + put_pid(pid); + return -EINVAL; + } + + mutex_lock(&rmfork_mutex); + + meta = rmfork_off_to_va(kargs->pa); + if (!meta || meta->magic != RMFORK_MAGIC) { + pr_err("rmfork: bad metadata at off=0x%lx (magic=0x%lx)\n", + kargs->pa, meta ? meta->magic : 0); + ret = -EINVAL; + goto out_unlock; + } + + pr_info("rmfork: restore meta at %px, magic=0x%lx, tsk_cnt=%d\n", + meta, meta->magic, meta->tsk_cnt); + + old_tree = rmfork_off_to_va(meta->pstrees_pa); + if (!old_tree) { + ret = -EINVAL; + goto out_unlock; + } + + /* Either reuse existing pstree or build new one */ + if (rmfork_pstree && rfork_meta_off == kargs->pa) { + pr_info("rmfork: pstree already created, reusing\n"); + rmfork_task = rmfork_pstree->tasks; + } else { + /* First restore: copy task info from old tree */ + if (!rmfork_pstree) { + rmfork_pstree = rmfork_alloc_from_ubmem(sizeof(*rmfork_pstree)); + if (!rmfork_pstree) { + ret = -ENOMEM; + goto out_unlock; + } + } + + old_task = rmfork_off_to_va(old_tree->tasks_off); + if (!old_task) { + ret = -EINVAL; + goto out_unlock; + } + + rmfork_pstree->tsk_cnt = old_tree->tsk_cnt; + rmfork_pstree->tasks_off = old_tree->tasks_off; + + rmfork_pstree->tasks = rmfork_alloc_from_ubmem( + sizeof(struct rmfork_task) * old_tree->tsk_cnt); + if (!rmfork_pstree->tasks) { + ret = -ENOMEM; + goto out_unlock; + } + + memcpy(rmfork_pstree->tasks, old_task, + sizeof(struct rmfork_task) * old_tree->tsk_cnt); + + /* Wire up pts: they're already in ubmem, reuse directly */ + for (i = 0; i < old_tree->tsk_cnt; i++) { + rmfork_pstree->tasks[i].pts = + rmfork_off_to_va(old_task[i].pts_off); + rmfork_pstree->tasks[i].data_ptr = + rmfork_off_to_va(old_task[i].data_ptr_off); + } + + rmfork_pstree->criu_pid = (pid_t)kargs->pid; + rfork_meta_off = kargs->pa; + pr_info("rmfork: new pstree created, tsk_cnt=%lu\n", + rmfork_pstree->tsk_cnt); + } + + rmfork_task = rmfork_pstree->tasks; + + /* Find the matching task by vpid and restore */ + for (i = 0; i < rmfork_pstree->tsk_cnt; i++) { + if (rmfork_task[i].vpid == (pid_t)kargs->pid) { + pr_info("rmfork: restoring pid=%d (vpid=%d), pt_cnt=%lu\n", + rmfork_task[i].pid, rfork_task[i].vpid, + rmfork_task[i].pt_cnt); + ret = rmfork_restore_iov(mm, &rmfork_task[i], + (struct iovec __user *)kargs->iovs, + kargs->nr_iovs); + break; + } + } + + if (i == rmfork_pstree->tsk_cnt) { + pr_warn("rmfork: no matching task for pid=%lu\n", kargs->pid); + ret = -ENOENT; + } + +out_unlock: + mutex_unlock(&rmfork_mutex); + mmput(mm); + put_task_struct(tsk); + put_pid(pid); + return ret; +} + +static int rmfork_restore_all(struct rmfork_kargs *kargs) +{ + return rmfork_restore_one(kargs); +} + +/* ------------------------------------------------------------------ */ +/* Syscall entry point */ +/* ------------------------------------------------------------------ */ +SYSCALL_DEFINE1(remote_fork, struct rmfork_kargs __user *, uargs) +{ + struct rmfork_kargs kargs; + int ret; + + if (!uargs) + return -EINVAL; + + if (copy_from_user(&kargs, uargs, sizeof(kargs))) + return -EFAULT; + + pr_info("rmfork: syscall pid=%lu opt=%lu\n", kargs.pid, kargs.rmfork_opt); + + switch (kargs.rmfork_opt) { + case RMFORK_OPT_DUMP: + ret = rmfork_dump_mem(&kargs); + break; + case RMFORK_OPT_RESTORE_ALL: + ret = rmfork_restore_all(&kargs); + break; + case RMFORK_OPT_RESTORE_ONE: + ret = rmfork_restore_one(&kargs); + break; + default: + pr_err("rmfork: unknown opt %lu\n", kargs.rmfork_opt); + return -EINVAL; + } + + return ret; +} diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c index e8e117787377..6f4125e32da8 100644 --- a/kernel/sys_ni.c +++ b/kernel/sys_ni.c @@ -271,6 +271,9 @@ COND_SYSCALL(memfd_secret); * Architecture specific weak syscall entries. */ +/* rmfork: arm64 (syscall 454) */ +COND_SYSCALL(remote_fork); + /* pciconfig: alpha, arm, arm64, ia64, sparc */ COND_SYSCALL(pciconfig_read); COND_SYSCALL(pciconfig_write); -- 2.53.0
Add a kernel parameter pmemmem=<size>:<phys_addr> to reserve a region of physical memory as persistent memory (pmem) used by rmfork's ubmem allocator for storing process memory pages during checkpoint. Changes: - lib/Kconfig: add CONFIG_KUP_PMEM_MEMORY option - arch/arm64/mm/init.c: parse_pmem() early_param handler, reserve_pmem() called from arm64_memblock_init(), pmem_res struct definition - arch/arm64/kernel/setup.c: insert pmem_res into iomem_resource tree - include/linux/ioport.h: add IORES_DESC_KPMEM_DEV resource descriptor - include/linux/mm.h: expose pmem_res extern under CONFIG_KUP_PMEM_MEMORY Example: pmemmem=8G:0x40000000 reserves 8GB at physical 0x40000000. Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com> --- arch/arm64/kernel/setup.c | 5 +++ arch/arm64/mm/init.c | 75 +++++++++++++++++++++++++++++++++++++++ include/linux/ioport.h | 1 + include/linux/mm.h | 4 +++ lib/Kconfig | 6 ++++ 5 files changed, 91 insertions(+) diff --git a/arch/arm64/kernel/setup.c b/arch/arm64/kernel/setup.c index 09a15a9a8b2b..6e85500b098f 100644 --- a/arch/arm64/kernel/setup.c +++ b/arch/arm64/kernel/setup.c @@ -281,6 +281,11 @@ static void __init request_standard_resources(void) insert_resource(&iomem_resource, &kernel_code); insert_resource(&iomem_resource, &kernel_data); +#ifdef CONFIG_KUP_PMEM_MEMORY + if (pmem_res.end) + insert_resource(&iomem_resource, &pmem_res); +#endif + num_standard_resources = memblock.memory.cnt; res_size = num_standard_resources * sizeof(*standard_resources); standard_resources = memblock_alloc(res_size, SMP_CACHE_BYTES); diff --git a/arch/arm64/mm/init.c b/arch/arm64/mm/init.c index da75dd9d964b..c60a7fbc3ea5 100644 --- a/arch/arm64/mm/init.c +++ b/arch/arm64/mm/init.c @@ -72,6 +72,18 @@ EXPORT_SYMBOL(memstart_addr); */ phys_addr_t __ro_after_init arm64_dma_phys_limit; +#ifdef CONFIG_KUP_PMEM_MEMORY +static unsigned long long pmem_size, pmem_phystart; + +struct resource pmem_res = { + .name = "Kpmem Dev", + .start = 0, + .end = 0, + .flags = IORESOURCE_MEM, + .desc = IORES_DESC_KPMEM_DEV +}; +#endif + /* Current arm64 boot protocol requires 2MB alignment */ #define CRASH_ALIGN SZ_2M @@ -128,6 +140,65 @@ static int __init reserve_crashkernel_low(unsigned long long low_size) return 0; } +#ifdef CONFIG_KUP_PMEM_MEMORY +static int __init parse_pmem(char *par) +{ + char *cur = par; + + if (!par) + return 0; + + pmem_size = 0; + pmem_phystart = 0; + + pmem_size = memparse(par, &cur); + if (par == cur) { + pr_warn("pmem: memory value expected\n"); + return -EINVAL; + } + + if (*cur == ':') + pmem_phystart = memparse(cur + 1, &cur); + else if (*cur != ' ' && *cur != '\0') { + pr_warn("pmem: unrecognized char %c\n", *cur); + return -EINVAL; + } + + return 0; +} +early_param("pmemmem", parse_pmem); + +static void __init reserve_pmem(void) +{ + if (!pmem_size || !pmem_phystart) + return; + + pmem_size = PAGE_ALIGN(pmem_size); + + if (!memblock_is_region_memory(pmem_phystart, pmem_size)) { + pr_warn("cannot reserve pmem: region is not memory!\n"); + return; + } + + if (memblock_is_region_reserved(pmem_phystart, pmem_size)) { + pr_warn("cannot reserve pmem: region overlaps reserved memory!\n"); + return; + } + + if (!IS_ALIGNED(pmem_phystart, SZ_2M)) { + pr_warn("cannot reserve pmem: base address is not 2MB aligned\n"); + return; + } + memblock_reserve(pmem_phystart, pmem_size); + memblock_remove(pmem_phystart, pmem_size); + pr_info("pmem reserved: 0x%016llx - 0x%016llx (%lld MB)\n", + pmem_phystart, pmem_phystart + pmem_size, pmem_size >> 20); + + pmem_res.start = pmem_phystart; + pmem_res.end = pmem_phystart + pmem_size - 1; +} +#endif /* CONFIG_KUP_PMEM_MEMORY */ + /* * reserve_crashkernel() - reserves memory for crash kernel * @@ -551,6 +622,10 @@ void __init arm64_memblock_init(void) early_init_fdt_scan_reserved_mem(); +#ifdef CONFIG_KUP_PMEM_MEMORY + reserve_pmem(); +#endif + high_memory = __va(memblock_end_of_DRAM() - 1) + 1; } diff --git a/include/linux/ioport.h b/include/linux/ioport.h index 25d768d48970..69139a9000e6 100644 --- a/include/linux/ioport.h +++ b/include/linux/ioport.h @@ -143,6 +143,7 @@ enum { IORES_DESC_RESERVED = 7, IORES_DESC_SOFT_RESERVED = 8, IORES_DESC_CXL = 9, + IORES_DESC_KPMEM_DEV = 10, }; /* diff --git a/include/linux/mm.h b/include/linux/mm.h index 75d32b512cb4..f269dca11bef 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -44,6 +44,10 @@ extern int sysctl_page_lock_unfairness; void mm_core_init(void); void init_mm_internals(void); +#ifdef CONFIG_KUP_PMEM_MEMORY +extern struct resource pmem_res; +#endif + #ifndef CONFIG_NUMA /* Don't use mapnrs, do it properly */ extern unsigned long max_mapnr; diff --git a/lib/Kconfig b/lib/Kconfig index 43d69669465a..d574c240b977 100644 --- a/lib/Kconfig +++ b/lib/Kconfig @@ -754,6 +754,12 @@ config PLDMFW select CRC32 default n +config KUP_PMEM_MEMORY + bool "reserve memory for kup pmem to store image" + default y + help + Say y here to enable this feature + config ASN1_ENCODER tristate -- 2.53.0
- Add /sys/kernel/rmfork/ubmem_reset (write-only) to reset ubmem state between dump cycles without rebooting. Clears metadata, zeroes pmem. - Split rmfork_alloc_from_ubmem into _locked variant for callers that already hold rmfork_mutex, and ALIGN to PAGE_SIZE instead of 8 bytes. - Support CONFIG_KUP_PMEM_MEMORY in rmfork_ubmem_init: use the kernel pmemmem parameter region if available, fall back to memblock_phys_alloc. - Expand RMFORK_PTS_MAX from 4096 to SZ_8G>>PAGE_SHIFT (2,097,152), enabling checkpoint of processes with up to 8GB of virtual memory. - Switch VMA iteration from legacy mm->mmap linked list to for_each_vma with VMA_ITERATOR (maple tree). Replace VMA filter VM_IO|VM_PFNMAP with VM_SPECIAL (the modern equivalent since v6.4). - Remove unused rmfork_on_single_machine flag. - Add copy_to_user back to userspace after syscall completes so CRIU can read the result, and pr_err on failure for diagnostics. Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com> --- kernel/rmfork.c | 133 ++++++++++++++++++++++++++++++++++++++---------- 1 file changed, 105 insertions(+), 28 deletions(-) diff --git a/kernel/rmfork.c b/kernel/rmfork.c index 675d712b3d7d..d9bf6eae1520 100644 --- a/kernel/rmfork.c +++ b/kernel/rmfork.c @@ -44,7 +44,7 @@ #define RMFORK_MAGIC 0x1234abcdUL #define RMFORK_UBMEM_START 0x840000000ULL #define RMFORK_UBMEM_SIZE SZ_32M -#define RMFORK_PTS_MAX 4096 +#define RMFORK_PTS_MAX (SZ_8G >> PAGE_SHIFT) #define RMFORK_TASK_MAX 512 /* ------------------------------------------------------------------ */ @@ -94,8 +94,6 @@ static unsigned long rfork_ubmem_off; /* current allocation offset */ static struct rmfork_pstree *rmfork_pstree; static unsigned long rfork_meta_off; -/* Whether we are on the same machine as dump (simplifies pfn handling) */ -static bool rmfork_on_single_machine = true; /* ------------------------------------------------------------------ */ /* Forward declarations */ @@ -112,54 +110,121 @@ static int dump_one_task(struct pid *pid, struct rmfork_pstree *pstree, /* ------------------------------------------------------------------ */ static int __init rmfork_ubmem_init(void) { - phys_addr_t phys; - - phys = memblock_phys_alloc(RMFORK_UBMEM_SIZE, SZ_2M); - if (!phys) { - pr_err("rmfork: failed to reserve %pa of pmem\n", &RMFORK_UBMEM_SIZE); - return -ENOMEM; + phys_addr_t phys = 0; + unsigned long size = RMFORK_UBMEM_SIZE; + +#ifdef CONFIG_KUP_PMEM_MEMORY + if (pmem_res.end) { + phys = pmem_res.start; + size = pmem_res.end - pmem_res.start + 1; + pr_info("rmfork: using pmemmem region at phys=0x%llx, size=0x%lx\n", + (unsigned long long)phys, size); + } else +#endif + { + phys = memblock_phys_alloc(size, SZ_2M); + if (!phys) { + pr_err("rmfork: failed to reserve %zu of pmem\n", size); + return -ENOMEM; + } } - rmfork_ubmem_base = memremap(phys, RMFORK_UBMEM_SIZE, MEMREMAP_WB); + rmfork_ubmem_base = memremap(phys, size, MEMREMAP_WB); if (!rmfork_ubmem_base) { pr_err("rmfork: failed to memremap ubmem\n"); - memblock_phys_free(phys, RMFORK_UBMEM_SIZE); +#ifndef CONFIG_KUP_PMEM_MEMORY + memblock_phys_free(phys, size); +#endif return -ENOMEM; } rfork_ubmem_phys = phys; - rfork_ubmem_size = RMFORK_UBMEM_SIZE; + rfork_ubmem_size = size; rfork_ubmem_off = 0; - memset(rmfork_ubmem_base, 0, RMFORK_UBMEM_SIZE); + memset(rmfork_ubmem_base, 0, size); - pr_info("rmfork: ubmem reserved at phys=0x%llx, size=0x%lx, va=%px\n", - (unsigned long long)phys, RMFORK_UBMEM_SIZE, rmfork_ubmem_base); + pr_info("rmfork: ubmem active at phys=0x%llx, size=0x%lx, va=%px\n", + (unsigned long long)phys, size, rmfork_ubmem_base); return 0; } early_initcall(rmfork_ubmem_init); -static void *rmfork_alloc_from_ubmem(unsigned long size) +/* ------------------------------------------------------------------ */ +/* Sysfs interface */ +/* ------------------------------------------------------------------ */ +#include <linux/kobject.h> + +static struct kobject *rmfork_kobj; + +static ssize_t ubmem_reset_store(struct kobject *kobj, + struct kobj_attribute *attr, + const char *buf, size_t count) +{ + int val; + + if (kstrtoint(buf, 0, &val)) + return -EINVAL; + + if (val == 1) { + mutex_lock(&rmfork_mutex); + rfork_ubmem_off = 0; + memset(rmfork_ubmem_base, 0, rfork_ubmem_size); + rmfork_pstree = NULL; + rfork_meta_off = 0; + mutex_unlock(&rmfork_mutex); + pr_info("rmfork: ubmem reset by sysfs\n"); + } + + return count; +} + +static struct kobj_attribute ubmem_reset_attr = __ATTR_WO(ubmem_reset); + +static int __init rmfork_sysfs_init(void) +{ + int ret; + + rmfork_kobj = kobject_create_and_add("rmfork", kernel_kobj); + if (!rmfork_kobj) + return -ENOMEM; + + ret = sysfs_create_file(rmfork_kobj, &ubmem_reset_attr.attr); + if (ret) + kobject_put(rmfork_kobj); + + return ret; +} +device_initcall(rmfork_sysfs_init); + +static void *rmfork_alloc_from_ubmem_locked(unsigned long size) { void *ptr; unsigned long off; - mutex_lock(&rmfork_mutex); - off = ALIGN(rfork_ubmem_off, 8); + off = ALIGN(rfork_ubmem_off, PAGE_SIZE); if (off + size > rfork_ubmem_size) { - mutex_unlock(&rmfork_mutex); pr_err("rmfork: ubmem exhausted (off=0x%lx, size=0x%lx, max=0x%lx)\n", off, size, rfork_ubmem_size); return NULL; } ptr = rmfork_ubmem_base + off; rfork_ubmem_off = off + size; - mutex_unlock(&rmfork_mutex); memset(ptr, 0, size); return ptr; } +static void *rmfork_alloc_from_ubmem(unsigned long size) +{ + void *ptr; + + mutex_lock(&rmfork_mutex); + ptr = rmfork_alloc_from_ubmem_locked(size); + mutex_unlock(&rmfork_mutex); + return ptr; +} + static void *rmfork_off_to_va(unsigned long off) { if (off >= rfork_ubmem_size) { @@ -317,7 +382,7 @@ static unsigned long rmfork_init_metadata(struct rmfork_pstree *pstree) { struct rmfork_metadata *meta; - meta = rmfork_alloc_from_ubmem(sizeof(*meta)); + meta = rmfork_alloc_from_ubmem_locked(sizeof(*meta)); if (!meta) return 0; @@ -371,8 +436,10 @@ static int dump_one_task(struct pid *pid, struct rmfork_pstree *pstree, /* Count VMAs to estimate max pages */ max_pages = 0; mmap_read_lock(mm); - for (vma = mm->mmap; vma; vma = vma->vm_next) + VMA_ITERATOR(vmi, mm, 0); + for_each_vma(vmi, vma) { max_pages += (vma->vm_end - vma->vm_start) >> PAGE_SHIFT; + } if (max_pages > RMFORK_PTS_MAX) max_pages = RMFORK_PTS_MAX; @@ -387,8 +454,9 @@ static int dump_one_task(struct pid *pid, struct rmfork_pstree *pstree, } /* Walk all VMAs and record pages */ - for (vma = mm->mmap; vma; vma = vma->vm_next) { - if (vma->vm_flags & (VM_IO | VM_PFNMAP)) + mas_set(&vmi.mas, 0); + for_each_vma(vmi, vma) { + if (vma->vm_flags & VM_SPECIAL) continue; rmfork_record_vma(mm, task, vma->vm_start, vma->vm_end); } @@ -445,7 +513,7 @@ static int rmfork_dump_mem(struct rmfork_kargs *kargs) pstrees = rmfork_pstree; if (!pstrees) { - pstrees = rmfork_alloc_from_ubmem(sizeof(*pstrees)); + pstrees = rmfork_alloc_from_ubmem_locked(sizeof(*pstrees)); if (!pstrees) { mutex_unlock(&rmfork_mutex); put_pid(pid); @@ -454,7 +522,7 @@ static int rmfork_dump_mem(struct rmfork_kargs *kargs) pstrees->tsk_cnt = 0; pstrees->criu_pid = (pid_t)kargs->pid; - pstrees->tasks = rmfork_alloc_from_ubmem( + pstrees->tasks = rmfork_alloc_from_ubmem_locked( sizeof(struct rmfork_task) * RMFORK_TASK_MAX); if (!pstrees->tasks) { mutex_unlock(&rmfork_mutex); @@ -660,7 +728,7 @@ static int rmfork_restore_one(struct rmfork_kargs *kargs) } else { /* First restore: copy task info from old tree */ if (!rmfork_pstree) { - rmfork_pstree = rmfork_alloc_from_ubmem(sizeof(*rmfork_pstree)); + rmfork_pstree = rmfork_alloc_from_ubmem_locked(sizeof(*rmfork_pstree)); if (!rmfork_pstree) { ret = -ENOMEM; goto out_unlock; @@ -676,7 +744,7 @@ static int rmfork_restore_one(struct rmfork_kargs *kargs) rmfork_pstree->tsk_cnt = old_tree->tsk_cnt; rmfork_pstree->tasks_off = old_tree->tasks_off; - rmfork_pstree->tasks = rmfork_alloc_from_ubmem( + rmfork_pstree->tasks = rmfork_alloc_from_ubmem_locked( sizeof(struct rmfork_task) * old_tree->tsk_cnt); if (!rmfork_pstree->tasks) { ret = -ENOMEM; @@ -764,5 +832,14 @@ SYSCALL_DEFINE1(remote_fork, struct rmfork_kargs __user *, uargs) return -EINVAL; } + if (copy_to_user(uargs, &kargs, sizeof(kargs))) { + pr_err("rmfork: copy_to_user failed after opt=%lu ret=%d\n", + kargs.rmfork_opt, ret); + return -EFAULT; + } + + if (ret) + pr_err("rmfork: opt=%lu failed: %d\n", kargs.rmfork_opt, ret); + return ret; } -- 2.53.0
Replace the old page restoration approach that used vm_insert_page to insert ubmem pages directly into the process page table. The new approach copies page data from ubmem to the process address space using copy_to_user, which triggers normal page faults. This is simpler and more reliable: - Remove rmfork_handle_one_fault, rmfork_restore_vma, rmfork_restore_iov - Add rmfork_restore_pages with two modes: * IOV-based: only touch pages in the specified iovec ranges * All-pages: iterate all saved page table entries - Pages that hit -EFAULT (e.g., guard pages with prot=0) are skipped and counted, not treated as errors - Fix typo: rfork_task[i].vpid → rmfork_task[i].vpid Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com> --- kernel/rmfork.c | 135 +++++++++++++++++++++--------------------------- 1 file changed, 60 insertions(+), 75 deletions(-) diff --git a/kernel/rmfork.c b/kernel/rmfork.c index d9bf6eae1520..3ac48106652b 100644 --- a/kernel/rmfork.c +++ b/kernel/rmfork.c @@ -94,7 +94,7 @@ static unsigned long rfork_ubmem_off; /* current allocation offset */ static struct rmfork_pstree *rmfork_pstree; static unsigned long rfork_meta_off; - +/* Whether we are on the same machine as dump (simplifies pfn handling) */ /* ------------------------------------------------------------------ */ /* Forward declarations */ /* ------------------------------------------------------------------ */ @@ -104,6 +104,8 @@ static int rmfork_restore_one(struct rmfork_kargs *kargs); static int dump_one_task(struct pid *pid, struct rmfork_pstree *pstree, unsigned long vpid, struct iovec __user *iovs, unsigned long nr); +static int rmfork_restore_pages(struct rmfork_task *task, + struct iovec __user *iovs, unsigned long nr); /* ------------------------------------------------------------------ */ /* UBMEM (pmem) allocator */ @@ -586,94 +588,77 @@ static unsigned long rmfork_task_find_pa_bsearch(struct rmfork_task *task, return 0; } -static int rmfork_handle_one_fault(struct vm_area_struct *vma, unsigned long addr, - unsigned long size, unsigned long pfn, - int remap, int thp) +/* + * Restore pages using copy_to_user from the kernel ubmem. + * This triggers normal page faults that allocate the correct page type + * (anonymous for anonymous VMAs, COW for private file-backed), + * matching what the normal CRIU preadv restore does. + * + * No mmap lock is needed — the restorer is single-threaded so the + * address space is stable. The page fault handler takes mmap_read_lock + * internally. Pages whose VMA was unmapped in the meantime simply + * get -EFAULT and are skipped. + */ +static int rmfork_restore_pages(struct rmfork_task *task, + struct iovec __user *iovs, unsigned long nr) { - struct page *page; - unsigned long pa; + unsigned long i, find_time = 0, skip_time = 0; + int ret = 0; - pa = rfork_ubmem_phys + (pfn << PAGE_SHIFT); - page = pfn_to_page(pa >> PAGE_SHIFT); - if (!page) - return -EINVAL; + if (iovs && nr > 0) { + /* IOV-based restore — only touch ranges in the list */ + struct iovec *kiovs; - return vm_insert_page(vma, addr, page); -} + kiovs = memdup_user(iovs, nr * sizeof(struct iovec)); + if (IS_ERR(kiovs)) + return PTR_ERR(kiovs); -static int rmfork_restore_vma(struct mm_struct *mm, struct rmfork_task *task, - unsigned long iov_base, unsigned long iov_off, - unsigned long len, unsigned long *saved_idx, - unsigned long *find_time) -{ - unsigned long va, pfn; - unsigned long pos; - struct vm_area_struct *vma; - int ret; + for (i = 0; i < nr; i++) { + unsigned long base = (unsigned long)kiovs[i].iov_base; + unsigned long len = kiovs[i].iov_len; + unsigned long addr; - if (len == 0) - return 0; + for (addr = base; addr < base + len; addr += PAGE_SIZE) { + unsigned long pos; + unsigned long pfn; + void *src; - for (va = iov_base; va < iov_base + len; va += PAGE_SIZE) { - pfn = rmfork_task_find_pa_bsearch(task, va, &pos); - if (pfn == 0) - continue; + pfn = rmfork_task_find_pa_bsearch(task, addr, &pos); + if (pfn == 0) + continue; - vma = find_vma(mm, va); - if (!vma) { - pr_warn("rmfork: no vma for va=0x%lx\n", va); - continue; + src = (void *)rmfork_ubmem_base + (pfn << PAGE_SHIFT); + + /* copy_to_user triggers page fault if needed */ + if (copy_to_user((void __user *)addr, src, PAGE_SIZE)) { + skip_time++; + continue; + } + find_time++; + } } + kfree(kiovs); + } else { + /* Restore all saved pages */ + for (i = 0; i < task->pt_cnt; i++) { + unsigned long va = task->pts[i].va << PAGE_SHIFT; + void *src; - *saved_idx = pos; - (*find_time)++; + src = (void *)rmfork_ubmem_base + (task->pts[i].pfn << PAGE_SHIFT); - ret = rmfork_handle_one_fault(vma, va, PAGE_SIZE, pfn, 0, 0); - if (ret) { - pr_err("rmfork: handle_one_fault failed at va=0x%lx: %d\n", - va, ret); - return ret; + if (copy_to_user((void __user *)va, src, PAGE_SIZE)) { + skip_time++; + continue; + } + find_time++; } } + pr_info("rmfork: restored %lu pages (skipped %lu) for task pid=%d\n", + find_time, skip_time, task->pid); return 0; } -static int rmfork_restore_iov(struct mm_struct *mm, struct rmfork_task *task, - struct iovec __user *iovs, unsigned long nr) -{ - struct iovec *kiovs; - unsigned long saved_idx = 0, find_time = 0; - int ret; - unsigned long i; - - if (!iovs || nr == 0) - return -EINVAL; - - kiovs = memdup_user(iovs, nr * sizeof(struct iovec)); - if (IS_ERR(kiovs)) - return PTR_ERR(kiovs); - - for (i = 0; i < nr; i++) { - if (!kiovs[i].iov_base || !kiovs[i].iov_len) - continue; - - ret = rmfork_restore_vma(mm, task, - (unsigned long)kiovs[i].iov_base, 0, - kiovs[i].iov_len, &saved_idx, &find_time); - if (ret) - goto out; - } - - pr_info("rmfork: restored %lu pages for task pid=%d\n", - find_time, task->pid); - ret = 0; - -out: - kfree(kiovs); - return ret; -} - static int rmfork_restore_one(struct rmfork_kargs *kargs) { struct mm_struct *mm; @@ -774,9 +759,9 @@ static int rmfork_restore_one(struct rmfork_kargs *kargs) for (i = 0; i < rmfork_pstree->tsk_cnt; i++) { if (rmfork_task[i].vpid == (pid_t)kargs->pid) { pr_info("rmfork: restoring pid=%d (vpid=%d), pt_cnt=%lu\n", - rmfork_task[i].pid, rfork_task[i].vpid, + rmfork_task[i].pid, rmfork_task[i].vpid, rmfork_task[i].pt_cnt); - ret = rmfork_restore_iov(mm, &rmfork_task[i], + ret = rmfork_restore_pages(&rmfork_task[i], (struct iovec __user *)kargs->iovs, kargs->nr_iovs); break; -- 2.53.0
participants (1)
-
Qi Xi