Introduce RMFork (Remote Memory Fork) — a kernel mechanism that allows CRIU to save and restore process memory via kernel-reserved pmem (ubmem), bypassing the need for userspace pages.img. Core changes: - Add syscall 454 (remote_fork) for ARM64 - Create kernel/rmfork.c with dump and restore handlers - Reserve 32MB of pmem at boot via memblock + memremap as ubmem - Dump path (opt=3): walk page table, copy pages to ubmem, record VA->PFN - Restore path (opt=1): insert saved pages into target process's VMAs - Add UAPI header (include/uapi/linux/rmfork.h) for struct rmfork_kargs - Add Kconfig option CONFIG_RMFORK, default y on ARM64 Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com> --- Kconfig | 1 + arch/arm64/include/uapi/asm/unistd.h | 6 + include/uapi/linux/rmfork.h | 26 + kernel/Kconfig.rmfork | 8 + kernel/Makefile | 1 + kernel/rmfork.c | 768 +++++++++++++++++++++++++++ kernel/sys_ni.c | 3 + 7 files changed, 813 insertions(+) create mode 100644 include/uapi/linux/rmfork.h create mode 100644 kernel/Kconfig.rmfork create mode 100644 kernel/rmfork.c diff --git a/Kconfig b/Kconfig index 4a96e16e5f31..33b9d4a3d858 100644 --- a/Kconfig +++ b/Kconfig @@ -10,6 +10,7 @@ source "scripts/Kconfig.include" source "init/Kconfig" source "kernel/Kconfig.freezer" +source "kernel/Kconfig.rmfork" source "fs/Kconfig.binfmt" diff --git a/arch/arm64/include/uapi/asm/unistd.h b/arch/arm64/include/uapi/asm/unistd.h index 079139c04b14..2456d590c123 100644 --- a/arch/arm64/include/uapi/asm/unistd.h +++ b/arch/arm64/include/uapi/asm/unistd.h @@ -36,3 +36,9 @@ #define __ARCH_WANT_MEMFD_SECRET #include <asm-generic/unistd.h> + +/* + * ARM64-specific syscalls + */ +#define __NR_remote_fork 454 +__SYSCALL(__NR_remote_fork, sys_remote_fork) diff --git a/include/uapi/linux/rmfork.h b/include/uapi/linux/rmfork.h new file mode 100644 index 000000000000..48ff75cbb559 --- /dev/null +++ b/include/uapi/linux/rmfork.h @@ -0,0 +1,26 @@ +/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */ +#ifndef _UAPI_LINUX_RMFORK_H +#define _UAPI_LINUX_RMFORK_H + +#include <linux/types.h> + +/* + * rmfork_opt values for sys_remote_fork + */ +#define RMFORK_OPT_RESTORE_ONE 1 /* restore one task's mm */ +#define RMFORK_OPT_RESTORE_ALL 2 /* restore all tasks */ +#define RMFORK_OPT_DUMP 3 /* dump one task's mm */ + +/* + * Arguments passed to sys_remote_fork via pointer + */ +struct rmfork_kargs { + unsigned long rmfork_opt; /* dump=3, restore=1/2 */ + unsigned long pid; /* target pid */ + unsigned long va; /* virtual address base */ + unsigned long pa; /* meta offset in ubmem (for restore) */ + unsigned long iovs; /* pointer to iovec array (user-space) */ + unsigned long nr_iovs; /* number of iov entries */ +}; + +#endif /* _UAPI_LINUX_RMFORK_H */ diff --git a/kernel/Kconfig.rmfork b/kernel/Kconfig.rmfork new file mode 100644 index 000000000000..ce120a515657 --- /dev/null +++ b/kernel/Kconfig.rmfork @@ -0,0 +1,8 @@ +config RMFORK + bool "RMFork support for CRIU kernel-side checkpoint/restore" + depends on ARM64 + default y + help + Enable RMFork (Remote Memory Fork) for CRIU integration. This allows + CRIU to dump and restore process memory via kernel-reserved pmem, + bypassing the need for userspace pages.img. diff --git a/kernel/Makefile b/kernel/Makefile index fe3559ee90d9..67848a48673e 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -127,6 +127,7 @@ obj-$(CONFIG_CONTEXT_TRACKING) += context_tracking.o obj-$(CONFIG_TORTURE_TEST) += torture.o obj-$(CONFIG_HAS_IOMEM) += iomem.o +obj-$(CONFIG_RMFORK) += rmfork.o obj-$(CONFIG_RSEQ) += rseq.o obj-$(CONFIG_WATCH_QUEUE) += watch_queue.o diff --git a/kernel/rmfork.c b/kernel/rmfork.c new file mode 100644 index 000000000000..675d712b3d7d --- /dev/null +++ b/kernel/rmfork.c @@ -0,0 +1,768 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * RMFork - Remote Memory Fork (CRIU + kernel checkpoint/restore) + * + * Copyright (C) 2026 + */ + +#include <linux/kernel.h> +#include <linux/module.h> +#include <linux/syscalls.h> +#include <linux/sched.h> +#include <linux/sched/mm.h> +#include <linux/mm.h> +#include <linux/mm_types.h> +#include <linux/memblock.h> +#include <linux/io.h> +#include <linux/hugetlb.h> +#include <linux/swap.h> +#include <linux/highmem.h> +#include <linux/pagemap.h> +#include <linux/slab.h> +#include <linux/uaccess.h> +#include <linux/fs.h> +#include <linux/fdtable.h> +#include <linux/string.h> +#include <linux/pid.h> +#include <linux/spinlock.h> +#include <linux/mutex.h> +#include <linux/types.h> +#include <linux/bitops.h> +#include <linux/uio.h> +#include <linux/ptrace.h> + +#include <uapi/linux/rmfork.h> + +#include <asm/pgtable.h> +#include <asm/page.h> +#include <asm/tlbflush.h> +#include <asm/io.h> + +/* ------------------------------------------------------------------ */ +/* Config */ +/* ------------------------------------------------------------------ */ +#define RMFORK_MAGIC 0x1234abcdUL +#define RMFORK_UBMEM_START 0x840000000ULL +#define RMFORK_UBMEM_SIZE SZ_32M +#define RMFORK_PTS_MAX 4096 +#define RMFORK_TASK_MAX 512 + +/* ------------------------------------------------------------------ */ +/* Data structures (must match spec) */ +/* ------------------------------------------------------------------ */ +struct rmfork_pt_entry { + unsigned long va; /* VA >> PAGE_SHIFT */ + unsigned long pfn; /* ubmem offset index after dump */ + unsigned long pfn_saved; /* original PFN (for dedup) */ +}; + +struct rmfork_task { + pid_t pid; /* real pid */ + pid_t vpid; /* virtual pid (from criu) */ + unsigned long pt_cnt; /* number of pages */ + struct rmfork_pt_entry *pts; /* VA->PFN mapping array */ + unsigned long pts_off; /* pts offset in ubmem */ + void *data_ptr; /* page data buffer in ubmem */ + unsigned long data_ptr_off; /* data_ptr offset in ubmem */ +}; + +struct rmfork_pstree { + unsigned long tsk_cnt; /* number of tasks */ + struct rmfork_task *tasks; /* tasks array */ + unsigned long tasks_off; /* tasks offset in ubmem */ + pid_t criu_pid; /* which criu pid did the dump */ +}; + +struct rmfork_metadata { + unsigned long magic; /* = RMFORK_MAGIC */ + struct rmfork_pstree *pstrees; /* pointer to pstrees */ + unsigned long pstrees_pa; /* pstrees offset in ubmem (pa as off) */ + int tsk_cnt; /* number of tasks */ +} __packed; + +/* ------------------------------------------------------------------ */ +/* Global state */ +/* ------------------------------------------------------------------ */ +static DEFINE_MUTEX(rmfork_mutex); + +static void *rmfork_ubmem_base; /* VA of reserved ubmem region */ +static unsigned long rfork_ubmem_phys; /* PA of reserved ubmem region */ +static unsigned long rfork_ubmem_size; /* size */ +static unsigned long rfork_ubmem_off; /* current allocation offset */ + +/* The live pstree used during restore */ +static struct rmfork_pstree *rmfork_pstree; +static unsigned long rfork_meta_off; + +/* Whether we are on the same machine as dump (simplifies pfn handling) */ +static bool rmfork_on_single_machine = true; + +/* ------------------------------------------------------------------ */ +/* Forward declarations */ +/* ------------------------------------------------------------------ */ +static int rmfork_dump_mem(struct rmfork_kargs *kargs); +static int rmfork_restore_all(struct rmfork_kargs *kargs); +static int rmfork_restore_one(struct rmfork_kargs *kargs); +static int dump_one_task(struct pid *pid, struct rmfork_pstree *pstree, + unsigned long vpid, struct iovec __user *iovs, + unsigned long nr); + +/* ------------------------------------------------------------------ */ +/* UBMEM (pmem) allocator */ +/* ------------------------------------------------------------------ */ +static int __init rmfork_ubmem_init(void) +{ + phys_addr_t phys; + + phys = memblock_phys_alloc(RMFORK_UBMEM_SIZE, SZ_2M); + if (!phys) { + pr_err("rmfork: failed to reserve %pa of pmem\n", &RMFORK_UBMEM_SIZE); + return -ENOMEM; + } + + rmfork_ubmem_base = memremap(phys, RMFORK_UBMEM_SIZE, MEMREMAP_WB); + if (!rmfork_ubmem_base) { + pr_err("rmfork: failed to memremap ubmem\n"); + memblock_phys_free(phys, RMFORK_UBMEM_SIZE); + return -ENOMEM; + } + + rfork_ubmem_phys = phys; + rfork_ubmem_size = RMFORK_UBMEM_SIZE; + rfork_ubmem_off = 0; + + memset(rmfork_ubmem_base, 0, RMFORK_UBMEM_SIZE); + + pr_info("rmfork: ubmem reserved at phys=0x%llx, size=0x%lx, va=%px\n", + (unsigned long long)phys, RMFORK_UBMEM_SIZE, rmfork_ubmem_base); + return 0; +} +early_initcall(rmfork_ubmem_init); + +static void *rmfork_alloc_from_ubmem(unsigned long size) +{ + void *ptr; + unsigned long off; + + mutex_lock(&rmfork_mutex); + off = ALIGN(rfork_ubmem_off, 8); + if (off + size > rfork_ubmem_size) { + mutex_unlock(&rmfork_mutex); + pr_err("rmfork: ubmem exhausted (off=0x%lx, size=0x%lx, max=0x%lx)\n", + off, size, rfork_ubmem_size); + return NULL; + } + ptr = rmfork_ubmem_base + off; + rfork_ubmem_off = off + size; + mutex_unlock(&rmfork_mutex); + + memset(ptr, 0, size); + return ptr; +} + +static void *rmfork_off_to_va(unsigned long off) +{ + if (off >= rfork_ubmem_size) { + pr_err("rmfork: invalid ubmem offset 0x%lx (max 0x%lx)\n", + off, rfork_ubmem_size); + return NULL; + } + return rmfork_ubmem_base + off; +} + +static unsigned long rfork_va_to_off(void *va) +{ + unsigned long off; + + off = (unsigned long)(va - rmfork_ubmem_base); + if (off >= rfork_ubmem_size) { + pr_err("rmfork: va %px not in ubmem range\n", va); + return 0; + } + return off; +} + +/* ------------------------------------------------------------------ */ +/* Page-table walk helpers */ +/* ------------------------------------------------------------------ */ +static int rmfork_get_pte(struct mm_struct *mm, unsigned long addr, + spinlock_t **ptl, pte_t **ptep) +{ + pgd_t *pgd; + p4d_t *p4d; + pud_t *pud; + pmd_t *pmd; + pte_t *pte; + + pgd = pgd_offset(mm, addr); + if (pgd_none(*pgd) || pgd_bad(*pgd)) + return -ENOENT; + + p4d = p4d_offset(pgd, addr); + if (p4d_none(*p4d) || p4d_bad(*p4d)) + return -ENOENT; + + pud = pud_offset(p4d, addr); + if (pud_none(*pud) || pud_bad(*pud)) + return -ENOENT; + + pmd = pmd_offset(pud, addr); + if (pmd_none(*pmd)) + return -ENOENT; + + /* Handle THP */ + if (pmd_trans_huge(*pmd)) { + *ptep = (pte_t *)pmd; + *ptl = pmd_lock(mm, pmd); + return 0; + } + + if (pmd_bad(*pmd)) + return -ENOENT; + + pte = pte_offset_map_lock(mm, pmd, addr, ptl); + if (!pte || pte_none(*pte)) { + pte_unmap_unlock(pte, *ptl); + return -ENOENT; + } + + *ptep = pte; + return 0; +} + +/* ------------------------------------------------------------------ */ +/* Dump path (rmfork_opt=3) */ +/* ------------------------------------------------------------------ */ +static int rmfork_prepare_pts(struct rmfork_task *task, unsigned long max_pages) +{ + size_t size; + + size = max_pages * sizeof(struct rmfork_pt_entry); + task->pts = rmfork_alloc_from_ubmem(size); + if (!task->pts) + return -ENOMEM; + + task->pts_off = rfork_va_to_off(task->pts); + task->pt_cnt = 0; + return 0; +} + +static int rmfork_record_vma(struct mm_struct *mm, struct rmfork_task *task, + unsigned long vma_start, unsigned long vma_end) +{ + unsigned long addr; + spinlock_t *ptl; + pte_t *ptep; + int ret; + + for (addr = vma_start; addr < vma_end; addr += PAGE_SIZE) { + if (task->pt_cnt >= RMFORK_PTS_MAX) { + pr_warn("rmfork: too many pages (%lu) for task %d\n", + task->pt_cnt, task->pid); + return -E2BIG; + } + + ret = rmfork_get_pte(mm, addr, &ptl, &ptep); + if (ret == -ENOENT) + continue; + if (ret < 0) + continue; + + if (pte_none(*ptep) || !pte_present(*ptep)) { + pte_unmap_unlock(ptep, ptl); + continue; + } + + task->pts[task->pt_cnt].va = addr >> PAGE_SHIFT; + task->pts[task->pt_cnt].pfn_saved = pte_pfn(*ptep); + + pte_unmap_unlock(ptep, ptl); + task->pt_cnt++; + } + + return 0; +} + +static int rmfork_copy_pages(struct rmfork_task *task) +{ + unsigned long i; + struct page *page; + void *src, *dst; + + dst = task->data_ptr; + + for (i = 0; i < task->pt_cnt; i++) { + page = pfn_to_page(task->pts[i].pfn_saved); + if (!page) { + pr_warn("rmfork: invalid pfn 0x%lx at index %lu\n", + task->pts[i].pfn_saved, i); + continue; + } + + src = kmap_local_page(page); + if (!src) + continue; + + memcpy(dst + i * PAGE_SIZE, src, PAGE_SIZE); + kunmap_local(src); + + /* Convert to ubmem offset index */ + task->pts[i].pfn = (rfork_va_to_off(task->data_ptr) >> PAGE_SHIFT) + i; + } + + return 0; +} + +static unsigned long rmfork_init_metadata(struct rmfork_pstree *pstree) +{ + struct rmfork_metadata *meta; + + meta = rmfork_alloc_from_ubmem(sizeof(*meta)); + if (!meta) + return 0; + + meta->magic = RMFORK_MAGIC; + meta->pstrees = pstree; + meta->pstrees_pa = rfork_va_to_off(pstree); + meta->tsk_cnt = pstree->tsk_cnt; + + return rfork_va_to_off(meta); +} + +static int dump_one_task(struct pid *pid, struct rmfork_pstree *pstree, + unsigned long vpid, struct iovec __user *iovs, + unsigned long nr) +{ + struct task_struct *tsk; + struct mm_struct *mm; + struct vm_area_struct *vma; + struct rmfork_task *task; + unsigned long max_pages, data_size; + int ret; + unsigned long idx; + + tsk = get_pid_task(pid, PIDTYPE_PID); + if (!tsk) { + pr_err("rmfork: task not found, pid=%d\n", pid_nr(pid)); + return -ESRCH; + } + + mm = get_task_mm(tsk); + if (!mm) { + pr_err("rmfork: no mm for task %d\n", pid_nr(pid)); + put_task_struct(tsk); + return -EINVAL; + } + + /* Allocate task slot in pstree */ + idx = pstree->tsk_cnt; + if (idx >= RMFORK_TASK_MAX) { + pr_err("rmfork: too many tasks\n"); + mmput(mm); + put_task_struct(tsk); + return -E2BIG; + } + task = &pstree->tasks[idx]; + + task->pid = pid_nr(pid); + task->vpid = (pid_t)vpid; + task->pt_cnt = 0; + + /* Count VMAs to estimate max pages */ + max_pages = 0; + mmap_read_lock(mm); + for (vma = mm->mmap; vma; vma = vma->vm_next) + max_pages += (vma->vm_end - vma->vm_start) >> PAGE_SHIFT; + + if (max_pages > RMFORK_PTS_MAX) + max_pages = RMFORK_PTS_MAX; + + /* Allocate pts and data pages in ubmem */ + ret = rmfork_prepare_pts(task, max_pages); + if (ret) { + mmap_read_unlock(mm); + mmput(mm); + put_task_struct(tsk); + return ret; + } + + /* Walk all VMAs and record pages */ + for (vma = mm->mmap; vma; vma = vma->vm_next) { + if (vma->vm_flags & (VM_IO | VM_PFNMAP)) + continue; + rmfork_record_vma(mm, task, vma->vm_start, vma->vm_end); + } + mmap_read_unlock(mm); + + if (task->pt_cnt == 0) { + pr_warn("rmfork: no pages recorded for task %d\n", task->pid); + mmput(mm); + put_task_struct(tsk); + return 0; + } + + /* Allocate data buffer in ubmem */ + data_size = task->pt_cnt * PAGE_SIZE; + task->data_ptr = rmfork_alloc_from_ubmem(data_size); + if (!task->data_ptr) { + mmput(mm); + put_task_struct(tsk); + return -ENOMEM; + } + task->data_ptr_off = rfork_va_to_off(task->data_ptr); + + /* Copy page content */ + ret = rmfork_copy_pages(task); + if (ret) { + mmput(mm); + put_task_struct(tsk); + return ret; + } + + pr_info("rmfork: dumped task pid=%d vpid=%lu pt_cnt=%lu data_off=0x%lx\n", + task->pid, vpid, task->pt_cnt, task->data_ptr_off); + + pstree->tsk_cnt++; + mmput(mm); + put_task_struct(tsk); + return 0; +} + +static int rmfork_dump_mem(struct rmfork_kargs *kargs) +{ + struct pid *pid; + struct rmfork_pstree *pstrees = NULL; + unsigned long meta_off; + int ret; + + pid = find_get_pid((pid_t)kargs->pid); + if (!pid) { + pr_err("rmfork: cannot find pid %lu\n", kargs->pid); + return -ESRCH; + } + + mutex_lock(&rmfork_mutex); + + pstrees = rmfork_pstree; + if (!pstrees) { + pstrees = rmfork_alloc_from_ubmem(sizeof(*pstrees)); + if (!pstrees) { + mutex_unlock(&rmfork_mutex); + put_pid(pid); + return -ENOMEM; + } + pstrees->tsk_cnt = 0; + pstrees->criu_pid = (pid_t)kargs->pid; + + pstrees->tasks = rmfork_alloc_from_ubmem( + sizeof(struct rmfork_task) * RMFORK_TASK_MAX); + if (!pstrees->tasks) { + mutex_unlock(&rmfork_mutex); + put_pid(pid); + return -ENOMEM; + } + pstrees->tasks_off = rfork_va_to_off(pstrees->tasks); + + rmfork_pstree = pstrees; + pr_info("rmfork: created new pstree at %px\n", pstrees); + } else { + pr_info("rmfork: reusing existing pstree (tsk_cnt=%lu)\n", + pstrees->tsk_cnt); + } + + mutex_unlock(&rmfork_mutex); + + ret = dump_one_task(pid, pstrees, kargs->pid, + (struct iovec __user *)kargs->iovs, kargs->nr_iovs); + if (ret) { + put_pid(pid); + return ret; + } + + mutex_lock(&rmfork_mutex); + meta_off = rmfork_init_metadata(pstrees); + if (meta_off) { + rfork_meta_off = meta_off; + pr_info("rmfork: metadata written at ubmem offset 0x%lx\n", meta_off); + } + mutex_unlock(&rmfork_mutex); + + kargs->pa = meta_off; + + put_pid(pid); + return 0; +} + +/* ------------------------------------------------------------------ */ +/* Restore path (rmfork_opt=1 / 2) */ +/* ------------------------------------------------------------------ */ +static unsigned long rmfork_task_find_pa_bsearch(struct rmfork_task *task, + unsigned long va, + unsigned long *pos) +{ + int lo = 0, hi = task->pt_cnt - 1, mid; + unsigned long key = va >> PAGE_SHIFT; + + while (lo <= hi) { + mid = (lo + hi) / 2; + if (task->pts[mid].va == key) { + *pos = mid; + return task->pts[mid].pfn; + } + if (task->pts[mid].va < key) + lo = mid + 1; + else + hi = mid - 1; + } + *pos = 0; + return 0; +} + +static int rmfork_handle_one_fault(struct vm_area_struct *vma, unsigned long addr, + unsigned long size, unsigned long pfn, + int remap, int thp) +{ + struct page *page; + unsigned long pa; + + pa = rfork_ubmem_phys + (pfn << PAGE_SHIFT); + page = pfn_to_page(pa >> PAGE_SHIFT); + if (!page) + return -EINVAL; + + return vm_insert_page(vma, addr, page); +} + +static int rmfork_restore_vma(struct mm_struct *mm, struct rmfork_task *task, + unsigned long iov_base, unsigned long iov_off, + unsigned long len, unsigned long *saved_idx, + unsigned long *find_time) +{ + unsigned long va, pfn; + unsigned long pos; + struct vm_area_struct *vma; + int ret; + + if (len == 0) + return 0; + + for (va = iov_base; va < iov_base + len; va += PAGE_SIZE) { + pfn = rmfork_task_find_pa_bsearch(task, va, &pos); + if (pfn == 0) + continue; + + vma = find_vma(mm, va); + if (!vma) { + pr_warn("rmfork: no vma for va=0x%lx\n", va); + continue; + } + + *saved_idx = pos; + (*find_time)++; + + ret = rmfork_handle_one_fault(vma, va, PAGE_SIZE, pfn, 0, 0); + if (ret) { + pr_err("rmfork: handle_one_fault failed at va=0x%lx: %d\n", + va, ret); + return ret; + } + } + + return 0; +} + +static int rmfork_restore_iov(struct mm_struct *mm, struct rmfork_task *task, + struct iovec __user *iovs, unsigned long nr) +{ + struct iovec *kiovs; + unsigned long saved_idx = 0, find_time = 0; + int ret; + unsigned long i; + + if (!iovs || nr == 0) + return -EINVAL; + + kiovs = memdup_user(iovs, nr * sizeof(struct iovec)); + if (IS_ERR(kiovs)) + return PTR_ERR(kiovs); + + for (i = 0; i < nr; i++) { + if (!kiovs[i].iov_base || !kiovs[i].iov_len) + continue; + + ret = rmfork_restore_vma(mm, task, + (unsigned long)kiovs[i].iov_base, 0, + kiovs[i].iov_len, &saved_idx, &find_time); + if (ret) + goto out; + } + + pr_info("rmfork: restored %lu pages for task pid=%d\n", + find_time, task->pid); + ret = 0; + +out: + kfree(kiovs); + return ret; +} + +static int rmfork_restore_one(struct rmfork_kargs *kargs) +{ + struct mm_struct *mm; + struct task_struct *tsk; + struct pid *pid; + struct rmfork_metadata *meta; + struct rmfork_pstree *old_tree; + struct rmfork_task *rmfork_task, *old_task; + int ret = 0; + unsigned long i; + + pid = find_get_pid((pid_t)kargs->pid); + if (!pid) + return -ESRCH; + + tsk = get_pid_task(pid, PIDTYPE_PID); + if (!tsk) { + put_pid(pid); + return -ESRCH; + } + + mm = get_task_mm(tsk); + if (!mm) { + put_task_struct(tsk); + put_pid(pid); + return -EINVAL; + } + + mutex_lock(&rmfork_mutex); + + meta = rmfork_off_to_va(kargs->pa); + if (!meta || meta->magic != RMFORK_MAGIC) { + pr_err("rmfork: bad metadata at off=0x%lx (magic=0x%lx)\n", + kargs->pa, meta ? meta->magic : 0); + ret = -EINVAL; + goto out_unlock; + } + + pr_info("rmfork: restore meta at %px, magic=0x%lx, tsk_cnt=%d\n", + meta, meta->magic, meta->tsk_cnt); + + old_tree = rmfork_off_to_va(meta->pstrees_pa); + if (!old_tree) { + ret = -EINVAL; + goto out_unlock; + } + + /* Either reuse existing pstree or build new one */ + if (rmfork_pstree && rfork_meta_off == kargs->pa) { + pr_info("rmfork: pstree already created, reusing\n"); + rmfork_task = rmfork_pstree->tasks; + } else { + /* First restore: copy task info from old tree */ + if (!rmfork_pstree) { + rmfork_pstree = rmfork_alloc_from_ubmem(sizeof(*rmfork_pstree)); + if (!rmfork_pstree) { + ret = -ENOMEM; + goto out_unlock; + } + } + + old_task = rmfork_off_to_va(old_tree->tasks_off); + if (!old_task) { + ret = -EINVAL; + goto out_unlock; + } + + rmfork_pstree->tsk_cnt = old_tree->tsk_cnt; + rmfork_pstree->tasks_off = old_tree->tasks_off; + + rmfork_pstree->tasks = rmfork_alloc_from_ubmem( + sizeof(struct rmfork_task) * old_tree->tsk_cnt); + if (!rmfork_pstree->tasks) { + ret = -ENOMEM; + goto out_unlock; + } + + memcpy(rmfork_pstree->tasks, old_task, + sizeof(struct rmfork_task) * old_tree->tsk_cnt); + + /* Wire up pts: they're already in ubmem, reuse directly */ + for (i = 0; i < old_tree->tsk_cnt; i++) { + rmfork_pstree->tasks[i].pts = + rmfork_off_to_va(old_task[i].pts_off); + rmfork_pstree->tasks[i].data_ptr = + rmfork_off_to_va(old_task[i].data_ptr_off); + } + + rmfork_pstree->criu_pid = (pid_t)kargs->pid; + rfork_meta_off = kargs->pa; + pr_info("rmfork: new pstree created, tsk_cnt=%lu\n", + rmfork_pstree->tsk_cnt); + } + + rmfork_task = rmfork_pstree->tasks; + + /* Find the matching task by vpid and restore */ + for (i = 0; i < rmfork_pstree->tsk_cnt; i++) { + if (rmfork_task[i].vpid == (pid_t)kargs->pid) { + pr_info("rmfork: restoring pid=%d (vpid=%d), pt_cnt=%lu\n", + rmfork_task[i].pid, rfork_task[i].vpid, + rmfork_task[i].pt_cnt); + ret = rmfork_restore_iov(mm, &rmfork_task[i], + (struct iovec __user *)kargs->iovs, + kargs->nr_iovs); + break; + } + } + + if (i == rmfork_pstree->tsk_cnt) { + pr_warn("rmfork: no matching task for pid=%lu\n", kargs->pid); + ret = -ENOENT; + } + +out_unlock: + mutex_unlock(&rmfork_mutex); + mmput(mm); + put_task_struct(tsk); + put_pid(pid); + return ret; +} + +static int rmfork_restore_all(struct rmfork_kargs *kargs) +{ + return rmfork_restore_one(kargs); +} + +/* ------------------------------------------------------------------ */ +/* Syscall entry point */ +/* ------------------------------------------------------------------ */ +SYSCALL_DEFINE1(remote_fork, struct rmfork_kargs __user *, uargs) +{ + struct rmfork_kargs kargs; + int ret; + + if (!uargs) + return -EINVAL; + + if (copy_from_user(&kargs, uargs, sizeof(kargs))) + return -EFAULT; + + pr_info("rmfork: syscall pid=%lu opt=%lu\n", kargs.pid, kargs.rmfork_opt); + + switch (kargs.rmfork_opt) { + case RMFORK_OPT_DUMP: + ret = rmfork_dump_mem(&kargs); + break; + case RMFORK_OPT_RESTORE_ALL: + ret = rmfork_restore_all(&kargs); + break; + case RMFORK_OPT_RESTORE_ONE: + ret = rmfork_restore_one(&kargs); + break; + default: + pr_err("rmfork: unknown opt %lu\n", kargs.rmfork_opt); + return -EINVAL; + } + + return ret; +} diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c index e8e117787377..6f4125e32da8 100644 --- a/kernel/sys_ni.c +++ b/kernel/sys_ni.c @@ -271,6 +271,9 @@ COND_SYSCALL(memfd_secret); * Architecture specific weak syscall entries. */ +/* rmfork: arm64 (syscall 454) */ +COND_SYSCALL(remote_fork); + /* pciconfig: alpha, arm, arm64, ia64, sparc */ COND_SYSCALL(pciconfig_read); COND_SYSCALL(pciconfig_write); -- 2.53.0