- Add /sys/kernel/rmfork/ubmem_reset (write-only) to reset ubmem state between dump cycles without rebooting. Clears metadata, zeroes pmem. - Split rmfork_alloc_from_ubmem into _locked variant for callers that already hold rmfork_mutex, and ALIGN to PAGE_SIZE instead of 8 bytes. - Support CONFIG_KUP_PMEM_MEMORY in rmfork_ubmem_init: use the kernel pmemmem parameter region if available, fall back to memblock_phys_alloc. - Expand RMFORK_PTS_MAX from 4096 to SZ_8G>>PAGE_SHIFT (2,097,152), enabling checkpoint of processes with up to 8GB of virtual memory. - Switch VMA iteration from legacy mm->mmap linked list to for_each_vma with VMA_ITERATOR (maple tree). Replace VMA filter VM_IO|VM_PFNMAP with VM_SPECIAL (the modern equivalent since v6.4). - Remove unused rmfork_on_single_machine flag. - Add copy_to_user back to userspace after syscall completes so CRIU can read the result, and pr_err on failure for diagnostics. Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com> --- kernel/rmfork.c | 133 ++++++++++++++++++++++++++++++++++++++---------- 1 file changed, 105 insertions(+), 28 deletions(-) diff --git a/kernel/rmfork.c b/kernel/rmfork.c index 675d712b3d7d..d9bf6eae1520 100644 --- a/kernel/rmfork.c +++ b/kernel/rmfork.c @@ -44,7 +44,7 @@ #define RMFORK_MAGIC 0x1234abcdUL #define RMFORK_UBMEM_START 0x840000000ULL #define RMFORK_UBMEM_SIZE SZ_32M -#define RMFORK_PTS_MAX 4096 +#define RMFORK_PTS_MAX (SZ_8G >> PAGE_SHIFT) #define RMFORK_TASK_MAX 512 /* ------------------------------------------------------------------ */ @@ -94,8 +94,6 @@ static unsigned long rfork_ubmem_off; /* current allocation offset */ static struct rmfork_pstree *rmfork_pstree; static unsigned long rfork_meta_off; -/* Whether we are on the same machine as dump (simplifies pfn handling) */ -static bool rmfork_on_single_machine = true; /* ------------------------------------------------------------------ */ /* Forward declarations */ @@ -112,54 +110,121 @@ static int dump_one_task(struct pid *pid, struct rmfork_pstree *pstree, /* ------------------------------------------------------------------ */ static int __init rmfork_ubmem_init(void) { - phys_addr_t phys; - - phys = memblock_phys_alloc(RMFORK_UBMEM_SIZE, SZ_2M); - if (!phys) { - pr_err("rmfork: failed to reserve %pa of pmem\n", &RMFORK_UBMEM_SIZE); - return -ENOMEM; + phys_addr_t phys = 0; + unsigned long size = RMFORK_UBMEM_SIZE; + +#ifdef CONFIG_KUP_PMEM_MEMORY + if (pmem_res.end) { + phys = pmem_res.start; + size = pmem_res.end - pmem_res.start + 1; + pr_info("rmfork: using pmemmem region at phys=0x%llx, size=0x%lx\n", + (unsigned long long)phys, size); + } else +#endif + { + phys = memblock_phys_alloc(size, SZ_2M); + if (!phys) { + pr_err("rmfork: failed to reserve %zu of pmem\n", size); + return -ENOMEM; + } } - rmfork_ubmem_base = memremap(phys, RMFORK_UBMEM_SIZE, MEMREMAP_WB); + rmfork_ubmem_base = memremap(phys, size, MEMREMAP_WB); if (!rmfork_ubmem_base) { pr_err("rmfork: failed to memremap ubmem\n"); - memblock_phys_free(phys, RMFORK_UBMEM_SIZE); +#ifndef CONFIG_KUP_PMEM_MEMORY + memblock_phys_free(phys, size); +#endif return -ENOMEM; } rfork_ubmem_phys = phys; - rfork_ubmem_size = RMFORK_UBMEM_SIZE; + rfork_ubmem_size = size; rfork_ubmem_off = 0; - memset(rmfork_ubmem_base, 0, RMFORK_UBMEM_SIZE); + memset(rmfork_ubmem_base, 0, size); - pr_info("rmfork: ubmem reserved at phys=0x%llx, size=0x%lx, va=%px\n", - (unsigned long long)phys, RMFORK_UBMEM_SIZE, rmfork_ubmem_base); + pr_info("rmfork: ubmem active at phys=0x%llx, size=0x%lx, va=%px\n", + (unsigned long long)phys, size, rmfork_ubmem_base); return 0; } early_initcall(rmfork_ubmem_init); -static void *rmfork_alloc_from_ubmem(unsigned long size) +/* ------------------------------------------------------------------ */ +/* Sysfs interface */ +/* ------------------------------------------------------------------ */ +#include <linux/kobject.h> + +static struct kobject *rmfork_kobj; + +static ssize_t ubmem_reset_store(struct kobject *kobj, + struct kobj_attribute *attr, + const char *buf, size_t count) +{ + int val; + + if (kstrtoint(buf, 0, &val)) + return -EINVAL; + + if (val == 1) { + mutex_lock(&rmfork_mutex); + rfork_ubmem_off = 0; + memset(rmfork_ubmem_base, 0, rfork_ubmem_size); + rmfork_pstree = NULL; + rfork_meta_off = 0; + mutex_unlock(&rmfork_mutex); + pr_info("rmfork: ubmem reset by sysfs\n"); + } + + return count; +} + +static struct kobj_attribute ubmem_reset_attr = __ATTR_WO(ubmem_reset); + +static int __init rmfork_sysfs_init(void) +{ + int ret; + + rmfork_kobj = kobject_create_and_add("rmfork", kernel_kobj); + if (!rmfork_kobj) + return -ENOMEM; + + ret = sysfs_create_file(rmfork_kobj, &ubmem_reset_attr.attr); + if (ret) + kobject_put(rmfork_kobj); + + return ret; +} +device_initcall(rmfork_sysfs_init); + +static void *rmfork_alloc_from_ubmem_locked(unsigned long size) { void *ptr; unsigned long off; - mutex_lock(&rmfork_mutex); - off = ALIGN(rfork_ubmem_off, 8); + off = ALIGN(rfork_ubmem_off, PAGE_SIZE); if (off + size > rfork_ubmem_size) { - mutex_unlock(&rmfork_mutex); pr_err("rmfork: ubmem exhausted (off=0x%lx, size=0x%lx, max=0x%lx)\n", off, size, rfork_ubmem_size); return NULL; } ptr = rmfork_ubmem_base + off; rfork_ubmem_off = off + size; - mutex_unlock(&rmfork_mutex); memset(ptr, 0, size); return ptr; } +static void *rmfork_alloc_from_ubmem(unsigned long size) +{ + void *ptr; + + mutex_lock(&rmfork_mutex); + ptr = rmfork_alloc_from_ubmem_locked(size); + mutex_unlock(&rmfork_mutex); + return ptr; +} + static void *rmfork_off_to_va(unsigned long off) { if (off >= rfork_ubmem_size) { @@ -317,7 +382,7 @@ static unsigned long rmfork_init_metadata(struct rmfork_pstree *pstree) { struct rmfork_metadata *meta; - meta = rmfork_alloc_from_ubmem(sizeof(*meta)); + meta = rmfork_alloc_from_ubmem_locked(sizeof(*meta)); if (!meta) return 0; @@ -371,8 +436,10 @@ static int dump_one_task(struct pid *pid, struct rmfork_pstree *pstree, /* Count VMAs to estimate max pages */ max_pages = 0; mmap_read_lock(mm); - for (vma = mm->mmap; vma; vma = vma->vm_next) + VMA_ITERATOR(vmi, mm, 0); + for_each_vma(vmi, vma) { max_pages += (vma->vm_end - vma->vm_start) >> PAGE_SHIFT; + } if (max_pages > RMFORK_PTS_MAX) max_pages = RMFORK_PTS_MAX; @@ -387,8 +454,9 @@ static int dump_one_task(struct pid *pid, struct rmfork_pstree *pstree, } /* Walk all VMAs and record pages */ - for (vma = mm->mmap; vma; vma = vma->vm_next) { - if (vma->vm_flags & (VM_IO | VM_PFNMAP)) + mas_set(&vmi.mas, 0); + for_each_vma(vmi, vma) { + if (vma->vm_flags & VM_SPECIAL) continue; rmfork_record_vma(mm, task, vma->vm_start, vma->vm_end); } @@ -445,7 +513,7 @@ static int rmfork_dump_mem(struct rmfork_kargs *kargs) pstrees = rmfork_pstree; if (!pstrees) { - pstrees = rmfork_alloc_from_ubmem(sizeof(*pstrees)); + pstrees = rmfork_alloc_from_ubmem_locked(sizeof(*pstrees)); if (!pstrees) { mutex_unlock(&rmfork_mutex); put_pid(pid); @@ -454,7 +522,7 @@ static int rmfork_dump_mem(struct rmfork_kargs *kargs) pstrees->tsk_cnt = 0; pstrees->criu_pid = (pid_t)kargs->pid; - pstrees->tasks = rmfork_alloc_from_ubmem( + pstrees->tasks = rmfork_alloc_from_ubmem_locked( sizeof(struct rmfork_task) * RMFORK_TASK_MAX); if (!pstrees->tasks) { mutex_unlock(&rmfork_mutex); @@ -660,7 +728,7 @@ static int rmfork_restore_one(struct rmfork_kargs *kargs) } else { /* First restore: copy task info from old tree */ if (!rmfork_pstree) { - rmfork_pstree = rmfork_alloc_from_ubmem(sizeof(*rmfork_pstree)); + rmfork_pstree = rmfork_alloc_from_ubmem_locked(sizeof(*rmfork_pstree)); if (!rmfork_pstree) { ret = -ENOMEM; goto out_unlock; @@ -676,7 +744,7 @@ static int rmfork_restore_one(struct rmfork_kargs *kargs) rmfork_pstree->tsk_cnt = old_tree->tsk_cnt; rmfork_pstree->tasks_off = old_tree->tasks_off; - rmfork_pstree->tasks = rmfork_alloc_from_ubmem( + rmfork_pstree->tasks = rmfork_alloc_from_ubmem_locked( sizeof(struct rmfork_task) * old_tree->tsk_cnt); if (!rmfork_pstree->tasks) { ret = -ENOMEM; @@ -764,5 +832,14 @@ SYSCALL_DEFINE1(remote_fork, struct rmfork_kargs __user *, uargs) return -EINVAL; } + if (copy_to_user(uargs, &kargs, sizeof(kargs))) { + pr_err("rmfork: copy_to_user failed after opt=%lu ret=%d\n", + kargs.rmfork_opt, ret); + return -EFAULT; + } + + if (ret) + pr_err("rmfork: opt=%lu failed: %d\n", kargs.rmfork_opt, ret); + return ret; } -- 2.53.0