[PATCH OLK-6.6 0/3] CVE-2026-53084
Puranjay Mohan (3): bpf: fix mm lifecycle in open-coded task_vma iterator bpf: switch task_vma iterator from mmap_lock to per-VMA locks bpf: return VMA snapshot from task_vma iterator kernel/bpf/task_iter.c | 151 +++++++++++++++++++++++++++++++++++++---- 1 file changed, 136 insertions(+), 15 deletions(-) -- 2.34.1
From: Puranjay Mohan <puranjay@kernel.org> mainline inclusion from mainline-v7.1-rc1 commit d8e27d2d22b6e2df3a0125b8c08e9aace38c954c category: bugfix bugzilla: https://atomgit.com/src-openeuler/kernel/issues/15786 CVE: CVE-2026-53084 Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?i... -------------------------------- The open-coded task_vma iterator reads task->mm locklessly and acquires mmap_read_trylock() but never calls mmget(). If the task exits concurrently, the mm_struct can be freed as it is not SLAB_TYPESAFE_BY_RCU, resulting in a use-after-free. Safely read task->mm with a trylock on alloc_lock and acquire an mm reference. Drop the reference via bpf_iter_mmput_async() in _destroy() and error paths. bpf_iter_mmput_async() is a local wrapper around mmput_async() with a fallback to mmput() on !CONFIG_MMU. Reject irqs-disabled contexts (including NMI) up front. Operations used by _next() and _destroy() (mmap_read_unlock, bpf_iter_mmput_async) take spinlocks with IRQs disabled (pool->lock, pi_lock). Running from NMI or from a tracepoint that fires with those locks held could deadlock. A trylock on alloc_lock is used instead of the blocking task_lock() (get_task_mm) to avoid a deadlock when a softirq BPF program iterates a task that already holds its alloc_lock on the same CPU. Fixes: 4ac454682158 ("bpf: Introduce task_vma open-coded iterator kfuncs") Signed-off-by: Puranjay Mohan <puranjay@kernel.org> Link: https://lore.kernel.org/r/20260408154539.3832150-2-puranjay@kernel.org Signed-off-by: Alexei Starovoitov <ast@kernel.org> Signed-off-by: Pu Lehui <pulehui@huawei.com> --- kernel/bpf/task_iter.c | 54 +++++++++++++++++++++++++++++++++++++++--- 1 file changed, 51 insertions(+), 3 deletions(-) diff --git a/kernel/bpf/task_iter.c b/kernel/bpf/task_iter.c index 145c506899e4..26284c8a8c72 100644 --- a/kernel/bpf/task_iter.c +++ b/kernel/bpf/task_iter.c @@ -10,6 +10,7 @@ #include <linux/bpf_mem_alloc.h> #include <linux/btf_ids.h> #include <linux/mm_types.h> +#include <linux/sched/mm.h> #include "mmap_unlock_work.h" static const char * const iter_task_type_names[] = { @@ -825,6 +826,15 @@ const struct bpf_func_proto bpf_find_vma_proto = { .arg5_type = ARG_ANYTHING, }; +static inline void bpf_iter_mmput_async(struct mm_struct *mm) +{ +#ifdef CONFIG_MMU + mmput_async(mm); +#else + mmput(mm); +#endif +} + struct bpf_iter_task_vma_kern_data { struct task_struct *task; struct mm_struct *mm; @@ -856,6 +866,24 @@ __bpf_kfunc int bpf_iter_task_vma_new(struct bpf_iter_task_vma *it, BUILD_BUG_ON(sizeof(struct bpf_iter_task_vma_kern) != sizeof(struct bpf_iter_task_vma)); BUILD_BUG_ON(__alignof__(struct bpf_iter_task_vma_kern) != __alignof__(struct bpf_iter_task_vma)); + /* bpf_iter_mmput_async() needs mmput_async() which requires CONFIG_MMU */ + if (!IS_ENABLED(CONFIG_MMU)) { + kit->data = NULL; + return -EOPNOTSUPP; + } + + /* + * Reject irqs-disabled contexts including NMI. Operations used + * by _next() and _destroy() (mmap_read_unlock, bpf_iter_mmput_async) + * can take spinlocks with IRQs disabled (pi_lock, pool->lock). + * Running from NMI or from a tracepoint that fires with those + * locks held could deadlock. + */ + if (irqs_disabled()) { + kit->data = NULL; + return -EBUSY; + } + /* is_iter_reg_valid_uninit guarantees that kit hasn't been initialized * before, so non-NULL kit->data doesn't point to previously * bpf_mem_alloc'd bpf_iter_task_vma_kern_data @@ -865,7 +893,25 @@ __bpf_kfunc int bpf_iter_task_vma_new(struct bpf_iter_task_vma *it, return -ENOMEM; kit->data->task = get_task_struct(task); + /* + * Safely read task->mm and acquire an mm reference. + * + * Cannot use get_task_mm() because its task_lock() is a + * blocking spin_lock that would deadlock if the target task + * already holds alloc_lock on this CPU (e.g. a softirq BPF + * program iterating a task interrupted while holding its + * alloc_lock). + */ + if (!spin_trylock(&task->alloc_lock)) { + err = -EBUSY; + goto err_cleanup_iter; + } kit->data->mm = task->mm; + if (kit->data->mm && !(task->flags & PF_KTHREAD)) + mmget(kit->data->mm); + else + kit->data->mm = NULL; + spin_unlock(&task->alloc_lock); if (!kit->data->mm) { err = -ENOENT; goto err_cleanup_iter; @@ -875,15 +921,16 @@ __bpf_kfunc int bpf_iter_task_vma_new(struct bpf_iter_task_vma *it, irq_work_busy = bpf_mmap_unlock_get_irq_work(&kit->data->work); if (irq_work_busy || !mmap_read_trylock(kit->data->mm)) { err = -EBUSY; - goto err_cleanup_iter; + goto err_cleanup_mmget; } vma_iter_init(&kit->data->vmi, kit->data->mm, addr); return 0; +err_cleanup_mmget: + bpf_iter_mmput_async(kit->data->mm); err_cleanup_iter: - if (kit->data->task) - put_task_struct(kit->data->task); + put_task_struct(kit->data->task); bpf_mem_free(&bpf_global_ma, kit->data); /* NULL kit->data signals failed bpf_iter_task_vma initialization */ kit->data = NULL; @@ -906,6 +953,7 @@ __bpf_kfunc void bpf_iter_task_vma_destroy(struct bpf_iter_task_vma *it) if (kit->data) { bpf_mmap_unlock_mm(kit->data->work, kit->data->mm); put_task_struct(kit->data->task); + bpf_iter_mmput_async(kit->data->mm); bpf_mem_free(&bpf_global_ma, kit->data); } } -- 2.34.1
From: Puranjay Mohan <puranjay@kernel.org> mainline inclusion from mainline-v7.1-rc1 commit bee9ef4a40a277bf401be43d39ba7f7f063cf39c category: bugfix bugzilla: https://atomgit.com/src-openeuler/kernel/issues/15786 CVE: CVE-2026-53084 Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?i... -------------------------------- The open-coded task_vma iterator holds mmap_lock for the entire duration of iteration, increasing contention on this highly contended lock. Switch to per-VMA locking. Find the next VMA via an RCU-protected maple tree walk and lock it with lock_vma_under_rcu(). lock_next_vma() is not used because its fallback takes mmap_read_lock(), and the iterator must work in non-sleepable contexts. lock_vma_under_rcu() is a point lookup (mas_walk) that finds the VMA containing a given address but cannot iterate across gaps. An RCU-protected vma_next() walk (mas_find) first locates the next VMA's vm_start to pass to lock_vma_under_rcu(). Between the RCU walk and the lock, the VMA may be removed, shrunk, or write-locked. On failure, advance past it using vm_end from the RCU walk. Because the VMA slab is SLAB_TYPESAFE_BY_RCU, vm_end may be stale; fall back to PAGE_SIZE advancement when it does not make forward progress. Concurrent VMA insertions at addresses already passed by the iterator are not detected. CONFIG_PER_VMA_LOCK is required; return -EOPNOTSUPP without it. Signed-off-by: Puranjay Mohan <puranjay@kernel.org> Link: https://lore.kernel.org/r/20260408154539.3832150-3-puranjay@kernel.org Signed-off-by: Alexei Starovoitov <ast@kernel.org> Signed-off-by: Pu Lehui <pulehui@huawei.com> --- kernel/bpf/task_iter.c | 91 +++++++++++++++++++++++++++++++++--------- 1 file changed, 73 insertions(+), 18 deletions(-) diff --git a/kernel/bpf/task_iter.c b/kernel/bpf/task_iter.c index 26284c8a8c72..804e5a9b283f 100644 --- a/kernel/bpf/task_iter.c +++ b/kernel/bpf/task_iter.c @@ -10,6 +10,7 @@ #include <linux/bpf_mem_alloc.h> #include <linux/btf_ids.h> #include <linux/mm_types.h> +#include <linux/mmap_lock.h> #include <linux/sched/mm.h> #include "mmap_unlock_work.h" @@ -838,8 +839,8 @@ static inline void bpf_iter_mmput_async(struct mm_struct *mm) struct bpf_iter_task_vma_kern_data { struct task_struct *task; struct mm_struct *mm; - struct mmap_unlock_irq_work *work; - struct vma_iterator vmi; + struct vm_area_struct *locked_vma; + u64 next_addr; }; struct bpf_iter_task_vma { @@ -860,21 +861,19 @@ __bpf_kfunc int bpf_iter_task_vma_new(struct bpf_iter_task_vma *it, struct task_struct *task, u64 addr) { struct bpf_iter_task_vma_kern *kit = (void *)it; - bool irq_work_busy = false; int err; BUILD_BUG_ON(sizeof(struct bpf_iter_task_vma_kern) != sizeof(struct bpf_iter_task_vma)); BUILD_BUG_ON(__alignof__(struct bpf_iter_task_vma_kern) != __alignof__(struct bpf_iter_task_vma)); - /* bpf_iter_mmput_async() needs mmput_async() which requires CONFIG_MMU */ - if (!IS_ENABLED(CONFIG_MMU)) { + if (!IS_ENABLED(CONFIG_PER_VMA_LOCK)) { kit->data = NULL; return -EOPNOTSUPP; } /* * Reject irqs-disabled contexts including NMI. Operations used - * by _next() and _destroy() (mmap_read_unlock, bpf_iter_mmput_async) + * by _next() and _destroy() (vma_end_read, bpf_iter_mmput_async) * can take spinlocks with IRQs disabled (pi_lock, pool->lock). * Running from NMI or from a tracepoint that fires with those * locks held could deadlock. @@ -917,18 +916,10 @@ __bpf_kfunc int bpf_iter_task_vma_new(struct bpf_iter_task_vma *it, goto err_cleanup_iter; } - /* kit->data->work == NULL is valid after bpf_mmap_unlock_get_irq_work */ - irq_work_busy = bpf_mmap_unlock_get_irq_work(&kit->data->work); - if (irq_work_busy || !mmap_read_trylock(kit->data->mm)) { - err = -EBUSY; - goto err_cleanup_mmget; - } - - vma_iter_init(&kit->data->vmi, kit->data->mm, addr); + kit->data->locked_vma = NULL; + kit->data->next_addr = addr; return 0; -err_cleanup_mmget: - bpf_iter_mmput_async(kit->data->mm); err_cleanup_iter: put_task_struct(kit->data->task); bpf_mem_free(&bpf_global_ma, kit->data); @@ -937,13 +928,76 @@ __bpf_kfunc int bpf_iter_task_vma_new(struct bpf_iter_task_vma *it, return err; } +/* + * Find and lock the next VMA at or after data->next_addr. + * + * lock_vma_under_rcu() is a point lookup (mas_walk): it finds the VMA + * containing a given address but cannot iterate. An RCU-protected + * maple tree walk with vma_next() (mas_find) is needed first to locate + * the next VMA's vm_start across any gap. + * + * Between the RCU walk and the lock, the VMA may be removed, shrunk, + * or write-locked. On failure, advance past it using vm_end from the + * RCU walk. SLAB_TYPESAFE_BY_RCU can make vm_end stale, so fall back + * to PAGE_SIZE advancement to guarantee forward progress. + */ +static struct vm_area_struct * +bpf_iter_task_vma_find_next(struct bpf_iter_task_vma_kern_data *data) +{ + struct vm_area_struct *vma; + struct vma_iterator vmi; + unsigned long start, end; + +retry: + rcu_read_lock(); + vma_iter_init(&vmi, data->mm, data->next_addr); + vma = vma_next(&vmi); + if (!vma) { + rcu_read_unlock(); + return NULL; + } + start = vma->vm_start; + end = vma->vm_end; + rcu_read_unlock(); + + vma = lock_vma_under_rcu(data->mm, start); + if (!vma) { + if (end <= data->next_addr) + data->next_addr += PAGE_SIZE; + else + data->next_addr = end; + goto retry; + } + + if (unlikely(vma->vm_end <= data->next_addr)) { + data->next_addr += PAGE_SIZE; + vma_end_read(vma); + goto retry; + } + + return vma; +} + __bpf_kfunc struct vm_area_struct *bpf_iter_task_vma_next(struct bpf_iter_task_vma *it) { struct bpf_iter_task_vma_kern *kit = (void *)it; + struct vm_area_struct *vma; if (!kit->data) /* bpf_iter_task_vma_new failed */ return NULL; - return vma_next(&kit->data->vmi); + + if (kit->data->locked_vma) { + vma_end_read(kit->data->locked_vma); + kit->data->locked_vma = NULL; + } + + vma = bpf_iter_task_vma_find_next(kit->data); + if (!vma) + return NULL; + + kit->data->locked_vma = vma; + kit->data->next_addr = vma->vm_end; + return vma; } __bpf_kfunc void bpf_iter_task_vma_destroy(struct bpf_iter_task_vma *it) @@ -951,7 +1005,8 @@ __bpf_kfunc void bpf_iter_task_vma_destroy(struct bpf_iter_task_vma *it) struct bpf_iter_task_vma_kern *kit = (void *)it; if (kit->data) { - bpf_mmap_unlock_mm(kit->data->work, kit->data->mm); + if (kit->data->locked_vma) + vma_end_read(kit->data->locked_vma); put_task_struct(kit->data->task); bpf_iter_mmput_async(kit->data->mm); bpf_mem_free(&bpf_global_ma, kit->data); -- 2.34.1
From: Puranjay Mohan <puranjay@kernel.org> mainline inclusion from mainline-v7.1-rc1 commit 4cbee026db54cad39c39db4d356100cb133412b3 category: bugfix bugzilla: https://atomgit.com/src-openeuler/kernel/issues/15786 CVE: CVE-2026-53084 Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?i... -------------------------------- Holding the per-VMA lock across the BPF program body creates a lock ordering problem when helpers acquire locks that depend on mmap_lock: vm_lock -> i_rwsem -> mmap_lock -> vm_lock Snapshot the VMA under the per-VMA lock in _next() via memcpy(), then drop the lock before returning. The BPF program accesses only the snapshot. The verifier only trusts vm_mm and vm_file pointers (see BTF_TYPE_SAFE_TRUSTED_OR_NULL in verifier.c). vm_file is reference- counted with get_file() under the lock and released via fput() on the next iteration or in _destroy(). vm_mm is already correct because lock_vma_under_rcu() verifies vma->vm_mm == mm. All other pointers are left as-is by memcpy() since the verifier treats them as untrusted. Fixes: 4ac454682158 ("bpf: Introduce task_vma open-coded iterator kfuncs") Signed-off-by: Puranjay Mohan <puranjay@kernel.org> Acked-by: Andrii Nakryiko <andrii@kernel.org> Acked-by: Mykyta Yatsenko <yatsenko@meta.com> Link: https://lore.kernel.org/r/20260408154539.3832150-4-puranjay@kernel.org Signed-off-by: Alexei Starovoitov <ast@kernel.org> Signed-off-by: Pu Lehui <pulehui@huawei.com> --- kernel/bpf/task_iter.c | 42 ++++++++++++++++++++++++++++++------------ 1 file changed, 30 insertions(+), 12 deletions(-) diff --git a/kernel/bpf/task_iter.c b/kernel/bpf/task_iter.c index 804e5a9b283f..807ef6875880 100644 --- a/kernel/bpf/task_iter.c +++ b/kernel/bpf/task_iter.c @@ -839,7 +839,7 @@ static inline void bpf_iter_mmput_async(struct mm_struct *mm) struct bpf_iter_task_vma_kern_data { struct task_struct *task; struct mm_struct *mm; - struct vm_area_struct *locked_vma; + struct vm_area_struct snapshot; u64 next_addr; }; @@ -873,7 +873,7 @@ __bpf_kfunc int bpf_iter_task_vma_new(struct bpf_iter_task_vma *it, /* * Reject irqs-disabled contexts including NMI. Operations used - * by _next() and _destroy() (vma_end_read, bpf_iter_mmput_async) + * by _next() and _destroy() (vma_end_read, fput, bpf_iter_mmput_async) * can take spinlocks with IRQs disabled (pi_lock, pool->lock). * Running from NMI or from a tracepoint that fires with those * locks held could deadlock. @@ -916,7 +916,7 @@ __bpf_kfunc int bpf_iter_task_vma_new(struct bpf_iter_task_vma *it, goto err_cleanup_iter; } - kit->data->locked_vma = NULL; + kit->data->snapshot.vm_file = NULL; kit->data->next_addr = addr; return 0; @@ -978,26 +978,45 @@ bpf_iter_task_vma_find_next(struct bpf_iter_task_vma_kern_data *data) return vma; } +static void bpf_iter_task_vma_snapshot_reset(struct vm_area_struct *snap) +{ + if (snap->vm_file) { + fput(snap->vm_file); + snap->vm_file = NULL; + } +} + __bpf_kfunc struct vm_area_struct *bpf_iter_task_vma_next(struct bpf_iter_task_vma *it) { struct bpf_iter_task_vma_kern *kit = (void *)it; - struct vm_area_struct *vma; + struct vm_area_struct *snap, *vma; if (!kit->data) /* bpf_iter_task_vma_new failed */ return NULL; - if (kit->data->locked_vma) { - vma_end_read(kit->data->locked_vma); - kit->data->locked_vma = NULL; - } + snap = &kit->data->snapshot; + + bpf_iter_task_vma_snapshot_reset(snap); vma = bpf_iter_task_vma_find_next(kit->data); if (!vma) return NULL; - kit->data->locked_vma = vma; + memcpy(snap, vma, sizeof(*snap)); + + /* + * The verifier only trusts vm_mm and vm_file (see + * BTF_TYPE_SAFE_TRUSTED_OR_NULL in verifier.c). Take a reference + * on vm_file; vm_mm is already correct because lock_vma_under_rcu() + * verifies vma->vm_mm == mm. All other pointers are untrusted by + * the verifier and left as-is. + */ + if (snap->vm_file) + get_file(snap->vm_file); + kit->data->next_addr = vma->vm_end; - return vma; + vma_end_read(vma); + return snap; } __bpf_kfunc void bpf_iter_task_vma_destroy(struct bpf_iter_task_vma *it) @@ -1005,8 +1024,7 @@ __bpf_kfunc void bpf_iter_task_vma_destroy(struct bpf_iter_task_vma *it) struct bpf_iter_task_vma_kern *kit = (void *)it; if (kit->data) { - if (kit->data->locked_vma) - vma_end_read(kit->data->locked_vma); + bpf_iter_task_vma_snapshot_reset(&kit->data->snapshot); put_task_struct(kit->data->task); bpf_iter_mmput_async(kit->data->mm); bpf_mem_free(&bpf_global_ma, kit->data); -- 2.34.1
反馈: 您发送到kernel@openeuler.org的补丁/补丁集,已成功转换为PR! PR链接地址: https://atomgit.com/openeuler/kernel/merge_requests/24605 邮件列表地址:https://mailweb.openeuler.org/archives/list/kernel@openeuler.org/message/RXM... FeedBack: The patch(es) which you have sent to kernel@openeuler.org mailing list has been converted to a pull request successfully! Pull request link: https://atomgit.com/openeuler/kernel/merge_requests/24605 Mailing list address: https://mailweb.openeuler.org/archives/list/kernel@openeuler.org/message/RXM...
participants (2)
-
patchwork bot -
Pu Lehui