
From: Pu Lehui <pulehui@huawei.com> mainline inclusion from mainline-v6.16-rc1 commit 2b12d06c37fd3a394376f42f026a7478d826ed63 category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/ICO428 Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?i... -------------------------------- Patch series "Fix uprobe pte be overwritten when expanding vma". This patch (of 4): We encountered a BUG alert triggered by Syzkaller as follows: BUG: Bad rss-counter state mm:00000000b4a60fca type:MM_ANONPAGES val:1 And we can reproduce it with the following steps: 1. register uprobe on file at zero offset 2. mmap the file at zero offset: addr1 = mmap(NULL, 2 * 4096, PROT_NONE, MAP_PRIVATE, fd, 0); 3. mremap part of vma1 to new vma2: addr2 = mremap(addr1, 4096, 2 * 4096, MREMAP_MAYMOVE); 4. mremap back to orig addr1: mremap(addr2, 4096, 4096, MREMAP_MAYMOVE | MREMAP_FIXED, addr1); In step 3, the vma1 range [addr1, addr1 + 4096] will be remap to new vma2 with range [addr2, addr2 + 8192], and remap uprobe anon page from the vma1 to vma2, then unmap the vma1 range [addr1, addr1 + 4096]. In step 4, the vma2 range [addr2, addr2 + 4096] will be remap back to the addr range [addr1, addr1 + 4096]. Since the addr range [addr1 + 4096, addr1 + 8192] still maps the file, it will take vma_merge_new_range to expand the range, and then do uprobe_mmap in vma_complete. Since the merged vma pgoff is also zero offset, it will install uprobe anon page to the merged vma. However, the upcomming move_page_tables step, which use set_pte_at to remap the vma2 uprobe pte to the merged vma, will overwrite the newly uprobe pte in the merged vma, and lead that pte to be orphan. Since the uprobe pte will be remapped to the merged vma, we can remove the unnecessary uprobe_mmap upon merged vma. This problem was first found in linux-6.6.y and also exists in the community syzkaller: https://lore.kernel.org/all/000000000000ada39605a5e71711@google.com/T/ Link: https://lkml.kernel.org/r/20250529155650.4017699-1-pulehui@huaweicloud.com Link: https://lkml.kernel.org/r/20250529155650.4017699-2-pulehui@huaweicloud.com Fixes: 2b1444983508 ("uprobes, mm, x86: Add the ability to install and remove uprobes breakpoints") Signed-off-by: Pu Lehui <pulehui@huawei.com> Suggested-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com> Reviewed-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com> Acked-by: David Hildenbrand <david@redhat.com> Cc: Jann Horn <jannh@google.com> Cc: Liam Howlett <liam.howlett@oracle.com> Cc: "Masami Hiramatsu (Google)" <mhiramat@kernel.org> Cc: Oleg Nesterov <oleg@redhat.com> Cc: Peter Zijlstra <peterz@infradead.org> Cc: Vlastimil Babka <vbabka@suse.cz> Cc: <stable@vger.kernel.org> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Conflicts: fs/userfaultfd.c include/linux/mm.h mm/internal.h mm/madvise.c mm/mempolicy.c mm/mlock.c mm/mmap.c mm/mprotect.c mm/mremap.c [The conflicts were due to refactor commit 2f1c6611b0a8] Signed-off-by: Tengda Wu <wutengda2@huawei.com> --- fs/userfaultfd.c | 6 +++--- include/linux/mm.h | 2 +- mm/internal.h | 2 ++ mm/madvise.c | 2 +- mm/mempolicy.c | 2 +- mm/mlock.c | 2 +- mm/mmap.c | 30 ++++++++++++++++++++++++------ mm/mprotect.c | 2 +- mm/mremap.c | 2 +- 9 files changed, 35 insertions(+), 15 deletions(-) diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c index c64ad1fca4e4..d616b7777eef 100644 --- a/fs/userfaultfd.c +++ b/fs/userfaultfd.c @@ -976,7 +976,7 @@ static int userfaultfd_release(struct inode *inode, struct file *file) new_flags, vma->anon_vma, vma->vm_file, vma->vm_pgoff, vma_policy(vma), - NULL_VM_UFFD_CTX, anon_vma_name(vma)); + NULL_VM_UFFD_CTX, anon_vma_name(vma), false); if (prev) { vma = prev; } else { @@ -1547,7 +1547,7 @@ static int userfaultfd_register(struct userfaultfd_ctx *ctx, vma->anon_vma, vma->vm_file, pgoff, vma_policy(vma), ((struct vm_userfaultfd_ctx){ ctx }), - anon_vma_name(vma)); + anon_vma_name(vma), false); if (prev) { /* vma_merge() invalidated the mas */ vma = prev; @@ -1732,7 +1732,7 @@ static int userfaultfd_unregister(struct userfaultfd_ctx *ctx, prev = vma_merge(&vmi, mm, prev, start, vma_end, new_flags, vma->anon_vma, vma->vm_file, pgoff, vma_policy(vma), - NULL_VM_UFFD_CTX, anon_vma_name(vma)); + NULL_VM_UFFD_CTX, anon_vma_name(vma), false); if (prev) { vma = prev; goto next; diff --git a/include/linux/mm.h b/include/linux/mm.h index 77a7d7c4c88c..886569d56066 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -3350,7 +3350,7 @@ extern struct vm_area_struct *vma_merge(struct vma_iterator *vmi, struct mm_struct *, struct vm_area_struct *prev, unsigned long addr, unsigned long end, unsigned long vm_flags, struct anon_vma *, struct file *, pgoff_t, struct mempolicy *, struct vm_userfaultfd_ctx, - struct anon_vma_name *); + struct anon_vma_name *, bool skip_vma_uprobe); extern struct anon_vma *find_mergeable_anon_vma(struct vm_area_struct *); extern int __split_vma(struct vma_iterator *vmi, struct vm_area_struct *, unsigned long addr, int new_below); diff --git a/mm/internal.h b/mm/internal.h index ff96a9f53f01..ba2346a41447 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -1481,6 +1481,8 @@ struct vma_prepare { struct vm_area_struct *insert; struct vm_area_struct *remove; struct vm_area_struct *remove2; + + bool skip_vma_uprobe; }; void __meminit __init_single_page(struct page *page, unsigned long pfn, diff --git a/mm/madvise.c b/mm/madvise.c index bfbb48519259..b3a1500decca 100644 --- a/mm/madvise.c +++ b/mm/madvise.c @@ -157,7 +157,7 @@ static int madvise_update_vma(struct vm_area_struct *vma, pgoff = vma->vm_pgoff + ((start - vma->vm_start) >> PAGE_SHIFT); *prev = vma_merge(&vmi, mm, *prev, start, end, new_flags, vma->anon_vma, vma->vm_file, pgoff, vma_policy(vma), - vma->vm_userfaultfd_ctx, anon_name); + vma->vm_userfaultfd_ctx, anon_name, false); if (*prev) { vma = *prev; goto success; diff --git a/mm/mempolicy.c b/mm/mempolicy.c index 1587efaf777e..a82aab7ab47a 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -844,7 +844,7 @@ static int mbind_range(struct vma_iterator *vmi, struct vm_area_struct *vma, pgoff = vma->vm_pgoff + ((vmstart - vma->vm_start) >> PAGE_SHIFT); merged = vma_merge(vmi, vma->vm_mm, *prev, vmstart, vmend, vma->vm_flags, vma->anon_vma, vma->vm_file, pgoff, new_pol, - vma->vm_userfaultfd_ctx, anon_vma_name(vma)); + vma->vm_userfaultfd_ctx, anon_vma_name(vma), false); if (merged) { *prev = merged; return vma_replace_policy(merged, new_pol); diff --git a/mm/mlock.c b/mm/mlock.c index f9653d30d025..31288fda9dec 100644 --- a/mm/mlock.c +++ b/mm/mlock.c @@ -493,7 +493,7 @@ static int mlock_fixup(struct vma_iterator *vmi, struct vm_area_struct *vma, pgoff = vma->vm_pgoff + ((start - vma->vm_start) >> PAGE_SHIFT); *prev = vma_merge(vmi, mm, *prev, start, end, newflags, vma->anon_vma, vma->vm_file, pgoff, vma_policy(vma), - vma->vm_userfaultfd_ctx, anon_vma_name(vma)); + vma->vm_userfaultfd_ctx, anon_vma_name(vma), false); if (*prev) { vma = *prev; goto success; diff --git a/mm/mmap.c b/mm/mmap.c index 32799ed58022..fb54df419ea2 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -587,10 +587,13 @@ static inline void vma_complete(struct vma_prepare *vp, if (vp->file) { i_mmap_unlock_write(vp->mapping); - uprobe_mmap(vp->vma); - if (vp->adj_next) - uprobe_mmap(vp->adj_next); + if (!vp->skip_vma_uprobe) { + uprobe_mmap(vp->vma); + + if (vp->adj_next) + uprobe_mmap(vp->adj_next); + } } if (vp->remove) { @@ -908,6 +911,9 @@ can_vma_merge_after(struct vm_area_struct *vma, unsigned long vm_flags, * NNNN is represented by *next or not represented at all (NULL) * **** is not represented - it will be merged and the vma containing the * area is returned, or the function will return NULL + * + * @skip_vma_uprobe: only valid for copy_vma process, others please + * mark it as false. */ struct vm_area_struct *vma_merge(struct vma_iterator *vmi, struct mm_struct *mm, struct vm_area_struct *prev, unsigned long addr, @@ -915,7 +921,8 @@ struct vm_area_struct *vma_merge(struct vma_iterator *vmi, struct mm_struct *mm, struct anon_vma *anon_vma, struct file *file, pgoff_t pgoff, struct mempolicy *policy, struct vm_userfaultfd_ctx vm_userfaultfd_ctx, - struct anon_vma_name *anon_name) + struct anon_vma_name *anon_name, + bool skip_vma_uprobe) { struct vm_area_struct *curr, *next, *res; struct vm_area_struct *vma, *adjust, *remove, *remove2; @@ -1062,6 +1069,8 @@ struct vm_area_struct *vma_merge(struct vma_iterator *vmi, struct mm_struct *mm, VM_WARN_ON(vp.anon_vma && adjust && adjust->anon_vma && vp.anon_vma != adjust->anon_vma); + vp.skip_vma_uprobe = skip_vma_uprobe; + vma_prepare(&vp); vma_adjust_trans_huge(vma, vma_start, vma_end, adj_start); @@ -2873,7 +2882,7 @@ static unsigned long __mmap_region(struct mm_struct *mm, struct file *file, merge = vma_merge(&vmi, mm, prev, vma->vm_start, vma->vm_end, vma->vm_flags, NULL, vma->vm_file, vma->vm_pgoff, NULL, - NULL_VM_UFFD_CTX, NULL); + NULL_VM_UFFD_CTX, NULL, false); if (merge) { /* @@ -3457,6 +3466,7 @@ struct vm_area_struct *copy_vma(struct vm_area_struct **vmap, struct mm_struct *mm = vma->vm_mm; struct vm_area_struct *new_vma, *prev; bool faulted_in_anon_vma = true; + bool skip_vma_uprobe = false; VMA_ITERATOR(vmi, mm, addr); /* @@ -3472,9 +3482,17 @@ struct vm_area_struct *copy_vma(struct vm_area_struct **vmap, if (new_vma && new_vma->vm_start < addr + len) return NULL; /* should never get here */ + /* + * If the VMA we are copying might contain a uprobe PTE, ensure + * that we do not establish one upon merge. Otherwise, when mremap() + * moves page tables, it will orphan the newly created PTE. + */ + if (vma->vm_file) + skip_vma_uprobe = true; + new_vma = vma_merge(&vmi, mm, prev, addr, addr + len, vma->vm_flags, vma->anon_vma, vma->vm_file, pgoff, vma_policy(vma), - vma->vm_userfaultfd_ctx, anon_vma_name(vma)); + vma->vm_userfaultfd_ctx, anon_vma_name(vma), skip_vma_uprobe); if (new_vma) { /* * Source vma may have been merged into new_vma diff --git a/mm/mprotect.c b/mm/mprotect.c index ed08f87e39c4..e65363eb603e 100644 --- a/mm/mprotect.c +++ b/mm/mprotect.c @@ -633,7 +633,7 @@ mprotect_fixup(struct vma_iterator *vmi, struct mmu_gather *tlb, pgoff = vma->vm_pgoff + ((start - vma->vm_start) >> PAGE_SHIFT); *pprev = vma_merge(vmi, mm, *pprev, start, end, newflags, vma->anon_vma, vma->vm_file, pgoff, vma_policy(vma), - vma->vm_userfaultfd_ctx, anon_vma_name(vma)); + vma->vm_userfaultfd_ctx, anon_vma_name(vma), false); if (*pprev) { vma = *pprev; VM_WARN_ON((vma->vm_flags ^ newflags) & ~VM_SOFTDIRTY); diff --git a/mm/mremap.c b/mm/mremap.c index e990bb8c8918..a3fdb878d503 100644 --- a/mm/mremap.c +++ b/mm/mremap.c @@ -1073,7 +1073,7 @@ SYSCALL_DEFINE5(mremap, unsigned long, addr, unsigned long, old_len, vma = vma_merge(&vmi, mm, vma, extension_start, extension_end, vma->vm_flags, vma->anon_vma, vma->vm_file, extension_pgoff, vma_policy(vma), - vma->vm_userfaultfd_ctx, anon_vma_name(vma)); + vma->vm_userfaultfd_ctx, anon_vma_name(vma), false); if (!vma) { vm_unacct_memory(pages); ret = -ENOMEM; -- 2.34.1