From: David Hildenbrand <david@redhat.com> mainline inclusion from mainline-v6.17-rc1 commit e66d7a4f55f44aca39cc74e8c7b4602faf26b4f7 category: bugfix bugzilla: https://atomgit.com/src-openeuler/kernel/issues/14124 CVE: CVE-2026-31398 Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?i... -------------------------------- Patch series "mm: folio_pte_batch() improvements", v2. Ever since we added folio_pte_batch() for fork() + munmap() purposes, a lot more users appeared (and more are being proposed), and more functionality was added. Most of the users only need basic functionality, and could benefit from a non-inlined version. So let's clean up folio_pte_batch() and split it into a basic folio_pte_batch() (no flags) and a more advanced folio_pte_batch_ext(). Using either variant will now look much cleaner. This series will likely conflict with some changes in some (old+new) folio_pte_batch() users, but conflicts should be trivial to resolve. This patch (of 4): Respecting these PTE bits is the exception, so let's invert the meaning. With this change, most callers don't have to pass any flags. This is a preparation for splitting folio_pte_batch() into a non-inlined variant that doesn't consume any flags. Long-term, we want folio_pte_batch() to probably ignore most common PTE bits (e.g., write/dirty/young/soft-dirty) that are not relevant for most page table walkers: uffd-wp and protnone might be bits to consider in the future. Only walkers that care about them can opt-in to respect them. No functional change intended. Link: https://lkml.kernel.org/r/20250702104926.212243-2-david@redhat.com Signed-off-by: David Hildenbrand <david@redhat.com> Reviewed-by: Lance Yang <lance.yang@linux.dev> Reviewed-by: Zi Yan <ziy@nvidia.com> Reviewed-by: Oscar Salvador <osalvador@suse.de> Reviewed-by: Dev Jain <dev.jain@arm.com> Cc: Alistair Popple <apopple@nvidia.com> Cc: Byungchul Park <byungchul@sk.com> Cc: Gregory Price <gourry@gourry.net> Cc: "Huang, Ying" <ying.huang@linux.alibaba.com> Cc: Jann Horn <jannh@google.com> Cc: Joshua Hahn <joshua.hahnjy@gmail.com> Cc: Liam Howlett <liam.howlett@oracle.com> Cc: Lorenzo Stoakes <lorenzo.stoakes@oracle.com> Cc: Mathew Brost <matthew.brost@intel.com> Cc: Michal Hocko <mhocko@suse.com> Cc: Mike Rapoport <rppt@kernel.org> Cc: Rakie Kim <rakie.kim@sk.com> Cc: Rik van Riel <riel@surriel.com> Cc: Suren Baghdasaryan <surenb@google.com> Cc: Vlastimil Babka <vbabka@suse.cz> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Conflicts: mm/mlock.c [Wupeng Ma: conflict due to commit f742829d32e2 ("mm/mlock: implement folio_mlock_step() using folio_pte_batch()") is not backported] Signed-off-by: Wupeng Ma <mawupeng1@huawei.com> --- mm/internal.h | 16 ++++++++-------- mm/madvise.c | 3 +-- mm/memory.c | 11 +++++------ mm/mempolicy.c | 4 +--- mm/mremap.c | 3 +-- mm/rmap.c | 3 +-- 6 files changed, 17 insertions(+), 23 deletions(-) diff --git a/mm/internal.h b/mm/internal.h index 0f8d8d8f18959..718bb389eec4e 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -104,17 +104,17 @@ static inline void *folio_raw_mapping(struct folio *folio) /* Flags for folio_pte_batch(). */ typedef int __bitwise fpb_t; -/* Compare PTEs after pte_mkclean(), ignoring the dirty bit. */ -#define FPB_IGNORE_DIRTY ((__force fpb_t)BIT(0)) +/* Compare PTEs respecting the dirty bit. */ +#define FPB_RESPECT_DIRTY ((__force fpb_t)BIT(0)) -/* Compare PTEs after pte_clear_soft_dirty(), ignoring the soft-dirty bit. */ -#define FPB_IGNORE_SOFT_DIRTY ((__force fpb_t)BIT(1)) +/* Compare PTEs respecting the soft-dirty bit. */ +#define FPB_RESPECT_SOFT_DIRTY ((__force fpb_t)BIT(1)) static inline pte_t __pte_batch_clear_ignored(pte_t pte, fpb_t flags) { - if (flags & FPB_IGNORE_DIRTY) + if (!(flags & FPB_RESPECT_DIRTY)) pte = pte_mkclean(pte); - if (likely(flags & FPB_IGNORE_SOFT_DIRTY)) + if (likely(!(flags & FPB_RESPECT_SOFT_DIRTY))) pte = pte_clear_soft_dirty(pte); return pte_wrprotect(pte_mkold(pte)); } @@ -138,8 +138,8 @@ static inline pte_t __pte_batch_clear_ignored(pte_t pte, fpb_t flags) * pages of the same large folio. * * All PTEs inside a PTE batch have the same PTE bits set, excluding the PFN, - * the accessed bit, writable bit, dirty bit (with FPB_IGNORE_DIRTY) and - * soft-dirty bit (with FPB_IGNORE_SOFT_DIRTY). + * the accessed bit, writable bit, dirty bit (unless FPB_RESPECT_DIRTY is set) + * and soft-dirty bit (unless FPB_RESPECT_SOFT_DIRTY is set). * * start_ptep must map any page of the folio. max_nr must be at least one and * must be limited by the caller so scanning cannot exceed a single page table. diff --git a/mm/madvise.c b/mm/madvise.c index a5f4caca1ef24..5bd1a9d3dedc6 100644 --- a/mm/madvise.c +++ b/mm/madvise.c @@ -358,10 +358,9 @@ static inline int madvise_folio_pte_batch(unsigned long addr, unsigned long end, pte_t pte, bool *any_young, bool *any_dirty) { - const fpb_t fpb_flags = FPB_IGNORE_DIRTY | FPB_IGNORE_SOFT_DIRTY; int max_nr = (end - addr) / PAGE_SIZE; - return folio_pte_batch(folio, addr, ptep, pte, max_nr, fpb_flags, NULL, + return folio_pte_batch(folio, addr, ptep, pte, max_nr, 0, NULL, any_young, any_dirty); } diff --git a/mm/memory.c b/mm/memory.c index 794233ef55eb3..b17e9e82d379e 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -1002,10 +1002,10 @@ copy_present_ptes(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma * by keeping the batching logic separate. */ if (unlikely(!*prealloc && folio_test_large(folio) && max_nr != 1)) { - if (src_vma->vm_flags & VM_SHARED) - flags |= FPB_IGNORE_DIRTY; - if (!vma_soft_dirty_enabled(src_vma)) - flags |= FPB_IGNORE_SOFT_DIRTY; + if (!(src_vma->vm_flags & VM_SHARED)) + flags |= FPB_RESPECT_DIRTY; + if (vma_soft_dirty_enabled(src_vma)) + flags |= FPB_RESPECT_SOFT_DIRTY; nr = folio_pte_batch(folio, addr, src_pte, pte, max_nr, flags, &any_writable, NULL, NULL); @@ -1545,7 +1545,6 @@ static inline int zap_present_ptes(struct mmu_gather *tlb, struct zap_details *details, int *rss, bool *force_flush, bool *force_break) { - const fpb_t fpb_flags = FPB_IGNORE_DIRTY | FPB_IGNORE_SOFT_DIRTY; struct mm_struct *mm = tlb->mm; struct folio *folio; struct page *page; @@ -1573,7 +1572,7 @@ static inline int zap_present_ptes(struct mmu_gather *tlb, * by keeping the batching logic separate. */ if (unlikely(folio_test_large(folio) && max_nr != 1)) { - nr = folio_pte_batch(folio, addr, pte, ptent, max_nr, fpb_flags, + nr = folio_pte_batch(folio, addr, pte, ptent, max_nr, 0, NULL, NULL, NULL); zap_present_folio_ptes(tlb, vma, folio, page, pte, ptent, nr, diff --git a/mm/mempolicy.c b/mm/mempolicy.c index 5c5597b8984f4..1b03ffef106cd 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -537,7 +537,6 @@ static void queue_folios_pmd(pmd_t *pmd, struct mm_walk *walk) static int queue_folios_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end, struct mm_walk *walk) { - const fpb_t fpb_flags = FPB_IGNORE_DIRTY | FPB_IGNORE_SOFT_DIRTY; struct vm_area_struct *vma = walk->vma; struct folio *folio; struct queue_pages *qp = walk->private; @@ -575,8 +574,7 @@ static int queue_folios_pte_range(pmd_t *pmd, unsigned long addr, continue; if (folio_test_large(folio) && max_nr != 1) nr = folio_pte_batch(folio, addr, pte, ptent, - max_nr, fpb_flags, - NULL, NULL, NULL); + max_nr, 0, NULL, NULL, NULL); /* * vm_normal_folio() filters out zero pages, but there might * still be reserved folios to skip, perhaps in a VDSO. diff --git a/mm/mremap.c b/mm/mremap.c index ae842afdaea11..44f75657342e8 100644 --- a/mm/mremap.c +++ b/mm/mremap.c @@ -138,7 +138,6 @@ static pte_t move_soft_dirty_pte(pte_t pte) static int mremap_folio_pte_batch(struct vm_area_struct *vma, unsigned long addr, pte_t *ptep, pte_t pte, int max_nr) { - const fpb_t flags = FPB_IGNORE_DIRTY | FPB_IGNORE_SOFT_DIRTY; struct folio *folio; if (max_nr == 1) @@ -152,7 +151,7 @@ static int mremap_folio_pte_batch(struct vm_area_struct *vma, unsigned long addr if (!folio || !folio_test_large(folio)) return 1; - return folio_pte_batch(folio, addr, ptep, pte, max_nr, flags, NULL, + return folio_pte_batch(folio, addr, ptep, pte, max_nr, 0, NULL, NULL, NULL); } diff --git a/mm/rmap.c b/mm/rmap.c index 5949fe733818f..4180c5ef953a8 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -1598,7 +1598,6 @@ static inline unsigned int folio_unmap_pte_batch(struct folio *folio, struct page_vma_mapped_walk *pvmw, enum ttu_flags flags, pte_t pte) { - const fpb_t fpb_flags = FPB_IGNORE_DIRTY | FPB_IGNORE_SOFT_DIRTY; unsigned long end_addr, addr = pvmw->address; struct vm_area_struct *vma = pvmw->vma; unsigned int max_nr; @@ -1618,7 +1617,7 @@ static inline unsigned int folio_unmap_pte_batch(struct folio *folio, if (pte_unused(pte)) return 1; - return folio_pte_batch(folio, addr, pvmw->pte, pte, max_nr, fpb_flags, + return folio_pte_batch(folio, addr, pvmw->pte, pte, max_nr, 0, NULL, NULL, NULL); } -- 2.43.0