From: David Hildenbrand <david@redhat.com> mainline inclusion from mainline-v6.17-rc1 commit dd80cfd4878bafc74f2a386c51b5398a12ffeb8c category: bugfix bugzilla: https://atomgit.com/src-openeuler/kernel/issues/14124 CVE: CVE-2026-31398 Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?i... -------------------------------- Many users (including upcoming ones) don't really need the flags etc, and can live with the possible overhead of a function call. So let's provide a basic, non-inlined folio_pte_batch(), to avoid code bloat while still providing a variant that optimizes out all flag checks at runtime. folio_pte_batch_flags() will get inlined into folio_pte_batch(), optimizing out any conditionals that depend on input flags. folio_pte_batch() will behave like folio_pte_batch_flags() when no flags are specified. It's okay to add new users of folio_pte_batch_flags(), but using folio_pte_batch() if applicable is preferred. So, before this change, folio_pte_batch() was inlined into the C file optimized by propagating constants within the resulting object file. With this change, we now also have a folio_pte_batch() that is optimized by propagating all constants. But instead of having one instance per object file, we have a single shared one. In zap_present_ptes(), where we care about performance, the compiler already seem to generate a call to a common inlined folio_pte_batch() variant, shared with fork() code. So calling the new non-inlined variant should not make a difference. While at it, drop the "addr" parameter that is unused. Link: https://lkml.kernel.org/r/20250702104926.212243-4-david@redhat.com Signed-off-by: David Hildenbrand <david@redhat.com> Suggested-by: Andrew Morton <akpm@linux-foundation.org> Link: https://lore.kernel.org/linux-mm/20250503182858.5a02729fcffd6d4723afcfc2@lin... Reviewed-by: Oscar Salvador <osalvador@suse.de> Reviewed-by: Zi Yan <ziy@nvidia.com> Reviewed-by: Dev Jain <dev.jain@arm.com> Cc: Alistair Popple <apopple@nvidia.com> Cc: Byungchul Park <byungchul@sk.com> Cc: Gregory Price <gourry@gourry.net> Cc: "Huang, Ying" <ying.huang@linux.alibaba.com> Cc: Jann Horn <jannh@google.com> Cc: Joshua Hahn <joshua.hahnjy@gmail.com> Cc: Lance Yang <lance.yang@linux.dev> Cc: Liam Howlett <liam.howlett@oracle.com> Cc: Lorenzo Stoakes <lorenzo.stoakes@oracle.com> Cc: Mathew Brost <matthew.brost@intel.com> Cc: Michal Hocko <mhocko@suse.com> Cc: Mike Rapoport <rppt@kernel.org> Cc: Rakie Kim <rakie.kim@sk.com> Cc: Rik van Riel <riel@surriel.com> Cc: Suren Baghdasaryan <surenb@google.com> Cc: Vlastimil Babka <vbabka@suse.cz> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Conflicts: mm/mlock.c mm/util.c [Context conflicts] Signed-off-by: Wupeng Ma <mawupeng1@huawei.com> --- mm/internal.h | 11 ++++++++--- mm/madvise.c | 4 ++-- mm/memory.c | 8 +++----- mm/mempolicy.c | 3 +-- mm/mremap.c | 3 +-- mm/rmap.c | 3 +-- mm/util.c | 29 +++++++++++++++++++++++++++++ 7 files changed, 45 insertions(+), 16 deletions(-) diff --git a/mm/internal.h b/mm/internal.h index 58cf94a043445..93d87a2aa06c9 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -120,9 +120,8 @@ static inline pte_t __pte_batch_clear_ignored(pte_t pte, fpb_t flags) } /** - * folio_pte_batch - detect a PTE batch for a large folio + * folio_pte_batch_flags - detect a PTE batch for a large folio * @folio: The large folio to detect a PTE batch for. - * @addr: The user virtual address the first page is mapped at. * @ptep: Page table pointer for the first entry. * @pte: Page table entry for the first page. * @max_nr: The maximum number of table entries to consider. @@ -145,9 +144,12 @@ static inline pte_t __pte_batch_clear_ignored(pte_t pte, fpb_t flags) * must be limited by the caller so scanning cannot exceed a single VMA and * a single page table. * + * This function will be inlined to optimize based on the input parameters; + * consider using folio_pte_batch() instead if applicable. + * * Return: the number of table entries in the batch. */ -static inline unsigned int folio_pte_batch(struct folio *folio, unsigned long addr, +static inline unsigned int folio_pte_batch_flags(struct folio *folio, pte_t *ptep, pte_t pte, unsigned int max_nr, fpb_t flags, bool *any_writable, bool *any_young, bool *any_dirty) { @@ -195,6 +197,9 @@ static inline unsigned int folio_pte_batch(struct folio *folio, unsigned long ad return min(nr, max_nr); } +unsigned int folio_pte_batch(struct folio *folio, pte_t *ptep, pte_t pte, + unsigned int max_nr); + /** * pte_move_swp_offset - Move the swap entry offset field of a swap pte * forward or backward by delta diff --git a/mm/madvise.c b/mm/madvise.c index 5bd1a9d3dedc6..3683002cf8448 100644 --- a/mm/madvise.c +++ b/mm/madvise.c @@ -360,8 +360,8 @@ static inline int madvise_folio_pte_batch(unsigned long addr, unsigned long end, { int max_nr = (end - addr) / PAGE_SIZE; - return folio_pte_batch(folio, addr, ptep, pte, max_nr, 0, NULL, - any_young, any_dirty); + return folio_pte_batch_flags(folio, ptep, pte, max_nr, 0, NULL, + any_young, any_dirty); } static int madvise_cold_or_pageout_pte_range(pmd_t *pmd, diff --git a/mm/memory.c b/mm/memory.c index b17e9e82d379e..006904c4581d9 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -1007,8 +1007,8 @@ copy_present_ptes(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma if (vma_soft_dirty_enabled(src_vma)) flags |= FPB_RESPECT_SOFT_DIRTY; - nr = folio_pte_batch(folio, addr, src_pte, pte, max_nr, flags, - &any_writable, NULL, NULL); + nr = folio_pte_batch_flags(folio, src_pte, pte, max_nr, flags, + &any_writable, NULL, NULL); folio_ref_add(folio, nr); if (folio_test_anon(folio)) { if (unlikely(folio_try_dup_anon_rmap_ptes(folio, page, @@ -1572,9 +1572,7 @@ static inline int zap_present_ptes(struct mmu_gather *tlb, * by keeping the batching logic separate. */ if (unlikely(folio_test_large(folio) && max_nr != 1)) { - nr = folio_pte_batch(folio, addr, pte, ptent, max_nr, 0, - NULL, NULL, NULL); - + nr = folio_pte_batch(folio, pte, ptent, max_nr); zap_present_folio_ptes(tlb, vma, folio, page, pte, ptent, nr, addr, details, rss, force_flush, force_break); diff --git a/mm/mempolicy.c b/mm/mempolicy.c index 1b03ffef106cd..59da1bae3a876 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -573,8 +573,7 @@ static int queue_folios_pte_range(pmd_t *pmd, unsigned long addr, if (!folio || folio_is_zone_device(folio)) continue; if (folio_test_large(folio) && max_nr != 1) - nr = folio_pte_batch(folio, addr, pte, ptent, - max_nr, 0, NULL, NULL, NULL); + nr = folio_pte_batch(folio, pte, ptent, max_nr); /* * vm_normal_folio() filters out zero pages, but there might * still be reserved folios to skip, perhaps in a VDSO. diff --git a/mm/mremap.c b/mm/mremap.c index 44f75657342e8..e0f5c5fbc49a7 100644 --- a/mm/mremap.c +++ b/mm/mremap.c @@ -151,8 +151,7 @@ static int mremap_folio_pte_batch(struct vm_area_struct *vma, unsigned long addr if (!folio || !folio_test_large(folio)) return 1; - return folio_pte_batch(folio, addr, ptep, pte, max_nr, 0, NULL, - NULL, NULL); + return folio_pte_batch(folio, ptep, pte, max_nr); } static int move_ptes(struct vm_area_struct *vma, pmd_t *old_pmd, diff --git a/mm/rmap.c b/mm/rmap.c index 4180c5ef953a8..91cb4af6ace0b 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -1617,8 +1617,7 @@ static inline unsigned int folio_unmap_pte_batch(struct folio *folio, if (pte_unused(pte)) return 1; - return folio_pte_batch(folio, addr, pvmw->pte, pte, max_nr, 0, - NULL, NULL, NULL); + return folio_pte_batch(folio, pvmw->pte, pte, max_nr); } /* diff --git a/mm/util.c b/mm/util.c index dd5695364bac5..c819d8258420b 100644 --- a/mm/util.c +++ b/mm/util.c @@ -1164,3 +1164,32 @@ void flush_dcache_folio(struct folio *folio) } EXPORT_SYMBOL(flush_dcache_folio); #endif + +#ifdef CONFIG_MMU +/** + * folio_pte_batch - detect a PTE batch for a large folio + * @folio: The large folio to detect a PTE batch for. + * @ptep: Page table pointer for the first entry. + * @pte: Page table entry for the first page. + * @max_nr: The maximum number of table entries to consider. + * + * This is a simplified variant of folio_pte_batch_flags(). + * + * Detect a PTE batch: consecutive (present) PTEs that map consecutive + * pages of the same large folio in a single VMA and a single page table. + * + * All PTEs inside a PTE batch have the same PTE bits set, excluding the PFN, + * the accessed bit, writable bit, dirt-bit and soft-dirty bit. + * + * ptep must map any page of the folio. max_nr must be at least one and + * must be limited by the caller so scanning cannot exceed a single VMA and + * a single page table. + * + * Return: the number of table entries in the batch. + */ +unsigned int folio_pte_batch(struct folio *folio, pte_t *ptep, pte_t pte, + unsigned int max_nr) +{ + return folio_pte_batch_flags(folio, ptep, pte, max_nr, 0, NULL, NULL, NULL); +} +#endif /* CONFIG_MMU */ -- 2.43.0