From: Lance Yang ioworker0@gmail.com
mainline inclusion from mainline-v6.10-rc1 commit 96ebdb032096f67e37b582cd2ea2558c402f878b category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/IAIHQO
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?i...
--------------------------------
This commit adds the any_dirty pointer as an optional parameter to folio_pte_batch() function. By using both the any_young and any_dirty pointers, madvise_free can make smarter decisions about whether to clear the PTEs when marking large folios as lazyfree.
Link: https://lkml.kernel.org/r/20240418134435.6092-4-ioworker0@gmail.com Signed-off-by: Lance Yang ioworker0@gmail.com Suggested-by: David Hildenbrand david@redhat.com Acked-by: David Hildenbrand david@redhat.com Cc: Barry Song 21cnbao@gmail.com Cc: Jeff Xie xiehuan09@gmail.com Cc: Kefeng Wang wangkefeng.wang@huawei.com Cc: Michal Hocko mhocko@suse.com Cc: Minchan Kim minchan@kernel.org Cc: Muchun Song songmuchun@bytedance.com Cc: Peter Xu peterx@redhat.com Cc: Ryan Roberts ryan.roberts@arm.com Cc: Yang Shi shy828301@gmail.com Cc: Yin Fengwei fengwei.yin@intel.com Cc: Zach O'Keefe zokeefe@google.com Signed-off-by: Andrew Morton akpm@linux-foundation.org Signed-off-by: Liu Shixin liushixin2@huawei.com --- mm/internal.h | 12 ++++++++++-- mm/madvise.c | 19 ++++++++++++++----- mm/memory.c | 4 ++-- 3 files changed, 26 insertions(+), 9 deletions(-)
diff --git a/mm/internal.h b/mm/internal.h index 7db2957ef3a0..6479c376ffdf 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -131,6 +131,8 @@ static inline pte_t __pte_batch_clear_ignored(pte_t pte, fpb_t flags) * first one is writable. * @any_young: Optional pointer to indicate whether any entry except the * first one is young. + * @any_dirty: Optional pointer to indicate whether any entry except the + * first one is dirty. * * Detect a PTE batch: consecutive (present) PTEs that map consecutive * pages of the same large folio. @@ -146,18 +148,20 @@ static inline pte_t __pte_batch_clear_ignored(pte_t pte, fpb_t flags) */ static inline int folio_pte_batch(struct folio *folio, unsigned long addr, pte_t *start_ptep, pte_t pte, int max_nr, fpb_t flags, - bool *any_writable, bool *any_young) + bool *any_writable, bool *any_young, bool *any_dirty) { unsigned long folio_end_pfn = folio_pfn(folio) + folio_nr_pages(folio); const pte_t *end_ptep = start_ptep + max_nr; pte_t expected_pte, *ptep; - bool writable, young; + bool writable, young, dirty; int nr;
if (any_writable) *any_writable = false; if (any_young) *any_young = false; + if (any_dirty) + *any_dirty = false;
VM_WARN_ON_FOLIO(!pte_present(pte), folio); VM_WARN_ON_FOLIO(!folio_test_large(folio) || max_nr < 1, folio); @@ -173,6 +177,8 @@ static inline int folio_pte_batch(struct folio *folio, unsigned long addr, writable = !!pte_write(pte); if (any_young) young = !!pte_young(pte); + if (any_dirty) + dirty = !!pte_dirty(pte); pte = __pte_batch_clear_ignored(pte, flags);
if (!pte_same(pte, expected_pte)) @@ -190,6 +196,8 @@ static inline int folio_pte_batch(struct folio *folio, unsigned long addr, *any_writable |= writable; if (any_young) *any_young |= young; + if (any_dirty) + *any_dirty |= dirty;
nr = pte_batch_hint(ptep, pte); expected_pte = pte_advance_pfn(expected_pte, nr); diff --git a/mm/madvise.c b/mm/madvise.c index 674f7402a8bb..05f01268ef57 100644 --- a/mm/madvise.c +++ b/mm/madvise.c @@ -365,6 +365,18 @@ static inline bool can_do_file_pageout(struct vm_area_struct *vma) file_permission(vma->vm_file, MAY_WRITE) == 0; }
+static inline int madvise_folio_pte_batch(unsigned long addr, unsigned long end, + struct folio *folio, pte_t *ptep, + pte_t pte, bool *any_young, + bool *any_dirty) +{ + const fpb_t fpb_flags = FPB_IGNORE_DIRTY | FPB_IGNORE_SOFT_DIRTY; + int max_nr = (end - addr) / PAGE_SIZE; + + return folio_pte_batch(folio, addr, ptep, pte, max_nr, fpb_flags, NULL, + any_young, any_dirty); +} + static int madvise_cold_or_pageout_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end, struct mm_walk *walk) @@ -500,13 +512,10 @@ static int madvise_cold_or_pageout_pte_range(pmd_t *pmd, * next pte in the range. */ if (folio_test_large(folio)) { - const fpb_t fpb_flags = FPB_IGNORE_DIRTY | - FPB_IGNORE_SOFT_DIRTY; - int max_nr = (end - addr) / PAGE_SIZE; bool any_young;
- nr = folio_pte_batch(folio, addr, pte, ptent, max_nr, - fpb_flags, NULL, &any_young); + nr = madvise_folio_pte_batch(addr, end, folio, pte, + ptent, &any_young, NULL); if (any_young) ptent = pte_mkyoung(ptent);
diff --git a/mm/memory.c b/mm/memory.c index d2fe33e3352e..7fd1f71cebeb 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -996,7 +996,7 @@ copy_present_ptes(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma flags |= FPB_IGNORE_SOFT_DIRTY;
nr = folio_pte_batch(folio, addr, src_pte, pte, max_nr, flags, - &any_writable, NULL); + &any_writable, NULL, NULL); folio_ref_add(folio, nr); if (folio_test_anon(folio)) { if (unlikely(folio_try_dup_anon_rmap_ptes(folio, page, @@ -1563,7 +1563,7 @@ static inline int zap_present_ptes(struct mmu_gather *tlb, */ if (unlikely(folio_test_large(folio) && max_nr != 1)) { nr = folio_pte_batch(folio, addr, pte, ptent, max_nr, fpb_flags, - NULL, NULL); + NULL, NULL, NULL);
zap_present_folio_ptes(tlb, vma, folio, page, pte, ptent, nr, addr, details, rss, force_flush,