From: Barry Song v-songbaohua@oppo.com
mainline inclusion from mainline-v6.9-rc1 commit 2864f3d0f5831a50253befc5d4583868268b7153 category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I9OCYO CVE: NA
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?i...
--------------------------------
While doing MADV_PAGEOUT, the current code will clear PTE young so that vmscan won't read young flags to allow the reclamation of madvised folios to go ahead. It seems we can do it by directly ignoring references, thus we can remove tlb flush in madvise and rmap overhead in vmscan.
Regarding the side effect, in the original code, if a parallel thread runs side by side to access the madvised memory with the thread doing madvise, folios will get a chance to be re-activated by vmscan (though the time gap is actually quite small since checking PTEs is done immediately after clearing PTEs young). But with this patch, they will still be reclaimed. But this behaviour doing PAGEOUT and doing access at the same time is quite silly like DoS. So probably, we don't need to care. Or ignoring the new access during the quite small time gap is even better.
For DAMON's DAMOS_PAGEOUT based on physical address region, we still keep its behaviour as is since a physical address might be mapped by multiple processes. MADV_PAGEOUT based on virtual address is actually much more aggressive on reclamation. To untouch paddr's DAMOS_PAGEOUT, we simply pass ignore_references as false in reclaim_pages().
A microbench as below has shown 6% decrement on the latency of MADV_PAGEOUT,
#define PGSIZE 4096 main() { int i; #define SIZE 512*1024*1024 volatile long *p = mmap(NULL, SIZE, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
for (i = 0; i < SIZE/sizeof(long); i += PGSIZE / sizeof(long)) p[i] = 0x11;
madvise(p, SIZE, MADV_PAGEOUT); }
w/o patch w/ patch root@10:~# time ./a.out root@10:~# time ./a.out real 0m49.634s real 0m46.334s user 0m0.637s user 0m0.648s sys 0m47.434s sys 0m44.265s
Link: https://lkml.kernel.org/r/20240226005739.24350-1-21cnbao@gmail.com Signed-off-by: Barry Song v-songbaohua@oppo.com Acked-by: Minchan Kim minchan@kernel.org Cc: SeongJae Park sj@kernel.org Cc: Michal Hocko mhocko@suse.com Cc: Johannes Weiner hannes@cmpxchg.org Signed-off-by: Andrew Morton akpm@linux-foundation.org Conflicts: mm/vmscan.c mm/etmem.c include/linux/swap.h fs/proc/etmem_swap.c [ Adapt reclaim_pages() and reclaim_folio_list() used in etmem. ] Signed-off-by: Liu Shixin liushixin2@huawei.com --- fs/proc/etmem_swap.c | 2 +- include/linux/swap.h | 5 +++-- mm/damon/paddr.c | 2 +- mm/etmem.c | 2 +- mm/madvise.c | 8 ++++---- mm/vmscan.c | 12 +++++++----- 6 files changed, 17 insertions(+), 14 deletions(-)
diff --git a/fs/proc/etmem_swap.c b/fs/proc/etmem_swap.c index b4a35da9ac3d..20ac09d67d33 100644 --- a/fs/proc/etmem_swap.c +++ b/fs/proc/etmem_swap.c @@ -72,7 +72,7 @@ static ssize_t swap_pages_write(struct file *file, const char __user *buf, }
if (!list_empty(&pagelist)) - reclaim_pages(&pagelist); + reclaim_pages(&pagelist, false);
ret = count; kfree(data_ptr_res); diff --git a/include/linux/swap.h b/include/linux/swap.h index e818f53cbc31..cbcb767b79e3 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h @@ -420,8 +420,9 @@ extern unsigned long zone_reclaimable_pages(struct zone *zone); extern unsigned long try_to_free_pages(struct zonelist *zonelist, int order, gfp_t gfp_mask, nodemask_t *mask); extern unsigned int reclaim_folio_list(struct list_head *folio_list, - struct pglist_data *pgdat); -extern unsigned long reclaim_pages(struct list_head *folio_list); + struct pglist_data *pgdat, + bool ignore_references); +extern unsigned long reclaim_pages(struct list_head *folio_list, bool ignore_references);
#define MEMCG_RECLAIM_MAY_SWAP (1 << 1) #define MEMCG_RECLAIM_PROACTIVE (1 << 2) diff --git a/mm/damon/paddr.c b/mm/damon/paddr.c index 909db25efb35..21d31580d1a4 100644 --- a/mm/damon/paddr.c +++ b/mm/damon/paddr.c @@ -250,7 +250,7 @@ static unsigned long damon_pa_pageout(struct damon_region *r, struct damos *s) put_folio: folio_put(folio); } - applied = reclaim_pages(&folio_list); + applied = reclaim_pages(&folio_list, false); cond_resched(); return applied * PAGE_SIZE; } diff --git a/mm/etmem.c b/mm/etmem.c index 5accf8e0bbdf..a1b2db374fdb 100644 --- a/mm/etmem.c +++ b/mm/etmem.c @@ -248,7 +248,7 @@ int do_swapcache_reclaim(unsigned long *swapcache_watermark, /* Reclaim all the swapcache we have scanned */ for_each_node_state(nid, N_MEMORY) { cond_resched(); - reclaim_folio_list(&swapcache_list[nid], NODE_DATA(nid)); + reclaim_folio_list(&swapcache_list[nid], NODE_DATA(nid), false); }
/* Put pack all the pages that are not reclaimed by shrink_folio_list */ diff --git a/mm/madvise.c b/mm/madvise.c index a3c509cf2bc9..2b821024a38e 100644 --- a/mm/madvise.c +++ b/mm/madvise.c @@ -429,7 +429,7 @@ static int madvise_cold_or_pageout_pte_range(pmd_t *pmd, return 0; }
- if (pmd_young(orig_pmd)) { + if (!pageout && pmd_young(orig_pmd)) { pmdp_invalidate(vma, addr, pmd); orig_pmd = pmd_mkold(orig_pmd);
@@ -453,7 +453,7 @@ static int madvise_cold_or_pageout_pte_range(pmd_t *pmd, huge_unlock: spin_unlock(ptl); if (pageout) - reclaim_pages(&folio_list); + reclaim_pages(&folio_list, true); return 0; }
@@ -522,7 +522,7 @@ static int madvise_cold_or_pageout_pte_range(pmd_t *pmd,
VM_BUG_ON_FOLIO(folio_test_large(folio), folio);
- if (pte_young(ptent)) { + if (!pageout && pte_young(ptent)) { ptent = ptep_get_and_clear_full(mm, addr, pte, tlb->fullmm); ptent = pte_mkold(ptent); @@ -556,7 +556,7 @@ static int madvise_cold_or_pageout_pte_range(pmd_t *pmd, pte_unmap_unlock(start_pte, ptl); } if (pageout) - reclaim_pages(&folio_list); + reclaim_pages(&folio_list, true); cond_resched();
return 0; diff --git a/mm/vmscan.c b/mm/vmscan.c index 95a845905624..e1aa8bd796e9 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -2792,7 +2792,8 @@ static void shrink_active_list(unsigned long nr_to_scan, }
unsigned int reclaim_folio_list(struct list_head *folio_list, - struct pglist_data *pgdat) + struct pglist_data *pgdat, + bool ignore_references) { struct reclaim_stat dummy_stat; unsigned int nr_reclaimed; @@ -2805,7 +2806,7 @@ unsigned int reclaim_folio_list(struct list_head *folio_list, .no_demotion = 1, };
- nr_reclaimed = shrink_folio_list(folio_list, pgdat, &sc, &dummy_stat, false); + nr_reclaimed = shrink_folio_list(folio_list, pgdat, &sc, &dummy_stat, ignore_references); while (!list_empty(folio_list)) { folio = lru_to_folio(folio_list); list_del(&folio->lru); @@ -2815,7 +2816,7 @@ unsigned int reclaim_folio_list(struct list_head *folio_list, return nr_reclaimed; }
-unsigned long reclaim_pages(struct list_head *folio_list) +unsigned long reclaim_pages(struct list_head *folio_list, bool ignore_references) { int nid; unsigned int nr_reclaimed = 0; @@ -2837,11 +2838,12 @@ unsigned long reclaim_pages(struct list_head *folio_list) continue; }
- nr_reclaimed += reclaim_folio_list(&node_folio_list, NODE_DATA(nid)); + nr_reclaimed += reclaim_folio_list(&node_folio_list, NODE_DATA(nid), + ignore_references); nid = folio_nid(lru_to_folio(folio_list)); } while (!list_empty(folio_list));
- nr_reclaimed += reclaim_folio_list(&node_folio_list, NODE_DATA(nid)); + nr_reclaimed += reclaim_folio_list(&node_folio_list, NODE_DATA(nid), ignore_references);
memalloc_noreclaim_restore(noreclaim_flag);