hulk inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/ID4GC1 -------------------------------- During testing, memory allocation with node_reclaim_mode enabled became very slow when a large allocation targeted a node whose free memory was mostly occupied by clean page cache. Node reclaim behaves like direct reclaim and only recycles a small batch at a time, so repeated allocations can spend excessive time in synchronous reclaim. Add a RECLAIM_KSWAPD mode bit so node reclaim can optionally wake kswapd and allow background reclaim to bring the node toward high watermarks. Pass the allocation zone into node_reclaim() so wakeup_kswapd() can use the same zone context as the allocator. Document the new mode bit: echo 8 > /proc/sys/vm/zone_reclaim_mode The bit can be combined with other node reclaim modes; for example, 12 represents RECLAIM_UNMAP | RECLAIM_KSWAPD. Signed-off-by: Ze Zuo <zuoze1@huawei.com> Signed-off-by: Wupeng Ma <mawupeng1@huawei.com> --- Documentation/sysctl/vm.txt | 1 + mm/internal.h | 5 +++-- mm/page_alloc.c | 3 ++- mm/vmscan.c | 8 +++++++- 4 files changed, 13 insertions(+), 4 deletions(-) diff --git a/Documentation/sysctl/vm.txt b/Documentation/sysctl/vm.txt index b3808f1236f8..ea084a586b31 100644 --- a/Documentation/sysctl/vm.txt +++ b/Documentation/sysctl/vm.txt @@ -900,6 +900,7 @@ This is value ORed together of 1 = Zone reclaim on 2 = Zone reclaim writes dirty pages out 4 = Zone reclaim swaps pages +8 = Zone reclaim wakeup kswapd zone_reclaim_mode is disabled by default. For file servers or workloads that benefit from having their data cached, zone_reclaim_mode should be diff --git a/mm/internal.h b/mm/internal.h index e50363ddb416..84074fe1d20e 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -515,10 +515,11 @@ static inline void mminit_validate_memmodel_limits(unsigned long *start_pfn, #define NODE_RECLAIM_SUCCESS 1 #ifdef CONFIG_NUMA -extern int node_reclaim(struct pglist_data *, gfp_t, unsigned int); +int node_reclaim(struct pglist_data *pgdat, gfp_t mask, unsigned int order, + struct zone *zone); #else static inline int node_reclaim(struct pglist_data *pgdat, gfp_t mask, - unsigned int order) + unsigned int order, struct zone *zone) { return NODE_RECLAIM_NOSCAN; } diff --git a/mm/page_alloc.c b/mm/page_alloc.c index c2577d3048d9..cd019f06d747 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -3593,7 +3593,8 @@ get_page_from_freelist(gfp_t gfp_mask, unsigned int order, int alloc_flags, !zone_allows_reclaim(ac->preferred_zoneref->zone, zone)) continue; - ret = node_reclaim(zone->zone_pgdat, gfp_mask, order); + ret = node_reclaim(zone->zone_pgdat, gfp_mask, order, + zone); switch (ret) { case NODE_RECLAIM_NOSCAN: /* did not scan */ diff --git a/mm/vmscan.c b/mm/vmscan.c index 343244e1fe10..b3042f4d529a 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -4223,6 +4223,7 @@ int node_reclaim_mode __read_mostly; #define RECLAIM_ZONE (1<<0) /* Run shrink_inactive_list on the zone */ #define RECLAIM_WRITE (1<<1) /* Writeout pages during reclaim */ #define RECLAIM_UNMAP (1<<2) /* Unmap pages during reclaim */ +#define RECLAIM_KSWAPD (1<<3) /* Wake kswapd during reclaim */ /* * Priority for NODE_RECLAIM. This determines the fraction of pages @@ -4338,7 +4339,8 @@ static int __node_reclaim(struct pglist_data *pgdat, gfp_t gfp_mask, unsigned in return sc.nr_reclaimed >= nr_pages; } -int node_reclaim(struct pglist_data *pgdat, gfp_t gfp_mask, unsigned int order) +int node_reclaim(struct pglist_data *pgdat, gfp_t gfp_mask, unsigned int order, + struct zone *zone) { int ret; @@ -4371,6 +4373,10 @@ int node_reclaim(struct pglist_data *pgdat, gfp_t gfp_mask, unsigned int order) if (node_state(pgdat->node_id, N_CPU) && pgdat->node_id != numa_node_id()) return NODE_RECLAIM_NOSCAN; + if ((node_reclaim_mode & RECLAIM_KSWAPD) && + (gfp_mask & __GFP_KSWAPD_RECLAIM)) + wakeup_kswapd(zone, gfp_mask, order, gfp_zone(gfp_mask)); + if (test_and_set_bit(PGDAT_RECLAIM_LOCKED, &pgdat->flags)) return NODE_RECLAIM_NOSCAN; -- 2.25.1