kylin inclusion category: feature bugfix: https://gitee.com/openeuler-competition/summer-2021/issues/I3EIMT?from=proje... CVE: NA --------------------------------------------------
In some atomic operation scenarios, such as interrupt context, it is not possible to sleep. Therefore, when memory allocation in this scenario, it will not enter the direct_reclaim link, and will not even wake up the kswapd process. For example, in the soft interrupt processing function of the network card receiving packets, there may be a phenomenon that the page cache is too occupied and the remaining memory of the system is insufficient, and the memory cannot be allocated for the received data packet, and the packet is directly lost. This is the problem to be solved by the page cache limit.
The page cache limit is mainly used to detect whether the page cache exceeds the upper limit we set (/proc/sys/vm/pagecache_limit_ratio) when the page cache is added to the application (that is, when the add_to_page_cache_lru function is called)
Provides 3 /proc interfaces, respectively: echo x > /proc/sys/vm/pagecache_limit_ratio(0 < x < 100):Enable page cache limit function x means limit the percentage of page cache to the total system memory
/proc/sys/vm/pagecache_limit_ignore_dirty :Whether to ignore dirty pages when calculating the memory occupied by the page cache. The default value is 1, which means ignore. Because the recycling of dirty pages is time-consuming.
/proc/sys/vm/pagecache_limit_async:1 means asynchronous recycling, 0 means synchronous recycling signed-off-by: wen zhiwei wenzhiwei@kylinos.cn
Signed-off-by: wenzhiwei wenzhiwei@kylinos.cn --- include/linux/memcontrol.h | 7 +- include/linux/mmzone.h | 7 + include/linux/swap.h | 15 + include/trace/events/vmscan.h | 28 +- kernel/sysctl.c | 139 ++++++++ mm/filemap.c | 2 + mm/page_alloc.c | 52 +++ mm/vmscan.c | 650 ++++++++++++++++++++++++++++++++-- mm/workingset.c | 1 + 9 files changed, 862 insertions(+), 39 deletions(-)
diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index 71a5b589bddb..731a2cd2ea86 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -50,6 +50,7 @@ enum memcg_memory_event {
struct mem_cgroup_reclaim_cookie { pg_data_t *pgdat; + int priority; unsigned int generation; };
@@ -492,8 +493,7 @@ mem_cgroup_nodeinfo(struct mem_cgroup *memcg, int nid) * @node combination. This can be the node lruvec, if the memory * controller is disabled. */ -static inline struct lruvec *mem_cgroup_lruvec(struct mem_cgroup *memcg, - struct pglist_data *pgdat) +static inline struct lruvec *mem_cgroup_lruvec(struct mem_cgroup *memcg, struct pglist_data *pgdat) { struct mem_cgroup_per_node *mz; struct lruvec *lruvec; @@ -1066,8 +1066,7 @@ static inline void mem_cgroup_migrate(struct page *old, struct page *new) { }
-static inline struct lruvec *mem_cgroup_lruvec(struct mem_cgroup *memcg, - struct pglist_data *pgdat) +static inline struct lruvec *mem_cgroup_lruvec(struct mem_cgroup *memcg, struct pglist_data *pgdat) { return &pgdat->__lruvec; } diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index 82fceef88448..d3c5258e5d0d 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -445,6 +445,13 @@ struct zone { * changes. */ long lowmem_reserve[MAX_NR_ZONES]; + /* + * This atomic counter is set when there is pagecache limit + * reclaim going on on this particular zone. Other potential + * reclaiers should back off to prevent from heavy lru_lock + * bouncing. + */ + atomic_t pagecache_reclaim;
#ifdef CONFIG_NEED_MULTIPLE_NODES int node; diff --git a/include/linux/swap.h b/include/linux/swap.h index 9b708c0288bc..b9329e575836 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h @@ -377,6 +377,21 @@ extern unsigned long mem_cgroup_shrink_node(struct mem_cgroup *mem, unsigned long *nr_scanned); extern unsigned long shrink_all_memory(unsigned long nr_pages); extern int vm_swappiness; + +#define ADDITIONAL_RECLAIM_RATIO 2 +extern unsigned long pagecache_over_limit(void); +extern void shrink_page_cache(gfp_t mask, struct page *page); +extern unsigned long vm_pagecache_limit_pages; +extern unsigned long vm_pagecache_limit_reclaim_pages; +extern int unsigned vm_pagecache_limit_ratio; +extern int vm_pagecache_limit_reclaim_ratio; +extern unsigned int vm_pagecache_ignore_dirty; +extern unsigned long pagecache_over_limit(void); +extern unsigned int vm_pagecache_limit_async; +extern int kpagecache_limitd_run(void); +extern void kpagecache_limitd_stop(void); +extern unsigned int vm_pagecache_ignore_slab; + extern int remove_mapping(struct address_space *mapping, struct page *page);
extern unsigned long reclaim_pages(struct list_head *page_list); diff --git a/include/trace/events/vmscan.h b/include/trace/events/vmscan.h index 2070df64958e..3bfe47a85f6f 100644 --- a/include/trace/events/vmscan.h +++ b/include/trace/events/vmscan.h @@ -183,48 +183,48 @@ DEFINE_EVENT(mm_vmscan_direct_reclaim_end_template, mm_vmscan_memcg_softlimit_re #endif /* CONFIG_MEMCG */
TRACE_EVENT(mm_shrink_slab_start, - TP_PROTO(struct shrinker *shr, struct shrink_control *sc, - long nr_objects_to_shrink, unsigned long cache_items, - unsigned long long delta, unsigned long total_scan, - int priority), - - TP_ARGS(shr, sc, nr_objects_to_shrink, cache_items, delta, total_scan, - priority), + TP_PROTO(struct shrinker *shr, struct shrink_control *sc, + long nr_objects_to_shrink,unsigned long pgs_scanned, + unsigned long lru_pgs, unsigned long cache_items, + unsigned long long delta, unsigned long total_scan),
+ TP_ARGS(shr, sc, nr_objects_to_shrink,pgs_scanned, lru_pgs, cache_items, delta, total_scan), TP_STRUCT__entry( __field(struct shrinker *, shr) __field(void *, shrink) __field(int, nid) __field(long, nr_objects_to_shrink) __field(gfp_t, gfp_flags) + __field(unsigned long, pgs_scanned) + __field(unsigned long, lru_pgs) __field(unsigned long, cache_items) __field(unsigned long long, delta) __field(unsigned long, total_scan) - __field(int, priority) ),
TP_fast_assign( - __entry->shr = shr; + __entry->shr = shr; __entry->shrink = shr->scan_objects; __entry->nid = sc->nid; __entry->nr_objects_to_shrink = nr_objects_to_shrink; __entry->gfp_flags = sc->gfp_mask; + __entry->pgs_scanned = pgs_scanned; + __entry->lru_pgs = lru_pgs; __entry->cache_items = cache_items; __entry->delta = delta; __entry->total_scan = total_scan; - __entry->priority = priority; ), - - TP_printk("%pS %p: nid: %d objects to shrink %ld gfp_flags %s cache items %ld delta %lld total_scan %ld priority %d", +TP_printk("%pF %p: nid: %d objects to shrink %ld gfp_flags %s pgs_scanned %ld lru_pgs %ld cache items %ld delta %lld total_scan %ld", __entry->shrink, __entry->shr, __entry->nid, __entry->nr_objects_to_shrink, show_gfp_flags(__entry->gfp_flags), + __entry->pgs_scanned, + __entry->lru_pgs, __entry->cache_items, __entry->delta, - __entry->total_scan, - __entry->priority) + __entry->total_scan) );
TRACE_EVENT(mm_shrink_slab_end, diff --git a/kernel/sysctl.c b/kernel/sysctl.c index c7ca58de3b1b..4ef436cdfdad 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -111,6 +111,7 @@ static int sixty = 60; #endif
+static int zero; static int __maybe_unused neg_one = -1; static int __maybe_unused two = 2; static int __maybe_unused four = 4; @@ -648,6 +649,68 @@ static int do_proc_dointvec(struct ctl_table *table, int write, return __do_proc_dointvec(table->data, table, write, buffer, lenp, ppos, conv, data); } +int setup_pagecache_limit(void) +{ + /* reclaim $ADDITIONAL_RECLAIM_PAGES more than limit. */ + vm_pagecache_limit_reclaim_ratio = vm_pagecache_limit_ratio + ADDITIONAL_RECLAIM_RATIO; + + if (vm_pagecache_limit_reclaim_ratio > 100) + vm_pagecache_limit_reclaim_ratio = 100; + if (vm_pagecache_limit_ratio == 0) + vm_pagecache_limit_reclaim_ratio = 0; + + vm_pagecache_limit_pages = vm_pagecache_limit_ratio * totalram_pages() / 100; + vm_pagecache_limit_reclaim_pages = vm_pagecache_limit_reclaim_ratio * totalram_pages() / 100; + return 0; +} + +static int pc_limit_proc_dointvec(struct ctl_table *table, int write, + void __user *buffer, size_t *lenp, loff_t *ppos) +{ + int ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos); + if (write && !ret) + ret = setup_pagecache_limit(); + return ret; +} +static int pc_reclaim_limit_proc_dointvec(struct ctl_table *table, int write, + void __user *buffer, size_t *lenp, loff_t *ppos) +{ + int pre_reclaim_ratio = vm_pagecache_limit_reclaim_ratio; + int ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos); + + if (write && vm_pagecache_limit_ratio == 0) + return -EINVAL; + + if (write && !ret) { + if (vm_pagecache_limit_reclaim_ratio - vm_pagecache_limit_ratio < ADDITIONAL_RECLAIM_RATIO) { + vm_pagecache_limit_reclaim_ratio = pre_reclaim_ratio; + return -EINVAL; + } + vm_pagecache_limit_reclaim_pages = vm_pagecache_limit_reclaim_ratio * totalram_pages() / 100; + } + return ret; +} +static int pc_limit_async_handler(struct ctl_table *table, int write, + void __user *buffer, size_t *lenp, loff_t *ppos) +{ + int ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos); + + if (write && vm_pagecache_limit_ratio == 0) + return -EINVAL; + + if (write && !ret) { + if (vm_pagecache_limit_async > 0) { + if (kpagecache_limitd_run()) { + vm_pagecache_limit_async = 0; + return -EINVAL; + } + } + else { + kpagecache_limitd_stop(); + } + } + return ret; +}
static int do_proc_douintvec_w(unsigned int *tbl_data, struct ctl_table *table, @@ -2711,6 +2774,14 @@ static struct ctl_table kern_table[] = { }, { } }; +static int pc_limit_proc_dointvec(struct ctl_table *table, int write, + void __user *buffer, size_t *lenp, loff_t *ppos); + +static int pc_reclaim_limit_proc_dointvec(struct ctl_table *table, int write, + void __user *buffer, size_t *lenp, loff_t *ppos); + +static int pc_limit_async_handler(struct ctl_table *table, int write, + void __user *buffer, size_t *lenp, loff_t *ppos);
static struct ctl_table vm_table[] = { { @@ -2833,6 +2904,74 @@ static struct ctl_table vm_table[] = { .extra1 = SYSCTL_ZERO, .extra2 = &two_hundred, }, + { + .procname = "pagecache_limit_ratio", + .data = &vm_pagecache_limit_ratio, + .maxlen = sizeof(vm_pagecache_limit_ratio), + .mode = 0644, + .proc_handler = &pc_limit_proc_dointvec, + .extra1 = &zero, + .extra2 = &one_hundred, + }, + { + .procname = "pagecache_limit_reclaim_ratio", + .data = &vm_pagecache_limit_reclaim_ratio, + .maxlen = sizeof(vm_pagecache_limit_reclaim_ratio), + .mode = 0644, + .proc_handler = &pc_reclaim_limit_proc_dointvec, + .extra1 = &zero, + .extra2 = &one_hundred, + }, + { + .procname = "pagecache_limit_ignore_dirty", + .data = &vm_pagecache_ignore_dirty, + .maxlen = sizeof(vm_pagecache_ignore_dirty), + .mode = 0644, + .proc_handler = &proc_dointvec, + }, +#ifdef CONFIG_SHRINK_PAGECACHE + { + .procname = "cache_reclaim_s", + .data = &vm_cache_reclaim_s, + .maxlen = sizeof(vm_cache_reclaim_s), + .mode = 0644, + .proc_handler = cache_reclaim_sysctl_handler, + .extra1 = &vm_cache_reclaim_s_min, + .extra2 = &vm_cache_reclaim_s_max, + }, + { + .procname = "cache_reclaim_weight", + .data = &vm_cache_reclaim_weight, + .maxlen = sizeof(vm_cache_reclaim_weight), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = &vm_cache_reclaim_weight_min, + .extra2 = &vm_cache_reclaim_weight_max, + }, + { + .procname = "cache_reclaim_enable", + .data = &vm_cache_reclaim_enable, + .maxlen = sizeof(vm_cache_reclaim_enable), + .mode = 0644, + .proc_handler = cache_reclaim_enable_handler, + .extra1 = &zero, + .extra2 = &one, + }, + { + .procname = "pagecache_limit_async", + .data = &vm_pagecache_limit_async, + .maxlen = sizeof(vm_pagecache_limit_async), + .mode = 0644, + .proc_handler = &pc_limit_async_handler, + }, + { + .procname = "pagecache_limit_ignore_slab", + .data = &vm_pagecache_ignore_slab, + .maxlen = sizeof(vm_pagecache_ignore_slab), + .mode = 0644, + .proc_handler = &proc_dointvec, + }, +#endif #ifdef CONFIG_HUGETLB_PAGE { .procname = "nr_hugepages", diff --git a/mm/filemap.c b/mm/filemap.c index ef611eb34aa7..808d4f02b5a5 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -922,6 +922,8 @@ int add_to_page_cache_lru(struct page *page, struct address_space *mapping, { void *shadow = NULL; int ret; + if (unlikely(vm_pagecache_limit_pages) && pagecache_over_limit() > 0) + shrink_page_cache(gfp_mask, page);
__SetPageLocked(page); ret = __add_to_page_cache_locked(page, mapping, offset, diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 71afec177233..08feba42d3d7 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -8933,6 +8933,58 @@ void zone_pcp_reset(struct zone *zone) local_irq_restore(flags); }
+/* Returns a number that's positive if the pagecache is above + * the set limit*/ +unsigned long pagecache_over_limit() +{ + unsigned long should_reclaim_pages = 0; + unsigned long overlimit_pages = 0; + unsigned long delta_pages = 0; + unsigned long pgcache_lru_pages = 0; + /* We only want to limit unmapped and non-shmem page cache pages; + * normally all shmem pages are mapped as well*/ + unsigned long pgcache_pages = global_node_page_state(NR_FILE_PAGES) + - max_t(unsigned long, + global_node_page_state(NR_FILE_MAPPED), + global_node_page_state(NR_SHMEM)); + /* We certainly can't free more than what's on the LRU lists + * minus the dirty ones*/ + if (vm_pagecache_ignore_slab) + pgcache_lru_pages = global_node_page_state(NR_ACTIVE_FILE) + + global_node_page_state(NR_INACTIVE_FILE); + else + pgcache_lru_pages = global_node_page_state(NR_ACTIVE_FILE) + + global_node_page_state(NR_INACTIVE_FILE) + + global_node_page_state(NR_SLAB_RECLAIMABLE_B) + + global_node_page_state(NR_SLAB_UNRECLAIMABLE_B); + + if (vm_pagecache_ignore_dirty != 0) + pgcache_lru_pages -= global_node_page_state(NR_FILE_DIRTY) / vm_pagecache_ignore_dirty; + /* Paranoia */ + if (unlikely(pgcache_lru_pages > LONG_MAX)) + return 0; + + /* Limit it to 94% of LRU (not all there might be unmapped) */ + pgcache_lru_pages -= pgcache_lru_pages/16; + if (vm_pagecache_ignore_slab) + pgcache_pages = min_t(unsigned long, pgcache_pages, pgcache_lru_pages); + else + pgcache_pages = pgcache_lru_pages; + + /* + *delta_pages: we should reclaim at least 2% more pages than overlimit_page, values get from + * /proc/vm/pagecache_limit_reclaim_pages + *should_reclaim_pages: the real pages we will reclaim, but it should less than pgcache_pages; + */ + if (pgcache_pages > vm_pagecache_limit_pages) { + overlimit_pages = pgcache_pages - vm_pagecache_limit_pages; + delta_pages = vm_pagecache_limit_reclaim_pages - vm_pagecache_limit_pages; + should_reclaim_pages = min_t(unsigned long, delta_pages, vm_pagecache_limit_pages) + overlimit_pages; + return should_reclaim_pages; + } + return 0; +} + #ifdef CONFIG_MEMORY_HOTREMOVE /* * All pages in the range must be in a single zone, must not contain holes, diff --git a/mm/vmscan.c b/mm/vmscan.c index 23f8a5242de7..1fe2c74a1c10 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -175,6 +175,39 @@ struct scan_control { */ int vm_swappiness = 60;
+/* + * The total number of pages which are beyond the high watermark within all + * zones. + */ +unsigned long vm_pagecache_limit_pages __read_mostly = 0; +unsigned long vm_pagecache_limit_reclaim_pages = 0; +unsigned int vm_pagecache_limit_ratio __read_mostly = 0; +int vm_pagecache_limit_reclaim_ratio __read_mostly = 0; +unsigned int vm_pagecache_ignore_dirty __read_mostly = 1; + +unsigned long vm_total_pages; +static struct task_struct *kpclimitd = NULL; +unsigned int vm_pagecache_ignore_slab __read_mostly = 1; +unsigned int vm_pagecache_limit_async __read_mostly = 0; + +#ifdef CONFIG_SHRINK_PAGECACHE +unsigned long vm_cache_limit_ratio; +unsigned long vm_cache_limit_ratio_min; +unsigned long vm_cache_limit_ratio_max; +unsigned long vm_cache_limit_mbytes __read_mostly; +unsigned long vm_cache_limit_mbytes_min; +unsigned long vm_cache_limit_mbytes_max; +static bool kpclimitd_context = false; +int vm_cache_reclaim_s __read_mostly; +int vm_cache_reclaim_s_min; +int vm_cache_reclaim_s_max; +int vm_cache_reclaim_weight __read_mostly; +int vm_cache_reclaim_weight_min; +int vm_cache_reclaim_weight_max; +int vm_cache_reclaim_enable; +static DEFINE_PER_CPU(struct delayed_work, vmscan_work); +#endif + static void set_task_reclaim_state(struct task_struct *task, struct reclaim_state *rs) { @@ -187,10 +220,12 @@ static void set_task_reclaim_state(struct task_struct *task, task->reclaim_state = rs; }
+static bool kpclimitd_context = false; static LIST_HEAD(shrinker_list); static DECLARE_RWSEM(shrinker_rwsem);
#ifdef CONFIG_MEMCG +static DEFINE_IDR(shrinker_idr); static int shrinker_nr_max;
/* The shrinker_info is expanded in a batch of BITS_PER_LONG */ @@ -346,7 +381,6 @@ void set_shrinker_bit(struct mem_cgroup *memcg, int nid, int shrinker_id) } }
-static DEFINE_IDR(shrinker_idr);
static int prealloc_memcg_shrinker(struct shrinker *shrinker) { @@ -646,7 +680,9 @@ EXPORT_SYMBOL(unregister_shrinker); #define SHRINK_BATCH 128
static unsigned long do_shrink_slab(struct shrink_control *shrinkctl, - struct shrinker *shrinker, int priority) + struct shrinker *shrinker, + unsigned long nr_scanned, + unsigned long nr_eligible) { unsigned long freed = 0; unsigned long long delta; @@ -670,8 +706,10 @@ static unsigned long do_shrink_slab(struct shrink_control *shrinkctl, nr = xchg_nr_deferred(shrinker, shrinkctl);
if (shrinker->seeks) { - delta = freeable >> priority; - delta *= 4; + //delta = freeable >> priority; + //delta *= 4; + delta = (4 * nr_scanned) / shrinker->seeks; + delta *= freeable; do_div(delta, shrinker->seeks); } else { /* @@ -682,12 +720,12 @@ static unsigned long do_shrink_slab(struct shrink_control *shrinkctl, delta = freeable / 2; }
- total_scan = nr >> priority; + total_scan = nr; total_scan += delta; total_scan = min(total_scan, (2 * freeable));
trace_mm_shrink_slab_start(shrinker, shrinkctl, nr, - freeable, delta, total_scan, priority); + freeable, delta, total_scan, nr_scanned,nr_eligible);
/* * Normally, we should not scan less than batch_size objects in one @@ -744,7 +782,7 @@ static unsigned long do_shrink_slab(struct shrink_control *shrinkctl,
#ifdef CONFIG_MEMCG static unsigned long shrink_slab_memcg(gfp_t gfp_mask, int nid, - struct mem_cgroup *memcg, int priority) + struct mem_cgroup *memcg, unsigned long nr_scanned, unsigned long nr_eligible) { struct shrinker_info *info; unsigned long ret, freed = 0; @@ -780,7 +818,7 @@ static unsigned long shrink_slab_memcg(gfp_t gfp_mask, int nid, !(shrinker->flags & SHRINKER_NONSLAB)) continue;
- ret = do_shrink_slab(&sc, shrinker, priority); + ret = do_shrink_slab(&sc, shrinker, nr_scanned, nr_eligible); if (ret == SHRINK_EMPTY) { clear_bit(i, info->map); /* @@ -799,7 +837,7 @@ static unsigned long shrink_slab_memcg(gfp_t gfp_mask, int nid, * set_bit() do_shrink_slab() */ smp_mb__after_atomic(); - ret = do_shrink_slab(&sc, shrinker, priority); + ret = do_shrink_slab(&sc, shrinker, nr_scanned, nr_eligible); if (ret == SHRINK_EMPTY) ret = 0; else @@ -846,7 +884,8 @@ static unsigned long shrink_slab_memcg(gfp_t gfp_mask, int nid, */ static unsigned long shrink_slab(gfp_t gfp_mask, int nid, struct mem_cgroup *memcg, - int priority) + unsigned long nr_scanned, + unsigned long nr_eligible) { unsigned long ret, freed = 0; struct shrinker *shrinker; @@ -859,7 +898,8 @@ static unsigned long shrink_slab(gfp_t gfp_mask, int nid, * oom. */ if (!mem_cgroup_disabled() && !mem_cgroup_is_root(memcg)) - return shrink_slab_memcg(gfp_mask, nid, memcg, priority); + return 0; + // return shrink_slab_memcg(gfp_mask, nid, memcg, priority);
if (!down_read_trylock(&shrinker_rwsem)) goto out; @@ -871,7 +911,14 @@ static unsigned long shrink_slab(gfp_t gfp_mask, int nid, .memcg = memcg, };
- ret = do_shrink_slab(&sc, shrinker, priority); + if (memcg_kmem_enabled() && + !!memcg != !!(shrinker->flags & SHRINKER_MEMCG_AWARE)) + continue; + + if (!(shrinker->flags & SHRINKER_NUMA_AWARE)) + sc.nid = 0; + + ret = do_shrink_slab(&sc, shrinker, nr_scanned, nr_eligible); if (ret == SHRINK_EMPTY) ret = 0; freed += ret; @@ -905,7 +952,7 @@ void drop_slab_node(int nid) freed = 0; memcg = mem_cgroup_iter(NULL, NULL, NULL); do { - freed += shrink_slab(GFP_KERNEL, nid, memcg, 0); + freed += shrink_slab(GFP_KERNEL, nid, memcg, 1000,1000); } while ((memcg = mem_cgroup_iter(NULL, memcg, NULL)) != NULL); } while (freed > 10); } @@ -2369,7 +2416,7 @@ unsigned long reclaim_pages(struct list_head *page_list) EXPORT_SYMBOL_GPL(reclaim_pages);
static unsigned long shrink_list(enum lru_list lru, unsigned long nr_to_scan, - struct lruvec *lruvec, struct scan_control *sc) + struct lruvec *lruvec, struct mem_cgroup *memcg, struct scan_control *sc) { if (is_active_lru(lru)) { if (sc->may_deactivate & (1 << is_file_lru(lru))) @@ -2683,7 +2730,7 @@ static void shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc) nr[lru] -= nr_to_scan;
nr_reclaimed += shrink_list(lru, nr_to_scan, - lruvec, sc); + lruvec, NULL, sc); } }
@@ -2836,7 +2883,7 @@ static void shrink_node_memcgs(pg_data_t *pgdat, struct scan_control *sc) struct lruvec *lruvec = mem_cgroup_lruvec(memcg, pgdat); unsigned long reclaimed; unsigned long scanned; - + unsigned long lru_pages; /* * This loop can become CPU-bound when target memcgs * aren't eligible for reclaim - either because they @@ -2873,7 +2920,8 @@ static void shrink_node_memcgs(pg_data_t *pgdat, struct scan_control *sc) shrink_lruvec(lruvec, sc);
shrink_slab(sc->gfp_mask, pgdat->node_id, memcg, - sc->priority); + sc->nr_scanned - scanned, + lru_pages);
/* Record the group's reclaim efficiency */ vmpressure(sc->gfp_mask, memcg, false, @@ -3202,6 +3250,7 @@ static void shrink_zones(struct zonelist *zonelist, struct scan_control *sc)
static void snapshot_refaults(struct mem_cgroup *target_memcg, pg_data_t *pgdat) { + struct mem_cgroup *memcg; struct lruvec *target_lruvec; unsigned long refaults;
@@ -3273,8 +3322,7 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist, if (cgroup_reclaim(sc)) { struct lruvec *lruvec;
- lruvec = mem_cgroup_lruvec(sc->target_mem_cgroup, - zone->zone_pgdat); + lruvec = mem_cgroup_lruvec(sc->target_mem_cgroup, zone->zone_pgdat); clear_bit(LRUVEC_CONGESTED, &lruvec->flags); } } @@ -3745,6 +3793,8 @@ static bool kswapd_shrink_node(pg_data_t *pgdat, return sc->nr_scanned >= sc->nr_to_reclaim; }
+static void __shrink_page_cache(gfp_t mask); + /* * For kswapd, balance_pgdat() will reclaim pages across a node from zones * that are eligible for use by the caller until at least one zone is @@ -4208,6 +4258,27 @@ void wakeup_kswapd(struct zone *zone, gfp_t gfp_flags, int order, wake_up_interruptible(&pgdat->kswapd_wait); }
+/* + * The reclaimable count would be mostly accurate. + * The less reclaimable pages may be + * - mlocked pages, which will be moved to unevictable list when encountered + * - mapped pages, which may require several travels to be reclaimed + * - dirty pages, which is not "instantly" reclaimable + */ + +static unsigned long global_reclaimable_pages(void) +{ + int nr; + + nr = global_node_page_state(NR_ACTIVE_FILE) + + global_node_page_state(NR_INACTIVE_FILE); + + if (get_nr_swap_pages() > 0) + nr += global_node_page_state(NR_ACTIVE_ANON) + + global_node_page_state(NR_INACTIVE_ANON); + return nr; +} + #ifdef CONFIG_HIBERNATION /* * Try to free `nr_to_reclaim' of memory, system-wide, and return the number of @@ -4246,6 +4317,498 @@ unsigned long shrink_all_memory(unsigned long nr_to_reclaim) return nr_reclaimed; } #endif /* CONFIG_HIBERNATION */ +/* + * Returns non-zero if the lock has been acquired, false if somebody + * else is holding the lock. + */ +static int pagecache_reclaim_lock_zone(struct zone *zone) +{ + return atomic_add_unless(&zone->pagecache_reclaim, 1, 1); +} + +static void pagecache_reclaim_unlock_zone(struct zone *zone) +{ + BUG_ON(atomic_dec_return(&zone->pagecache_reclaim)); +} + +/* + * Potential page cache reclaimers who are not able to take + * reclaim lock on any zone are sleeping on this waitqueue. + * So this is basically a congestion wait queue for them. + */ +DECLARE_WAIT_QUEUE_HEAD(pagecache_reclaim_wq); +DECLARE_WAIT_QUEUE_HEAD(kpagecache_limitd_wq); + +/* + * Similar to shrink_zone but it has a different consumer - pagecache limit + * so we cannot reuse the original function - and we do not want to clobber + * that code path so we have to live with this code duplication. + * + * In short this simply scans through the given lru for all cgroups for the + * give zone. + * + * returns true if we managed to cumulatively reclaim (via nr_reclaimed) + * the given nr_to_reclaim pages, false otherwise. The caller knows that + * it doesn't have to touch other zones if the target was hit already. + * + * DO NOT USE OUTSIDE of shrink_all_zones unless you have a really really + * really good reason. + */ + +static bool shrink_zone_per_memcg(struct zone *zone, enum lru_list lru, + unsigned long nr_to_scan, unsigned long nr_to_reclaim, + unsigned long *nr_reclaimed, struct scan_control *sc) +{ + struct mem_cgroup *root = sc->target_mem_cgroup; + struct mem_cgroup *memcg; + struct mem_cgroup_reclaim_cookie reclaim = { + .pgdat = zone->zone_pgdat, + .priority = sc->priority, + }; + + memcg = mem_cgroup_iter(root, NULL, &reclaim); + do { + struct lruvec *lruvec; + + lruvec = mem_cgroup_lruvec(memcg, zone->zone_pgdat); + *nr_reclaimed += shrink_list(lru, nr_to_scan, lruvec, memcg, sc); + if (*nr_reclaimed >= nr_to_reclaim) { + mem_cgroup_iter_break(root, memcg); + return true; + } + memcg = mem_cgroup_iter(root, memcg, &reclaim); + } while (memcg); + + return false; +} +/* + * Tries to reclaim 'nr_pages' pages from LRU lists system-wide, for given + * pass. + * + * For pass > 3 we also try to shrink the LRU lists that contain a few pages + * + * Returns the number of scanned zones. + */ +static int shrink_all_zones(unsigned long nr_pages, int pass, + struct scan_control *sc) +{ + struct zone *zone; + unsigned long nr_reclaimed = 0; + unsigned int nr_locked_zones = 0; + DEFINE_WAIT(wait); + + prepare_to_wait(&pagecache_reclaim_wq, &wait, TASK_INTERRUPTIBLE); + + for_each_populated_zone(zone) { + enum lru_list lru; + + /* + * Back off if somebody is already reclaiming this zone + * for the pagecache reclaim. + */ + if (!pagecache_reclaim_lock_zone(zone)) + continue; + + + /* + * This reclaimer might scan a zone so it will never + * sleep on pagecache_reclaim_wq + */ + finish_wait(&pagecache_reclaim_wq, &wait); + nr_locked_zones++; + + for_each_evictable_lru(lru) { + enum zone_stat_item ls = NR_ZONE_LRU_BASE + lru; + unsigned long lru_pages = zone_page_state(zone, ls); + + /* For pass = 0, we don't shrink the active list */ + if (pass == 0 && (lru == LRU_ACTIVE_ANON || + lru == LRU_ACTIVE_FILE)) + continue; + + /* Original code relied on nr_saved_scan which is no + * longer present so we are just considering LRU pages. + * This means that the zone has to have quite large + * LRU list for default priority and minimum nr_pages + * size (8*SWAP_CLUSTER_MAX). In the end we will tend + * to reclaim more from large zones wrt. small. + * This should be OK because shrink_page_cache is called + * when we are getting to short memory condition so + * LRUs tend to be large. + */ + if (((lru_pages >> sc->priority) + 1) >= nr_pages || pass >= 3) { + unsigned long nr_to_scan; + + nr_to_scan = min(nr_pages, lru_pages); + + /* + * A bit of a hack but the code has always been + * updating sc->nr_reclaimed once per shrink_all_zones + * rather than accumulating it for all calls to shrink + * lru. This costs us an additional argument to + * shrink_zone_per_memcg but well... + * + * Let's stick with this for bug-to-bug compatibility + */ + while (nr_to_scan > 0) { + /* shrink_list takes lru_lock with IRQ off so we + * should be careful about really huge nr_to_scan + */ + unsigned long batch = min_t(unsigned long, nr_to_scan, SWAP_CLUSTER_MAX); + + if (shrink_zone_per_memcg(zone, lru, + batch, nr_pages, &nr_reclaimed, sc)) { + pagecache_reclaim_unlock_zone(zone); + goto out_wakeup; + } + nr_to_scan -= batch; + } + } + } + pagecache_reclaim_unlock_zone(zone); + } + /* + * We have to go to sleep because all the zones are already reclaimed. + * One of the reclaimer will wake us up or __shrink_page_cache will + * do it if there is nothing to be done. + */ + if (!nr_locked_zones) { + if (!kpclimitd_context) + schedule(); + finish_wait(&pagecache_reclaim_wq, &wait); + goto out; + } + +out_wakeup: + wake_up_interruptible(&pagecache_reclaim_wq); + sc->nr_reclaimed += nr_reclaimed; +out: + return nr_locked_zones; +} + +/* + * Function to shrink the page cache + * + * This function calculates the number of pages (nr_pages) the page + * cache is over its limit and shrinks the page cache accordingly. + * + * The maximum number of pages, the page cache shrinks in one call of + * this function is limited to SWAP_CLUSTER_MAX pages. Therefore it may + * require a number of calls to actually reach the vm_pagecache_limit_kb. + * + * This function is similar to shrink_all_memory, except that it may never + * swap out mapped pages and only does four passes. + */ +static void __shrink_page_cache(gfp_t mask) +{ + unsigned long ret = 0; + int pass = 0; + struct reclaim_state reclaim_state; + struct scan_control sc = { + .gfp_mask = mask, + .may_swap = 0, + .may_unmap = 0, + .may_writepage = 0, + .target_mem_cgroup = NULL, + .reclaim_idx = MAX_NR_ZONES, + }; + struct reclaim_state *old_rs = current->reclaim_state; + long nr_pages; + + /* We might sleep during direct reclaim so make atomic context + * is certainly a bug. + */ + BUG_ON(!(mask & __GFP_RECLAIM)); + +retry: + /* How many pages are we over the limit?*/ + nr_pages = pagecache_over_limit(); + + /* + * Return early if there's no work to do. + * Wake up reclaimers that couldn't scan any zone due to congestion. + * There is apparently nothing to do so they do not have to sleep. + * This makes sure that no sleeping reclaimer will stay behind. + * Allow breaching the limit if the task is on the way out. + */ + if (nr_pages <= 0 || fatal_signal_pending(current)) { + wake_up_interruptible(&pagecache_reclaim_wq); + goto out; + } + + /* But do a few at least */ + nr_pages = max_t(unsigned long, nr_pages, 8*SWAP_CLUSTER_MAX); + + current->reclaim_state = &reclaim_state; + + /* + * Shrink the LRU in 4 passes: + * 0 = Reclaim from inactive_list only (fast) + * 1 = Reclaim from active list but don't reclaim mapped and dirtied (not that fast) + * 2 = Reclaim from active list but don't reclaim mapped (2nd pass) + * it may reclaim dirtied if vm_pagecache_ignore_dirty = 0 + * 3 = same as pass 2, but it will reclaim some few pages , detail in shrink_all_zones + */ + for (; pass <= 3; pass++) { + for (sc.priority = DEF_PRIORITY; sc.priority >= 0; sc.priority--) { + unsigned long nr_to_scan = nr_pages - ret; + struct mem_cgroup *memcg = NULL; + int nid; + + sc.nr_scanned = 0; + + /* + * No zone reclaimed because of too many reclaimers. Retry whether + * there is still something to do + */ + if (!shrink_all_zones(nr_to_scan, pass, &sc)) + goto retry; + + ret += sc.nr_reclaimed; + if (ret >= nr_pages) + goto out; + + reclaim_state.reclaimed_slab = 0; + for_each_online_node(nid) { + do { + shrink_slab(mask, nid, memcg, sc.nr_scanned, + global_reclaimable_pages()); + } while ((memcg = mem_cgroup_iter(NULL, memcg, NULL)) != NULL); + } + ret += reclaim_state.reclaimed_slab; + + if (ret >= nr_pages) + goto out; + + } + if (pass == 1) { + if (vm_pagecache_ignore_dirty == 1 || + (mask & (__GFP_IO | __GFP_FS)) != (__GFP_IO | __GFP_FS) ) + break; + else + sc.may_writepage = 1; + } + } + +out: + current->reclaim_state = old_rs; +} + +#ifdef CONFIG_SHRINK_PAGECACHE +static unsigned long __shrink_page_cache(gfp_t mask) +{ + struct scan_control sc = { + .gfp_mask = current_gfp_context(mask), + .reclaim_idx = gfp_zone(mask), + .may_writepage = !laptop_mode, + .nr_to_reclaim = SWAP_CLUSTER_MAX * + (unsigned long)vm_cache_reclaim_weight, + .may_unmap = 1, + .may_swap = 1, + .order = 0, + .priority = DEF_PRIORITY, + .target_mem_cgroup = NULL, + .nodemask = NULL, + }; + + struct zonelist *zonelist = node_zonelist(numa_node_id(), mask); + + return do_try_to_free_pages(zonelist, &sc); +} + + +static void shrink_page_cache_work(struct work_struct *w); +static void shrink_shepherd(struct work_struct *w); +static DECLARE_DEFERRABLE_WORK(shepherd, shrink_shepherd); + +static void shrink_shepherd(struct work_struct *w) +{ + int cpu; + + get_online_cpus(); + + for_each_online_cpu(cpu) { + struct delayed_work *work = &per_cpu(vmscan_work, cpu); + + if (!delayed_work_pending(work) && vm_cache_reclaim_enable) + queue_delayed_work_on(cpu, system_wq, work, 0); + } + + put_online_cpus(); + + /* we want all kernel thread to stop */ + if (vm_cache_reclaim_enable) { + if (vm_cache_reclaim_s == 0) + schedule_delayed_work(&shepherd, + round_jiffies_relative(120 * HZ)); + else + schedule_delayed_work(&shepherd, + round_jiffies_relative((unsigned long) + vm_cache_reclaim_s * HZ)); + } +} +static void shrink_shepherd_timer(void) +{ + int cpu; + + for_each_possible_cpu(cpu) { + struct delayed_work *work = &per_cpu(vmscan_work, cpu); + + INIT_DEFERRABLE_WORK(work, shrink_page_cache_work); + } + + schedule_delayed_work(&shepherd, + round_jiffies_relative((unsigned long)vm_cache_reclaim_s * HZ)); +} + +unsigned long shrink_page_cache(gfp_t mask) +{ + unsigned long nr_pages; + + /* We reclaim the highmem zone too, it is useful for 32bit arch */ + nr_pages = __shrink_page_cache(mask | __GFP_HIGHMEM); + + return nr_pages; +} +static void shrink_page_cache_work(struct work_struct *w) +{ + struct delayed_work *work = to_delayed_work(w); + unsigned long nr_pages; + + /* + * if vm_cache_reclaim_enable or vm_cache_reclaim_s is zero, + * we do not shrink page cache again. + */ + if (vm_cache_reclaim_s == 0 || !vm_cache_reclaim_enable) + return; + + /* It should wait more time if we hardly reclaim the page cache */ + nr_pages = shrink_page_cache(GFP_KERNEL); + if ((nr_pages < SWAP_CLUSTER_MAX) && vm_cache_reclaim_enable) + queue_delayed_work_on(smp_processor_id(), system_wq, work, + round_jiffies_relative(120 * HZ)); +} + +static void shrink_page_cache_init(void) +{ + vm_cache_limit_ratio = 0; + vm_cache_limit_ratio_min = 0; + vm_cache_limit_ratio_max = 100; + vm_cache_limit_mbytes = 0; + vm_cache_limit_mbytes_min = 0; + vm_cache_limit_mbytes_max = totalram_pages >> (20 - PAGE_SHIFT); + vm_cache_reclaim_s = 0; + vm_cache_reclaim_s_min = 0; + vm_cache_reclaim_s_max = 43200; + vm_cache_reclaim_weight = 1; + vm_cache_reclaim_weight_min = 1; + vm_cache_reclaim_weight_max = 100; + vm_cache_reclaim_enable = 1; + + shrink_shepherd_timer(); +} + +static int kswapd_cpu_down_prep(unsigned int cpu) +{ + cancel_delayed_work_sync(&per_cpu(vmscan_work, cpu)); + + return 0; +} +int cache_reclaim_enable_handler(struct ctl_table *table, int write, + void __user *buffer, size_t *length, loff_t *ppos) +{ + int ret; + + ret = proc_dointvec_minmax(table, write, buffer, length, ppos); + if (ret) + return ret; + + if (write) + schedule_delayed_work(&shepherd, round_jiffies_relative((unsigned long)vm_cache_reclaim_s * HZ)); + + return 0; +} + +int cache_reclaim_sysctl_handler(struct ctl_table *table, int write, + void __user *buffer, size_t *length, loff_t *ppos) +{ + int ret; + + ret = proc_dointvec_minmax(table, write, buffer, length, ppos); + if (ret) + return ret; + + if (write) + mod_delayed_work(system_wq, &shepherd, + round_jiffies_relative( + (unsigned long)vm_cache_reclaim_s * HZ)); + + return ret; +} +#endif + +static int kpagecache_limitd(void *data) +{ + DEFINE_WAIT(wait); + kpclimitd_context = true; + + /* + * make sure all work threads woken up, when switch to async mode + */ + if (waitqueue_active(&pagecache_reclaim_wq)) + wake_up_interruptible(&pagecache_reclaim_wq); + + for ( ; ; ) { + __shrink_page_cache(GFP_KERNEL); + prepare_to_wait(&kpagecache_limitd_wq, &wait, TASK_INTERRUPTIBLE); + + if (!kthread_should_stop()) + schedule(); + else { + finish_wait(&kpagecache_limitd_wq, &wait); + break; + } + finish_wait(&kpagecache_limitd_wq, &wait); + } + kpclimitd_context = false; + return 0; +} + +static void wakeup_kpclimitd(gfp_t mask) +{ + if (!waitqueue_active(&kpagecache_limitd_wq)) + return; + wake_up_interruptible(&kpagecache_limitd_wq); +} + +void shrink_page_cache(gfp_t mask, struct page *page) +{ + if (0 == vm_pagecache_limit_async) + __shrink_page_cache(mask); + else + wakeup_kpclimitd(mask); +} + +/* It's optimal to keep kswapds on the same CPUs as their memory, but + not required for correctness. So if the last cpu in a node goes + away, we get changed to run anywhere: as the first one comes back, + restore their cpu bindings. */ +static int kswapd_cpu_online(unsigned int cpu) +{ + int nid; + + for_each_node_state(nid, N_MEMORY) { + pg_data_t *pgdat = NODE_DATA(nid); + const struct cpumask *mask; + + mask = cpumask_of_node(pgdat->node_id); + + if (cpumask_any_and(cpu_online_mask, mask) < nr_cpu_ids) + /* One of our CPUs online: restore mask */ + set_cpus_allowed_ptr(pgdat->kswapd, mask); + } + return 0; +}
/* * This kswapd start function will be called by init and node-hot-add. @@ -4286,16 +4849,61 @@ void kswapd_stop(int nid)
static int __init kswapd_init(void) { - int nid; + /*int nid;
swap_setup(); for_each_node_state(nid, N_MEMORY) kswapd_run(nid); - return 0; + return 0;*/ + int nid, ret; + + swap_setup(); + for_each_node_state(nid, N_MEMORY) + kswapd_run(nid); +#ifdef CONFIG_SHRINK_PAGECACHE + ret = cpuhp_setup_state_nocalls(CPUHP_AP_ONLINE_DYN, + "mm/vmscan:online", kswapd_cpu_online, + kswapd_cpu_down_prep); +#else + ret = cpuhp_setup_state_nocalls(CPUHP_AP_ONLINE_DYN, + "mm/vmscan:online", kswapd_cpu_online, + NULL); +#endif + WARN_ON(ret < 0); +#ifdef CONFIG_SHRINK_PAGECACHE + shrink_page_cache_init(); +#endif + return 0; + }
module_init(kswapd_init)
+int kpagecache_limitd_run(void) +{ + int ret = 0; + + if (kpclimitd) + return 0; + + kpclimitd = kthread_run(kpagecache_limitd, NULL, "kpclimitd"); + if (IS_ERR(kpclimitd)) { + pr_err("Failed to start kpagecache_limitd thread\n"); + ret = PTR_ERR(kpclimitd); + kpclimitd = NULL; + } + return ret; + +} + +void kpagecache_limitd_stop(void) +{ + if (kpclimitd) { + kthread_stop(kpclimitd); + kpclimitd = NULL; + } +} + #ifdef CONFIG_NUMA /* * Node reclaim mode diff --git a/mm/workingset.c b/mm/workingset.c index bba4380405b4..9a5ad145b9bd 100644 --- a/mm/workingset.c +++ b/mm/workingset.c @@ -253,6 +253,7 @@ void workingset_age_nonresident(struct lruvec *lruvec, unsigned long nr_pages) void *workingset_eviction(struct page *page, struct mem_cgroup *target_memcg) { struct pglist_data *pgdat = page_pgdat(page); + struct mem_cgroup *memcg = page_memcg(page); unsigned long eviction; struct lruvec *lruvec; int memcgid;