From: Ma Wupeng mawupeng1@huawei.com
hulk inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I4SK3S CVE: NA
--------------------------------
Add reliable limit for reliable user task, page cache and shmem.
1. For reliable user task:
There is a upper limit for all memory allocation if the following conditions are met: - gfp_zone(gfp & ~ GFP_RELIABLE) == ZONE_MOVABLE - gfp & GFP_RELIABLE is true
Init tasks will alloc memory from non-mirrored region if their allocation trigger limit.
The limit can be set or access via /proc/sys/vm/task_reliable_limit
2. For page cache:
This limit's default value is ULONG_MAX. User can update this value between current user used reliable memory size and total reliable memory size.
Add interface /proc/sys/vm/pagecache_reliable_limit to set the max size for reliable page cache, the max size can not beyond total reliable ram.
the whole reliable memory feature depend on kernelcore=mirror, and which depend on NUMA, so remove redundant code in UMA.
3. For shmem:
This limit is used to restrict the amount of mirrored memory by shmem. This memory allocation will return no memory if reliable fallback is off or fallback to non-mirrored region if reliable fallback on.
This limit can be set or access via /proc/sys/vm/shmem_reliable_bytes_limit. The default value of this limit is LONG_MAX. This limit can be set from 0 to the total size of mirrored memory.
Signed-off-by: Chen Wandun chenwandun@huawei.com Signed-off-by: Ma Wupeng mawupeng1@huawei.com --- include/linux/mem_reliable.h | 77 +++++++++++++++++++++--- mm/mem_reliable.c | 113 +++++++++++++++++++++++++++++++++++ mm/page_alloc.c | 87 +++++++++++++++++++++++++++ mm/shmem.c | 4 +- 4 files changed, 271 insertions(+), 10 deletions(-)
diff --git a/include/linux/mem_reliable.h b/include/linux/mem_reliable.h index 8dba07dc6983..22a62deb8274 100644 --- a/include/linux/mem_reliable.h +++ b/include/linux/mem_reliable.h @@ -7,8 +7,10 @@ #include <linux/stddef.h> #include <linux/gfp.h> #include <linux/mmzone.h> +#include <linux/oom.h> #include <linux/mm_types.h> #include <linux/sched.h> +#include <linux/percpu_counter.h>
DECLARE_STATIC_KEY_FALSE(mem_reliable);
@@ -19,6 +21,9 @@ extern bool pagecache_reliable; extern struct percpu_counter pagecache_reliable_pages; extern struct percpu_counter anon_reliable_pages; extern struct percpu_counter shmem_reliable_pages; +extern unsigned long task_reliable_limit __read_mostly; +extern unsigned long shmem_reliable_limit __read_mostly; +extern unsigned long pagecache_reliable_limit __read_mostly;
void mem_reliable_init(bool has_unmirrored_mem, unsigned long mirrored_sz); bool mem_reliable_status(void); @@ -28,6 +33,8 @@ void reliable_lru_add(enum lru_list lru, struct folio *folio, int val); void reliable_lru_add_batch(int zid, enum lru_list lru, int val); bool mem_reliable_counter_initialized(void); void reliable_report_meminfo(struct seq_file *m); +void mem_reliable_out_of_memory(gfp_t gfp_mask, unsigned int order, + int preferred_nid, nodemask_t *nodemask);
static inline bool mem_reliable_is_enabled(void) { @@ -84,26 +91,53 @@ static inline bool skip_non_mirrored_zone(gfp_t gfp, struct zoneref *z) return false; }
-static inline void shmem_prepare_alloc(gfp_t *gfp_mask) +static inline bool mem_reliable_shmem_limit_check(void) +{ + return percpu_counter_read_positive(&shmem_reliable_pages) < + (shmem_reliable_limit >> PAGE_SHIFT); +} + +/* + * Check if this memory allocation for shmem is allowed. + * Return false if limit is triggered. + */ +static inline bool shmem_prepare_alloc(gfp_t *gfp_mask) { if (!mem_reliable_is_enabled()) - return; + return true;
- if (shmem_reliable_is_enabled()) - *gfp_mask |= GFP_RELIABLE; - else + if (!shmem_reliable_is_enabled()) { *gfp_mask &= ~GFP_RELIABLE; + return true; + } + + if (mem_reliable_shmem_limit_check()) { + *gfp_mask |= GFP_RELIABLE; + return true; + } + + return false; }
static inline void filemap_prepare_alloc(gfp_t *gfp_mask) { + s64 nr_reliable = 0; + if (!mem_reliable_is_enabled()) return;
- if (filemap_reliable_is_enabled()) - *gfp_mask |= GFP_RELIABLE; - else + if (!filemap_reliable_is_enabled()) { *gfp_mask &= ~GFP_RELIABLE; + return; + } + + nr_reliable = percpu_counter_read_positive(&pagecache_reliable_pages); + if (nr_reliable > pagecache_reliable_limit >> PAGE_SHIFT) { + *gfp_mask &= ~GFP_RELIABLE; + return; + } + + *gfp_mask |= GFP_RELIABLE; }
static inline unsigned long task_reliable_used_pages(void) @@ -122,6 +156,21 @@ static inline void shmem_reliable_folio_add(struct folio *folio, int nr_page) percpu_counter_add(&shmem_reliable_pages, nr_page); }
+ +static inline bool reliable_mem_limit_check(unsigned long nr_page) +{ + return (task_reliable_used_pages() + nr_page) <= + (task_reliable_limit >> PAGE_SHIFT); +} + +static inline bool mem_reliable_should_reclaim(void) +{ + if (percpu_counter_sum_positive(&pagecache_reliable_pages) >= + MAX_ORDER_NR_PAGES) + return true; + + return false; +} #else #define reliable_enabled 0
@@ -137,7 +186,7 @@ static inline bool skip_non_mirrored_zone(gfp_t gfp, struct zoneref *z) } static inline bool mem_reliable_status(void) { return false; } static inline bool mem_reliable_hide_file(const char *name) { return false; } -static inline void shmem_prepare_alloc(gfp_t *gfp_mask) {} +static inline bool shmem_prepare_alloc(gfp_t *gfp_mask) { return true; } static inline void filemap_prepare_alloc(gfp_t *gfp_mask) {} static inline void shmem_reliable_init(void) {} static inline void reliable_lru_add(enum lru_list lru, struct folio *folio, @@ -148,6 +197,16 @@ static inline bool mem_reliable_counter_initialized(void) { return false; } static inline void shmem_reliable_folio_add(struct folio *folio, int nr_page) {} static inline void reliable_report_meminfo(struct seq_file *m) {} +static inline bool mem_reliable_shmem_limit_check(void) { return true; } +static inline bool reliable_mem_limit_check(unsigned long nr_page) +{ + return false; +} +static inline bool mem_reliable_should_reclaim(void) { return false; } +static inline void mem_reliable_out_of_memory(gfp_t gfp_mask, + unsigned int order, + int preferred_nid, + nodemask_t *nodemask) {} #endif
#endif diff --git a/mm/mem_reliable.c b/mm/mem_reliable.c index c046d8016228..ae654d6bb047 100644 --- a/mm/mem_reliable.c +++ b/mm/mem_reliable.c @@ -20,6 +20,10 @@ bool pagecache_reliable __read_mostly = true; struct percpu_counter pagecache_reliable_pages; struct percpu_counter anon_reliable_pages; struct percpu_counter shmem_reliable_pages; +unsigned long pagecache_reliable_limit = ULONG_MAX; +/* reliable user limit for user tasks with reliable flag */ +unsigned long task_reliable_limit = ULONG_MAX; +unsigned long shmem_reliable_limit = ULONG_MAX;
bool mem_reliable_counter_initialized(void) { @@ -117,11 +121,103 @@ void reliable_lru_add(enum lru_list lru, struct folio *folio, int val) } }
+static int reliable_pagecache_max_bytes_write(struct ctl_table *table, + int write, void __user *buffer, + size_t *length, loff_t *ppos) +{ + unsigned long old_value = pagecache_reliable_limit; + int ret; + + ret = proc_doulongvec_minmax(table, write, buffer, length, ppos); + if (!ret && write) { + if (pagecache_reliable_limit > + PAGES_TO_B(total_reliable_pages())) { + pagecache_reliable_limit = old_value; + return -EINVAL; + } + } + + return ret; +} + +static int reliable_limit_handler(struct ctl_table *table, int write, + void __user *buffer, size_t *length, + loff_t *ppos) +{ + unsigned long old = task_reliable_limit; + int ret; + + ret = proc_doulongvec_minmax(table, write, buffer, length, ppos); + if (!ret && write) { + if (task_reliable_limit > PAGES_TO_B(total_reliable_pages()) || + task_reliable_limit < + (task_reliable_used_pages() << PAGE_SHIFT)) { + task_reliable_limit = old; + return -EINVAL; + } + } + + return ret; +} + +#ifdef CONFIG_SHMEM +static int reliable_shmem_bytes_limit_handler(struct ctl_table *table, + int write, void __user *buffer, + size_t *length, loff_t *ppos) +{ + unsigned long *data_ptr = (unsigned long *)(table->data); + unsigned long old = *data_ptr; + int ret; + + ret = proc_doulongvec_minmax(table, write, buffer, length, ppos); + if (!ret && write) { + if (*data_ptr > PAGES_TO_B(total_reliable_pages())) { + *data_ptr = old; + return -EINVAL; + } + } + + return ret; +} +#endif + +static struct ctl_table reliable_ctl_table[] = { + { + .procname = "reliable_pagecache_max_bytes", + .data = &pagecache_reliable_limit, + .maxlen = sizeof(pagecache_reliable_limit), + .mode = 0644, + .proc_handler = reliable_pagecache_max_bytes_write, + }, + { + .procname = "task_reliable_limit", + .data = &task_reliable_limit, + .maxlen = sizeof(task_reliable_limit), + .mode = 0644, + .proc_handler = reliable_limit_handler, + }, +#ifdef CONFIG_SHMEM + { + .procname = "shmem_reliable_bytes_limit", + .data = &shmem_reliable_limit, + .maxlen = sizeof(shmem_reliable_limit), + .mode = 0644, + .proc_handler = reliable_shmem_bytes_limit_handler, + }, +#endif + {} +}; + static int __init reliable_sysctl_init(void) { if (!mem_reliable_is_enabled()) return 0;
+ if (!register_sysctl("vm", reliable_ctl_table)) { + pr_err("register sysctl failed."); + return -ENOMEM; + } + percpu_counter_init(&pagecache_reliable_pages, 0, GFP_KERNEL); percpu_counter_init(&anon_reliable_pages, 0, GFP_KERNEL);
@@ -167,6 +263,23 @@ void reliable_report_meminfo(struct seq_file *m) } }
+void mem_reliable_out_of_memory(gfp_t gfp, unsigned int order, + int preferred_nid, nodemask_t *nodemask) +{ + struct oom_control oc = { + .zonelist = node_zonelist(preferred_nid, gfp), + .nodemask = nodemask, + .memcg = NULL, + .gfp_mask = gfp, + .order = order, + }; + + if (!mutex_trylock(&oom_lock)) + return; + out_of_memory(&oc); + mutex_unlock(&oom_lock); +} + static int __init setup_reliable_debug(char *str) { if (*str++ != '=' || !*str) diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 9d158a1113f5..503ce164a1e3 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -4557,6 +4557,89 @@ static inline void prepare_before_alloc(gfp_t *gfp_mask) *gfp_mask &= ~GFP_RELIABLE; }
+static inline long mem_reliable_direct_reclaim(int nr_pages, struct alloc_context *ac) +{ + long nr_reclaimed = 0; + + while (nr_reclaimed < nr_pages) { + /* try to free cache from reliable region */ + long progress = __perform_reclaim(GFP_KERNEL, 0, ac); + + nr_reclaimed += progress; + if (progress < SWAP_CLUSTER_MAX) + break; + } + + return nr_reclaimed; +} + +/* + * return true means memory allocation need retry and flag ___GFP_RELIABILITY + * must be cleared. + */ +static inline bool check_after_alloc(gfp_t *gfp, unsigned int order, + int preferred_nid, + struct alloc_context *ac, + struct page **_page) +{ + int retry_times = MAX_RECLAIM_RETRIES; + int nr_pages; + + if (!mem_reliable_is_enabled()) + return false; + + if (!(*gfp & GFP_RELIABLE)) + return false; + + if (!*_page) + goto out_retry; + + if (*gfp & __GFP_NOFAIL || current->flags & PF_MEMALLOC) + goto out; + + /* percpu counter is not initialized, ignore limit check */ + if (!mem_reliable_counter_initialized()) + goto out; + +limit_check: + /* user task is limited by task_reliable_limit */ + if (!reliable_mem_limit_check(1 << order)) + goto out_free_page; + + goto out; + +out_free_page: + if (mem_reliable_should_reclaim() && retry_times--) { + nr_pages = mem_reliable_direct_reclaim(1 << order, ac); + if (nr_pages) + goto limit_check; + } + + __free_pages(*_page, order); + *_page = NULL; + +out_retry: + if (is_global_init(current)) { + *gfp &= ~GFP_RELIABLE; + return true; + } + + if (*gfp & (__GFP_NORETRY | __GFP_RETRY_MAYFAIL | __GFP_THISNODE)) + goto out; + + /* Coredumps can quickly deplete all memory reserves */ + if (current->flags & PF_DUMPCORE) + goto out; + /* The OOM killer will not help higher order allocs */ + if (order > PAGE_ALLOC_COSTLY_ORDER) + goto out; + + /* oom here */ + mem_reliable_out_of_memory(*gfp, order, preferred_nid, ac->nodemask); +out: + return false; +} + /* * This is the 'heart' of the zoned buddy allocator. */ @@ -4579,6 +4662,7 @@ struct page *__alloc_pages(gfp_t gfp, unsigned int order, int preferred_nid,
prepare_before_alloc(&gfp);
+retry: /* * Apply scoped allocation constraints. This is mainly about GFP_NOFS * resp. GFP_NOIO which has to be inherited for all allocation requests @@ -4621,6 +4705,9 @@ struct page *__alloc_pages(gfp_t gfp, unsigned int order, int preferred_nid, page = NULL; }
+ if (check_after_alloc(&gfp, order, preferred_nid, &ac, &page)) + goto retry; + trace_mm_page_alloc(page, order, alloc_gfp, ac.migratetype); kmsan_alloc_page(page, order, alloc_gfp);
diff --git a/mm/shmem.c b/mm/shmem.c index 0d6807b608ed..b44bfad90f8d 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -1679,7 +1679,8 @@ static struct folio *shmem_alloc_and_acct_folio(gfp_t gfp, struct inode *inode, if (err) goto failed;
- shmem_prepare_alloc(&gfp); + if (!shmem_prepare_alloc(&gfp)) + goto no_mem;
if (huge) folio = shmem_alloc_hugefolio(gfp, info, index); @@ -1691,6 +1692,7 @@ static struct folio *shmem_alloc_and_acct_folio(gfp_t gfp, struct inode *inode, return folio; }
+no_mem: err = -ENOMEM; shmem_inode_unacct_blocks(inode, nr); failed: