From: Ma Wupeng mawupeng1@huawei.com
hulk inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I4SK3S CVE: NA
--------------------------------
There is a upper limit for all memory allocation if the following condtions are met: - gfp_zone(gfp & ~ GFP_RELIABLE) == ZONE_MOVABLE - gfp & GFP_RELIABLE is true
Init tasks will alloc memory from non-mirrored region if their allocation trigger limit.
The limit can be set or access via /proc/sys/vm/task_reliable_limit
This limit's default value is ULONG_MAX. User can update this value between current user used reliable memory size and total reliable memory size.
Signed-off-by: Ma Wupeng mawupeng1@huawei.com Reviewed-by: Kefeng Wang wangkefeng.wang@huawei.com --- include/linux/mem_reliable.h | 40 +++++++++++++++++ mm/mem_reliable.c | 45 +++++++++++++++++++ mm/page_alloc.c | 87 ++++++++++++++++++++++++++++++++++++ 3 files changed, 172 insertions(+)
diff --git a/include/linux/mem_reliable.h b/include/linux/mem_reliable.h index 59108e955f48..9b94154b383e 100644 --- a/include/linux/mem_reliable.h +++ b/include/linux/mem_reliable.h @@ -5,8 +5,10 @@ #include <linux/stddef.h> #include <linux/gfp.h> #include <linux/mmzone.h> +#include <linux/oom.h> #include <linux/mm_types.h> #include <linux/sched.h> +#include <linux/percpu_counter.h>
#ifdef CONFIG_MEMORY_RELIABLE
@@ -17,6 +19,7 @@ extern bool shmem_reliable; extern bool pagecache_use_reliable_mem; extern struct percpu_counter pagecache_reliable_pages; extern struct percpu_counter anon_reliable_pages; +extern unsigned long task_reliable_limit __read_mostly;
extern void mem_reliable_init(bool has_unmirrored_mem, unsigned long *zone_movable_pfn, @@ -30,6 +33,8 @@ extern void reliable_lru_add(enum lru_list lru, struct page *page, extern void reliable_lru_add_batch(int zid, enum lru_list lru, int val); extern bool mem_reliable_counter_initialized(void); +extern void mem_reliable_out_of_memory(gfp_t gfp_mask, unsigned int order, + int preferred_nid, nodemask_t *nodemask);
static inline bool mem_reliable_is_enabled(void) { @@ -74,6 +79,31 @@ static inline bool page_reliable(struct page *page)
return page_zonenum(page) < ZONE_MOVABLE; } + +static inline u64 task_reliable_used_pages(void) +{ + s64 nr_pages; + + nr_pages = percpu_counter_read_positive(&pagecache_reliable_pages); + nr_pages += percpu_counter_read_positive(&anon_reliable_pages); + + return nr_pages; +} + +static inline bool reliable_mem_limit_check(unsigned long nr_page) +{ + return (task_reliable_used_pages() + nr_page) <= + (task_reliable_limit >> PAGE_SHIFT); +} + +static inline bool mem_reliable_should_reclaim(void) +{ + if (percpu_counter_sum_positive(&pagecache_reliable_pages) >= + MAX_ORDER_NR_PAGES) + return true; + + return false; +} #else #define reliable_enabled 0 #define pagecache_use_reliable_mem 0 @@ -98,6 +128,16 @@ static inline void reliable_lru_add(enum lru_list lru, struct page *page, static inline void reliable_lru_add_batch(int zid, enum lru_list lru, int val) {} static inline bool mem_reliable_counter_initialized(void) { return false; } +static inline u64 task_reliable_used_pages(void) { return 0; } +static inline bool reliable_mem_limit_check(unsigned long nr_page) +{ + return false; +} +static inline bool mem_reliable_should_reclaim(void) { return false; } +static inline void mem_reliable_out_of_memory(gfp_t gfp_mask, + unsigned int order, + int preferred_nid, + nodemask_t *nodemask) {} #endif
#endif diff --git a/mm/mem_reliable.c b/mm/mem_reliable.c index b81e1147088d..36133976910e 100644 --- a/mm/mem_reliable.c +++ b/mm/mem_reliable.c @@ -20,6 +20,8 @@ bool pagecache_use_reliable_mem __read_mostly = true; struct percpu_counter pagecache_reliable_pages; struct percpu_counter anon_reliable_pages; static unsigned long reliable_pagecache_max_bytes = ULONG_MAX; +/* reliable user limit for user tasks with reliable flag */ +unsigned long task_reliable_limit = ULONG_MAX;
bool mem_reliable_counter_initialized(void) { @@ -178,6 +180,25 @@ void reliable_report_meminfo(struct seq_file *m) } }
+int reliable_limit_handler(struct ctl_table *table, int write, + void __user *buffer, size_t *length, loff_t *ppos) +{ + unsigned long old = task_reliable_limit; + int ret; + + ret = proc_doulongvec_minmax(table, write, buffer, length, ppos); + if (ret == 0 && write) { + if (task_reliable_limit > PAGES_TO_B(total_reliable_pages()) || + task_reliable_limit < + (task_reliable_used_pages() << PAGE_SHIFT)) { + task_reliable_limit = old; + return -EINVAL; + } + } + + return ret; +} + int reliable_pagecache_max_bytes_write(struct ctl_table *table, int write, void __user *buffer, size_t *length, loff_t *ppos) { @@ -204,6 +225,13 @@ static struct ctl_table reliable_ctl_table[] = { .mode = 0644, .proc_handler = reliable_pagecache_max_bytes_write, }, + { + .procname = "task_reliable_limit", + .data = &task_reliable_limit, + .maxlen = sizeof(task_reliable_limit), + .mode = 0644, + .proc_handler = reliable_limit_handler, + }, {} };
@@ -234,6 +262,23 @@ static int __init reliable_sysctl_init(void) } arch_initcall(reliable_sysctl_init);
+void mem_reliable_out_of_memory(gfp_t gfp, unsigned int order, + int preferred_nid, nodemask_t *nodemask) +{ + struct oom_control oc = { + .zonelist = node_zonelist(preferred_nid, gfp), + .nodemask = nodemask, + .memcg = NULL, + .gfp_mask = gfp, + .order = order, + }; + + if (!mutex_trylock(&oom_lock)) + return; + out_of_memory(&oc); + mutex_unlock(&oom_lock); +} + static int __init setup_reliable_debug(char *str) { if (*str++ != '=' || !*str) diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 74affafb8d48..dbc0104bfbf7 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -5166,6 +5166,89 @@ static inline void prepare_before_alloc(gfp_t *gfp_mask) *gfp_mask &= ~GFP_RELIABLE; }
+static inline long mem_reliable_direct_reclaim(int nr_pages, struct alloc_context *ac) +{ + long nr_reclaimed = 0; + + while (nr_reclaimed < nr_pages) { + /* try to free cache from reliable region */ + long progress = __perform_reclaim(GFP_KERNEL, 0, ac); + + nr_reclaimed += progress; + if (progress < SWAP_CLUSTER_MAX) + break; + } + + return nr_reclaimed; +} + +/* + * return true means memory allocation need retry and flag ___GFP_RELIABILITY + * must be cleared. + */ +static inline bool check_after_alloc(gfp_t *gfp, unsigned int order, + int preferred_nid, + struct alloc_context *ac, + struct page **_page) +{ + int retry_times = MAX_RECLAIM_RETRIES; + int nr_pages; + + if (!mem_reliable_is_enabled()) + return false; + + if (!(*gfp & GFP_RELIABLE)) + return false; + + if (!*_page) + goto out_retry; + + if (*gfp & __GFP_NOFAIL || current->flags & PF_MEMALLOC) + goto out; + + /* percpu counter is not initialized, ignore limit check */ + if (!mem_reliable_counter_initialized()) + goto out; + +limit_check: + /* user task is limited by task_reliable_limit */ + if (!reliable_mem_limit_check(1 << order)) + goto out_free_page; + + goto out; + +out_free_page: + if (mem_reliable_should_reclaim() && retry_times--) { + nr_pages = mem_reliable_direct_reclaim(1 << order, ac); + if (nr_pages) + goto limit_check; + } + + __free_pages(*_page, order); + *_page = NULL; + +out_retry: + if (is_global_init(current)) { + *gfp &= ~GFP_RELIABLE; + return true; + } + + if (*gfp & (__GFP_NORETRY | __GFP_RETRY_MAYFAIL | __GFP_THISNODE)) + goto out; + + /* Coredumps can quickly deplete all memory reserves */ + if (current->flags & PF_DUMPCORE) + goto out; + /* The OOM killer will not help higher order allocs */ + if (order > PAGE_ALLOC_COSTLY_ORDER) + goto out; + + /* oom here */ + mem_reliable_out_of_memory(*gfp, order, preferred_nid, ac->nodemask); +out: + return false; +} + /* * This is the 'heart' of the zoned buddy allocator. */ @@ -5190,6 +5273,7 @@ struct page *__alloc_pages(gfp_t gfp, unsigned int order, int preferred_nid,
prepare_before_alloc(&gfp);
+retry: alloc_gfp = gfp; if (!prepare_alloc_pages(gfp, order, preferred_nid, nodemask, &ac, &alloc_gfp, &alloc_flags)) @@ -5235,6 +5319,9 @@ struct page *__alloc_pages(gfp_t gfp, unsigned int order, int preferred_nid, page = NULL; }
+ if (check_after_alloc(&gfp, order, preferred_nid, &ac, &page)) + goto retry; + trace_mm_page_alloc(page, order, alloc_gfp, ac.migratetype);
return page;