From: Ma Wupeng mawupeng1@huawei.com
hulk inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I4SK3S CVE: NA
----------------------------------------------
Introuce min watermark to ensure kernel can use this reserved memory. This watermark will prevent special user tasks, pagecache, tmpfs user from using too many mirrored memory which will lead to kernel oom.
Memory allocation with ___GFP_RELIABILITY or special user tasks will be blocked after this watermark is reached. This memory allocation will fallback to non-mirrored region or trigger oom depends on memory reliable fallback's status.
This limit can be set or access via /proc/sys/vm/reliable_reserve_size. The default value of this limit is 256M. This limit can be set from 256M to the total size of mirrored memory.
Signed-off-by: Ma Wupeng mawupeng1@huawei.com Reviewed-by: Xie XiuQi xiexiuqi@huawei.com Reviewed-by: Kefeng Wangwangkefeng.wang@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- include/linux/mem_reliable.h | 14 ++++++++++++++ mm/mem_reliable.c | 33 +++++++++++++++++++++++++++++++++ mm/page_alloc.c | 17 +++++++++++++---- 3 files changed, 60 insertions(+), 4 deletions(-)
diff --git a/include/linux/mem_reliable.h b/include/linux/mem_reliable.h index 3e0d2a002aa1f..f8af64474f227 100644 --- a/include/linux/mem_reliable.h +++ b/include/linux/mem_reliable.h @@ -23,6 +23,7 @@ extern struct percpu_counter reliable_shmem_used_nr_page; extern bool pagecache_use_reliable_mem; extern atomic_long_t page_cache_fallback; DECLARE_PER_CPU(long, nr_reliable_buddy_pages); +extern unsigned long nr_reliable_reserve_pages __read_mostly; extern void page_cache_fallback_inc(gfp_t gfp, struct page *page);
extern void add_reliable_mem_size(long sz); @@ -107,6 +108,18 @@ static inline void mem_reliable_buddy_counter(struct page *page, int nr_page) this_cpu_add(nr_reliable_buddy_pages, nr_page); }
+/* reserve mirrored memory for kernel usage */ +static inline bool mem_reliable_watermark_ok(int nr_page) +{ + long sum = 0; + int cpu; + + for_each_possible_cpu(cpu) + sum += per_cpu(nr_reliable_buddy_pages, cpu); + + return sum > nr_reliable_reserve_pages; +} + #else #define reliable_enabled 0 #define reliable_allow_fb_enabled() false @@ -147,6 +160,7 @@ static inline void page_cache_fallback_inc(gfp_t gfp, struct page *page) {} static inline bool pagecache_reliable_is_enabled(void) { return false; } static inline bool mem_reliable_status(void) { return false; } static inline void mem_reliable_buddy_counter(struct page *page, int nr_page) {} +static inline bool mem_reliable_watermark_ok(int nr_page) { return true; } #endif
#endif diff --git a/mm/mem_reliable.c b/mm/mem_reliable.c index 5491a1bafe02c..eb55b7f40a885 100644 --- a/mm/mem_reliable.c +++ b/mm/mem_reliable.c @@ -10,6 +10,8 @@ #include <linux/mmzone.h> #include <linux/oom.h>
+#define MEM_RELIABLE_RESERVE_MIN (256UL << 20) + enum mem_reliable_types { MEM_RELIABLE_ALL, MEM_RELIABLE_FALLBACK, @@ -29,6 +31,7 @@ bool reliable_allow_fallback __read_mostly = true; bool shmem_reliable __read_mostly = true; struct percpu_counter reliable_shmem_used_nr_page __read_mostly; DEFINE_PER_CPU(long, nr_reliable_buddy_pages); +unsigned long nr_reliable_reserve_pages = MEM_RELIABLE_RESERVE_MIN / PAGE_SIZE;
bool pagecache_use_reliable_mem __read_mostly = true; atomic_long_t page_cache_fallback = ATOMIC_LONG_INIT(0); @@ -316,6 +319,29 @@ int reliable_debug_handler(struct ctl_table *table, int write, return ret; }
+static unsigned long sysctl_reliable_reserve_size = MEM_RELIABLE_RESERVE_MIN; + +int reliable_reserve_size_handler(struct ctl_table *table, int write, + void __user *buffer, size_t *length, loff_t *ppos) +{ + unsigned long *data_ptr = (unsigned long *)(table->data); + unsigned long old = *data_ptr; + int ret; + + ret = proc_doulongvec_minmax(table, write, buffer, length, ppos); + if (ret == 0 && write) { + if (*data_ptr > total_reliable_mem_sz() || + *data_ptr < MEM_RELIABLE_RESERVE_MIN) { + *data_ptr = old; + return -EINVAL; + } + + nr_reliable_reserve_pages = *data_ptr / PAGE_SIZE; + } + + return ret; +} + static struct ctl_table reliable_ctl_table[] = { { .procname = "task_reliable_limit", @@ -331,6 +357,13 @@ static struct ctl_table reliable_ctl_table[] = { .mode = 0600, .proc_handler = reliable_debug_handler, }, + { + .procname = "reliable_reserve_size", + .data = &sysctl_reliable_reserve_size, + .maxlen = sizeof(sysctl_reliable_reserve_size), + .mode = 0644, + .proc_handler = reliable_reserve_size_handler, + }, {} };
diff --git a/mm/page_alloc.c b/mm/page_alloc.c index e139605f1dbb4..571bfdecd5a0c 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -4652,8 +4652,15 @@ static inline bool prepare_before_alloc(gfp_t *gfp_mask, unsigned int order) return true;
if (gfp_ori & ___GFP_RELIABILITY) { - *gfp_mask |= ___GFP_RELIABILITY; - return true; + if (mem_reliable_watermark_ok(1 << order)) { + *gfp_mask |= ___GFP_RELIABILITY; + return true; + } + + if (reliable_allow_fb_enabled()) + return true; + + return false; }
/* @@ -4661,7 +4668,8 @@ static inline bool prepare_before_alloc(gfp_t *gfp_mask, unsigned int order) * allocation trigger task_reliable_limit */ if (is_global_init(current)) { - if (reliable_mem_limit_check(1 << order)) + if (reliable_mem_limit_check(1 << order) && + mem_reliable_watermark_ok(1 << order)) *gfp_mask |= ___GFP_RELIABILITY; return true; } @@ -4675,7 +4683,8 @@ static inline bool prepare_before_alloc(gfp_t *gfp_mask, unsigned int order) */ if ((current->flags & PF_RELIABLE) && (gfp_ori & __GFP_HIGHMEM) && (gfp_ori & __GFP_MOVABLE)) { - if (reliable_mem_limit_check(1 << order)) { + if (reliable_mem_limit_check(1 << order) && + mem_reliable_watermark_ok(1 << order)) { *gfp_mask |= ___GFP_RELIABILITY; return true; }