From: Peng Wu wupeng58@huawei.com
hulk inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I4SK3S CVE: NA
----------------------------------------------
there is a upper limit for special user tasks's memory allocation. special user task means user task with reliable flag.
Init tasks will alloc memory from non-mirrored region if their allocation trigger limit.
The limit can be set or access via /proc/sys/vm/task_reliable_limit
This limit's default value is ULONG_MAX.
Signed-off-by: Peng Wu wupeng58@huawei.com Signed-off-by: Ma Wupeng mawupeng1@huawei.com Reviewed-by: Kefeng Wang wangkefeng.wang@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- include/linux/mem_reliable.h | 20 +++++++++ lib/show_mem.c | 1 + mm/khugepaged.c | 14 ++++++ mm/mem_reliable.c | 86 ++++++++++++++++++++++++++++++++++++ mm/page_alloc.c | 50 ++++++++++++++++++--- 5 files changed, 164 insertions(+), 7 deletions(-)
diff --git a/include/linux/mem_reliable.h b/include/linux/mem_reliable.h index a18a843c7b52f..e4097f0cff679 100644 --- a/include/linux/mem_reliable.h +++ b/include/linux/mem_reliable.h @@ -15,6 +15,7 @@ extern struct static_key_false mem_reliable;
extern bool reliable_enabled; extern atomic_long_t reliable_user_used_nr_page; +extern unsigned long task_reliable_limit __read_mostly;
extern void add_reliable_mem_size(long sz); extern void mem_reliable_init(bool has_unmirrored_mem, @@ -22,6 +23,9 @@ extern void mem_reliable_init(bool has_unmirrored_mem, extern void reliable_report_meminfo(struct seq_file *m); extern bool page_reliable(struct page *page); extern void reliable_report_usage(struct seq_file *m, struct mm_struct *mm); +extern void reliable_show_mem_info(void); +extern void mem_reliable_out_of_memory(gfp_t gfp_mask, unsigned int order, + int preferred_nid, nodemask_t *nodemask);
static inline bool mem_reliable_is_enabled(void) { @@ -59,6 +63,12 @@ static inline void reliable_page_counter(struct page *page, atomic_long_add(val, &reliable_user_used_nr_page); } } + +static inline bool reliable_mem_limit_check(unsigned long nr_page) +{ + return atomic_long_read(&reliable_user_used_nr_page) + nr_page <= + task_reliable_limit / PAGE_SIZE; +} #else #define reliable_enabled 0
@@ -78,6 +88,16 @@ static inline void reliable_page_counter(struct page *page, static inline void reliable_report_usage(struct seq_file *m, struct mm_struct *mm) {}
+static inline bool reliable_mem_limit_check(unsigned long nr_page) +{ + return false; +} +static inline void reliable_show_mem_info(void) {} +static inline void mem_reliable_out_of_memory(gfp_t gfp_mask, + unsigned int order, + int preferred_nid, + nodemask_t *nodemask) {} + #endif
#endif diff --git a/lib/show_mem.c b/lib/show_mem.c index 0beaa1d899aae..0f85331ba91b9 100644 --- a/lib/show_mem.c +++ b/lib/show_mem.c @@ -49,4 +49,5 @@ void show_mem(unsigned int filter, nodemask_t *nodemask) #ifdef CONFIG_MEMORY_FAILURE printk("%lu pages hwpoisoned\n", atomic_long_read(&num_poisoned_pages)); #endif + reliable_show_mem_info(); } diff --git a/mm/khugepaged.c b/mm/khugepaged.c index 7f37633a886e0..2b154ff6ee734 100644 --- a/mm/khugepaged.c +++ b/mm/khugepaged.c @@ -1235,6 +1235,12 @@ static int khugepaged_scan_pmd(struct mm_struct *mm, out_unmap: pte_unmap_unlock(pte, ptl); if (ret) { + if (reliable && + !reliable_mem_limit_check(1 << HPAGE_PMD_ORDER)) { + ret = SCAN_ALLOC_HUGE_PAGE_FAIL; + goto out; + } + node = khugepaged_find_target_node(); /* collapse_huge_page will return with the mmap_sem released */ collapse_huge_page(mm, address, hpage, node, @@ -1692,6 +1698,12 @@ static void khugepaged_scan_shmem(struct mm_struct *mm, if (present < HPAGE_PMD_NR - khugepaged_max_ptes_none) { result = SCAN_EXCEED_NONE_PTE; } else { + if (reliable && + !reliable_mem_limit_check(1 << HPAGE_PMD_ORDER)) { + result = SCAN_ALLOC_HUGE_PAGE_FAIL; + goto out; + } + node = khugepaged_find_target_node(); collapse_shmem(mm, mapping, start, hpage, node, reliable); @@ -1699,6 +1711,8 @@ static void khugepaged_scan_shmem(struct mm_struct *mm, }
/* TODO: tracepoints */ +out: + return; } #else static void khugepaged_scan_shmem(struct mm_struct *mm, diff --git a/mm/mem_reliable.c b/mm/mem_reliable.c index d6aec08638923..c24c5b7cbca33 100644 --- a/mm/mem_reliable.c +++ b/mm/mem_reliable.c @@ -8,12 +8,15 @@ #include <linux/memory_hotplug.h> #include <linux/seq_file.h> #include <linux/mmzone.h> +#include <linux/oom.h>
DEFINE_STATIC_KEY_FALSE(mem_reliable);
bool reliable_enabled; static atomic_long_t total_reliable_mem; atomic_long_t reliable_user_used_nr_page; +/* reliable user limit for user tasks with reliable flag */ +unsigned long task_reliable_limit = ULONG_MAX;
void add_reliable_mem_size(long sz) { @@ -118,3 +121,86 @@ void reliable_report_usage(struct seq_file *m, struct mm_struct *mm) atomic_long_read(&mm->reliable_nr_page)); } } + +#ifdef CONFIG_SYSCTL +int reliable_limit_handler(struct ctl_table *table, int write, + void __user *buffer, size_t *length, loff_t *ppos) +{ + unsigned long old = task_reliable_limit; + int ret; + + ret = proc_doulongvec_minmax(table, write, buffer, length, ppos); + if (ret == 0 && write) { + if (task_reliable_limit > total_reliable_mem_sz()) { + task_reliable_limit = old; + return -EINVAL; + } + } + + return ret; +} + +static struct ctl_table reliable_ctl_table[] = { + { + .procname = "task_reliable_limit", + .data = &task_reliable_limit, + .maxlen = sizeof(task_reliable_limit), + .mode = 0644, + .proc_handler = reliable_limit_handler, + }, + {} +}; + +static struct ctl_table reliable_dir_table[] = { + { + .procname = "vm", + .maxlen = 0, + .mode = 0555, + .child = reliable_ctl_table, + }, + {} +}; + +static int __init reliable_sysctl_init(void) +{ + if (!mem_reliable_is_enabled()) + return 0; + + if (!register_sysctl_table(reliable_dir_table)) { + pr_err("register sysctl failed."); + return -1; + } + + return 0; +} +late_initcall(reliable_sysctl_init); +#endif + +void reliable_show_mem_info(void) +{ + if (mem_reliable_is_enabled()) { + pr_info("ReliableTotal: %lu kB", total_reliable_mem_sz() >> 10); + pr_info("ReliableUsed: %lu kB", used_reliable_mem_sz() >> 10); + pr_info("task_reliable_limit: %lu kB", + task_reliable_limit >> 10); + pr_info("reliable_user_used: %ld kB", + atomic_long_read(&reliable_user_used_nr_page) * 4); + } +} + +void mem_reliable_out_of_memory(gfp_t gfp_mask, unsigned int order, + int preferred_nid, nodemask_t *nodemask) +{ + struct oom_control oc = { + .zonelist = node_zonelist(preferred_nid, gfp_mask), + .nodemask = nodemask, + .memcg = NULL, + .gfp_mask = gfp_mask, + .order = order, + }; + + if (!mutex_trylock(&oom_lock)) + return; + out_of_memory(&oc); + mutex_unlock(&oom_lock); +} diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 95d2450cf1771..bfc0c2d1825cd 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -4561,19 +4561,51 @@ static inline void finalise_ac(gfp_t gfp_mask, struct alloc_context *ac) ac->high_zoneidx, ac->nodemask); }
-static inline void prepare_before_alloc(gfp_t *gfp_mask) +/* + * return false means this allocation is limit by reliable user limit and + * this will lead to pagefault_out_of_memory() + */ +static inline bool prepare_before_alloc(gfp_t *gfp_mask, unsigned int order) { gfp_t gfp_ori = *gfp_mask; *gfp_mask &= gfp_allowed_mask;
if (!mem_reliable_is_enabled()) - return; + return true;
- if (gfp_ori & ___GFP_RELIABILITY) + if (gfp_ori & ___GFP_RELIABILITY) { *gfp_mask |= ___GFP_RELIABILITY; + return true; + }
- if (current->flags & PF_RELIABLE || is_global_init(current)) - *gfp_mask |= ___GFP_RELIABILITY; + /* + * Init tasks will alloc memory from non-mirrored region if their + * allocation trigger task_reliable_limit + */ + if (is_global_init(current)) { + if (reliable_mem_limit_check(1 << order)) + *gfp_mask |= ___GFP_RELIABILITY; + return true; + } + + /* + * This only check task_reliable_limit without ___GFP_RELIABILITY + * or this process is global init. + * For kernel internal mechanism(hugepaged collapse and others) + * If they alloc memory for user and obey task_reliable_limit, they + * need to check this limit before allocing pages. + */ + if ((current->flags & PF_RELIABLE) && (gfp_ori & __GFP_HIGHMEM) && + (gfp_ori & __GFP_MOVABLE)) { + if (reliable_mem_limit_check(1 << order)) { + *gfp_mask |= ___GFP_RELIABILITY; + return true; + } + + return false; + } + + return true; }
/* @@ -4583,7 +4615,7 @@ struct page * __alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order, int preferred_nid, nodemask_t *nodemask) { - struct page *page; + struct page *page = NULL; unsigned int alloc_flags = ALLOC_WMARK_LOW; gfp_t alloc_mask; /* The gfp_t that was actually used for allocation */ struct alloc_context ac = { }; @@ -4597,7 +4629,11 @@ __alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order, int preferred_nid, return NULL; }
- prepare_before_alloc(&gfp_mask); + if (!prepare_before_alloc(&gfp_mask, order)) { + mem_reliable_out_of_memory(gfp_mask, order, preferred_nid, + nodemask); + goto out; + }
alloc_mask = gfp_mask; if (!prepare_alloc_pages(gfp_mask, order, preferred_nid, nodemask, &ac, &alloc_mask, &alloc_flags))