Support OOM priority for memcg.
Changelog: * rename CONFIG_MEMCG_QOS to CONFIG_MEMCG_OOM_PRIORITY * move CONFIG_MEMCG_OOM_PRIORITY to init/Kconfig * set CONFIG_MEMCG_OOM_PRIORITY default from y to n, and enbale it in openeuler_defconfig * add rcu_read_lock() protection for mem_cgroup_from_task() * Instead of static key, use variable to check if the feature is enabled * use READ_ONCE/WRITE_ONCE protection when read/write oom_prio * cleanup
Jing Xiangfeng (3): memcg: support priority for oom memcg: Add sysctl memcg_qos_enable memcg: enable CONFIG_MEMCG_OOM_PRIORITY by default
arch/arm64/configs/openeuler_defconfig | 1 + arch/x86/configs/openeuler_defconfig | 1 + include/linux/memcontrol.h | 26 +++ init/Kconfig | 14 ++ mm/memcontrol.c | 229 +++++++++++++++++++++++++ mm/oom_kill.c | 61 ++++++- 6 files changed, 327 insertions(+), 5 deletions(-)
反馈: 您发送到kernel@openeuler.org的补丁/补丁集,已成功转换为PR! PR链接地址: https://gitee.com/openeuler/kernel/pulls/3584 邮件列表地址:https://mailweb.openeuler.org/hyperkitty/list/kernel@openeuler.org/message/K...
FeedBack: The patch(es) which you have sent to kernel@openeuler.org mailing list has been converted to a pull request successfully! Pull request link: https://gitee.com/openeuler/kernel/pulls/3584 Mailing list address: https://mailweb.openeuler.org/hyperkitty/list/kernel@openeuler.org/message/K...
From: Jing Xiangfeng jingxiangfeng@huawei.com
hulk inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I8PXX8 CVE: NA
--------------------------------------
We first kill the process from the low priority memcg if OOM occurs. If the process is not found, then fallback to normal handle.
Signed-off-by: Jing Xiangfeng jingxiangfeng@huawei.com Signed-off-by: Jinjiang Tu tujinjiang@huawei.com --- include/linux/memcontrol.h | 23 ++++++ init/Kconfig | 14 ++++ mm/memcontrol.c | 150 +++++++++++++++++++++++++++++++++++++ mm/oom_kill.c | 58 ++++++++++++-- 4 files changed, 240 insertions(+), 5 deletions(-)
diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index 5bfaa77e2b82..a3a20aef438d 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -27,6 +27,7 @@ struct obj_cgroup; struct page; struct mm_struct; struct kmem_cache; +struct oom_control;
/* Cgroup-specific page state, on top of universal node page state */ enum memcg_stat_item { @@ -297,6 +298,12 @@ struct mem_cgroup { bool tcpmem_active; int tcpmem_pressure;
+#ifdef CONFIG_MEMCG_OOM_PRIORITY + /* Currently support 0 and -1. + * in the future it can expand to other value. + */ + int oom_prio; +#endif #ifdef CONFIG_MEMCG_KMEM int kmemcg_id; struct obj_cgroup __rcu *objcg; @@ -346,6 +353,18 @@ struct mem_cgroup { struct mem_cgroup_per_node *nodeinfo[]; };
+#ifdef CONFIG_MEMCG_OOM_PRIORITY +#define MEMCG_LOW_OOM_PRIORITY -1 +#define MEMCG_HIGH_OOM_PRIORITY 0 +bool memcg_oom_prio_scan_tasks(int (*)(struct task_struct *, void *), + void *); +void memcg_print_bad_task(struct oom_control *oc); +#else +static inline void memcg_print_bad_task(struct oom_control *oc) +{ +} +#endif + /* * size of first charge trial. * TODO: maybe necessary to use big numbers in big irons or dynamic based of the @@ -1602,6 +1621,10 @@ unsigned long mem_cgroup_soft_limit_reclaim(pg_data_t *pgdat, int order, { return 0; } + +static inline void memcg_print_bad_task(struct oom_control *oc) +{ +} #endif /* CONFIG_MEMCG */
static inline void __inc_lruvec_kmem_state(void *p, enum node_stat_item idx) diff --git a/init/Kconfig b/init/Kconfig index 9209fc5b39b9..ab4b78875be4 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -959,6 +959,20 @@ config MEMCG_MEMFS_INFO through interface "memory.memfs_files_info" or printed when OOM is triggered.
+config MEMCG_OOM_PRIORITY + bool "Enable Memory Cgroup OOM Priority" + depends on MEMCG + depends on X86 || ARM64 + default n + help + Prefer to kill the process from the low priority memcg when OOM occurs. + + When OOM occurs, this feature first selects the low priority memcg that + uses most memory, and then kill the process that uses most memory in the + memcg. If the process is not found, then fallback to normal processing. + + If unsure, say "n". + config MEMCG_KMEM bool depends on MEMCG diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 5144477b1a7f..940022380b53 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -4055,6 +4055,141 @@ static int mem_cgroup_move_charge_write(struct cgroup_subsys_state *css, } #endif
+#ifdef CONFIG_MEMCG_OOM_PRIORITY +static void memcg_oom_prio_init(struct mem_cgroup *memcg) +{ + struct mem_cgroup *parent = parent_mem_cgroup(memcg); + int oom_prio; + + if (!parent) + return; + + oom_prio = READ_ONCE(parent->oom_prio); + WRITE_ONCE(memcg->oom_prio, oom_prio); +} + +static void memcg_hierarchy_oom_prio_set(struct mem_cgroup *memcg, int val) +{ + struct mem_cgroup *iter; + struct cgroup_subsys_state *css; + + rcu_read_lock(); + css_for_each_descendant_pre(css, &memcg->css) { + iter = mem_cgroup_from_css(css); + + WRITE_ONCE(iter->oom_prio, val); + } + rcu_read_unlock(); +} + +static s64 memcg_oom_prio_read(struct cgroup_subsys_state *css, + struct cftype *cft) +{ + struct mem_cgroup *memcg = mem_cgroup_from_css(css); + + return READ_ONCE(memcg->oom_prio); +} + +static int memcg_oom_prio_write(struct cgroup_subsys_state *css, + struct cftype *cft, s64 val) +{ + struct mem_cgroup *memcg = mem_cgroup_from_css(css); + + if (mem_cgroup_is_root(memcg)) + return -EINVAL; + + if (val != MEMCG_LOW_OOM_PRIORITY && val != MEMCG_HIGH_OOM_PRIORITY) + return -EINVAL; + + WRITE_ONCE(memcg->oom_prio, val); + memcg_hierarchy_oom_prio_set(memcg, val); + + return 0; +} + +static struct mem_cgroup *memcg_find_max_usage(struct mem_cgroup *last) +{ + struct mem_cgroup *iter, *max_memcg = NULL; + struct cgroup_subsys_state *css; + unsigned long usage, max_usage = 0; + int oom_prio; + + rcu_read_lock(); + css_for_each_descendant_pre(css, &root_mem_cgroup->css) { + iter = mem_cgroup_from_css(css); + oom_prio = READ_ONCE(iter->oom_prio); + + if (oom_prio == MEMCG_HIGH_OOM_PRIORITY || + iter == root_mem_cgroup || + iter == last) + continue; + + usage = mem_cgroup_usage(iter, false); + if (usage > max_usage) { + max_usage = usage; + max_memcg = iter; + } + } + rcu_read_unlock(); + + return max_memcg; +} + +bool memcg_oom_prio_scan_tasks(int (*fn)(struct task_struct *, void *), + void *arg) +{ + struct mem_cgroup *max, *last = NULL; + struct oom_control *oc = arg; + struct css_task_iter it; + struct task_struct *task; + int ret = 0; + bool retry = true; + +retry: + max = memcg_find_max_usage(last); + if (!max) + return false; + + css_task_iter_start(&max->css, 0, &it); + while (!ret && (task = css_task_iter_next(&it))) { + if (test_tsk_thread_flag(task, TIF_MEMDIE)) + continue; + + ret = fn(task, arg); + } + css_task_iter_end(&it); + + if (ret) + return false; + + if (!oc->chosen && retry) { + last = max; + retry = false; + goto retry; + } + + if (oc->chosen) + pr_info("The bad task [%d:%s] is from low-priority memcg.\n", + oc->chosen->pid, oc->chosen->comm); + + return oc->chosen ? true : false; +} + +void memcg_print_bad_task(struct oom_control *oc) +{ + if (oc->chosen) { + struct mem_cgroup *memcg; + + rcu_read_lock(); + memcg = mem_cgroup_from_task(oc->chosen); + if (READ_ONCE(memcg->oom_prio) == MEMCG_LOW_OOM_PRIORITY) + pr_info("The bad task [%d:%s] is from low-priority memcg.\n", + oc->chosen->pid, oc->chosen->comm); + rcu_read_unlock(); + } +} +#endif + #ifdef CONFIG_NUMA
#define LRU_ALL_FILE (BIT(LRU_INACTIVE_FILE) | BIT(LRU_ACTIVE_FILE)) @@ -5417,6 +5552,17 @@ static struct cftype mem_cgroup_legacy_files[] = { .seq_show = mem_cgroup_memfs_files_show, }, #endif +#ifdef CONFIG_MEMCG_OOM_PRIORITY + { + /* + * This interface is used to control the oom priority + * of the memcg. The interface name is for compatibility. + */ + .name = "qos_level", + .read_s64 = memcg_oom_prio_read, + .write_s64 = memcg_oom_prio_write, + }, +#endif #ifdef CONFIG_NUMA { .name = "numa_stat", @@ -5816,6 +5962,10 @@ static int mem_cgroup_css_online(struct cgroup_subsys_state *css) FLUSH_TIME); lru_gen_online_memcg(memcg);
+#ifdef CONFIG_MEMCG_OOM_PRIORITY + memcg_oom_prio_init(memcg); +#endif + /* Online state pins memcg ID, memcg ID pins CSS */ refcount_set(&memcg->id.ref, 1); css_get(css); diff --git a/mm/oom_kill.c b/mm/oom_kill.c index 81b79f82f601..38ed03c344f3 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c @@ -306,6 +306,48 @@ static enum oom_constraint constrained_alloc(struct oom_control *oc) return CONSTRAINT_NONE; }
+#ifdef CONFIG_MEMCG_OOM_PRIORITY +/** + * We choose the task in low-priority memcg firstly. For the same state, we + * choose the task with the highest number of 'points'. + */ +static bool oom_next_task(struct task_struct *task, struct oom_control *oc, + long points) +{ + struct mem_cgroup *cur_memcg; + struct mem_cgroup *oc_memcg; + int cur_memcg_prio, oc_memcg_prio; + + if (points == LONG_MIN) + return true; + + if (!oc->chosen) + return false; + + rcu_read_lock(); + oc_memcg = mem_cgroup_from_task(oc->chosen); + cur_memcg = mem_cgroup_from_task(task); + oc_memcg_prio = READ_ONCE(oc_memcg->oom_prio); + cur_memcg_prio = READ_ONCE(cur_memcg->oom_prio); + rcu_read_unlock(); + + if (cur_memcg_prio == oc_memcg_prio) + return points < oc->chosen_points; + + /* if oc is low-priority, so skip the task */ + if (oc_memcg_prio == MEMCG_LOW_OOM_PRIORITY) + return true; + + return false; +} +#else +static inline bool oom_next_task(struct task_struct *task, + struct oom_control *oc, long points) +{ + return points == LONG_MIN || points < oc->chosen_points; +} +#endif + static int oom_evaluate_task(struct task_struct *task, void *arg) { struct oom_control *oc = arg; @@ -340,7 +382,7 @@ static int oom_evaluate_task(struct task_struct *task, void *arg) }
points = oom_badness(task, oc->totalpages); - if (points == LONG_MIN || points < oc->chosen_points) + if (oom_next_task(task, oc, points)) goto next;
select: @@ -366,11 +408,16 @@ static void select_bad_process(struct oom_control *oc) { oc->chosen_points = LONG_MIN;
- if (is_memcg_oom(oc)) + if (is_memcg_oom(oc)) { mem_cgroup_scan_tasks(oc->memcg, oom_evaluate_task, oc); - else { + memcg_print_bad_task(oc); + } else { struct task_struct *p;
+#ifdef CONFIG_MEMCG_OOM_PRIORITY + if (memcg_oom_prio_scan_tasks(oom_evaluate_task, oc)) + return; +#endif rcu_read_lock(); for_each_process(p) if (oom_evaluate_task(p, oc)) @@ -426,9 +473,10 @@ static void dump_tasks(struct oom_control *oc) pr_info("Tasks state (memory values in pages):\n"); pr_info("[ pid ] uid tgid total_vm rss pgtables_bytes swapents oom_score_adj name\n");
- if (is_memcg_oom(oc)) + if (is_memcg_oom(oc)) { mem_cgroup_scan_tasks(oc->memcg, dump_task, oc); - else { + memcg_print_bad_task(oc); + } else { struct task_struct *p;
rcu_read_lock();
From: Jing Xiangfeng jingxiangfeng@huawei.com
hulk inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I8PXX8 CVE: NA
--------------------------------------
This patch adds sysctl memcg_qos_enable to enable/disable memcg oom priority feature. If you want to enable it by writing 1:
echo 1 > /proc/sys/vm/memcg_qos_enable
Signed-off-by: Jing Xiangfeng jingxiangfeng@huawei.com Signed-off-by: Jinjiang Tu tujinjiang@huawei.com --- include/linux/memcontrol.h | 3 ++ mm/memcontrol.c | 79 ++++++++++++++++++++++++++++++++++++++ mm/oom_kill.c | 3 ++ 3 files changed, 85 insertions(+)
diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index a3a20aef438d..c42b78163e65 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -356,9 +356,12 @@ struct mem_cgroup { #ifdef CONFIG_MEMCG_OOM_PRIORITY #define MEMCG_LOW_OOM_PRIORITY -1 #define MEMCG_HIGH_OOM_PRIORITY 0 +extern int sysctl_memcg_oom_prio; + bool memcg_oom_prio_scan_tasks(int (*)(struct task_struct *, void *), void *); void memcg_print_bad_task(struct oom_control *oc); +bool memcg_oom_prio_disabled(void); #else static inline void memcg_print_bad_task(struct oom_control *oc) { diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 940022380b53..95df92ed8e39 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -4056,6 +4056,15 @@ static int mem_cgroup_move_charge_write(struct cgroup_subsys_state *css, #endif
#ifdef CONFIG_MEMCG_OOM_PRIORITY +#define ENABLE_MEMCG_OOM_PROIRITY 1 +#define DISABLE_MEMCG_OOM_PROIRITY 0 +int sysctl_memcg_oom_prio = DISABLE_MEMCG_OOM_PROIRITY; + +bool memcg_oom_prio_disabled(void) +{ + return READ_ONCE(sysctl_memcg_oom_prio) == DISABLE_MEMCG_OOM_PROIRITY; +} + static void memcg_oom_prio_init(struct mem_cgroup *memcg) { struct mem_cgroup *parent = parent_mem_cgroup(memcg); @@ -4087,6 +4096,9 @@ static s64 memcg_oom_prio_read(struct cgroup_subsys_state *css, { struct mem_cgroup *memcg = mem_cgroup_from_css(css);
+ if (memcg_oom_prio_disabled()) + return 0; + return READ_ONCE(memcg->oom_prio); }
@@ -4095,6 +4107,9 @@ static int memcg_oom_prio_write(struct cgroup_subsys_state *css, { struct mem_cgroup *memcg = mem_cgroup_from_css(css);
+ if (memcg_oom_prio_disabled()) + return -EACCES; + if (mem_cgroup_is_root(memcg)) return -EINVAL;
@@ -4145,6 +4160,8 @@ bool memcg_oom_prio_scan_tasks(int (*fn)(struct task_struct *, void *), int ret = 0; bool retry = true;
+ if (memcg_oom_prio_disabled()) + return false; retry: max = memcg_find_max_usage(last); if (!max) @@ -4177,6 +4194,9 @@ bool memcg_oom_prio_scan_tasks(int (*fn)(struct task_struct *, void *),
void memcg_print_bad_task(struct oom_control *oc) { + if (memcg_oom_prio_disabled()) + return; + if (oc->chosen) { struct mem_cgroup *memcg;
@@ -4188,6 +4208,64 @@ void memcg_print_bad_task(struct oom_control *oc) rcu_read_unlock(); } } + +static void memcg_oom_prio_reset(void) +{ + struct mem_cgroup *iter; + struct cgroup_subsys_state *css; + + rcu_read_lock(); + css_for_each_descendant_pre(css, &root_mem_cgroup->css) { + iter = mem_cgroup_from_css(css); + WRITE_ONCE(iter->oom_prio, 0); + } + rcu_read_unlock(); +} + +static int sysctl_memcg_oom_prio_handler(struct ctl_table *table, int write, + void __user *buffer, size_t *length, loff_t *ppos) +{ + int ret; + + ret = proc_dointvec_minmax(table, write, buffer, length, ppos); + if (ret) + return ret; + + if (write) { + if (READ_ONCE(sysctl_memcg_oom_prio) == DISABLE_MEMCG_OOM_PROIRITY) + memcg_oom_prio_reset(); + } + + return ret; +} + +static struct ctl_table memcg_oom_prio_sysctls[] = { + { + /* + * This sysctl is used to control memcg oom priority + * feature, the sysctl name is for compatibility. + */ + .procname = "memcg_qos_enable", + .data = &sysctl_memcg_oom_prio, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = sysctl_memcg_oom_prio_handler, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_ONE, + }, +}; + +static __init int memcg_oom_prio_sysctls_init(void) +{ + register_sysctl_init("vm", memcg_oom_prio_sysctls); + return 0; +} +#else +static inline int memcg_oom_prio_sysctls_init(void) +{ + return 0; +} + #endif
#ifdef CONFIG_NUMA @@ -7958,6 +8036,7 @@ static int __init mem_cgroup_init(void) }
mem_cgroup_memfs_info_init(); + memcg_oom_prio_sysctls_init();
return 0; } diff --git a/mm/oom_kill.c b/mm/oom_kill.c index 38ed03c344f3..3da29ddfea1a 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c @@ -318,6 +318,9 @@ static bool oom_next_task(struct task_struct *task, struct oom_control *oc, struct mem_cgroup *oc_memcg; int cur_memcg_prio, oc_memcg_prio;
+ if (memcg_oom_prio_disabled()) + return points == LONG_MIN || points < oc->chosen_points; + if (points == LONG_MIN) return true;
From: Jing Xiangfeng jingxiangfeng@huawei.com
hulk inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I8PXX8 CVE: NA
--------------------------------------
enable CONFIG_MEMCG_OOM_PRIORITY to support memcg OOM priority.
Signed-off-by: Jing Xiangfeng jingxiangfeng@huawei.com Signed-off-by: Jinjiang Tu tujinjiang@huawei.com --- arch/arm64/configs/openeuler_defconfig | 1 + arch/x86/configs/openeuler_defconfig | 1 + 2 files changed, 2 insertions(+)
diff --git a/arch/arm64/configs/openeuler_defconfig b/arch/arm64/configs/openeuler_defconfig index 9297e3ca9f92..f388419a6a4a 100644 --- a/arch/arm64/configs/openeuler_defconfig +++ b/arch/arm64/configs/openeuler_defconfig @@ -159,6 +159,7 @@ CONFIG_PAGE_COUNTER=y CONFIG_MEMCG=y CONFIG_MEMCG_V1_RECLAIM=y CONFIG_MEMCG_MEMFS_INFO=y +CONFIG_MEMCG_OOM_PRIORITY=y CONFIG_MEMCG_KMEM=y CONFIG_BLK_CGROUP=y CONFIG_CGROUP_WRITEBACK=y diff --git a/arch/x86/configs/openeuler_defconfig b/arch/x86/configs/openeuler_defconfig index c7b4a4b80b66..6cad71d9f9c1 100644 --- a/arch/x86/configs/openeuler_defconfig +++ b/arch/x86/configs/openeuler_defconfig @@ -181,6 +181,7 @@ CONFIG_PAGE_COUNTER=y CONFIG_MEMCG=y CONFIG_MEMCG_V1_RECLAIM=y CONFIG_MEMCG_MEMFS_INFO=y +CONFIG_MEMCG_OOM_PRIORITY=y CONFIG_MEMCG_KMEM=y CONFIG_BLK_CGROUP=y CONFIG_CGROUP_WRITEBACK=y