From: Jing Xiangfeng jingxiangfeng@huawei.com
hulk inclusion category: feature bugzilla: 51827 CVE: NA
--------------------------------------
We first kill the process from the low priority memcg if OOM occurs. If the process is not found, then fallback to normal handle.
Signed-off-by: Jing Xiangfeng jingxiangfeng@huawei.com Reviewed-by: Kefeng Wang wangkefeng.wang@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- include/linux/memcontrol.h | 12 ++++ mm/Kconfig | 12 ++++ mm/memcontrol.c | 127 +++++++++++++++++++++++++++++++++++++ mm/oom_kill.c | 52 ++++++++++++++- 4 files changed, 202 insertions(+), 1 deletion(-)
diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index f354e76221db2..c4d4658c7d6df 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -287,6 +287,12 @@ struct mem_cgroup { bool tcpmem_active; int tcpmem_pressure;
+#ifdef CONFIG_MEMCG_QOS + /* Currently support 0 and -1. + * in the future it can expand to other value. + */ + int memcg_priority; +#endif #ifdef CONFIG_MEMCG_KMEM /* Index in the kmem_cache->memcg_params.memcg_caches array */ int kmemcg_id; @@ -321,6 +327,12 @@ struct mem_cgroup_extension { struct mem_cgroup memcg; };
+#ifdef CONFIG_MEMCG_QOS +bool memcg_low_priority_scan_tasks(int (*)(struct task_struct *, void *), + void *); +void memcg_print_bad_task(void *arg, int ret); +#endif + /* * size of first charge trial. "32" comes from vmscan.c's magic value. * TODO: maybe necessary to use big numbers in big irons. diff --git a/mm/Kconfig b/mm/Kconfig index 92be0a1ad61f7..7edf3c4c1252b 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -512,6 +512,18 @@ config USERSWAP Support for User Swap. This is based on userfaultfd. We can implement our own swapout and swapin functions in usersapce.
+config MEMCG_QOS + bool "Enable Memory Cgroup Priority" + depends on MEMCG + depends on X86 || ARM64 + default y + help + MEMCG_QOS means that we first kill the process from the low priority + memcg if OOM occurs. If the process is not found, then fallback to + normal handle. + + If unsure, say "n". + config CMA bool "Contiguous Memory Allocator" depends on HAVE_MEMBLOCK && MMU diff --git a/mm/memcontrol.c b/mm/memcontrol.c index f401be9d45a5c..31352be988114 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -1105,6 +1105,9 @@ int mem_cgroup_scan_tasks(struct mem_cgroup *memcg, break; } } +#ifdef CONFIG_MEMCG_QOS + memcg_print_bad_task(arg, ret); +#endif return ret; }
@@ -3400,6 +3403,119 @@ static int mem_cgroup_move_charge_write(struct cgroup_subsys_state *css, } #endif
+#ifdef CONFIG_MEMCG_QOS +static void memcg_qos_init(struct mem_cgroup *memcg) +{ + struct mem_cgroup *parent = parent_mem_cgroup(memcg); + + if (!parent) + return; + + if (parent->memcg_priority && parent->use_hierarchy) + memcg->memcg_priority = parent->memcg_priority; +} + +static s64 memcg_qos_read(struct cgroup_subsys_state *css, + struct cftype *cft) +{ + return mem_cgroup_from_css(css)->memcg_priority; +} + +static int memcg_qos_write(struct cgroup_subsys_state *css, + struct cftype *cft, s64 val) +{ + struct mem_cgroup *memcg = mem_cgroup_from_css(css); + + if (val >= 0) + memcg->memcg_priority = 0; + else + memcg->memcg_priority = -1; + + return 0; +} + +static struct mem_cgroup *memcg_find_max_usage(struct mem_cgroup *last) +{ + struct mem_cgroup *iter, *max_memcg = NULL; + struct cgroup_subsys_state *css; + unsigned long usage, max_usage = 0; + + rcu_read_lock(); + css_for_each_descendant_pre(css, &root_mem_cgroup->css) { + iter = mem_cgroup_from_css(css); + + if (!iter->memcg_priority || iter == root_mem_cgroup || + iter == last) + continue; + + usage = mem_cgroup_usage(iter, false); + if (usage > max_usage) { + max_usage = usage; + max_memcg = iter; + } + } + rcu_read_unlock(); + + return max_memcg; +} + +bool memcg_low_priority_scan_tasks(int (*fn)(struct task_struct *, void *), + void *arg) +{ + struct mem_cgroup *max, *last = NULL; + struct oom_control *oc = arg; + struct css_task_iter it; + struct task_struct *task; + int ret = 0; + bool retry = true; + +retry: + max = memcg_find_max_usage(last); + if (!max) + return false; + + css_task_iter_start(&max->css, 0, &it); + while (!ret && (task = css_task_iter_next(&it))) { + if (test_tsk_thread_flag(task, TIF_MEMDIE)) { + pr_info("task %s is dying.\n", task->comm); + continue; + } + + ret = fn(task, arg); + } + css_task_iter_end(&it); + + if (ret) + return false; + + if (!oc->chosen && retry) { + last = max; + retry = false; + goto retry; + } + + if (oc->chosen) + pr_info("The bad task [%d:%s] is from low-priority memcg.\n", + oc->chosen->pid, oc->chosen->comm); + + return oc->chosen ? true : false; +} + +void memcg_print_bad_task(void *arg, int ret) +{ + struct oom_control *oc = arg; + + if (!ret && oc->chosen) { + struct mem_cgroup *memcg; + + memcg = mem_cgroup_from_task(oc->chosen); + if (memcg->memcg_priority) + pr_info("The bad task [%d:%s] is from low-priority memcg.\n", + oc->chosen->pid, oc->chosen->comm); + } +} +#endif + #ifdef CONFIG_NUMA static int memcg_numa_stat_show(struct seq_file *m, void *v) { @@ -4324,6 +4440,13 @@ static struct cftype mem_cgroup_legacy_files[] = { { .name = "pressure_level", }, +#ifdef CONFIG_MEMCG_QOS + { + .name = "qos_level", + .read_s64 = memcg_qos_read, + .write_s64 = memcg_qos_write, + }, +#endif #ifdef CONFIG_NUMA { .name = "numa_stat", @@ -4657,6 +4780,10 @@ static int mem_cgroup_css_online(struct cgroup_subsys_state *css) return -ENOMEM; }
+#ifdef CONFIG_MEMCG_QOS + memcg_qos_init(memcg); +#endif + /* Online state pins memcg ID, memcg ID pins CSS */ atomic_set(&memcg->id.ref, 1); css_get(css); diff --git a/mm/oom_kill.c b/mm/oom_kill.c index 1c8236cbb9022..22a6f88d1f4d9 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c @@ -317,6 +317,49 @@ static enum oom_constraint constrained_alloc(struct oom_control *oc) return CONSTRAINT_NONE; }
+#ifdef CONFIG_MEMCG_QOS +/** + * We choose the task in low-priority memcg firstly. For the same state, we + * choose the task with the highest number of 'points'. + */ +static bool oom_next_task(struct task_struct *task, struct oom_control *oc, + unsigned long points) +{ + struct mem_cgroup *cur_memcg; + struct mem_cgroup *oc_memcg; + + + if (!points) + return true; + + if (!oc->chosen) + return false; + + oc_memcg = mem_cgroup_from_task(oc->chosen); + cur_memcg = mem_cgroup_from_task(task); + + if (cur_memcg->memcg_priority == oc_memcg->memcg_priority) { + if (points < oc->chosen_points) + return true; + return false; + } + /* if oc is low-priority, so skip the task */ + if (oc_memcg->memcg_priority) + return true; + + return false; +} +#else +static inline bool oom_next_task(struct task_struct *task, + struct oom_control *oc, unsigned long points) +{ + if (!points || points < oc->chosen_points) + return true; + + return false; +} +#endif + static int oom_evaluate_task(struct task_struct *task, void *arg) { struct oom_control *oc = arg; @@ -347,7 +390,7 @@ static int oom_evaluate_task(struct task_struct *task, void *arg) }
points = oom_badness(task, NULL, oc->nodemask, oc->totalpages); - if (!points || points < oc->chosen_points) + if (oom_next_task(task, oc, points)) goto next;
select: @@ -376,6 +419,13 @@ static void select_bad_process(struct oom_control *oc) else { struct task_struct *p;
+#ifdef CONFIG_MEMCG_QOS + if (memcg_low_priority_scan_tasks(oom_evaluate_task, oc)) { + oc->chosen_points = + oc->chosen_points * 1000 / oc->totalpages; + return; + } +#endif rcu_read_lock(); for_each_process(p) if (oom_evaluate_task(p, oc))