From: Jing Xiangfeng jingxiangfeng@huawei.com
hulk inclusion category: feature bugzilla: 51827 CVE: NA
--------------------------------------
We first kill the process from the low priority memcg if OOM occurs. If the process is not found, then fallback to normal handle.
Signed-off-by: Jing Xiangfeng jingxiangfeng@huawei.com Reviewed-by: Kefeng Wang wangkefeng.wang@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- include/linux/memcontrol.h | 12 ++++ mm/Kconfig | 12 ++++ mm/memcontrol.c | 127 +++++++++++++++++++++++++++++++++++++ mm/oom_kill.c | 52 ++++++++++++++- 4 files changed, 202 insertions(+), 1 deletion(-)
diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index f354e76221db2..c4d4658c7d6df 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -287,6 +287,12 @@ struct mem_cgroup { bool tcpmem_active; int tcpmem_pressure;
+#ifdef CONFIG_MEMCG_QOS + /* Currently support 0 and -1. + * in the future it can expand to other value. + */ + int memcg_priority; +#endif #ifdef CONFIG_MEMCG_KMEM /* Index in the kmem_cache->memcg_params.memcg_caches array */ int kmemcg_id; @@ -321,6 +327,12 @@ struct mem_cgroup_extension { struct mem_cgroup memcg; };
+#ifdef CONFIG_MEMCG_QOS +bool memcg_low_priority_scan_tasks(int (*)(struct task_struct *, void *), + void *); +void memcg_print_bad_task(void *arg, int ret); +#endif + /* * size of first charge trial. "32" comes from vmscan.c's magic value. * TODO: maybe necessary to use big numbers in big irons. diff --git a/mm/Kconfig b/mm/Kconfig index 92be0a1ad61f7..7edf3c4c1252b 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -512,6 +512,18 @@ config USERSWAP Support for User Swap. This is based on userfaultfd. We can implement our own swapout and swapin functions in usersapce.
+config MEMCG_QOS + bool "Enable Memory Cgroup Priority" + depends on MEMCG + depends on X86 || ARM64 + default y + help + MEMCG_QOS means that we first kill the process from the low priority + memcg if OOM occurs. If the process is not found, then fallback to + normal handle. + + If unsure, say "n". + config CMA bool "Contiguous Memory Allocator" depends on HAVE_MEMBLOCK && MMU diff --git a/mm/memcontrol.c b/mm/memcontrol.c index f401be9d45a5c..31352be988114 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -1105,6 +1105,9 @@ int mem_cgroup_scan_tasks(struct mem_cgroup *memcg, break; } } +#ifdef CONFIG_MEMCG_QOS + memcg_print_bad_task(arg, ret); +#endif return ret; }
@@ -3400,6 +3403,119 @@ static int mem_cgroup_move_charge_write(struct cgroup_subsys_state *css, } #endif
+#ifdef CONFIG_MEMCG_QOS +static void memcg_qos_init(struct mem_cgroup *memcg) +{ + struct mem_cgroup *parent = parent_mem_cgroup(memcg); + + if (!parent) + return; + + if (parent->memcg_priority && parent->use_hierarchy) + memcg->memcg_priority = parent->memcg_priority; +} + +static s64 memcg_qos_read(struct cgroup_subsys_state *css, + struct cftype *cft) +{ + return mem_cgroup_from_css(css)->memcg_priority; +} + +static int memcg_qos_write(struct cgroup_subsys_state *css, + struct cftype *cft, s64 val) +{ + struct mem_cgroup *memcg = mem_cgroup_from_css(css); + + if (val >= 0) + memcg->memcg_priority = 0; + else + memcg->memcg_priority = -1; + + return 0; +} + +static struct mem_cgroup *memcg_find_max_usage(struct mem_cgroup *last) +{ + struct mem_cgroup *iter, *max_memcg = NULL; + struct cgroup_subsys_state *css; + unsigned long usage, max_usage = 0; + + rcu_read_lock(); + css_for_each_descendant_pre(css, &root_mem_cgroup->css) { + iter = mem_cgroup_from_css(css); + + if (!iter->memcg_priority || iter == root_mem_cgroup || + iter == last) + continue; + + usage = mem_cgroup_usage(iter, false); + if (usage > max_usage) { + max_usage = usage; + max_memcg = iter; + } + } + rcu_read_unlock(); + + return max_memcg; +} + +bool memcg_low_priority_scan_tasks(int (*fn)(struct task_struct *, void *), + void *arg) +{ + struct mem_cgroup *max, *last = NULL; + struct oom_control *oc = arg; + struct css_task_iter it; + struct task_struct *task; + int ret = 0; + bool retry = true; + +retry: + max = memcg_find_max_usage(last); + if (!max) + return false; + + css_task_iter_start(&max->css, 0, &it); + while (!ret && (task = css_task_iter_next(&it))) { + if (test_tsk_thread_flag(task, TIF_MEMDIE)) { + pr_info("task %s is dying.\n", task->comm); + continue; + } + + ret = fn(task, arg); + } + css_task_iter_end(&it); + + if (ret) + return false; + + if (!oc->chosen && retry) { + last = max; + retry = false; + goto retry; + } + + if (oc->chosen) + pr_info("The bad task [%d:%s] is from low-priority memcg.\n", + oc->chosen->pid, oc->chosen->comm); + + return oc->chosen ? true : false; +} + +void memcg_print_bad_task(void *arg, int ret) +{ + struct oom_control *oc = arg; + + if (!ret && oc->chosen) { + struct mem_cgroup *memcg; + + memcg = mem_cgroup_from_task(oc->chosen); + if (memcg->memcg_priority) + pr_info("The bad task [%d:%s] is from low-priority memcg.\n", + oc->chosen->pid, oc->chosen->comm); + } +} +#endif + #ifdef CONFIG_NUMA static int memcg_numa_stat_show(struct seq_file *m, void *v) { @@ -4324,6 +4440,13 @@ static struct cftype mem_cgroup_legacy_files[] = { { .name = "pressure_level", }, +#ifdef CONFIG_MEMCG_QOS + { + .name = "qos_level", + .read_s64 = memcg_qos_read, + .write_s64 = memcg_qos_write, + }, +#endif #ifdef CONFIG_NUMA { .name = "numa_stat", @@ -4657,6 +4780,10 @@ static int mem_cgroup_css_online(struct cgroup_subsys_state *css) return -ENOMEM; }
+#ifdef CONFIG_MEMCG_QOS + memcg_qos_init(memcg); +#endif + /* Online state pins memcg ID, memcg ID pins CSS */ atomic_set(&memcg->id.ref, 1); css_get(css); diff --git a/mm/oom_kill.c b/mm/oom_kill.c index 1c8236cbb9022..22a6f88d1f4d9 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c @@ -317,6 +317,49 @@ static enum oom_constraint constrained_alloc(struct oom_control *oc) return CONSTRAINT_NONE; }
+#ifdef CONFIG_MEMCG_QOS +/** + * We choose the task in low-priority memcg firstly. For the same state, we + * choose the task with the highest number of 'points'. + */ +static bool oom_next_task(struct task_struct *task, struct oom_control *oc, + unsigned long points) +{ + struct mem_cgroup *cur_memcg; + struct mem_cgroup *oc_memcg; + + + if (!points) + return true; + + if (!oc->chosen) + return false; + + oc_memcg = mem_cgroup_from_task(oc->chosen); + cur_memcg = mem_cgroup_from_task(task); + + if (cur_memcg->memcg_priority == oc_memcg->memcg_priority) { + if (points < oc->chosen_points) + return true; + return false; + } + /* if oc is low-priority, so skip the task */ + if (oc_memcg->memcg_priority) + return true; + + return false; +} +#else +static inline bool oom_next_task(struct task_struct *task, + struct oom_control *oc, unsigned long points) +{ + if (!points || points < oc->chosen_points) + return true; + + return false; +} +#endif + static int oom_evaluate_task(struct task_struct *task, void *arg) { struct oom_control *oc = arg; @@ -347,7 +390,7 @@ static int oom_evaluate_task(struct task_struct *task, void *arg) }
points = oom_badness(task, NULL, oc->nodemask, oc->totalpages); - if (!points || points < oc->chosen_points) + if (oom_next_task(task, oc, points)) goto next;
select: @@ -376,6 +419,13 @@ static void select_bad_process(struct oom_control *oc) else { struct task_struct *p;
+#ifdef CONFIG_MEMCG_QOS + if (memcg_low_priority_scan_tasks(oom_evaluate_task, oc)) { + oc->chosen_points = + oc->chosen_points * 1000 / oc->totalpages; + return; + } +#endif rcu_read_lock(); for_each_process(p) if (oom_evaluate_task(p, oc))
From: Jing Xiangfeng jingxiangfeng@huawei.com
hulk inclusion category: feature bugzilla: 51827 CVE: NA
------------------------------ enable CONFIG_MEMCG_QOS to support memcg OOM priority.
Signed-off-by: Jing Xiangfeng jingxiangfeng@huawei.com Reviewed-by: Kefeng Wang wangkefeng.wang@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- arch/arm64/configs/hulk_defconfig | 1 + arch/x86/configs/hulk_defconfig | 1 + 2 files changed, 2 insertions(+)
diff --git a/arch/arm64/configs/hulk_defconfig b/arch/arm64/configs/hulk_defconfig index 969a41b20f660..741b33fab4e9c 100644 --- a/arch/arm64/configs/hulk_defconfig +++ b/arch/arm64/configs/hulk_defconfig @@ -970,6 +970,7 @@ CONFIG_CLEANCACHE=y CONFIG_FRONTSWAP=y CONFIG_SHRINK_PAGECACHE=y CONFIG_USERSWAP=y +CONFIG_MEMCG_QOS=y CONFIG_CMA=y # CONFIG_CMA_DEBUG is not set # CONFIG_CMA_DEBUGFS is not set diff --git a/arch/x86/configs/hulk_defconfig b/arch/x86/configs/hulk_defconfig index 60f198771a207..6ea79ca8a8a1e 100644 --- a/arch/x86/configs/hulk_defconfig +++ b/arch/x86/configs/hulk_defconfig @@ -996,6 +996,7 @@ CONFIG_TRANSPARENT_HUGE_PAGECACHE=y CONFIG_CLEANCACHE=y CONFIG_FRONTSWAP=y CONFIG_SHRINK_PAGECACHE=y +CONFIG_MEMCG_QOS=y # CONFIG_CMA is not set CONFIG_MEM_SOFT_DIRTY=y CONFIG_ZSWAP=y
From: Jing Xiangfeng jingxiangfeng@huawei.com
hulk inclusion category: feature bugzilla: 51827 CVE: NA
--------------------------------------
Fix it by moving memcg_priority from struct mem_cgroup to struct mem_cgroup_extension.
Signed-off-by: Jing Xiangfeng jingxiangfeng@huawei.com Reviewed-by: Kefeng Wang wangkefeng.wang@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- include/linux/memcontrol.h | 15 +++++++++------ mm/memcontrol.c | 34 +++++++++++++++++++++++++--------- mm/oom_kill.c | 8 ++++++-- 3 files changed, 40 insertions(+), 17 deletions(-)
diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index c4d4658c7d6df..dce7aa54f2698 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -287,12 +287,6 @@ struct mem_cgroup { bool tcpmem_active; int tcpmem_pressure;
-#ifdef CONFIG_MEMCG_QOS - /* Currently support 0 and -1. - * in the future it can expand to other value. - */ - int memcg_priority; -#endif #ifdef CONFIG_MEMCG_KMEM /* Index in the kmem_cache->memcg_params.memcg_caches array */ int kmemcg_id; @@ -321,12 +315,21 @@ struct mem_cgroup { };
struct mem_cgroup_extension { +#ifdef CONFIG_MEMCG_QOS + /* Currently support 0 and -1. + * in the future it can expand to other value. + */ + int memcg_priority; +#endif spinlock_t split_queue_lock; struct list_head split_queue; unsigned long split_queue_len; struct mem_cgroup memcg; };
+#define to_memcg_ext(cgroup) \ + container_of(cgroup, struct mem_cgroup_extension, memcg) + #ifdef CONFIG_MEMCG_QOS bool memcg_low_priority_scan_tasks(int (*)(struct task_struct *, void *), void *); diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 31352be988114..bb6e7a0af502d 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -3407,29 +3407,42 @@ static int mem_cgroup_move_charge_write(struct cgroup_subsys_state *css, static void memcg_qos_init(struct mem_cgroup *memcg) { struct mem_cgroup *parent = parent_mem_cgroup(memcg); + struct mem_cgroup_extension *memcg_ext; + struct mem_cgroup_extension *parent_ext;
if (!parent) return;
- if (parent->memcg_priority && parent->use_hierarchy) - memcg->memcg_priority = parent->memcg_priority; + memcg_ext = to_memcg_ext(memcg); + parent_ext = to_memcg_ext(parent); + + if (parent_ext->memcg_priority && parent->use_hierarchy) + memcg_ext->memcg_priority = parent_ext->memcg_priority; }
static s64 memcg_qos_read(struct cgroup_subsys_state *css, struct cftype *cft) { - return mem_cgroup_from_css(css)->memcg_priority; + struct mem_cgroup *memcg = mem_cgroup_from_css(css); + struct mem_cgroup_extension *memcg_ext; + + memcg_ext = to_memcg_ext(memcg); + + return memcg_ext->memcg_priority; }
static int memcg_qos_write(struct cgroup_subsys_state *css, struct cftype *cft, s64 val) { struct mem_cgroup *memcg = mem_cgroup_from_css(css); + struct mem_cgroup_extension *memcg_ext; + + memcg_ext = to_memcg_ext(memcg);
if (val >= 0) - memcg->memcg_priority = 0; + memcg_ext->memcg_priority = 0; else - memcg->memcg_priority = -1; + memcg_ext->memcg_priority = -1;
return 0; } @@ -3438,13 +3451,15 @@ static struct mem_cgroup *memcg_find_max_usage(struct mem_cgroup *last) { struct mem_cgroup *iter, *max_memcg = NULL; struct cgroup_subsys_state *css; + struct mem_cgroup_extension *memcg_ext; unsigned long usage, max_usage = 0;
rcu_read_lock(); css_for_each_descendant_pre(css, &root_mem_cgroup->css) { iter = mem_cgroup_from_css(css); + memcg_ext = to_memcg_ext(iter);
- if (!iter->memcg_priority || iter == root_mem_cgroup || + if (!memcg_ext->memcg_priority || iter == root_mem_cgroup || iter == last) continue;
@@ -3504,12 +3519,13 @@ bool memcg_low_priority_scan_tasks(int (*fn)(struct task_struct *, void *), void memcg_print_bad_task(void *arg, int ret) { struct oom_control *oc = arg; + struct mem_cgroup *memcg; + struct mem_cgroup_extension *memcg_ext;
if (!ret && oc->chosen) { - struct mem_cgroup *memcg; - memcg = mem_cgroup_from_task(oc->chosen); - if (memcg->memcg_priority) + memcg_ext = to_memcg_ext(memcg); + if (memcg_ext->memcg_priority) pr_info("The bad task [%d:%s] is from low-priority memcg.\n", oc->chosen->pid, oc->chosen->comm); } diff --git a/mm/oom_kill.c b/mm/oom_kill.c index 22a6f88d1f4d9..38710c51bb40a 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c @@ -327,6 +327,8 @@ static bool oom_next_task(struct task_struct *task, struct oom_control *oc, { struct mem_cgroup *cur_memcg; struct mem_cgroup *oc_memcg; + struct mem_cgroup_extension *cur_ext; + struct mem_cgroup_extension *oc_ext;
if (!points) @@ -337,14 +339,16 @@ static bool oom_next_task(struct task_struct *task, struct oom_control *oc,
oc_memcg = mem_cgroup_from_task(oc->chosen); cur_memcg = mem_cgroup_from_task(task); + oc_ext = to_memcg_ext(oc_memcg); + cur_ext = to_memcg_ext(cur_memcg);
- if (cur_memcg->memcg_priority == oc_memcg->memcg_priority) { + if (cur_ext->memcg_priority == oc_ext->memcg_priority) { if (points < oc->chosen_points) return true; return false; } /* if oc is low-priority, so skip the task */ - if (oc_memcg->memcg_priority) + if (oc_ext->memcg_priority) return true;
return false;
From: Jing Xiangfeng jingxiangfeng@huawei.com
hulk inclusion category: feature bugzilla: 51827 CVE: NA
--------------------------------------
This patch adds a default-false static key to disable memcg priority feature. If you want to enable it by writing 1:
echo 1 > /proc/sys/vm/memcg_qos_enable
Signed-off-by: Jing Xiangfeng jingxiangfeng@huawei.com Reviewed-by: Kefeng Wang wangkefeng.wang@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- include/linux/memcontrol.h | 7 +++++ kernel/sysctl.c | 11 ++++++++ mm/memcontrol.c | 56 ++++++++++++++++++++++++++++++++++++++ mm/oom_kill.c | 7 ++--- 4 files changed, 77 insertions(+), 4 deletions(-)
diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index dce7aa54f2698..52604319712b9 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -331,9 +331,16 @@ struct mem_cgroup_extension { container_of(cgroup, struct mem_cgroup_extension, memcg)
#ifdef CONFIG_MEMCG_QOS +#define ENABLE_MEMCG_QOS 1 +#define DISABLE_MEMCG_QOS 0 +extern int sysctl_memcg_qos_stat; +DECLARE_STATIC_KEY_FALSE(memcg_qos_stat_key); + bool memcg_low_priority_scan_tasks(int (*)(struct task_struct *, void *), void *); void memcg_print_bad_task(void *arg, int ret); +extern int sysctl_memcg_qos_handler(struct ctl_table *table, + int write, void __user *buffer, size_t *length, loff_t *ppos); #endif
/* diff --git a/kernel/sysctl.c b/kernel/sysctl.c index d111d02042eb7..af4d97b9dfce6 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -1449,6 +1449,17 @@ static struct ctl_table vm_table[] = { .mode = 0644, .proc_handler = hugetlb_overcommit_handler, }, +#endif +#ifdef CONFIG_MEMCG_QOS + { + .procname = "memcg_qos_enable", + .data = &sysctl_memcg_qos_stat, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = sysctl_memcg_qos_handler, + .extra1 = &zero, + .extra2 = &one, + }, #endif { .procname = "lowmem_reserve_ratio", diff --git a/mm/memcontrol.c b/mm/memcontrol.c index bb6e7a0af502d..c1871d7b134cf 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -3404,12 +3404,18 @@ static int mem_cgroup_move_charge_write(struct cgroup_subsys_state *css, #endif
#ifdef CONFIG_MEMCG_QOS +int sysctl_memcg_qos_stat = DISABLE_MEMCG_QOS; +DEFINE_STATIC_KEY_FALSE(memcg_qos_stat_key); + static void memcg_qos_init(struct mem_cgroup *memcg) { struct mem_cgroup *parent = parent_mem_cgroup(memcg); struct mem_cgroup_extension *memcg_ext; struct mem_cgroup_extension *parent_ext;
+ if (!static_branch_likely(&memcg_qos_stat_key)) + return; + if (!parent) return;
@@ -3426,6 +3432,9 @@ static s64 memcg_qos_read(struct cgroup_subsys_state *css, struct mem_cgroup *memcg = mem_cgroup_from_css(css); struct mem_cgroup_extension *memcg_ext;
+ if (!static_branch_likely(&memcg_qos_stat_key)) + return 0; + memcg_ext = to_memcg_ext(memcg);
return memcg_ext->memcg_priority; @@ -3437,6 +3446,9 @@ static int memcg_qos_write(struct cgroup_subsys_state *css, struct mem_cgroup *memcg = mem_cgroup_from_css(css); struct mem_cgroup_extension *memcg_ext;
+ if (!static_branch_likely(&memcg_qos_stat_key)) + return -EACCES; + memcg_ext = to_memcg_ext(memcg);
if (val >= 0) @@ -3484,6 +3496,8 @@ bool memcg_low_priority_scan_tasks(int (*fn)(struct task_struct *, void *), int ret = 0; bool retry = true;
+ if (!static_branch_likely(&memcg_qos_stat_key)) + return false; retry: max = memcg_find_max_usage(last); if (!max) @@ -3522,6 +3536,9 @@ void memcg_print_bad_task(void *arg, int ret) struct mem_cgroup *memcg; struct mem_cgroup_extension *memcg_ext;
+ if (!static_branch_likely(&memcg_qos_stat_key)) + return; + if (!ret && oc->chosen) { memcg = mem_cgroup_from_task(oc->chosen); memcg_ext = to_memcg_ext(memcg); @@ -3530,6 +3547,45 @@ void memcg_print_bad_task(void *arg, int ret) oc->chosen->pid, oc->chosen->comm); } } + +static void memcg_qos_reset(void) +{ + struct mem_cgroup *iter; + struct cgroup_subsys_state *css; + struct mem_cgroup_extension *memcg_ext; + + rcu_read_lock(); + css_for_each_descendant_pre(css, &root_mem_cgroup->css) { + iter = mem_cgroup_from_css(css); + memcg_ext = to_memcg_ext(iter); + + if (memcg_ext->memcg_priority) + memcg_ext->memcg_priority = 0; + } + rcu_read_unlock(); +} + +int sysctl_memcg_qos_handler(struct ctl_table *table, int write, + void __user *buffer, size_t *length, loff_t *ppos) +{ + int ret; + + ret = proc_dointvec_minmax(table, write, buffer, length, ppos); + if (ret) + return ret; + if (write) { + if (sysctl_memcg_qos_stat == ENABLE_MEMCG_QOS) { + static_branch_enable(&memcg_qos_stat_key); + pr_info("enable memcg priority.\n"); + } else { + static_branch_disable(&memcg_qos_stat_key); + memcg_qos_reset(); + pr_info("disable memcg priority.\n"); + } + } + + return ret; +} #endif
#ifdef CONFIG_NUMA diff --git a/mm/oom_kill.c b/mm/oom_kill.c index 38710c51bb40a..2e09b03432c08 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c @@ -330,6 +330,8 @@ static bool oom_next_task(struct task_struct *task, struct oom_control *oc, struct mem_cgroup_extension *cur_ext; struct mem_cgroup_extension *oc_ext;
+ if (!static_branch_likely(&memcg_qos_stat_key)) + return !points || points < oc->chosen_points;
if (!points) return true; @@ -357,10 +359,7 @@ static bool oom_next_task(struct task_struct *task, struct oom_control *oc, static inline bool oom_next_task(struct task_struct *task, struct oom_control *oc, unsigned long points) { - if (!points || points < oc->chosen_points) - return true; - - return false; + return !points || points < oc->chosen_points; } #endif
From: Jing Xiangfeng jingxiangfeng@huawei.com
hulk inclusion category: feature bugzilla: 51827 CVE: NA
--------------------------------------
If parent's qos_level is set, iterate over all cgroups (under this tree) to modify memory.qos_level synchronously. Currently qos_level support 0 and -1.
Signed-off-by: Jing Xiangfeng jingxiangfeng@huawei.com Reviewed-by: Kefeng Wang wangkefeng.wang@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- mm/memcontrol.c | 51 ++++++++++++++++++++++++++++--------------------- 1 file changed, 29 insertions(+), 22 deletions(-)
diff --git a/mm/memcontrol.c b/mm/memcontrol.c index c1871d7b134cf..b42d615fa8479 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -3407,6 +3407,25 @@ static int mem_cgroup_move_charge_write(struct cgroup_subsys_state *css, int sysctl_memcg_qos_stat = DISABLE_MEMCG_QOS; DEFINE_STATIC_KEY_FALSE(memcg_qos_stat_key);
+static void memcg_hierarchy_qos_set(struct mem_cgroup *memcg, int val) +{ + struct mem_cgroup *iter; + struct cgroup_subsys_state *css; + struct mem_cgroup_extension *memcg_ext; + + if (!memcg) + memcg = root_mem_cgroup; + + rcu_read_lock(); + css_for_each_descendant_pre(css, &memcg->css) { + iter = mem_cgroup_from_css(css); + memcg_ext = to_memcg_ext(iter); + + memcg_ext->memcg_priority = val; + } + rcu_read_unlock(); +} + static void memcg_qos_init(struct mem_cgroup *memcg) { struct mem_cgroup *parent = parent_mem_cgroup(memcg); @@ -3449,12 +3468,17 @@ static int memcg_qos_write(struct cgroup_subsys_state *css, if (!static_branch_likely(&memcg_qos_stat_key)) return -EACCES;
+ if (mem_cgroup_is_root(memcg)) + return -EINVAL; + + if (val != 0 && val != -1) + return -EINVAL; + memcg_ext = to_memcg_ext(memcg);
- if (val >= 0) - memcg_ext->memcg_priority = 0; - else - memcg_ext->memcg_priority = -1; + memcg_ext->memcg_priority = val; + if (memcg->use_hierarchy) + memcg_hierarchy_qos_set(memcg, val);
return 0; } @@ -3548,23 +3572,6 @@ void memcg_print_bad_task(void *arg, int ret) } }
-static void memcg_qos_reset(void) -{ - struct mem_cgroup *iter; - struct cgroup_subsys_state *css; - struct mem_cgroup_extension *memcg_ext; - - rcu_read_lock(); - css_for_each_descendant_pre(css, &root_mem_cgroup->css) { - iter = mem_cgroup_from_css(css); - memcg_ext = to_memcg_ext(iter); - - if (memcg_ext->memcg_priority) - memcg_ext->memcg_priority = 0; - } - rcu_read_unlock(); -} - int sysctl_memcg_qos_handler(struct ctl_table *table, int write, void __user *buffer, size_t *length, loff_t *ppos) { @@ -3579,7 +3586,7 @@ int sysctl_memcg_qos_handler(struct ctl_table *table, int write, pr_info("enable memcg priority.\n"); } else { static_branch_disable(&memcg_qos_stat_key); - memcg_qos_reset(); + memcg_hierarchy_qos_set(NULL, 0); pr_info("disable memcg priority.\n"); } }