v3->v2: correct input param of try_to_free_mem_cgroup_pages v2->v1: introduce CONFIG_CGROUP_V1_KILL to isolated cgroup.kill in cgroupv1
Lu Jialin (9): memcg: Export memcg.{min/low} from cgroupv2 to cgroupv1 memcg: Export memcg.high from cgroupv2 to cgroupv1 memcg: Export memory.events and memory.events.local from cgroupv2 to cgroupv1 memcg: Fix inconsistent oom event behavior for OOM_MEMCG_KILL cgroup: Export cgroup.kill from cgroupv2 to cgroupv1 memcg: Introduce CONFIG_MEMCG_V1_THRESHOLD_QOS memcg: enable memcg async reclaim memcg: export high_async_ratio to userland cgroup: add config isolation for cgroup_kill in cgroupv1
arch/arm64/configs/openeuler_defconfig | 2 + arch/x86/configs/openeuler_defconfig | 2 + include/linux/memcontrol.h | 23 +- init/Kconfig | 10 + kernel/cgroup/cgroup-internal.h | 3 + kernel/cgroup/cgroup-v1.c | 7 + kernel/cgroup/cgroup.c | 4 +- mm/memcontrol.c | 382 ++++++++++++++++++------- 8 files changed, 321 insertions(+), 112 deletions(-)
hulk inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I7VV7N
--------------------------------
Export memcg.min and memcg.low from cgroupv2 to cgroupv1, in order to reduce the negtive impact between cgroups when the system memory is insufficient.
Only export memory.{min/low} numbers in mem_cgroup_legacy_files and move related functions in front of mem_cgroup_legacy_files. There is no need to other changes.
Signed-off-by: Lu Jialin lujialin4@huawei.com --- mm/memcontrol.c | 124 ++++++++++++++++++++++++++---------------------- 1 file changed, 68 insertions(+), 56 deletions(-)
diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 4b27e245a055..f8bb8cfedda9 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -5059,6 +5059,62 @@ static int mem_cgroup_slab_show(struct seq_file *m, void *p) } #endif
+static int seq_puts_memcg_tunable(struct seq_file *m, unsigned long value) +{ + if (value == PAGE_COUNTER_MAX) + seq_puts(m, "max\n"); + else + seq_printf(m, "%llu\n", (u64)value * PAGE_SIZE); + + return 0; +} + +static int memory_min_show(struct seq_file *m, void *v) +{ + return seq_puts_memcg_tunable(m, + READ_ONCE(mem_cgroup_from_seq(m)->memory.min)); +} + +static ssize_t memory_min_write(struct kernfs_open_file *of, + char *buf, size_t nbytes, loff_t off) +{ + struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of)); + unsigned long min; + int err; + + buf = strstrip(buf); + err = page_counter_memparse(buf, "max", &min); + if (err) + return err; + + page_counter_set_min(&memcg->memory, min); + + return nbytes; +} + +static int memory_low_show(struct seq_file *m, void *v) +{ + return seq_puts_memcg_tunable(m, + READ_ONCE(mem_cgroup_from_seq(m)->memory.low)); +} + +static ssize_t memory_low_write(struct kernfs_open_file *of, + char *buf, size_t nbytes, loff_t off) +{ + struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of)); + unsigned long low; + int err; + + buf = strstrip(buf); + err = page_counter_memparse(buf, "max", &low); + if (err) + return err; + + page_counter_set_low(&memcg->memory, low); + + return nbytes; +} + static struct cftype mem_cgroup_legacy_files[] = { { .name = "usage_in_bytes", @@ -5185,6 +5241,18 @@ static struct cftype mem_cgroup_legacy_files[] = { .write = mem_cgroup_reset, .read_u64 = mem_cgroup_read_u64, }, + { + .name = "min", + .flags = CFTYPE_NOT_ON_ROOT, + .seq_show = memory_min_show, + .write = memory_min_write, + }, + { + .name = "low", + .flags = CFTYPE_NOT_ON_ROOT, + .seq_show = memory_low_show, + .write = memory_low_write, + }, { }, /* terminate */ };
@@ -6428,16 +6496,6 @@ static void mem_cgroup_attach(struct cgroup_taskset *tset) } #endif /* CONFIG_LRU_GEN */
-static int seq_puts_memcg_tunable(struct seq_file *m, unsigned long value) -{ - if (value == PAGE_COUNTER_MAX) - seq_puts(m, "max\n"); - else - seq_printf(m, "%llu\n", (u64)value * PAGE_SIZE); - - return 0; -} - static u64 memory_current_read(struct cgroup_subsys_state *css, struct cftype *cft) { @@ -6454,52 +6512,6 @@ static u64 memory_peak_read(struct cgroup_subsys_state *css, return (u64)memcg->memory.watermark * PAGE_SIZE; }
-static int memory_min_show(struct seq_file *m, void *v) -{ - return seq_puts_memcg_tunable(m, - READ_ONCE(mem_cgroup_from_seq(m)->memory.min)); -} - -static ssize_t memory_min_write(struct kernfs_open_file *of, - char *buf, size_t nbytes, loff_t off) -{ - struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of)); - unsigned long min; - int err; - - buf = strstrip(buf); - err = page_counter_memparse(buf, "max", &min); - if (err) - return err; - - page_counter_set_min(&memcg->memory, min); - - return nbytes; -} - -static int memory_low_show(struct seq_file *m, void *v) -{ - return seq_puts_memcg_tunable(m, - READ_ONCE(mem_cgroup_from_seq(m)->memory.low)); -} - -static ssize_t memory_low_write(struct kernfs_open_file *of, - char *buf, size_t nbytes, loff_t off) -{ - struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of)); - unsigned long low; - int err; - - buf = strstrip(buf); - err = page_counter_memparse(buf, "max", &low); - if (err) - return err; - - page_counter_set_low(&memcg->memory, low); - - return nbytes; -} - static int memory_high_show(struct seq_file *m, void *v) { return seq_puts_memcg_tunable(m,
hulk inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I7VV7N
--------------------------------
Export memory.high from cgroupv2 to cgroupv1. Therefore, when the usage of the memcg is larger than memory.high, some pages will be reclaimed before return to userland, which will throttle the process.
Only export memory.high number in mem_cgroup_legacy_files and move related functions in front of mem_cgroup_legacy_files. There is no need to other changes.
Signed-off-by: Lu Jialin lujialin4@huawei.com --- mm/memcontrol.c | 104 +++++++++++++++++++++++++----------------------- 1 file changed, 55 insertions(+), 49 deletions(-)
diff --git a/mm/memcontrol.c b/mm/memcontrol.c index f8bb8cfedda9..d9fdc566e431 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -5115,6 +5115,55 @@ static ssize_t memory_low_write(struct kernfs_open_file *of, return nbytes; }
+static int memory_high_show(struct seq_file *m, void *v) +{ + return seq_puts_memcg_tunable(m, + READ_ONCE(mem_cgroup_from_seq(m)->memory.high)); +} + +static ssize_t memory_high_write(struct kernfs_open_file *of, + char *buf, size_t nbytes, loff_t off) +{ + struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of)); + unsigned int nr_retries = MAX_RECLAIM_RETRIES; + bool drained = false; + unsigned long high; + int err; + + buf = strstrip(buf); + err = page_counter_memparse(buf, "max", &high); + if (err) + return err; + + page_counter_set_high(&memcg->memory, high); + + for (;;) { + unsigned long nr_pages = page_counter_read(&memcg->memory); + unsigned long reclaimed; + + if (nr_pages <= high) + break; + + if (signal_pending(current)) + break; + + if (!drained) { + drain_all_stock(memcg); + drained = true; + continue; + } + + reclaimed = try_to_free_mem_cgroup_pages(memcg, nr_pages - high, + GFP_KERNEL, true); + + if (!reclaimed && !nr_retries--) + break; + } + + memcg_wb_domain_size_changed(memcg); + return nbytes; +} + static struct cftype mem_cgroup_legacy_files[] = { { .name = "usage_in_bytes", @@ -5253,6 +5302,12 @@ static struct cftype mem_cgroup_legacy_files[] = { .seq_show = memory_low_show, .write = memory_low_write, }, + { + .name = "high", + .flags = CFTYPE_NOT_ON_ROOT, + .seq_show = memory_high_show, + .write = memory_high_write, + }, { }, /* terminate */ };
@@ -6512,55 +6567,6 @@ static u64 memory_peak_read(struct cgroup_subsys_state *css, return (u64)memcg->memory.watermark * PAGE_SIZE; }
-static int memory_high_show(struct seq_file *m, void *v) -{ - return seq_puts_memcg_tunable(m, - READ_ONCE(mem_cgroup_from_seq(m)->memory.high)); -} - -static ssize_t memory_high_write(struct kernfs_open_file *of, - char *buf, size_t nbytes, loff_t off) -{ - struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of)); - unsigned int nr_retries = MAX_RECLAIM_RETRIES; - bool drained = false; - unsigned long high; - int err; - - buf = strstrip(buf); - err = page_counter_memparse(buf, "max", &high); - if (err) - return err; - - page_counter_set_high(&memcg->memory, high); - - for (;;) { - unsigned long nr_pages = page_counter_read(&memcg->memory); - unsigned long reclaimed; - - if (nr_pages <= high) - break; - - if (signal_pending(current)) - break; - - if (!drained) { - drain_all_stock(memcg); - drained = true; - continue; - } - - reclaimed = try_to_free_mem_cgroup_pages(memcg, nr_pages - high, - GFP_KERNEL, MEMCG_RECLAIM_MAY_SWAP); - - if (!reclaimed && !nr_retries--) - break; - } - - memcg_wb_domain_size_changed(memcg); - return nbytes; -} - static int memory_max_show(struct seq_file *m, void *v) { return seq_puts_memcg_tunable(m,
hulk inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I7VV7N
--------------------------------
Export "memory.events" and "memory.events.local" from cgroupv2 to cgroupv1.
There are some differences between v2 and v1:
1)events of MEMCG_OOM_GROUP_KILL is not included in cgroupv1. Because, there is no member of memory.oom.group.
2)events of MEMCG_MAX is represented with "limit_in_bytes" in cgroupv1 instead of memory.max
3)event of oom_kill is include in memory.oom_control. make oom_kill include its descendants' events and add oom_kill_local include its oom_kill event only.
Signed-off-by: Lu Jialin lujialin4@huawei.com --- include/linux/memcontrol.h | 2 -- mm/memcontrol.c | 40 ++++++++++++++++++++++++++++++++++++++ 2 files changed, 40 insertions(+), 2 deletions(-)
diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index 222d7370134c..ee855da6e370 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -1129,8 +1129,6 @@ static inline void memcg_memory_event(struct mem_cgroup *memcg, else cgroup_file_notify(&memcg->events_file);
- if (!cgroup_subsys_on_dfl(memory_cgrp_subsys)) - break; if (cgrp_dfl_root.flags & CGRP_ROOT_MEMORY_LOCAL_EVENTS) break; } while ((memcg = parent_mem_cgroup(memcg)) && diff --git a/mm/memcontrol.c b/mm/memcontrol.c index d9fdc566e431..6e7049b44b94 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -4575,6 +4575,9 @@ static int mem_cgroup_oom_control_read(struct seq_file *sf, void *v) seq_printf(sf, "under_oom %d\n", (bool)memcg->under_oom); seq_printf(sf, "oom_kill %lu\n", atomic_long_read(&memcg->memory_events[MEMCG_OOM_KILL])); + seq_printf(sf, "oom_kill_local %lu\n", + atomic_long_read(&memcg->memory_events_local[MEMCG_OOM_KILL])); + return 0; }
@@ -5164,6 +5167,31 @@ static ssize_t memory_high_write(struct kernfs_open_file *of, return nbytes; }
+static void __memcg_events_show(struct seq_file *m, atomic_long_t *events) +{ + seq_printf(m, "low %lu\n", atomic_long_read(&events[MEMCG_LOW])); + seq_printf(m, "high %lu\n", atomic_long_read(&events[MEMCG_HIGH])); + seq_printf(m, "limit_in_bytes %lu\n", + atomic_long_read(&events[MEMCG_MAX])); + seq_printf(m, "oom %lu\n", atomic_long_read(&events[MEMCG_OOM])); +} + +static int memcg_events_show(struct seq_file *m, void *v) +{ + struct mem_cgroup *memcg = mem_cgroup_from_seq(m); + + __memcg_events_show(m, memcg->memory_events); + return 0; +} + +static int memcg_events_local_show(struct seq_file *m, void *v) +{ + struct mem_cgroup *memcg = mem_cgroup_from_seq(m); + + __memcg_events_show(m, memcg->memory_events_local); + return 0; +} + static struct cftype mem_cgroup_legacy_files[] = { { .name = "usage_in_bytes", @@ -5308,6 +5336,18 @@ static struct cftype mem_cgroup_legacy_files[] = { .seq_show = memory_high_show, .write = memory_high_write, }, + { + .name = "events", + .flags = CFTYPE_NOT_ON_ROOT, + .file_offset = offsetof(struct mem_cgroup, events_file), + .seq_show = memcg_events_show, + }, + { + .name = "events.local", + .flags = CFTYPE_NOT_ON_ROOT, + .file_offset = offsetof(struct mem_cgroup, events_local_file), + .seq_show = memcg_events_local_show, + }, { }, /* terminate */ };
hulk inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I7VV7N
--------------------------------
Since memory.event is fully supported in cgroupv1, the problem of inconsistent oom event behavior for OOM_MEMCG_KILL occurs again. We fix the problem by add a new condition to support the event adding continue. Therefore, there are two condition: 1) memcg is not root memcg; 2) the memcg is root memcg and the event is OOM_MEMCG_KILL of cgroupv1
Signed-off-by: Lu Jialin lujialin4@huawei.com --- include/linux/memcontrol.h | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-)
diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index ee855da6e370..1dca6a4e4cac 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -1112,6 +1112,18 @@ static inline void count_memcg_event_mm(struct mm_struct *mm, rcu_read_unlock(); }
+static bool memcg_event_add(struct mem_cgroup *memcg, + enum memcg_memory_event event) +{ + if (!mem_cgroup_is_root(memcg)) + return true; + + if (event == MEMCG_OOM_KILL && !cgroup_subsys_on_dfl(memory_cgrp_subsys)) + return true; + + return false; +} + static inline void memcg_memory_event(struct mem_cgroup *memcg, enum memcg_memory_event event) { @@ -1132,7 +1144,7 @@ static inline void memcg_memory_event(struct mem_cgroup *memcg, if (cgrp_dfl_root.flags & CGRP_ROOT_MEMORY_LOCAL_EVENTS) break; } while ((memcg = parent_mem_cgroup(memcg)) && - !mem_cgroup_is_root(memcg)); + memcg_event_add(memcg, event)); }
static inline void memcg_memory_event_mm(struct mm_struct *mm,
hulk inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I7VV7N
--------------------------------
Export cgroup.kill feature from cgroupv2 to cgroupv1. Therefore, user can kill all process in one cgroup and its subcgroups instead of kill them one by one.
Signed-off-by: Lu Jialin lujialin4@huawei.com --- kernel/cgroup/cgroup-internal.h | 3 +++ kernel/cgroup/cgroup-v1.c | 5 +++++ kernel/cgroup/cgroup.c | 4 ++-- 3 files changed, 10 insertions(+), 2 deletions(-)
diff --git a/kernel/cgroup/cgroup-internal.h b/kernel/cgroup/cgroup-internal.h index 367b0a42ada9..172a7a5bd742 100644 --- a/kernel/cgroup/cgroup-internal.h +++ b/kernel/cgroup/cgroup-internal.h @@ -266,6 +266,9 @@ int cgroup_show_path(struct seq_file *sf, struct kernfs_node *kf_node, int __cgroup_task_count(const struct cgroup *cgrp); int cgroup_task_count(const struct cgroup *cgrp);
+ssize_t cgroup_kill_write(struct kernfs_open_file *of, char *buf, size_t nbytes, + loff_t off); + /* * rstat.c */ diff --git a/kernel/cgroup/cgroup-v1.c b/kernel/cgroup/cgroup-v1.c index 5407241dbb45..aa829c08bc9b 100644 --- a/kernel/cgroup/cgroup-v1.c +++ b/kernel/cgroup/cgroup-v1.c @@ -661,6 +661,11 @@ struct cftype cgroup1_base_files[] = { .write = cgroup_release_agent_write, .max_write_len = PATH_MAX - 1, }, + { + .name = "cgroup.kill", + .flags = CFTYPE_NOT_ON_ROOT, + .write = cgroup_kill_write, + }, { } /* terminate */ };
diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index 4d42f0cbc11e..ea66b93f2671 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -3989,8 +3989,8 @@ static void cgroup_kill(struct cgroup *cgrp) __cgroup_kill(dsct); }
-static ssize_t cgroup_kill_write(struct kernfs_open_file *of, char *buf, - size_t nbytes, loff_t off) +ssize_t cgroup_kill_write(struct kernfs_open_file *of, char *buf, size_t nbytes, + loff_t off) { ssize_t ret = 0; int kill;
hulk inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I7VV7N
-------------------------------
Introduce CONFIG_MEMCG_V1_THRESHOLD_QOS to isolate memcg qos management feature from baseline.
Signed-off-by: Lu Jialin lujialin4@huawei.com --- arch/arm64/configs/openeuler_defconfig | 1 + arch/x86/configs/openeuler_defconfig | 1 + include/linux/memcontrol.h | 7 ++++++- init/Kconfig | 5 +++++ mm/memcontrol.c | 6 ++++++ 5 files changed, 19 insertions(+), 1 deletion(-)
diff --git a/arch/arm64/configs/openeuler_defconfig b/arch/arm64/configs/openeuler_defconfig index 63abdb3f8c63..5d19beb1fdca 100644 --- a/arch/arm64/configs/openeuler_defconfig +++ b/arch/arm64/configs/openeuler_defconfig @@ -156,6 +156,7 @@ CONFIG_CGROUPS=y CONFIG_PAGE_COUNTER=y # CONFIG_CGROUP_FAVOR_DYNMODS is not set CONFIG_MEMCG=y +CONFIG_MEMCG_V1_THERSHOLD_QOS=y CONFIG_MEMCG_KMEM=y CONFIG_BLK_CGROUP=y CONFIG_CGROUP_WRITEBACK=y diff --git a/arch/x86/configs/openeuler_defconfig b/arch/x86/configs/openeuler_defconfig index 0e05e7a15fdb..ff975913e9bf 100644 --- a/arch/x86/configs/openeuler_defconfig +++ b/arch/x86/configs/openeuler_defconfig @@ -179,6 +179,7 @@ CONFIG_CGROUPS=y CONFIG_PAGE_COUNTER=y # CONFIG_CGROUP_FAVOR_DYNMODS is not set CONFIG_MEMCG=y +CONFIG_MEMCG_V1_THRESHOLD_QOS=y CONFIG_MEMCG_KMEM=y CONFIG_BLK_CGROUP=y CONFIG_CGROUP_WRITEBACK=y diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index 1dca6a4e4cac..f654309fcb26 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -1118,8 +1118,10 @@ static bool memcg_event_add(struct mem_cgroup *memcg, if (!mem_cgroup_is_root(memcg)) return true;
+#ifdef CONFIG_MEMCG_V1_THRESHOLD_QOS if (event == MEMCG_OOM_KILL && !cgroup_subsys_on_dfl(memory_cgrp_subsys)) return true; +#endif
return false; } @@ -1140,7 +1142,10 @@ static inline void memcg_memory_event(struct mem_cgroup *memcg, cgroup_file_notify(&memcg->swap_events_file); else cgroup_file_notify(&memcg->events_file); - +#ifndef CONFIG_MEMCG_V1_THRESHOLD_QOS + if (!cgroup_subsys_on_dfl(memory_cgrp_subsys)) + break; +#endif if (cgrp_dfl_root.flags & CGRP_ROOT_MEMORY_LOCAL_EVENTS) break; } while ((memcg = parent_mem_cgroup(memcg)) && diff --git a/init/Kconfig b/init/Kconfig index 32c24950c4ce..dc6dfe94fb93 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -943,6 +943,11 @@ config MEMCG help Provides control over the memory footprint of tasks in a cgroup.
+config MEMCG_V1_THRESHOLD_QOS + bool "Qos memcg threshold in v1" + depends on MEMCG + default n + config MEMCG_KMEM bool depends on MEMCG diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 6e7049b44b94..5b2367cec32f 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -4575,8 +4575,10 @@ static int mem_cgroup_oom_control_read(struct seq_file *sf, void *v) seq_printf(sf, "under_oom %d\n", (bool)memcg->under_oom); seq_printf(sf, "oom_kill %lu\n", atomic_long_read(&memcg->memory_events[MEMCG_OOM_KILL])); +#ifdef CONFIG_MEMCG_V1_THRESHOLD_QOS seq_printf(sf, "oom_kill_local %lu\n", atomic_long_read(&memcg->memory_events_local[MEMCG_OOM_KILL])); +#endif
return 0; } @@ -5167,6 +5169,7 @@ static ssize_t memory_high_write(struct kernfs_open_file *of, return nbytes; }
+#ifdef CONFIG_MEMCG_V1_THRESHOLD_QOS static void __memcg_events_show(struct seq_file *m, atomic_long_t *events) { seq_printf(m, "low %lu\n", atomic_long_read(&events[MEMCG_LOW])); @@ -5191,6 +5194,7 @@ static int memcg_events_local_show(struct seq_file *m, void *v) __memcg_events_show(m, memcg->memory_events_local); return 0; } +#endif
static struct cftype mem_cgroup_legacy_files[] = { { @@ -5318,6 +5322,7 @@ static struct cftype mem_cgroup_legacy_files[] = { .write = mem_cgroup_reset, .read_u64 = mem_cgroup_read_u64, }, +#ifdef CONFIG_MEMCG_V1_THRESHOLD_QOS { .name = "min", .flags = CFTYPE_NOT_ON_ROOT, @@ -5348,6 +5353,7 @@ static struct cftype mem_cgroup_legacy_files[] = { .file_offset = offsetof(struct mem_cgroup, events_local_file), .seq_show = memcg_events_local_show, }, +#endif { }, /* terminate */ };
hulk inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I7VV7N
--------------------------------
Introduce two memcg watermarks: warning watermark and safe watermark. warning watermark = memory.high * memory.high_async_ratio / 100; safe watermark = memory.high * (memory.high_async_ratio - 10) / 100; Start memcg async reclaim when memcg usage is larger than warning watermark but smaller than memory.high; the aim reclaim pages is the diff of memcg usage and safe watermark. The default memory.high_async_ratio is 100; when memory.high_async_ratio is 100, memcg async reclaim is disabled;
Signed-off-by: Lu Jialin lujialin4@huawei.com --- include/linux/memcontrol.h | 4 +++ mm/memcontrol.c | 69 ++++++++++++++++++++++++++++++++++++-- 2 files changed, 70 insertions(+), 3 deletions(-)
diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index f654309fcb26..d786fb04faee 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -326,6 +326,10 @@ struct mem_cgroup { struct lru_gen_mm_list mm_list; #endif
+#ifdef CONFIG_MEMCG_V1_THRESHOLD_QOS + int high_async_ratio; + bool high_async_reclaim; +#endif struct mem_cgroup_per_node *nodeinfo[]; };
diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 5b2367cec32f..59f5db312d99 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -105,6 +105,18 @@ static bool do_memsw_account(void) #define THRESHOLDS_EVENTS_TARGET 128 #define SOFTLIMIT_EVENTS_TARGET 1024
+/* + * memcg warning watermark = memory.high * memcg->high_async_ratio / + * HIGH_ASYNC_RATIO_BASE. + * when memcg usage is larger than warning watermark, but smaller than + * memory.high, start memcg async reclaim; + * when memcg->high_async_ratio is HIGH_ASYNC_RATIO_BASE, memcg async + * relcaim is disabled; + */ + +#define HIGH_ASYNC_RATIO_BASE 100 +#define HIGH_ASYNC_RATIO_GAP 10 + /* * Cgroups above their limits are maintained in a RB-Tree, independent of * their hierarchy representation @@ -2439,12 +2451,52 @@ static unsigned long reclaim_high(struct mem_cgroup *memcg, return nr_reclaimed; }
+#ifdef CONFIG_MEMCG_V1_THRESHOLD_QOS +static bool is_high_async_reclaim(struct mem_cgroup *memcg) +{ + int ratio = READ_ONCE(memcg->high_async_ratio); + unsigned long memcg_high = READ_ONCE(memcg->memory.high); + + if (ratio == HIGH_ASYNC_RATIO_BASE || memcg_high == PAGE_COUNTER_MAX) + return false; + + return page_counter_read(&memcg->memory) > + memcg_high * ratio / HIGH_ASYNC_RATIO_BASE; +} + +static void async_reclaim_high(struct mem_cgroup *memcg) +{ + unsigned long nr_pages, pflags; + unsigned long memcg_high = READ_ONCE(memcg->memory.high); + unsigned long memcg_usage = page_counter_read(&memcg->memory); + int ratio = READ_ONCE(memcg->high_async_ratio) - HIGH_ASYNC_RATIO_GAP; + unsigned long safe_pages = memcg_high * ratio / HIGH_ASYNC_RATIO_BASE; + + if (!is_high_async_reclaim(memcg)) { + WRITE_ONCE(memcg->high_async_reclaim, false); + return; + } + + psi_memstall_enter(&pflags); + nr_pages = memcg_usage > safe_pages ? memcg_usage - safe_pages : + MEMCG_CHARGE_BATCH; + try_to_free_mem_cgroup_pages(memcg, nr_pages, GFP_KERNEL, MEMCG_RECLAIM_MAY_SWAP); + psi_memstall_leave(&pflags); + WRITE_ONCE(memcg->high_async_reclaim, false); +} +#endif + static void high_work_func(struct work_struct *work) { - struct mem_cgroup *memcg; + struct mem_cgroup *memcg = container_of(work, struct mem_cgroup, + high_work);
- memcg = container_of(work, struct mem_cgroup, high_work); - reclaim_high(memcg, MEMCG_CHARGE_BATCH, GFP_KERNEL); +#ifdef CONFIG_MEMCG_V1_THRESHOLD_QOS + if (READ_ONCE(memcg->high_async_reclaim)) + async_reclaim_high(memcg); + else +#endif + reclaim_high(memcg, MEMCG_CHARGE_BATCH, GFP_KERNEL); }
/* @@ -2833,6 +2885,14 @@ static int try_charge_memcg(struct mem_cgroup *memcg, gfp_t gfp_mask, continue; }
+#ifdef CONFIG_MEMCG_V1_THRESHOLD_QOS + if (is_high_async_reclaim(memcg) && !mem_high) { + WRITE_ONCE(memcg->high_async_reclaim, true); + schedule_work(&memcg->high_work); + break; + } +#endif + if (mem_high || swap_high) { /* * The allocating tasks in this cgroup will need to do @@ -5580,6 +5640,9 @@ mem_cgroup_css_alloc(struct cgroup_subsys_state *parent_css) WRITE_ONCE(memcg->soft_limit, PAGE_COUNTER_MAX); #if defined(CONFIG_MEMCG_KMEM) && defined(CONFIG_ZSWAP) memcg->zswap_max = PAGE_COUNTER_MAX; +#endif +#ifdef CONFIG_MEMCG_V1_THRESHOLD_QOS + memcg->high_async_ratio = HIGH_ASYNC_RATIO_BASE; #endif page_counter_set_high(&memcg->swap, PAGE_COUNTER_MAX); if (parent) {
hulk inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I7VV7N
--------------------------------
User can set high_async_ratio from 10 to 100; start memcg high async when memcg_usage is larger than memory.high * high_async_ratio / 100;
Signed-off-by: Lu Jialin lujialin4@huawei.com --- mm/memcontrol.c | 39 +++++++++++++++++++++++++++++++++++++++ 1 file changed, 39 insertions(+)
diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 59f5db312d99..3734bc00de72 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -5256,6 +5256,38 @@ static int memcg_events_local_show(struct seq_file *m, void *v) } #endif
+#ifdef CONFIG_MEMCG_V1_THRESHOLD_QOS +static int memcg_high_async_ratio_show(struct seq_file *m, void *v) +{ + seq_printf(m, "%d\n", + READ_ONCE(mem_cgroup_from_seq(m)->high_async_ratio)); + return 0; +} + +static ssize_t memcg_high_async_ratio_write(struct kernfs_open_file *of, + char *buf, size_t nbytes, loff_t off) +{ + struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of)); + int ret, high_async_ratio; + + buf = strstrip(buf); + if (!buf) + return -EINVAL; + + ret = kstrtoint(buf, 0, &high_async_ratio); + if (ret) + return ret; + + if (high_async_ratio > HIGH_ASYNC_RATIO_BASE || + high_async_ratio <= HIGH_ASYNC_RATIO_GAP) + return -EINVAL; + + WRITE_ONCE(memcg->high_async_ratio, high_async_ratio); + + return nbytes; +} +#endif + static struct cftype mem_cgroup_legacy_files[] = { { .name = "usage_in_bytes", @@ -5413,6 +5445,13 @@ static struct cftype mem_cgroup_legacy_files[] = { .file_offset = offsetof(struct mem_cgroup, events_local_file), .seq_show = memcg_events_local_show, }, + { + .name = "high_async_ratio", + .flags = CFTYPE_NOT_ON_ROOT, + .seq_show = memcg_high_async_ratio_show, + .write = memcg_high_async_ratio_write, + }, + #endif { }, /* terminate */ };
hulk inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I7VV7N
-------------------------------
Introduce CONFIG_CGROUP_V1_KILL to isolate cgroup_kill feature in cgroupv1.
Signed-off-by: Lu Jialin lujialin4@huawei.com --- arch/arm64/configs/openeuler_defconfig | 1 + arch/x86/configs/openeuler_defconfig | 1 + init/Kconfig | 5 +++++ kernel/cgroup/cgroup-v1.c | 2 ++ 4 files changed, 9 insertions(+)
diff --git a/arch/arm64/configs/openeuler_defconfig b/arch/arm64/configs/openeuler_defconfig index 5d19beb1fdca..8603c76dda05 100644 --- a/arch/arm64/configs/openeuler_defconfig +++ b/arch/arm64/configs/openeuler_defconfig @@ -153,6 +153,7 @@ CONFIG_ARCH_SUPPORTS_INT128=y CONFIG_NUMA_BALANCING=y CONFIG_NUMA_BALANCING_DEFAULT_ENABLED=y CONFIG_CGROUPS=y +CONFIG_CGROUP_V1_KILL=y CONFIG_PAGE_COUNTER=y # CONFIG_CGROUP_FAVOR_DYNMODS is not set CONFIG_MEMCG=y diff --git a/arch/x86/configs/openeuler_defconfig b/arch/x86/configs/openeuler_defconfig index ff975913e9bf..2bf33bfaa72f 100644 --- a/arch/x86/configs/openeuler_defconfig +++ b/arch/x86/configs/openeuler_defconfig @@ -176,6 +176,7 @@ CONFIG_ARCH_SUPPORTS_INT128=y CONFIG_NUMA_BALANCING=y CONFIG_NUMA_BALANCING_DEFAULT_ENABLED=y CONFIG_CGROUPS=y +CONFIG_CGROUP_V1_KILL=y CONFIG_PAGE_COUNTER=y # CONFIG_CGROUP_FAVOR_DYNMODS is not set CONFIG_MEMCG=y diff --git a/init/Kconfig b/init/Kconfig index dc6dfe94fb93..dce79d24c1f0 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -1179,6 +1179,11 @@ config SOCK_CGROUP_DATA bool default n
+config CGROUP_V1_KILL + bool "Kill All Tasks In Cgroup" + default n + depends on CGROUPS + endif # CGROUPS
menuconfig NAMESPACES diff --git a/kernel/cgroup/cgroup-v1.c b/kernel/cgroup/cgroup-v1.c index aa829c08bc9b..8dd9ba1d7f33 100644 --- a/kernel/cgroup/cgroup-v1.c +++ b/kernel/cgroup/cgroup-v1.c @@ -661,11 +661,13 @@ struct cftype cgroup1_base_files[] = { .write = cgroup_release_agent_write, .max_write_len = PATH_MAX - 1, }, +#ifdef CONFIG_CGROUP_V1_KILL { .name = "cgroup.kill", .flags = CFTYPE_NOT_ON_ROOT, .write = cgroup_kill_write, }, +#endif { } /* terminate */ };
反馈: 您发送到kernel@openeuler.org的补丁/补丁集,已成功转换为PR! PR链接地址: https://gitee.com/openeuler/kernel/pulls/2007 邮件列表地址:https://mailweb.openeuler.org/hyperkitty/list/kernel@openeuler.org/message/S...
FeedBack: The patch(es) which you have sent to kernel@openeuler.org mailing list has been converted to a pull request successfully! Pull request link: https://gitee.com/openeuler/kernel/pulls/2007 Mailing list address: https://mailweb.openeuler.org/hyperkitty/list/kernel@openeuler.org/message/S...