Lu Jialin (8): cgroup: Export cgroup.kill from cgroupv2 to cgroupv1 memcg: Export memcg.{min/low} from cgroupv2 to cgroupv1 memcg: Export memcg.high from cgroupv2 to cgroupv1 memcg: Export memory.events and memory.events.local from cgroupv2 to cgroupv1 memcg: Fix inconsistent oom event behavior for OOM_MEMCG_KILL memcg: enable memcg async reclaim memcg: export high_async_ratio to userland Kconfig: Introduce CONFIG_MEMCG_V1_THRESHOLD_QOS and CONFIG_CGROUP_V1_KILL
arch/arm64/configs/openeuler_defconfig | 2 + arch/x86/configs/openeuler_defconfig | 2 + include/linux/memcontrol.h | 24 +- init/Kconfig | 11 + kernel/cgroup/cgroup-internal.h | 3 + kernel/cgroup/cgroup-v1.c | 7 + kernel/cgroup/cgroup.c | 4 +- mm/memcontrol.c | 378 ++++++++++++++++++------- 8 files changed, 319 insertions(+), 112 deletions(-)
hulk inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I8JVN0
--------------------------------
Export cgroup.kill feature from cgroupv2 to cgroupv1. Therefore, user can kill all process in one cgroup and its subcgroups instead of kill them one by one.
Signed-off-by: Lu Jialin lujialin4@huawei.com --- kernel/cgroup/cgroup-internal.h | 3 +++ kernel/cgroup/cgroup-v1.c | 5 +++++ kernel/cgroup/cgroup.c | 4 ++-- 3 files changed, 10 insertions(+), 2 deletions(-)
diff --git a/kernel/cgroup/cgroup-internal.h b/kernel/cgroup/cgroup-internal.h index c56071f150f2..d5a197d4b0ec 100644 --- a/kernel/cgroup/cgroup-internal.h +++ b/kernel/cgroup/cgroup-internal.h @@ -264,6 +264,9 @@ int cgroup_show_path(struct seq_file *sf, struct kernfs_node *kf_node, int __cgroup_task_count(const struct cgroup *cgrp); int cgroup_task_count(const struct cgroup *cgrp);
+ssize_t cgroup_kill_write(struct kernfs_open_file *of, char *buf, size_t nbytes, + loff_t off); + /* * rstat.c */ diff --git a/kernel/cgroup/cgroup-v1.c b/kernel/cgroup/cgroup-v1.c index 76db6c67e39a..c50d5da68f18 100644 --- a/kernel/cgroup/cgroup-v1.c +++ b/kernel/cgroup/cgroup-v1.c @@ -660,6 +660,11 @@ struct cftype cgroup1_base_files[] = { .write = cgroup_release_agent_write, .max_write_len = PATH_MAX - 1, }, + { + .name = "cgroup.kill", + .flags = CFTYPE_NOT_ON_ROOT, + .write = cgroup_kill_write, + }, { } /* terminate */ };
diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index 1fb7f562289d..6030f015cb29 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -3973,8 +3973,8 @@ static void cgroup_kill(struct cgroup *cgrp) __cgroup_kill(dsct); }
-static ssize_t cgroup_kill_write(struct kernfs_open_file *of, char *buf, - size_t nbytes, loff_t off) +ssize_t cgroup_kill_write(struct kernfs_open_file *of, char *buf, size_t nbytes, + loff_t off) { ssize_t ret = 0; int kill;
hulk inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I8JVN0
--------------------------------
Export memcg.min and memcg.low from cgroupv2 to cgroupv1, in order to reduce the negtive impact between cgroups when the system memory is insufficient.
Only export memory.{min/low} numbers in mem_cgroup_legacy_files and move related functions in front of mem_cgroup_legacy_files. There is no need to other changes.
Signed-off-by: Lu Jialin lujialin4@huawei.com --- mm/memcontrol.c | 124 ++++++++++++++++++++++++++---------------------- 1 file changed, 68 insertions(+), 56 deletions(-)
diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 5b009b233ab8..164ccd4003f6 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -5009,6 +5009,62 @@ static int mem_cgroup_slab_show(struct seq_file *m, void *p) } #endif
+static int seq_puts_memcg_tunable(struct seq_file *m, unsigned long value) +{ + if (value == PAGE_COUNTER_MAX) + seq_puts(m, "max\n"); + else + seq_printf(m, "%llu\n", (u64)value * PAGE_SIZE); + + return 0; +} + +static int memory_min_show(struct seq_file *m, void *v) +{ + return seq_puts_memcg_tunable(m, + READ_ONCE(mem_cgroup_from_seq(m)->memory.min)); +} + +static ssize_t memory_min_write(struct kernfs_open_file *of, + char *buf, size_t nbytes, loff_t off) +{ + struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of)); + unsigned long min; + int err; + + buf = strstrip(buf); + err = page_counter_memparse(buf, "max", &min); + if (err) + return err; + + page_counter_set_min(&memcg->memory, min); + + return nbytes; +} + +static int memory_low_show(struct seq_file *m, void *v) +{ + return seq_puts_memcg_tunable(m, + READ_ONCE(mem_cgroup_from_seq(m)->memory.low)); +} + +static ssize_t memory_low_write(struct kernfs_open_file *of, + char *buf, size_t nbytes, loff_t off) +{ + struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of)); + unsigned long low; + int err; + + buf = strstrip(buf); + err = page_counter_memparse(buf, "max", &low); + if (err) + return err; + + page_counter_set_low(&memcg->memory, low); + + return nbytes; +} + static int memory_stat_show(struct seq_file *m, void *v);
static struct cftype mem_cgroup_legacy_files[] = { @@ -5137,6 +5193,18 @@ static struct cftype mem_cgroup_legacy_files[] = { .write = mem_cgroup_reset, .read_u64 = mem_cgroup_read_u64, }, + { + .name = "min", + .flags = CFTYPE_NOT_ON_ROOT, + .seq_show = memory_min_show, + .write = memory_min_write, + }, + { + .name = "low", + .flags = CFTYPE_NOT_ON_ROOT, + .seq_show = memory_low_show, + .write = memory_low_write, + }, { }, /* terminate */ };
@@ -6401,16 +6469,6 @@ static void mem_cgroup_attach(struct cgroup_taskset *tset) } #endif /* CONFIG_LRU_GEN */
-static int seq_puts_memcg_tunable(struct seq_file *m, unsigned long value) -{ - if (value == PAGE_COUNTER_MAX) - seq_puts(m, "max\n"); - else - seq_printf(m, "%llu\n", (u64)value * PAGE_SIZE); - - return 0; -} - static u64 memory_current_read(struct cgroup_subsys_state *css, struct cftype *cft) { @@ -6427,52 +6485,6 @@ static u64 memory_peak_read(struct cgroup_subsys_state *css, return (u64)memcg->memory.watermark * PAGE_SIZE; }
-static int memory_min_show(struct seq_file *m, void *v) -{ - return seq_puts_memcg_tunable(m, - READ_ONCE(mem_cgroup_from_seq(m)->memory.min)); -} - -static ssize_t memory_min_write(struct kernfs_open_file *of, - char *buf, size_t nbytes, loff_t off) -{ - struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of)); - unsigned long min; - int err; - - buf = strstrip(buf); - err = page_counter_memparse(buf, "max", &min); - if (err) - return err; - - page_counter_set_min(&memcg->memory, min); - - return nbytes; -} - -static int memory_low_show(struct seq_file *m, void *v) -{ - return seq_puts_memcg_tunable(m, - READ_ONCE(mem_cgroup_from_seq(m)->memory.low)); -} - -static ssize_t memory_low_write(struct kernfs_open_file *of, - char *buf, size_t nbytes, loff_t off) -{ - struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of)); - unsigned long low; - int err; - - buf = strstrip(buf); - err = page_counter_memparse(buf, "max", &low); - if (err) - return err; - - page_counter_set_low(&memcg->memory, low); - - return nbytes; -} - static int memory_high_show(struct seq_file *m, void *v) { return seq_puts_memcg_tunable(m,
hulk inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I8JVN0
--------------------------------
Export memory.high from cgroupv2 to cgroupv1. Therefore, when the usage of the memcg is larger than memory.high, some pages will be reclaimed before return to userland, which will throttle the process.
Only export memory.high number in mem_cgroup_legacy_files and move related functions in front of mem_cgroup_legacy_files. There is no need to other changes.
Signed-off-by: Lu Jialin lujialin4@huawei.com --- mm/memcontrol.c | 104 +++++++++++++++++++++++++----------------------- 1 file changed, 55 insertions(+), 49 deletions(-)
diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 164ccd4003f6..8e780953a1c9 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -5065,6 +5065,55 @@ static ssize_t memory_low_write(struct kernfs_open_file *of, return nbytes; }
+static int memory_high_show(struct seq_file *m, void *v) +{ + return seq_puts_memcg_tunable(m, + READ_ONCE(mem_cgroup_from_seq(m)->memory.high)); +} + +static ssize_t memory_high_write(struct kernfs_open_file *of, + char *buf, size_t nbytes, loff_t off) +{ + struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of)); + unsigned int nr_retries = MAX_RECLAIM_RETRIES; + bool drained = false; + unsigned long high; + int err; + + buf = strstrip(buf); + err = page_counter_memparse(buf, "max", &high); + if (err) + return err; + + page_counter_set_high(&memcg->memory, high); + + for (;;) { + unsigned long nr_pages = page_counter_read(&memcg->memory); + unsigned long reclaimed; + + if (nr_pages <= high) + break; + + if (signal_pending(current)) + break; + + if (!drained) { + drain_all_stock(memcg); + drained = true; + continue; + } + + reclaimed = try_to_free_mem_cgroup_pages(memcg, nr_pages - high, + GFP_KERNEL, MEMCG_RECLAIM_MAY_SWAP); + + if (!reclaimed && !nr_retries--) + break; + } + + memcg_wb_domain_size_changed(memcg); + return nbytes; +} + static int memory_stat_show(struct seq_file *m, void *v);
static struct cftype mem_cgroup_legacy_files[] = { @@ -5205,6 +5254,12 @@ static struct cftype mem_cgroup_legacy_files[] = { .seq_show = memory_low_show, .write = memory_low_write, }, + { + .name = "high", + .flags = CFTYPE_NOT_ON_ROOT, + .seq_show = memory_high_show, + .write = memory_high_write, + }, { }, /* terminate */ };
@@ -6485,55 +6540,6 @@ static u64 memory_peak_read(struct cgroup_subsys_state *css, return (u64)memcg->memory.watermark * PAGE_SIZE; }
-static int memory_high_show(struct seq_file *m, void *v) -{ - return seq_puts_memcg_tunable(m, - READ_ONCE(mem_cgroup_from_seq(m)->memory.high)); -} - -static ssize_t memory_high_write(struct kernfs_open_file *of, - char *buf, size_t nbytes, loff_t off) -{ - struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of)); - unsigned int nr_retries = MAX_RECLAIM_RETRIES; - bool drained = false; - unsigned long high; - int err; - - buf = strstrip(buf); - err = page_counter_memparse(buf, "max", &high); - if (err) - return err; - - page_counter_set_high(&memcg->memory, high); - - for (;;) { - unsigned long nr_pages = page_counter_read(&memcg->memory); - unsigned long reclaimed; - - if (nr_pages <= high) - break; - - if (signal_pending(current)) - break; - - if (!drained) { - drain_all_stock(memcg); - drained = true; - continue; - } - - reclaimed = try_to_free_mem_cgroup_pages(memcg, nr_pages - high, - GFP_KERNEL, MEMCG_RECLAIM_MAY_SWAP); - - if (!reclaimed && !nr_retries--) - break; - } - - memcg_wb_domain_size_changed(memcg); - return nbytes; -} - static int memory_max_show(struct seq_file *m, void *v) { return seq_puts_memcg_tunable(m,
hulk inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I8JVN0
--------------------------------
Export "memory.events" and "memory.events.local" from cgroupv2 to cgroupv1.
There are some differences between v2 and v1:
1)events of MEMCG_OOM_GROUP_KILL is not included in cgroupv1. Because, there is no member of memory.oom.group.
2)events of MEMCG_MAX is represented with "limit_in_bytes" in cgroupv1 instead of memory.max
3)event of oom_kill is include in memory.oom_control. make oom_kill include its descendants' events and add oom_kill_local include its oom_kill event only.
Signed-off-by: Lu Jialin lujialin4@huawei.com --- include/linux/memcontrol.h | 2 -- mm/memcontrol.c | 41 ++++++++++++++++++++++++++++++++++++++ 2 files changed, 41 insertions(+), 2 deletions(-)
diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index e4e24da16d2c..d8f00bf0ce3c 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -1130,8 +1130,6 @@ static inline void memcg_memory_event(struct mem_cgroup *memcg, else cgroup_file_notify(&memcg->events_file);
- if (!cgroup_subsys_on_dfl(memory_cgrp_subsys)) - break; if (cgrp_dfl_root.flags & CGRP_ROOT_MEMORY_LOCAL_EVENTS) break; } while ((memcg = parent_mem_cgroup(memcg)) && diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 8e780953a1c9..1b9dff25a03f 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -4529,6 +4529,9 @@ static int mem_cgroup_oom_control_read(struct seq_file *sf, void *v) seq_printf(sf, "under_oom %d\n", (bool)memcg->under_oom); seq_printf(sf, "oom_kill %lu\n", atomic_long_read(&memcg->memory_events[MEMCG_OOM_KILL])); + seq_printf(sf, "oom_kill_local %lu\n", + atomic_long_read(&memcg->memory_events_local[MEMCG_OOM_KILL])); + return 0; }
@@ -5114,6 +5117,32 @@ static ssize_t memory_high_write(struct kernfs_open_file *of, return nbytes; }
+static void __memcg_events_show(struct seq_file *m, atomic_long_t *events) +{ + seq_printf(m, "low %lu\n", atomic_long_read(&events[MEMCG_LOW])); + seq_printf(m, "high %lu\n", atomic_long_read(&events[MEMCG_HIGH])); + seq_printf(m, "limit_in_bytes %lu\n", + atomic_long_read(&events[MEMCG_MAX])); + seq_printf(m, "oom %lu\n", atomic_long_read(&events[MEMCG_OOM])); +} + +static int memcg_events_show(struct seq_file *m, void *v) +{ + struct mem_cgroup *memcg = mem_cgroup_from_seq(m); + + __memcg_events_show(m, memcg->memory_events); + return 0; +} + +static int memcg_events_local_show(struct seq_file *m, void *v) +{ + struct mem_cgroup *memcg = mem_cgroup_from_seq(m); + + __memcg_events_show(m, memcg->memory_events_local); + return 0; +} + + static int memory_stat_show(struct seq_file *m, void *v);
static struct cftype mem_cgroup_legacy_files[] = { @@ -5260,6 +5289,18 @@ static struct cftype mem_cgroup_legacy_files[] = { .seq_show = memory_high_show, .write = memory_high_write, }, + { + .name = "events", + .flags = CFTYPE_NOT_ON_ROOT, + .file_offset = offsetof(struct mem_cgroup, events_file), + .seq_show = memcg_events_show, + }, + { + .name = "events.local", + .flags = CFTYPE_NOT_ON_ROOT, + .file_offset = offsetof(struct mem_cgroup, events_local_file), + .seq_show = memcg_events_local_show, + }, { }, /* terminate */ };
hulk inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I8JVN0
--------------------------------
Since memory.event is fully supported in cgroupv1, the problem of inconsistent oom event behavior for OOM_MEMCG_KILL occurs again. We fix the problem by add a new condition to support the event adding continue. Therefore, there are two condition: 1) memcg is not root memcg; 2) the memcg is root memcg and the event is OOM_MEMCG_KILL of cgroupv1
Signed-off-by: Lu Jialin lujialin4@huawei.com --- include/linux/memcontrol.h | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-)
diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index d8f00bf0ce3c..0d4e7f1945dd 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -1113,6 +1113,18 @@ static inline void count_memcg_event_mm(struct mm_struct *mm, rcu_read_unlock(); }
+static bool memcg_event_add(struct mem_cgroup *memcg, + enum memcg_memory_event event) +{ + if (!mem_cgroup_is_root(memcg)) + return true; + + if (event == MEMCG_OOM_KILL && !cgroup_subsys_on_dfl(memory_cgrp_subsys)) + return true; + + return false; +} + static inline void memcg_memory_event(struct mem_cgroup *memcg, enum memcg_memory_event event) { @@ -1133,7 +1145,7 @@ static inline void memcg_memory_event(struct mem_cgroup *memcg, if (cgrp_dfl_root.flags & CGRP_ROOT_MEMORY_LOCAL_EVENTS) break; } while ((memcg = parent_mem_cgroup(memcg)) && - !mem_cgroup_is_root(memcg)); + memcg_event_add(memcg, event)); }
static inline void memcg_memory_event_mm(struct mm_struct *mm,
hulk inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I8JVN0
-------------------------------
Introduce two memcg watermarks: warning watermark and safe watermark. warning watermark = memory.high * memory.high_async_ratio / 100; safe watermark = memory.high * (memory.high_async_ratio - 10) / 100; Start memcg async reclaim when memcg usage is larger than warning watermark but smaller than memory.high; the aim reclaim pages is the diff of memcg usage and safe watermark. The default memory.high_async_ratio is 100; when memory.high_async_ratio is 100, memcg async reclaim is disabled;
Signed-off-by: Lu Jialin lujialin4@huawei.com --- include/linux/memcontrol.h | 2 ++ mm/memcontrol.c | 61 ++++++++++++++++++++++++++++++++++++-- 2 files changed, 60 insertions(+), 3 deletions(-)
diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index 0d4e7f1945dd..8e1ae2d252c6 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -332,6 +332,8 @@ struct mem_cgroup { /* per-memcg mm_struct list */ struct lru_gen_mm_list mm_list; #endif + int high_async_ratio; + bool high_async_reclaim;
struct mem_cgroup_per_node *nodeinfo[]; }; diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 1b9dff25a03f..bf225648fcb4 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -105,6 +105,18 @@ static bool do_memsw_account(void) #define THRESHOLDS_EVENTS_TARGET 128 #define SOFTLIMIT_EVENTS_TARGET 1024
+/* + * memcg warning watermark = memory.high * memcg->high_async_ratio / + * HIGH_ASYNC_RATIO_BASE. + * when memcg usage is larger than warning watermark, but smaller than + * memory.high, start memcg async reclaim; + * when memcg->high_async_ratio is HIGH_ASYNC_RATIO_BASE, memcg async + * relcaim is disabled; + */ + +#define HIGH_ASYNC_RATIO_BASE 100 +#define HIGH_ASYNC_RATIO_GAP 10 + /* * Cgroups above their limits are maintained in a RB-Tree, independent of * their hierarchy representation @@ -2406,12 +2418,48 @@ static unsigned long reclaim_high(struct mem_cgroup *memcg, return nr_reclaimed; }
+static bool is_high_async_reclaim(struct mem_cgroup *memcg) +{ + int ratio = READ_ONCE(memcg->high_async_ratio); + unsigned long memcg_high = READ_ONCE(memcg->memory.high); + + if (ratio == HIGH_ASYNC_RATIO_BASE || memcg_high == PAGE_COUNTER_MAX) + return false; + + return page_counter_read(&memcg->memory) > + memcg_high * ratio / HIGH_ASYNC_RATIO_BASE; +} + +static void async_reclaim_high(struct mem_cgroup *memcg) +{ + unsigned long nr_pages, pflags; + unsigned long memcg_high = READ_ONCE(memcg->memory.high); + unsigned long memcg_usage = page_counter_read(&memcg->memory); + int ratio = READ_ONCE(memcg->high_async_ratio) - HIGH_ASYNC_RATIO_GAP; + unsigned long safe_pages = memcg_high * ratio / HIGH_ASYNC_RATIO_BASE; + + if (!is_high_async_reclaim(memcg)) { + WRITE_ONCE(memcg->high_async_reclaim, false); + return; + } + + psi_memstall_enter(&pflags); + nr_pages = memcg_usage > safe_pages ? memcg_usage - safe_pages : + MEMCG_CHARGE_BATCH; + try_to_free_mem_cgroup_pages(memcg, nr_pages, GFP_KERNEL, MEMCG_RECLAIM_MAY_SWAP); + psi_memstall_leave(&pflags); + WRITE_ONCE(memcg->high_async_reclaim, false); +} + static void high_work_func(struct work_struct *work) { - struct mem_cgroup *memcg; + struct mem_cgroup *memcg = container_of(work, struct mem_cgroup, + high_work);
- memcg = container_of(work, struct mem_cgroup, high_work); - reclaim_high(memcg, MEMCG_CHARGE_BATCH, GFP_KERNEL); + if (READ_ONCE(memcg->high_async_reclaim)) + async_reclaim_high(memcg); + else + reclaim_high(memcg, MEMCG_CHARGE_BATCH, GFP_KERNEL); }
/* @@ -2800,6 +2848,12 @@ static int try_charge_memcg(struct mem_cgroup *memcg, gfp_t gfp_mask, continue; }
+ if (is_high_async_reclaim(memcg) && !mem_high) { + WRITE_ONCE(memcg->high_async_reclaim, true); + schedule_work(&memcg->high_work); + break; + } + if (mem_high || swap_high) { /* * The allocating tasks in this cgroup will need to do @@ -5528,6 +5582,7 @@ mem_cgroup_css_alloc(struct cgroup_subsys_state *parent_css) #if defined(CONFIG_MEMCG_KMEM) && defined(CONFIG_ZSWAP) memcg->zswap_max = PAGE_COUNTER_MAX; #endif + memcg->high_async_ratio = HIGH_ASYNC_RATIO_BASE; page_counter_set_high(&memcg->swap, PAGE_COUNTER_MAX); if (parent) { WRITE_ONCE(memcg->swappiness, mem_cgroup_swappiness(parent));
hulk inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I8JVN0
-------------------------------
User can set high_async_ratio from 10 to 100; start memcg high async when memcg_usage is larger than memory.high * high_async_ratio / 100;
Signed-off-by: Lu Jialin lujialin4@huawei.com --- mm/memcontrol.c | 36 ++++++++++++++++++++++++++++++++++++ 1 file changed, 36 insertions(+)
diff --git a/mm/memcontrol.c b/mm/memcontrol.c index bf225648fcb4..5a808d88e27a 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -5196,6 +5196,35 @@ static int memcg_events_local_show(struct seq_file *m, void *v) return 0; }
+static int memcg_high_async_ratio_show(struct seq_file *m, void *v) +{ + seq_printf(m, "%d\n", + READ_ONCE(mem_cgroup_from_seq(m)->high_async_ratio)); + return 0; +} + +static ssize_t memcg_high_async_ratio_write(struct kernfs_open_file *of, + char *buf, size_t nbytes, loff_t off) +{ + struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of)); + int ret, high_async_ratio; + + buf = strstrip(buf); + if (!buf) + return -EINVAL; + + ret = kstrtoint(buf, 0, &high_async_ratio); + if (ret) + return ret; + + if (high_async_ratio > HIGH_ASYNC_RATIO_BASE || + high_async_ratio <= HIGH_ASYNC_RATIO_GAP) + return -EINVAL; + + WRITE_ONCE(memcg->high_async_ratio, high_async_ratio); + + return nbytes; +}
static int memory_stat_show(struct seq_file *m, void *v);
@@ -5355,6 +5384,13 @@ static struct cftype mem_cgroup_legacy_files[] = { .file_offset = offsetof(struct mem_cgroup, events_local_file), .seq_show = memcg_events_local_show, }, + { + .name = "high_async_ratio", + .flags = CFTYPE_NOT_ON_ROOT, + .seq_show = memcg_high_async_ratio_show, + .write = memcg_high_async_ratio_write, + }, + { }, /* terminate */ };
这个也跟上个补丁合成一个
按照功能合成一个
On 2023/12/2 12:12, Lu Jialin wrote:
hulk inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I8JVN0
User can set high_async_ratio from 10 to 100; start memcg high async when memcg_usage is larger than memory.high * high_async_ratio / 100;
Signed-off-by: Lu Jialin lujialin4@huawei.com
mm/memcontrol.c | 36 ++++++++++++++++++++++++++++++++++++ 1 file changed, 36 insertions(+)
diff --git a/mm/memcontrol.c b/mm/memcontrol.c index bf225648fcb4..5a808d88e27a 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -5196,6 +5196,35 @@ static int memcg_events_local_show(struct seq_file *m, void *v) return 0; }
+static int memcg_high_async_ratio_show(struct seq_file *m, void *v) +{
- seq_printf(m, "%d\n",
READ_ONCE(mem_cgroup_from_seq(m)->high_async_ratio));
- return 0;
+}
+static ssize_t memcg_high_async_ratio_write(struct kernfs_open_file *of,
char *buf, size_t nbytes, loff_t off)
+{
- struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of));
- int ret, high_async_ratio;
- buf = strstrip(buf);
- if (!buf)
return -EINVAL;
- ret = kstrtoint(buf, 0, &high_async_ratio);
- if (ret)
return ret;
- if (high_async_ratio > HIGH_ASYNC_RATIO_BASE ||
high_async_ratio <= HIGH_ASYNC_RATIO_GAP)
return -EINVAL;
- WRITE_ONCE(memcg->high_async_ratio, high_async_ratio);
- return nbytes;
+}
static int memory_stat_show(struct seq_file *m, void *v);
@@ -5355,6 +5384,13 @@ static struct cftype mem_cgroup_legacy_files[] = { .file_offset = offsetof(struct mem_cgroup, events_local_file), .seq_show = memcg_events_local_show, },
- {
.name = "high_async_ratio",
.flags = CFTYPE_NOT_ON_ROOT,
.seq_show = memcg_high_async_ratio_show,
.write = memcg_high_async_ratio_write,
- },
- { }, /* terminate */ };
hulk inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I8JVN0
-------------------------------
Introduce CONFIG_MEMCG_V1_THRESHOLD_QOS and CONFIG_CGROUP_V1_KILL to isolate memcg qos management feature and cgroup kill from baseline.
Signed-off-by: Lu Jialin lujialin4@huawei.com --- arch/arm64/configs/openeuler_defconfig | 2 ++ arch/x86/configs/openeuler_defconfig | 2 ++ include/linux/memcontrol.h | 10 +++++++++- init/Kconfig | 11 +++++++++++ kernel/cgroup/cgroup-v1.c | 2 ++ mm/memcontrol.c | 16 ++++++++++++++-- 6 files changed, 40 insertions(+), 3 deletions(-)
diff --git a/arch/arm64/configs/openeuler_defconfig b/arch/arm64/configs/openeuler_defconfig index 8f1a4db8d49b..ec9173c779e7 100644 --- a/arch/arm64/configs/openeuler_defconfig +++ b/arch/arm64/configs/openeuler_defconfig @@ -155,8 +155,10 @@ CONFIG_NUMA_BALANCING=y CONFIG_NUMA_BALANCING_DEFAULT_ENABLED=y CONFIG_CGROUPS=y CONFIG_PAGE_COUNTER=y +CONFIG_CGROUP_V1_KILL=y # CONFIG_CGROUP_FAVOR_DYNMODS is not set CONFIG_MEMCG=y +CONFIG_MEMCG_V1_THRESHOLD_QOS=y CONFIG_MEMCG_KMEM=y CONFIG_BLK_CGROUP=y CONFIG_CGROUP_WRITEBACK=y diff --git a/arch/x86/configs/openeuler_defconfig b/arch/x86/configs/openeuler_defconfig index 507d199ff598..7c0e71021c42 100644 --- a/arch/x86/configs/openeuler_defconfig +++ b/arch/x86/configs/openeuler_defconfig @@ -176,9 +176,11 @@ CONFIG_ARCH_SUPPORTS_INT128=y CONFIG_NUMA_BALANCING=y CONFIG_NUMA_BALANCING_DEFAULT_ENABLED=y CONFIG_CGROUPS=y +CONFIG_CGROUP_V1_KILL=y CONFIG_PAGE_COUNTER=y # CONFIG_CGROUP_FAVOR_DYNMODS is not set CONFIG_MEMCG=y +CONFIG_MEMCG_V1_THRESHOLD_QOS=y CONFIG_MEMCG_KMEM=y CONFIG_BLK_CGROUP=y CONFIG_CGROUP_WRITEBACK=y diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index 8e1ae2d252c6..294ca0f5b55a 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -332,8 +332,11 @@ struct mem_cgroup { /* per-memcg mm_struct list */ struct lru_gen_mm_list mm_list; #endif + +#ifdef CONFIG_MEMCG_V1_THRESHOLD_QOS int high_async_ratio; bool high_async_reclaim; +#endif
struct mem_cgroup_per_node *nodeinfo[]; }; @@ -1121,8 +1124,10 @@ static bool memcg_event_add(struct mem_cgroup *memcg, if (!mem_cgroup_is_root(memcg)) return true;
+#ifdef CONFIG_MEMCG_V1_THRESHOLD_QOS if (event == MEMCG_OOM_KILL && !cgroup_subsys_on_dfl(memory_cgrp_subsys)) return true; +#endif
return false; } @@ -1143,7 +1148,10 @@ static inline void memcg_memory_event(struct mem_cgroup *memcg, cgroup_file_notify(&memcg->swap_events_file); else cgroup_file_notify(&memcg->events_file); - +#ifndef CONFIG_MEMCG_V1_THRESHOLD_QOS + if (!cgroup_subsys_on_dfl(memory_cgrp_subsys)) + break; +#endif if (cgrp_dfl_root.flags & CGRP_ROOT_MEMORY_LOCAL_EVENTS) break; } while ((memcg = parent_mem_cgroup(memcg)) && diff --git a/init/Kconfig b/init/Kconfig index 6d35728b94b2..208c1002319d 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -944,6 +944,12 @@ config MEMCG help Provides control over the memory footprint of tasks in a cgroup.
+config MEMCG_V1_THRESHOLD_QOS + bool "Qos memcg threshold in v1" + depends on MEMCG + default n + + config MEMCG_KMEM bool depends on MEMCG @@ -1175,6 +1181,11 @@ config SOCK_CGROUP_DATA bool default n
+config CGROUP_V1_KILL + bool "Kill All Tasks In Cgroup" + default n + depends on CGROUPS + endif # CGROUPS
menuconfig NAMESPACES diff --git a/kernel/cgroup/cgroup-v1.c b/kernel/cgroup/cgroup-v1.c index c50d5da68f18..134a15e1d83a 100644 --- a/kernel/cgroup/cgroup-v1.c +++ b/kernel/cgroup/cgroup-v1.c @@ -660,11 +660,13 @@ struct cftype cgroup1_base_files[] = { .write = cgroup_release_agent_write, .max_write_len = PATH_MAX - 1, }, +#ifdef CONFIG_CGROUP_V1_KILL { .name = "cgroup.kill", .flags = CFTYPE_NOT_ON_ROOT, .write = cgroup_kill_write, }, +#endif { } /* terminate */ };
diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 5a808d88e27a..278a14c09f94 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -2418,6 +2418,7 @@ static unsigned long reclaim_high(struct mem_cgroup *memcg, return nr_reclaimed; }
+#ifdef CONFIG_MEMCG_V1_THRESHOLD_QOS static bool is_high_async_reclaim(struct mem_cgroup *memcg) { int ratio = READ_ONCE(memcg->high_async_ratio); @@ -2450,15 +2451,18 @@ static void async_reclaim_high(struct mem_cgroup *memcg) psi_memstall_leave(&pflags); WRITE_ONCE(memcg->high_async_reclaim, false); } +#endif
static void high_work_func(struct work_struct *work) { struct mem_cgroup *memcg = container_of(work, struct mem_cgroup, high_work);
+#ifdef CONFIG_MEMCG_V1_THRESHOLD_QOS if (READ_ONCE(memcg->high_async_reclaim)) async_reclaim_high(memcg); else +#endif reclaim_high(memcg, MEMCG_CHARGE_BATCH, GFP_KERNEL); }
@@ -2847,12 +2851,13 @@ static int try_charge_memcg(struct mem_cgroup *memcg, gfp_t gfp_mask, } continue; } - +#ifdef CONFIG_MEMCG_V1_THRESHOLD_QOS if (is_high_async_reclaim(memcg) && !mem_high) { WRITE_ONCE(memcg->high_async_reclaim, true); schedule_work(&memcg->high_work); break; } +#endif
if (mem_high || swap_high) { /* @@ -4583,8 +4588,10 @@ static int mem_cgroup_oom_control_read(struct seq_file *sf, void *v) seq_printf(sf, "under_oom %d\n", (bool)memcg->under_oom); seq_printf(sf, "oom_kill %lu\n", atomic_long_read(&memcg->memory_events[MEMCG_OOM_KILL])); +#ifdef CONFIG_MEMCG_V1_THRESHOLD_QOS seq_printf(sf, "oom_kill_local %lu\n", atomic_long_read(&memcg->memory_events_local[MEMCG_OOM_KILL])); +#endif
return 0; } @@ -5171,6 +5178,7 @@ static ssize_t memory_high_write(struct kernfs_open_file *of, return nbytes; }
+#ifdef CONFIG_MEMCG_V1_THRESHOLD_QOS static void __memcg_events_show(struct seq_file *m, atomic_long_t *events) { seq_printf(m, "low %lu\n", atomic_long_read(&events[MEMCG_LOW])); @@ -5195,7 +5203,6 @@ static int memcg_events_local_show(struct seq_file *m, void *v) __memcg_events_show(m, memcg->memory_events_local); return 0; } - static int memcg_high_async_ratio_show(struct seq_file *m, void *v) { seq_printf(m, "%d\n", @@ -5225,6 +5232,7 @@ static ssize_t memcg_high_async_ratio_write(struct kernfs_open_file *of,
return nbytes; } +#endif
static int memory_stat_show(struct seq_file *m, void *v);
@@ -5354,6 +5362,7 @@ static struct cftype mem_cgroup_legacy_files[] = { .write = mem_cgroup_reset, .read_u64 = mem_cgroup_read_u64, }, +#ifdef CONFIG_MEMCG_V1_THRESHOLD_QOS { .name = "min", .flags = CFTYPE_NOT_ON_ROOT, @@ -5390,6 +5399,7 @@ static struct cftype mem_cgroup_legacy_files[] = { .seq_show = memcg_high_async_ratio_show, .write = memcg_high_async_ratio_write, }, +#endif
{ }, /* terminate */ }; @@ -5618,7 +5628,9 @@ mem_cgroup_css_alloc(struct cgroup_subsys_state *parent_css) #if defined(CONFIG_MEMCG_KMEM) && defined(CONFIG_ZSWAP) memcg->zswap_max = PAGE_COUNTER_MAX; #endif +#ifdef CONFIG_MEMCG_V1_THRESHOLD_QOS memcg->high_async_ratio = HIGH_ASYNC_RATIO_BASE; +#endif page_counter_set_high(&memcg->swap, PAGE_COUNTER_MAX); if (parent) { WRITE_ONCE(memcg->swappiness, mem_cgroup_swappiness(parent));
反馈: 您发送到kernel@openeuler.org的补丁/补丁集,已成功转换为PR! PR链接地址: https://gitee.com/openeuler/kernel/pulls/3138 邮件列表地址:https://mailweb.openeuler.org/hyperkitty/list/kernel@openeuler.org/message/O...
FeedBack: The patch(es) which you have sent to kernel@openeuler.org mailing list has been converted to a pull request successfully! Pull request link: https://gitee.com/openeuler/kernel/pulls/3138 Mailing list address: https://mailweb.openeuler.org/hyperkitty/list/kernel@openeuler.org/message/O...