From: Liu Shixin liushixin2@huawei.com
hulk inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I8QK6Q CVE: NA
--------------------------------
The memsw can't limit the usage of swap space. Add memory.swap.max interface to limit the difference value of memsw.usage and memory.usage. Since a page may occupy both swap entry and a swap cache page, this value is not exactly equal to swap.usage.
Signed-off-by: Liu Shixin liushixin2@huawei.com Signed-off-by: Jinjiang Tu tujinjiang@huawei.com --- .../admin-guide/cgroup-v1/memory.rst | 1 + include/linux/memcontrol.h | 8 ++ mm/memcontrol.c | 134 +++++++++++++++++- 3 files changed, 142 insertions(+), 1 deletion(-)
diff --git a/Documentation/admin-guide/cgroup-v1/memory.rst b/Documentation/admin-guide/cgroup-v1/memory.rst index 9bbd489136b2..f3ce13312604 100644 --- a/Documentation/admin-guide/cgroup-v1/memory.rst +++ b/Documentation/admin-guide/cgroup-v1/memory.rst @@ -84,6 +84,7 @@ Brief summary of control files. used. memory.force_empty trigger forced page reclaim memory.force_swapin trigger forced swapin anon page + memory.swap.max set/show limit for swap memory.pressure_level set memory pressure notifications memory.swappiness set/show swappiness parameter of vmscan (See sysctl's vm.swappiness) diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index 578f43b68392..7183a3767bf1 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -201,6 +201,10 @@ struct obj_cgroup { }; };
+struct swap_device { + unsigned long max; +}; + /* * The memory controller data structure. The memory controller controls both * page cache and RSS per cgroup. We would eventually like to provide @@ -350,6 +354,10 @@ struct mem_cgroup { bool high_async_reclaim; #endif
+#ifdef CONFIG_MEMCG_SWAP_QOS + struct swap_device *swap_dev; +#endif + struct mem_cgroup_per_node *nodeinfo[]; };
diff --git a/mm/memcontrol.c b/mm/memcontrol.c index deebbfcd5bf6..c4eb95b331d8 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -4249,6 +4249,10 @@ static int sysctl_memcg_swap_qos_stat;
static void memcg_swap_qos_reset(void) { + struct mem_cgroup *memcg; + + for_each_mem_cgroup(memcg) + WRITE_ONCE(memcg->swap_dev->max, PAGE_COUNTER_MAX); }
static int sysctl_memcg_swap_qos_handler(struct ctl_table *table, int write, @@ -4309,6 +4313,122 @@ static ssize_t memory_swapin(struct kernfs_open_file *of, char *buf,
return nbytes; } + +static int memcg_alloc_swap_device(struct mem_cgroup *memcg) +{ + memcg->swap_dev = kmalloc(sizeof(struct swap_device), GFP_KERNEL); + if (!memcg->swap_dev) + return -ENOMEM; + return 0; +} + +static void memcg_free_swap_device(struct mem_cgroup *memcg) +{ + if (!memcg->swap_dev) + return; + + kfree(memcg->swap_dev); + memcg->swap_dev = NULL; +} + +static void memcg_swap_device_init(struct mem_cgroup *memcg, + struct mem_cgroup *parent) +{ + if (!static_branch_likely(&memcg_swap_qos_key) || !parent) + WRITE_ONCE(memcg->swap_dev->max, PAGE_COUNTER_MAX); + else + WRITE_ONCE(memcg->swap_dev->max, + READ_ONCE(parent->swap_dev->max)); +} + +u64 memcg_swapmax_read(struct cgroup_subsys_state *css, struct cftype *cft) +{ + struct mem_cgroup *memcg = mem_cgroup_from_css(css); + + if (!static_branch_likely(&memcg_swap_qos_key)) + return PAGE_COUNTER_MAX * PAGE_SIZE; + + return READ_ONCE(memcg->swap_dev->max) * PAGE_SIZE; +} + +static ssize_t memcg_swapmax_write(struct kernfs_open_file *of, + char *buf, size_t nbytes, loff_t off) +{ + struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of)); + unsigned long max; + int err; + + if (!static_branch_likely(&memcg_swap_qos_key)) + return -EACCES; + + buf = strstrip(buf); + err = page_counter_memparse(buf, "max", &max); + if (err) + return err; + + WRITE_ONCE(memcg->swap_dev->max, max); + + return nbytes; +} + +static int mem_cgroup_check_swap_for_v1(struct folio *folio, swp_entry_t entry) +{ + struct mem_cgroup *memcg, *target_memcg; + unsigned long swap_usage; + unsigned long swap_limit; + long nr_swap_pages = PAGE_COUNTER_MAX; + + if (!static_branch_likely(&memcg_swap_qos_key)) + return 0; + + if (!entry.val) + return 0; + + rcu_read_lock(); + target_memcg = folio_memcg(folio); + if (!target_memcg || mem_cgroup_is_root(target_memcg) || + !css_tryget_online(&target_memcg->css)) { + rcu_read_unlock(); + return 0; + } + + rcu_read_unlock(); + + for (memcg = target_memcg; memcg != root_mem_cgroup; + memcg = parent_mem_cgroup(memcg)) { + swap_limit = READ_ONCE(memcg->swap_dev->max); + swap_usage = page_counter_read(&memcg->memsw) - + page_counter_read(&memcg->memory); + nr_swap_pages = min_t(long, nr_swap_pages, + swap_limit - swap_usage); + } + css_put(&target_memcg->css); + + if (folio_nr_pages(folio) > nr_swap_pages) + return -ENOMEM; + + return 0; +} + +#else +static int memcg_alloc_swap_device(struct mem_cgroup *memcg) +{ + return 0; +} + +static void memcg_free_swap_device(struct mem_cgroup *memcg) +{ +} + +static void memcg_swap_device_init(struct mem_cgroup *memcg, + struct mem_cgroup *parent) +{ +} + +static int mem_cgroup_check_swap_for_v1(struct folio *folio, swp_entry_t entry) +{ + return 0; +} #endif
#ifdef CONFIG_NUMA @@ -5807,6 +5927,12 @@ static struct cftype mem_cgroup_legacy_files[] = { .flags = CFTYPE_NOT_ON_ROOT, .write = memory_swapin, }, + { + .name = "swap.max", + .flags = CFTYPE_NOT_ON_ROOT, + .write = memcg_swapmax_write, + .read_u64 = memcg_swapmax_read, + }, #endif { }, /* terminate */ }; @@ -5943,6 +6069,7 @@ static void __mem_cgroup_free(struct mem_cgroup *memcg) free_mem_cgroup_per_node_info(memcg, node); kfree(memcg->vmstats); free_percpu(memcg->vmstats_percpu); + memcg_free_swap_device(memcg); kfree(memcg); }
@@ -5964,6 +6091,9 @@ static struct mem_cgroup *mem_cgroup_alloc(void) if (!memcg) return ERR_PTR(error);
+ if (memcg_alloc_swap_device(memcg)) + goto fail; + memcg->id.id = idr_alloc(&mem_cgroup_idr, NULL, 1, MEM_CGROUP_ID_MAX + 1, GFP_KERNEL); if (memcg->id.id < 0) { @@ -6047,12 +6177,14 @@ mem_cgroup_css_alloc(struct cgroup_subsys_state *parent_css) page_counter_init(&memcg->swap, &parent->swap); page_counter_init(&memcg->kmem, &parent->kmem); page_counter_init(&memcg->tcpmem, &parent->tcpmem); + memcg_swap_device_init(memcg, parent); } else { init_memcg_events(); page_counter_init(&memcg->memory, NULL); page_counter_init(&memcg->swap, NULL); page_counter_init(&memcg->kmem, NULL); page_counter_init(&memcg->tcpmem, NULL); + memcg_swap_device_init(memcg, NULL);
root_mem_cgroup = memcg; return &memcg->css; @@ -8301,7 +8433,7 @@ int __mem_cgroup_try_charge_swap(struct folio *folio, swp_entry_t entry) unsigned short oldid;
if (do_memsw_account()) - return 0; + return mem_cgroup_check_swap_for_v1(folio, entry);
memcg = folio_memcg(folio);