hulk inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I7CGGT CVE: NA
--------------------------------
The memsw can't limit the usage of swap space. Add memory.swap.max interface to limit the difference value of memsw.usage and memory.usage. Since a page may occupy both swap entry and a swap cache page, this value is not exactly equal to swap.usage.
Signed-off-by: Liu Shixin liushixin2@huawei.com --- .../admin-guide/cgroup-v1/memory.rst | 1 + include/linux/memcontrol.h | 9 ++ mm/memcontrol.c | 137 +++++++++++++++++- 3 files changed, 146 insertions(+), 1 deletion(-)
diff --git a/Documentation/admin-guide/cgroup-v1/memory.rst b/Documentation/admin-guide/cgroup-v1/memory.rst index 5eee7e3be4b2..962a80d44744 100644 --- a/Documentation/admin-guide/cgroup-v1/memory.rst +++ b/Documentation/admin-guide/cgroup-v1/memory.rst @@ -79,6 +79,7 @@ Brief summary of control files. memory.use_hierarchy set/show hierarchical account enabled memory.force_empty trigger forced page reclaim memory.force_swapin trigger forced swapin anon page + memory.swap.max set/show limit for swap memory.pressure_level set memory pressure notifications memory.swappiness set/show swappiness parameter of vmscan (See sysctl's vm.swappiness) diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index 17ef01c63102..58f976014384 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -240,6 +240,10 @@ struct obj_cgroup { }; };
+struct swap_device { + unsigned long max; +}; + /* * The memory controller data structure. The memory controller controls both * page cache and RSS per cgroup. We would eventually like to provide @@ -402,7 +406,12 @@ struct mem_cgroup { #else KABI_RESERVE(6) #endif +#ifdef CONFIG_MEMCG_SWAP_QOS + /* per-memcg swap device control; protected by swap_lock */ + KABI_USE(7, struct swap_device *swap_dev) +#else KABI_RESERVE(7) +#endif KABI_RESERVE(8)
struct mem_cgroup_per_node *nodeinfo[0]; diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 4aad76ada6a7..abcfb313226d 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -4061,6 +4061,10 @@ static int sysctl_memcg_swap_qos_stat;
static void memcg_swap_qos_reset(void) { + struct mem_cgroup *memcg; + + for_each_mem_cgroup(memcg) + WRITE_ONCE(memcg->swap_dev->max, PAGE_COUNTER_MAX); }
static int sysctl_memcg_swap_qos_handler(struct ctl_table *table, int write, @@ -4130,6 +4134,124 @@ static ssize_t memory_swapin(struct kernfs_open_file *of, char *buf,
return nbytes; } + +static int memcg_alloc_swap_device(struct mem_cgroup *memcg) +{ + memcg->swap_dev = kmalloc(sizeof(struct swap_device), GFP_KERNEL); + if (!memcg->swap_dev) + return -ENOMEM; + return 0; +} + +static void memcg_free_swap_device(struct mem_cgroup *memcg) +{ + if (!memcg->swap_dev) + return; + + kfree(memcg->swap_dev); + memcg->swap_dev = NULL; +} + +static void memcg_swap_device_init(struct mem_cgroup *memcg, + struct mem_cgroup *parent) +{ + if (!static_branch_likely(&memcg_swap_qos_key) || !parent) + WRITE_ONCE(memcg->swap_dev->max, PAGE_COUNTER_MAX); + else + WRITE_ONCE(memcg->swap_dev->max, + READ_ONCE(parent->swap_dev->max)); +} + +u64 memcg_swapmax_read(struct cgroup_subsys_state *css, struct cftype *cft) +{ + struct mem_cgroup *memcg = mem_cgroup_from_css(css); + + if (!static_branch_likely(&memcg_swap_qos_key)) + return PAGE_COUNTER_MAX * PAGE_SIZE; + + return READ_ONCE(memcg->swap_dev->max) * PAGE_SIZE; +} + +static ssize_t memcg_swapmax_write(struct kernfs_open_file *of, + char *buf, size_t nbytes, loff_t off) +{ + struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of)); + unsigned long max; + int err; + + if (!static_branch_likely(&memcg_swap_qos_key)) + return -EACCES; + + buf = strstrip(buf); + err = page_counter_memparse(buf, "max", &max); + if (err) + return err; + + WRITE_ONCE(memcg->swap_dev->max, max); + + return nbytes; +} + +static int mem_cgroup_check_swap_for_v1(struct page *page, swp_entry_t entry) +{ + struct mem_cgroup *memcg, *target_memcg; + unsigned long swap_usage; + unsigned long swap_limit; + long nr_swap_pages = PAGE_COUNTER_MAX; + + if (!static_branch_likely(&memcg_swap_qos_key)) + return 0; + + if (!entry.val) + return 0; + + rcu_read_lock(); + target_memcg = page_memcg(page); + if (!target_memcg || mem_cgroup_is_root(target_memcg)) { + rcu_read_unlock(); + return 0; + } + + if (!css_tryget_online(&target_memcg->css)) { + rcu_read_unlock(); + return 0; + } + rcu_read_unlock(); + + for (memcg = target_memcg; memcg != root_mem_cgroup; + memcg = parent_mem_cgroup(memcg)) { + swap_limit = READ_ONCE(memcg->swap_dev->max); + swap_usage = page_counter_read(&memcg->memsw) - + page_counter_read(&memcg->memory); + nr_swap_pages = min_t(long, nr_swap_pages, + swap_limit - swap_usage); + } + css_put(&target_memcg->css); + + if (thp_nr_pages(page) > nr_swap_pages) + return -ENOMEM; + return 0; +} + +#else +static int memcg_alloc_swap_device(struct mem_cgroup *memcg) +{ + return 0; +} + +static void memcg_free_swap_device(struct mem_cgroup *memcg) +{ +} + +static void memcg_swap_device_init(struct mem_cgroup *memcg, + struct mem_cgroup *parent) +{ +} + +static int mem_cgroup_check_swap_for_v1(struct page *page, swp_entry_t entry) +{ + return 0; +} #endif
#ifdef CONFIG_NUMA @@ -5830,6 +5952,12 @@ static struct cftype mem_cgroup_legacy_files[] = { .flags = CFTYPE_NOT_ON_ROOT, .write = memory_swapin, }, + { + .name = "swap.max", + .flags = CFTYPE_NOT_ON_ROOT, + .write = memcg_swapmax_write, + .read_u64 = memcg_swapmax_read, + }, #endif { .name = "high_async_ratio", @@ -5975,6 +6103,7 @@ static void __mem_cgroup_free(struct mem_cgroup *memcg) for_each_node(node) free_mem_cgroup_per_node_info(memcg, node); free_percpu(memcg->vmstats_percpu); + memcg_free_swap_device(memcg); kfree(memcg); }
@@ -5999,6 +6128,9 @@ static struct mem_cgroup *mem_cgroup_alloc(void) if (!memcg) return ERR_PTR(error);
+ if (memcg_alloc_swap_device(memcg)) + goto fail; + memcg->id.id = idr_alloc(&mem_cgroup_idr, NULL, 1, MEM_CGROUP_ID_MAX, GFP_KERNEL); @@ -6076,17 +6208,20 @@ mem_cgroup_css_alloc(struct cgroup_subsys_state *parent_css) page_counter_init(&memcg->swap, NULL); page_counter_init(&memcg->kmem, NULL); page_counter_init(&memcg->tcpmem, NULL); + memcg_swap_device_init(memcg, NULL); } else if (parent->use_hierarchy) { memcg->use_hierarchy = true; page_counter_init(&memcg->memory, &parent->memory); page_counter_init(&memcg->swap, &parent->swap); page_counter_init(&memcg->kmem, &parent->kmem); page_counter_init(&memcg->tcpmem, &parent->tcpmem); + memcg_swap_device_init(memcg, parent); } else { page_counter_init(&memcg->memory, &root_mem_cgroup->memory); page_counter_init(&memcg->swap, &root_mem_cgroup->swap); page_counter_init(&memcg->kmem, &root_mem_cgroup->kmem); page_counter_init(&memcg->tcpmem, &root_mem_cgroup->tcpmem); + memcg_swap_device_init(memcg, root_mem_cgroup); /* * Deeper hierachy with use_hierarchy == false doesn't make * much sense so let cgroup subsystem know about this @@ -8020,7 +8155,7 @@ int mem_cgroup_try_charge_swap(struct page *page, swp_entry_t entry) unsigned short oldid;
if (!cgroup_subsys_on_dfl(memory_cgrp_subsys)) - return 0; + return mem_cgroup_check_swap_for_v1(page, entry);
memcg = page_memcg(page);