hulk inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I7CGGT CVE: NA
--------------------------------
With memory.swapfile interface, the avail swap device can be limit for memcg. The acceptable parameters are 'all', 'none' and valid swap device. Usage: echo /dev/zram0 > memory.swapfile
If the swap device is offline, the swapfile will be fallback to 'none'.
Signed-off-by: Liu Shixin liushixin2@huawei.com --- .../admin-guide/cgroup-v1/memory.rst | 1 + include/linux/memcontrol.h | 26 ++++ include/linux/swap.h | 6 +- mm/Kconfig | 2 +- mm/memcontrol.c | 146 ++++++++++++++++++ mm/swap_slots.c | 14 +- mm/swapfile.c | 86 ++++++++++- 7 files changed, 273 insertions(+), 8 deletions(-)
diff --git a/Documentation/admin-guide/cgroup-v1/memory.rst b/Documentation/admin-guide/cgroup-v1/memory.rst index 5eee7e3be4b2..97bad1406a23 100644 --- a/Documentation/admin-guide/cgroup-v1/memory.rst +++ b/Documentation/admin-guide/cgroup-v1/memory.rst @@ -79,6 +79,7 @@ Brief summary of control files. memory.use_hierarchy set/show hierarchical account enabled memory.force_empty trigger forced page reclaim memory.force_swapin trigger forced swapin anon page + memory.swapfile set/show swap file memory.pressure_level set memory pressure notifications memory.swappiness set/show swappiness parameter of vmscan (See sysctl's vm.swappiness) diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index 4395f2e03cb7..6b55bfb1cd5b 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -50,6 +50,11 @@ enum memcg_memory_event { MEMCG_NR_MEMORY_EVENTS, };
+enum { + SWAP_TYPE_ALL = -1, /* allowd use all swap file */ + SWAP_TYPE_NONE = -2, /* prohibited use any swapfile */ +}; + struct mem_cgroup_reclaim_cookie { pg_data_t *pgdat; unsigned int generation; @@ -240,6 +245,10 @@ struct obj_cgroup { }; };
+struct swap_device { + int type; +}; + /* * The memory controller data structure. The memory controller controls both * page cache and RSS per cgroup. We would eventually like to provide @@ -402,7 +411,12 @@ struct mem_cgroup { #else KABI_RESERVE(6) #endif +#ifdef CONFIG_MEMCG_SWAP_QOS + /* per-memcg swap device control; protected by swap_lock */ + KABI_USE(7, struct swap_device *swap_dev) +#else KABI_RESERVE(7) +#endif KABI_RESERVE(8)
struct mem_cgroup_per_node *nodeinfo[0]; @@ -1292,6 +1306,9 @@ static inline bool memcg_has_children(struct mem_cgroup *memcg)
int mem_cgroup_force_empty(struct mem_cgroup *memcg);
+int memcg_get_swap_type(struct page *page); +void memcg_remove_swapfile(int type); + #else /* CONFIG_MEMCG */
#define MEM_CGROUP_ID_SHIFT 0 @@ -1695,6 +1712,15 @@ unsigned long mem_cgroup_soft_limit_reclaim(pg_data_t *pgdat, int order, { return 0; } + +static inline int memcg_get_swap_type(struct page *page) +{ + return SWAP_TYPE_ALL; +} + +static inline void memcg_remove_swapfile(int type) +{ +} #endif /* CONFIG_MEMCG */
/* idx can be of type enum memcg_stat_item or node_stat_item */ diff --git a/include/linux/swap.h b/include/linux/swap.h index b98b4c9df622..b40cc0500b42 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h @@ -515,7 +515,8 @@ extern void si_swapinfo(struct sysinfo *); extern swp_entry_t get_swap_page(struct page *page); extern void put_swap_page(struct page *page, swp_entry_t entry); extern swp_entry_t get_swap_page_of_type(int); -extern int get_swap_pages(int n, swp_entry_t swp_entries[], int entry_size); +extern int get_swap_pages(int n, swp_entry_t swp_entries[], int entry_size, + int type); extern int add_swap_count_continuation(swp_entry_t, gfp_t); extern void swap_shmem_alloc(swp_entry_t); extern int swap_duplicate(swp_entry_t); @@ -547,6 +548,9 @@ static inline void put_swap_device(struct swap_info_struct *si) percpu_ref_put(&si->sei->users); }
+extern int write_swapfile_for_memcg(struct address_space *mapping, + int *swap_type); +extern void read_swapfile_for_memcg(struct seq_file *m, int type); #else /* CONFIG_SWAP */
static inline int swap_readpage(struct page *page, bool do_poll) diff --git a/mm/Kconfig b/mm/Kconfig index 4c7569970c69..c43c2e6b744f 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -518,7 +518,7 @@ config MEMCG_SWAP_QOS depends on X86 || ARM64 default n help - Support swapin memory for memcg. + Support swapin memory for memcg. Support swapfile limit for memcg.
config ETMEM_SCAN tristate "module: etmem page scan for etmem support" diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 10e38edf6b87..33ecb52248e2 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -5365,6 +5365,139 @@ static ssize_t memory_swapin(struct kernfs_open_file *of, char *buf,
return nbytes; } + +static int memcg_alloc_swap_device(struct mem_cgroup *memcg) +{ + memcg->swap_dev = kmalloc(sizeof(struct swap_device), GFP_KERNEL); + if (!memcg->swap_dev) + return -ENOMEM; + return 0; +} + +static void memcg_free_swap_device(struct mem_cgroup *memcg) +{ + if (!memcg->swap_dev) + return; + + kfree(memcg->swap_dev); + memcg->swap_dev = NULL; +} + +static ssize_t memcg_swapfile_write(struct kernfs_open_file *of, char *buf, + size_t nbytes, loff_t off) +{ + struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of)); + struct filename *pathname; + struct file *swapfile; + int ret; + + buf = strstrip(buf); + + if (!strcmp(buf, "none")) { + memcg->swap_dev->type = SWAP_TYPE_NONE; + return nbytes; + } else if (!strcmp(buf, "all")) { + memcg->swap_dev->type = SWAP_TYPE_ALL; + return nbytes; + } + + pathname = getname_kernel(buf); + if (IS_ERR(pathname)) + return PTR_ERR(pathname); + + swapfile = file_open_name(pathname, O_RDWR|O_LARGEFILE, 0); + if (IS_ERR(swapfile)) { + putname(pathname); + return PTR_ERR(swapfile); + } + ret = write_swapfile_for_memcg(swapfile->f_mapping, + &memcg->swap_dev->type); + filp_close(swapfile, NULL); + putname(pathname); + + return ret < 0 ? ret : nbytes; +} + +static int memcg_swapfile_read(struct seq_file *m, void *v) +{ + struct mem_cgroup *memcg = mem_cgroup_from_seq(m); + + if (memcg->swap_dev->type == SWAP_TYPE_NONE) + seq_printf(m, "none\n"); + else if (memcg->swap_dev->type == SWAP_TYPE_ALL) + seq_printf(m, "all\n"); + else + read_swapfile_for_memcg(m, memcg->swap_dev->type); + return 0; +} + +static void memcg_copy_swap_device(struct mem_cgroup *dst, + struct mem_cgroup *src) +{ + if (!src) + dst->swap_dev->type = SWAP_TYPE_ALL; + else + dst->swap_dev->type = src->swap_dev->type; +} + +int memcg_get_swap_type(struct page *page) +{ + struct mem_cgroup *memcg; + int type; + + if (mem_cgroup_disabled() || !page) + return SWAP_TYPE_ALL; + + memcg = page_memcg(page); + if (!memcg || mem_cgroup_is_root(memcg)) + return SWAP_TYPE_ALL; + + rcu_read_lock(); + if (!css_tryget_online(&memcg->css)) { + rcu_read_unlock(); + return SWAP_TYPE_ALL; + } + rcu_read_unlock(); + + type = READ_ONCE(memcg->swap_dev->type); + css_put(&memcg->css); + return type; +} + +void memcg_remove_swapfile(int type) +{ + struct mem_cgroup *memcg; + + if (mem_cgroup_disabled()) + return; + + for_each_mem_cgroup(memcg) + if (memcg->swap_dev->type == type) + memcg->swap_dev->type = SWAP_TYPE_NONE; +} +#else +static int memcg_alloc_swap_device(struct mem_cgroup *memcg) +{ + return 0; +} + +static void memcg_free_swap_device(struct mem_cgroup *memcg) +{ +} + +static void memcg_copy_swap_device(struct mem_cgroup *dst, + struct mem_cgroup *src) +{ +} + +int memcg_get_swap_type(struct page *page) +{ + return SWAP_TYPE_ALL; +} + +void memcg_remove_swapfile(int type) +{ +} #endif
static int memcg_high_async_ratio_show(struct seq_file *m, void *v) @@ -5775,6 +5908,12 @@ static struct cftype mem_cgroup_legacy_files[] = { .flags = CFTYPE_NOT_ON_ROOT, .write = memory_swapin, }, + { + .name = "swapfile", + .flags = CFTYPE_NOT_ON_ROOT, + .write = memcg_swapfile_write, + .seq_show = memcg_swapfile_read, + }, #endif { .name = "high_async_ratio", @@ -5920,6 +6059,7 @@ static void __mem_cgroup_free(struct mem_cgroup *memcg) for_each_node(node) free_mem_cgroup_per_node_info(memcg, node); free_percpu(memcg->vmstats_percpu); + memcg_free_swap_device(memcg); kfree(memcg); }
@@ -5944,6 +6084,9 @@ static struct mem_cgroup *mem_cgroup_alloc(void) if (!memcg) return ERR_PTR(error);
+ if (memcg_alloc_swap_device(memcg)) + goto fail; + memcg->id.id = idr_alloc(&mem_cgroup_idr, NULL, 1, MEM_CGROUP_ID_MAX, GFP_KERNEL); @@ -6021,17 +6164,20 @@ mem_cgroup_css_alloc(struct cgroup_subsys_state *parent_css) page_counter_init(&memcg->swap, NULL); page_counter_init(&memcg->kmem, NULL); page_counter_init(&memcg->tcpmem, NULL); + memcg_copy_swap_device(memcg, NULL); } else if (parent->use_hierarchy) { memcg->use_hierarchy = true; page_counter_init(&memcg->memory, &parent->memory); page_counter_init(&memcg->swap, &parent->swap); page_counter_init(&memcg->kmem, &parent->kmem); page_counter_init(&memcg->tcpmem, &parent->tcpmem); + memcg_copy_swap_device(memcg, parent); } else { page_counter_init(&memcg->memory, &root_mem_cgroup->memory); page_counter_init(&memcg->swap, &root_mem_cgroup->swap); page_counter_init(&memcg->kmem, &root_mem_cgroup->kmem); page_counter_init(&memcg->tcpmem, &root_mem_cgroup->tcpmem); + memcg_copy_swap_device(memcg, root_mem_cgroup); /* * Deeper hierachy with use_hierarchy == false doesn't make * much sense so let cgroup subsystem know about this diff --git a/mm/swap_slots.c b/mm/swap_slots.c index 0357fbe70645..3d4e4c230305 100644 --- a/mm/swap_slots.c +++ b/mm/swap_slots.c @@ -266,7 +266,7 @@ static int refill_swap_slots_cache(struct swap_slots_cache *cache) cache->cur = 0; if (swap_slot_cache_active) cache->nr = get_swap_pages(SWAP_SLOTS_CACHE_SIZE, - cache->slots, 1); + cache->slots, 1, SWAP_TYPE_ALL);
return cache->nr; } @@ -307,12 +307,17 @@ swp_entry_t get_swap_page(struct page *page) { swp_entry_t entry; struct swap_slots_cache *cache; + int type;
entry.val = 0;
+ type = memcg_get_swap_type(page); + if (type == SWAP_TYPE_NONE) + goto out; + if (PageTransHuge(page)) { if (IS_ENABLED(CONFIG_THP_SWAP)) - get_swap_pages(1, &entry, HPAGE_PMD_NR); + get_swap_pages(1, &entry, HPAGE_PMD_NR, type); goto out; }
@@ -327,7 +332,8 @@ swp_entry_t get_swap_page(struct page *page) */ cache = raw_cpu_ptr(&swp_slots);
- if (likely(check_cache_active() && cache->slots)) { + if (likely(check_cache_active() && cache->slots) && + type == SWAP_TYPE_ALL) { mutex_lock(&cache->alloc_lock); if (cache->slots) { repeat: @@ -344,7 +350,7 @@ swp_entry_t get_swap_page(struct page *page) goto out; }
- get_swap_pages(1, &entry, 1); + get_swap_pages(1, &entry, 1, type); out: if (mem_cgroup_try_charge_swap(page, entry)) { put_swap_page(page, entry); diff --git a/mm/swapfile.c b/mm/swapfile.c index 14e2396fa8a3..3e0533139dfa 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -1056,7 +1056,83 @@ static unsigned long scan_swap_map(struct swap_info_struct *si,
}
-int get_swap_pages(int n_goal, swp_entry_t swp_entries[], int entry_size) +#ifdef CONFIG_MEMCG_SWAP_QOS +int write_swapfile_for_memcg(struct address_space *mapping, int *swap_type) +{ + struct swap_info_struct *si; + unsigned int type; + int ret = -EINVAL; + + spin_lock(&swap_lock); + for (type = 0; type < nr_swapfiles; type++) { + si = swap_info[type]; + if ((si->flags & SWP_WRITEOK) && + (si->swap_file->f_mapping == mapping)) { + *swap_type = type; + ret = 0; + break; + } + } + spin_unlock(&swap_lock); + return ret; +} + +void read_swapfile_for_memcg(struct seq_file *m, int type) +{ + struct swap_info_struct *si; + + spin_lock(&swap_lock); + if (type < nr_swapfiles) { + si = swap_info[type]; + if (si->flags & SWP_WRITEOK) { + seq_file_path(m, si->swap_file, "\t\n\"); + seq_printf(m, "\n"); + } + } + spin_unlock(&swap_lock); +} + +static long get_avail_pages(unsigned long size, int type) +{ + struct swap_info_struct *si; + long avail_pgs = 0; + + if (type == SWAP_TYPE_ALL) + return atomic_long_read(&nr_swap_pages) / size; + + spin_unlock(&swap_avail_lock); + spin_lock(&swap_lock); + if (type < nr_swapfiles) { + si = swap_info[type]; + if (si->flags & SWP_WRITEOK) + avail_pgs = si->pages - si->inuse_pages; + } + spin_unlock(&swap_lock); + spin_lock(&swap_avail_lock); + return avail_pgs; +} + +static inline bool should_skip_swap_type(int swap_type, int type) +{ + if (type == SWAP_TYPE_ALL) + return false; + + return (type != swap_type); +} +#else +static inline long get_avail_pages(unsigned long size, int type) +{ + return atomic_long_read(&nr_swap_pages) / size; +} + +static inline bool should_skip_swap_type(int swap_type, int type) +{ + return false; +} +#endif + +int get_swap_pages(int n_goal, swp_entry_t swp_entries[], int entry_size, + int type) { unsigned long size = swap_entry_size(entry_size); struct swap_info_struct *si, *next; @@ -1069,7 +1145,7 @@ int get_swap_pages(int n_goal, swp_entry_t swp_entries[], int entry_size)
spin_lock(&swap_avail_lock);
- avail_pgs = atomic_long_read(&nr_swap_pages) / size; + avail_pgs = get_avail_pages(size, type); if (avail_pgs <= 0) { spin_unlock(&swap_avail_lock); goto noswap; @@ -1086,6 +1162,11 @@ int get_swap_pages(int n_goal, swp_entry_t swp_entries[], int entry_size) plist_requeue(&si->avail_lists[node], &swap_avail_heads[node]); spin_unlock(&swap_avail_lock); spin_lock(&si->lock); + if (should_skip_swap_type(si->type, type)) { + spin_unlock(&si->lock); + spin_lock(&swap_avail_lock); + goto nextsi; + } if (!si->highest_bit || !(si->flags & SWP_WRITEOK)) { spin_lock(&swap_avail_lock); if (plist_node_empty(&si->avail_lists[node])) { @@ -2703,6 +2784,7 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile) cluster_info = p->cluster_info; p->cluster_info = NULL; frontswap_map = frontswap_map_get(p); + memcg_remove_swapfile(p->type); spin_unlock(&p->lock); spin_unlock(&swap_lock); arch_swap_invalidate_area(p->type);