This series patches add swap control for memory cgroup. Patch[1] add page type in memory.reclaim interface to support reclaim anon pages. Patch[2] add memory.force_swapin interface to support swap back pages proactively. Patch[3-5] add memory.swapfile interface to limit available swap device for memory cgroup. Patch[6] add memory.swap.max interface to limit usage of swap for memory cgroup.
Liu Shixin (6): memcg: add page type to memory.reclaim interface memcg: introduce per-memcg swapin interface mm/swapfile: introduce per-memcg swapfile control mm: swap_slots: add per-type slot cache mm: swap_slots: allocate per-type slot cache when online swap device memcg: add restrict to swap to cgroup1
.../admin-guide/cgroup-v1/memory.rst | 2 + Documentation/admin-guide/cgroup-v2.rst | 9 +- include/linux/memcontrol.h | 27 ++ include/linux/mm.h | 1 + include/linux/swap.h | 10 +- include/linux/swap_slots.h | 2 +- mm/Kconfig | 8 + mm/madvise.c | 19 ++ mm/memcontrol.c | 297 +++++++++++++++++- mm/swap_slots.c | 136 +++++++- mm/swapfile.c | 88 +++++- mm/vmscan.c | 17 + 12 files changed, 590 insertions(+), 26 deletions(-)
反馈: 您发送到kernel@openeuler.org的补丁/补丁集,已成功转换为PR! PR链接地址: https://gitee.com/openeuler/kernel/pulls/1079 邮件列表地址: https://mailweb.openeuler.org/hyperkitty/list/kernel@openeuler.org/thread/VO...
FeedBack: The patch(es) which you have sent to kernel@openeuler.org mailing list has been converted to a pull request successfully! Pull request link: https://gitee.com/openeuler/kernel/pulls/1079 Mailing list address: https://mailweb.openeuler.org/hyperkitty/list/kernel@openeuler.org/thread/VO...
Add anon/file to memory.reclaim interface to limit only reclaim one type pages. The lru algorithm can reclaim cold pages and balance between file and anon. But it didn't consider the speed of backend device. For example, if there is zram device, reclaim anon pages might has less impact on performance. So extend memory.reclaim interface to reclaim one type pages. Usage: "echo <size> type=anon > memory.reclaim" "echo <size> type=file > memory.reclaim"
Also compatible with the previous format.
Signed-off-by: Liu Shixin liushixin2@huawei.com --- Documentation/admin-guide/cgroup-v2.rst | 9 ++++--- include/linux/swap.h | 4 +++ mm/memcontrol.c | 36 ++++++++++++++++++++++--- mm/vmscan.c | 17 ++++++++++++ 4 files changed, 59 insertions(+), 7 deletions(-)
diff --git a/Documentation/admin-guide/cgroup-v2.rst b/Documentation/admin-guide/cgroup-v2.rst index 5d9b7e552fb0..a04ca490f58c 100644 --- a/Documentation/admin-guide/cgroup-v2.rst +++ b/Documentation/admin-guide/cgroup-v2.rst @@ -1196,15 +1196,16 @@ PAGE_SIZE multiple when read back. target cgroup.
This file accepts a single key, the number of bytes to reclaim. - No nested keys are currently supported.
Example::
echo "1G" > memory.reclaim
- The interface can be later extended with nested keys to - configure the reclaim behavior. For example, specify the - type of memory to reclaim from (anon, file, ..). + This file also accepts nested keys, the number of bytes to reclaim + with the type of memory to reclaim. + + Example:: + echo "1G type=file" > memory.reclaim
Please note that the kernel can over or under reclaim from the target cgroup. If less bytes are reclaimed than the diff --git a/include/linux/swap.h b/include/linux/swap.h index 7f49964f27d2..b98b4c9df622 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h @@ -380,6 +380,10 @@ extern unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *memcg, unsigned long nr_pages, gfp_t gfp_mask, bool may_swap); +extern unsigned long __try_to_free_mem_cgroup_pages(struct mem_cgroup *memcg, + unsigned long nr_pages, + gfp_t gfp_mask, + bool may_swap, bool only_swap); extern unsigned long mem_cgroup_shrink_node(struct mem_cgroup *mem, gfp_t gfp_mask, bool noswap, pg_data_t *pgdat, diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 662c7859b7f1..8f796b651baa 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -5263,16 +5263,46 @@ static int memcg_events_local_show(struct seq_file *m, void *v) return 0; }
+static int reclaim_param_parse(char *buf, unsigned long *nr_pages, + bool *anon, bool *file) +{ + char *endp; + u64 bytes; + + if (!strcmp(buf, "")) { + *nr_pages = PAGE_COUNTER_MAX; + return 0; + } + + bytes = memparse(buf, &endp); + if (*endp == ' ') { + buf = endp + 1; + buf = strim(buf); + if (!strcmp(buf, "type=anon")) + *file = false; + else if (!strcmp(buf, "type=file")) + *anon = false; + else + return -EINVAL; + } else if (*endp != '\0') + return -EINVAL; + + *nr_pages = min(bytes / PAGE_SIZE, (u64)PAGE_COUNTER_MAX); + + return 0; +} + static ssize_t memory_reclaim(struct kernfs_open_file *of, char *buf, size_t nbytes, loff_t off) { struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of)); unsigned int nr_retries = MAX_RECLAIM_RETRIES; unsigned long nr_to_reclaim, nr_reclaimed = 0; + bool anon = true, file = true; int err;
buf = strstrip(buf); - err = page_counter_memparse(buf, "", &nr_to_reclaim); + err = reclaim_param_parse(buf, &nr_to_reclaim, &anon, &file); if (err) return err;
@@ -5293,9 +5323,9 @@ static ssize_t memory_reclaim(struct kernfs_open_file *of, char *buf, if (!nr_retries) lru_add_drain_all();
- reclaimed = try_to_free_mem_cgroup_pages(memcg, + reclaimed = __try_to_free_mem_cgroup_pages(memcg, nr_to_reclaim - nr_reclaimed, - GFP_KERNEL, true); + GFP_KERNEL, anon, !file);
if (!reclaimed && !nr_retries--) return -EAGAIN; diff --git a/mm/vmscan.c b/mm/vmscan.c index a8412c5d4eda..2fce47af0e83 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -102,6 +102,8 @@ struct scan_control {
/* Can pages be swapped as part of reclaim? */ unsigned int may_swap:1; + /* Only swap anon pages */ + unsigned int only_swap:1;
/* * Cgroup memory below memory.low is protected as long as we @@ -2461,6 +2463,11 @@ static void get_scan_count(struct lruvec *lruvec, struct scan_control *sc, unsigned long ap, fp; enum lru_list lru;
+ if (sc->only_swap) { + scan_balance = SCAN_ANON; + goto out; + } + /* If we have no swap space, do not bother scanning anon pages. */ if (!sc->may_swap || mem_cgroup_get_nr_swap_pages(memcg) <= 0) { scan_balance = SCAN_FILE; @@ -3563,6 +3570,15 @@ unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *memcg, unsigned long nr_pages, gfp_t gfp_mask, bool may_swap) +{ + return __try_to_free_mem_cgroup_pages(memcg, nr_pages, gfp_mask, + may_swap, false); +} + +unsigned long __try_to_free_mem_cgroup_pages(struct mem_cgroup *memcg, + unsigned long nr_pages, + gfp_t gfp_mask, + bool may_swap, bool only_swap) { unsigned long nr_reclaimed; unsigned int noreclaim_flag; @@ -3576,6 +3592,7 @@ unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *memcg, .may_writepage = !laptop_mode, .may_unmap = 1, .may_swap = may_swap, + .only_swap = only_swap, }; /* * Traverse the ZONELIST_FALLBACK zonelist of the current node to put
Add a new per-memcg swapin interface to load data into memory in advance to improve access efficiency. Usage: # echo 0 > memory.force_swapin
Signed-off-by: Liu Shixin liushixin2@huawei.com --- .../admin-guide/cgroup-v1/memory.rst | 1 + include/linux/mm.h | 1 + mm/Kconfig | 8 ++++ mm/madvise.c | 19 ++++++++++ mm/memcontrol.c | 38 +++++++++++++++++++ 5 files changed, 67 insertions(+)
diff --git a/Documentation/admin-guide/cgroup-v1/memory.rst b/Documentation/admin-guide/cgroup-v1/memory.rst index 6859a50fbd09..5eee7e3be4b2 100644 --- a/Documentation/admin-guide/cgroup-v1/memory.rst +++ b/Documentation/admin-guide/cgroup-v1/memory.rst @@ -78,6 +78,7 @@ Brief summary of control files. memory.stat show various statistics memory.use_hierarchy set/show hierarchical account enabled memory.force_empty trigger forced page reclaim + memory.force_swapin trigger forced swapin anon page memory.pressure_level set memory pressure notifications memory.swappiness set/show swappiness parameter of vmscan (See sysctl's vm.swappiness) diff --git a/include/linux/mm.h b/include/linux/mm.h index 0b5ce84212d7..58d7a59b5b65 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -2650,6 +2650,7 @@ extern int __do_munmap(struct mm_struct *, unsigned long, size_t, struct list_head *uf, bool downgrade); extern int do_munmap(struct mm_struct *, unsigned long, size_t, struct list_head *uf); +extern void force_swapin_vma(struct vm_area_struct *vma); extern int do_madvise(struct mm_struct *mm, unsigned long start, size_t len_in, int behavior);
extern unsigned long __do_mmap_mm(struct mm_struct *mm, struct file *file, diff --git a/mm/Kconfig b/mm/Kconfig index f66457168de9..4c7569970c69 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -512,6 +512,14 @@ config MEMCG_QOS
If unsure, say "n".
+config MEMCG_SWAP_QOS + bool "Enable Memory Cgroup Swap priority" + depends on MEMCG_SWAP + depends on X86 || ARM64 + default n + help + Support swapin memory for memcg. + config ETMEM_SCAN tristate "module: etmem page scan for etmem support" depends on ETMEM diff --git a/mm/madvise.c b/mm/madvise.c index 0a1d6f9d75ea..6028383a8147 100644 --- a/mm/madvise.c +++ b/mm/madvise.c @@ -259,6 +259,25 @@ static void force_shm_swapin_readahead(struct vm_area_struct *vma,
lru_add_drain(); /* Push any new pages onto the LRU now */ } + +void force_swapin_vma(struct vm_area_struct *vma) +{ + struct file *file = vma->vm_file; + + if (!can_madv_lru_vma(vma)) + return; + + if (!file) { + walk_page_vma(vma, &swapin_walk_ops, vma); + lru_add_drain(); + } else if (shmem_mapping(file->f_mapping)) + force_shm_swapin_readahead(vma, vma->vm_start, + vma->vm_end, file->f_mapping); +} +#else +void force_swapin_vma(struct vm_area_struct *vma) +{ +} #endif /* CONFIG_SWAP */
/* diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 8f796b651baa..10e38edf6b87 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -5336,6 +5336,37 @@ static ssize_t memory_reclaim(struct kernfs_open_file *of, char *buf, return nbytes; }
+#ifdef CONFIG_MEMCG_SWAP_QOS +static int mem_cgroup_task_swapin(struct task_struct *task, void *arg) +{ + struct mm_struct *mm = task->mm; + struct vm_area_struct *vma; + struct blk_plug plug; + + mmap_read_lock(mm); + blk_start_plug(&plug); + for (vma = mm->mmap; vma; vma = vma->vm_next) + force_swapin_vma(vma); + blk_finish_plug(&plug); + mmap_read_unlock(mm); + + return 0; +} + +static ssize_t memory_swapin(struct kernfs_open_file *of, char *buf, + size_t nbytes, loff_t off) +{ + struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of)); + + if (unlikely(mem_cgroup_is_root(memcg))) + return -EINVAL; + + mem_cgroup_scan_tasks(memcg, mem_cgroup_task_swapin, NULL); + + return nbytes; +} +#endif + static int memcg_high_async_ratio_show(struct seq_file *m, void *v) { seq_printf(m, "%d\n", @@ -5738,6 +5769,13 @@ static struct cftype mem_cgroup_legacy_files[] = { .name = "reclaim", .write = memory_reclaim, }, +#ifdef CONFIG_MEMCG_SWAP_QOS + { + .name = "force_swapin", + .flags = CFTYPE_NOT_ON_ROOT, + .write = memory_swapin, + }, +#endif { .name = "high_async_ratio", .flags = CFTYPE_NOT_ON_ROOT,
With memory.swapfile interface, the avail swap device can be limit for memcg. The acceptable parameters are 'all', 'none' and valid swap device. Usage: echo /dev/zram0 > memory.swapfile
If the swap device is offline, the swapfile will be fallback to 'none'.
Signed-off-by: Liu Shixin liushixin2@huawei.com --- .../admin-guide/cgroup-v1/memory.rst | 1 + include/linux/memcontrol.h | 26 ++++ include/linux/swap.h | 6 +- mm/Kconfig | 2 +- mm/memcontrol.c | 146 ++++++++++++++++++ mm/swap_slots.c | 14 +- mm/swapfile.c | 86 ++++++++++- 7 files changed, 273 insertions(+), 8 deletions(-)
diff --git a/Documentation/admin-guide/cgroup-v1/memory.rst b/Documentation/admin-guide/cgroup-v1/memory.rst index 5eee7e3be4b2..97bad1406a23 100644 --- a/Documentation/admin-guide/cgroup-v1/memory.rst +++ b/Documentation/admin-guide/cgroup-v1/memory.rst @@ -79,6 +79,7 @@ Brief summary of control files. memory.use_hierarchy set/show hierarchical account enabled memory.force_empty trigger forced page reclaim memory.force_swapin trigger forced swapin anon page + memory.swapfile set/show swap file memory.pressure_level set memory pressure notifications memory.swappiness set/show swappiness parameter of vmscan (See sysctl's vm.swappiness) diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index 4395f2e03cb7..6b55bfb1cd5b 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -50,6 +50,11 @@ enum memcg_memory_event { MEMCG_NR_MEMORY_EVENTS, };
+enum { + SWAP_TYPE_ALL = -1, /* allowd use all swap file */ + SWAP_TYPE_NONE = -2, /* prohibited use any swapfile */ +}; + struct mem_cgroup_reclaim_cookie { pg_data_t *pgdat; unsigned int generation; @@ -240,6 +245,10 @@ struct obj_cgroup { }; };
+struct swap_device { + int type; +}; + /* * The memory controller data structure. The memory controller controls both * page cache and RSS per cgroup. We would eventually like to provide @@ -402,7 +411,12 @@ struct mem_cgroup { #else KABI_RESERVE(6) #endif +#ifdef CONFIG_MEMCG_SWAP_QOS + /* per-memcg swap device control; protected by swap_lock */ + KABI_USE(7, struct swap_device *swap_dev) +#else KABI_RESERVE(7) +#endif KABI_RESERVE(8)
struct mem_cgroup_per_node *nodeinfo[0]; @@ -1292,6 +1306,9 @@ static inline bool memcg_has_children(struct mem_cgroup *memcg)
int mem_cgroup_force_empty(struct mem_cgroup *memcg);
+int memcg_get_swap_type(struct page *page); +void memcg_remove_swapfile(int type); + #else /* CONFIG_MEMCG */
#define MEM_CGROUP_ID_SHIFT 0 @@ -1695,6 +1712,15 @@ unsigned long mem_cgroup_soft_limit_reclaim(pg_data_t *pgdat, int order, { return 0; } + +static inline int memcg_get_swap_type(struct page *page) +{ + return SWAP_TYPE_ALL; +} + +static inline void memcg_remove_swapfile(int type) +{ +} #endif /* CONFIG_MEMCG */
/* idx can be of type enum memcg_stat_item or node_stat_item */ diff --git a/include/linux/swap.h b/include/linux/swap.h index b98b4c9df622..b40cc0500b42 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h @@ -515,7 +515,8 @@ extern void si_swapinfo(struct sysinfo *); extern swp_entry_t get_swap_page(struct page *page); extern void put_swap_page(struct page *page, swp_entry_t entry); extern swp_entry_t get_swap_page_of_type(int); -extern int get_swap_pages(int n, swp_entry_t swp_entries[], int entry_size); +extern int get_swap_pages(int n, swp_entry_t swp_entries[], int entry_size, + int type); extern int add_swap_count_continuation(swp_entry_t, gfp_t); extern void swap_shmem_alloc(swp_entry_t); extern int swap_duplicate(swp_entry_t); @@ -547,6 +548,9 @@ static inline void put_swap_device(struct swap_info_struct *si) percpu_ref_put(&si->sei->users); }
+extern int write_swapfile_for_memcg(struct address_space *mapping, + int *swap_type); +extern void read_swapfile_for_memcg(struct seq_file *m, int type); #else /* CONFIG_SWAP */
static inline int swap_readpage(struct page *page, bool do_poll) diff --git a/mm/Kconfig b/mm/Kconfig index 4c7569970c69..c43c2e6b744f 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -518,7 +518,7 @@ config MEMCG_SWAP_QOS depends on X86 || ARM64 default n help - Support swapin memory for memcg. + Support swapin memory for memcg. Support swapfile limit for memcg.
config ETMEM_SCAN tristate "module: etmem page scan for etmem support" diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 10e38edf6b87..5eb973d6672f 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -5365,6 +5365,139 @@ static ssize_t memory_swapin(struct kernfs_open_file *of, char *buf,
return nbytes; } + +static int memcg_alloc_swap_device(struct mem_cgroup *memcg) +{ + memcg->swap_dev = kmalloc(sizeof(struct swap_device), GFP_KERNEL); + if (!memcg->swap_dev) + return -ENOMEM; + return 0; +} + +static void memcg_free_swap_device(struct mem_cgroup *memcg) +{ + if (!memcg->swap_dev) + return; + + kfree(memcg->swap_dev); + memcg->swap_dev = NULL; +} + +static ssize_t memcg_swapfile_write(struct kernfs_open_file *of, char *buf, + size_t nbytes, loff_t off) +{ + struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of)); + struct filename *pathname; + struct file *swapfile; + int ret; + + buf = strstrip(buf); + + if (!strcmp(buf, "none")) { + memcg->swap_dev->type = SWAP_TYPE_NONE; + return nbytes; + } else if (!strcmp(buf, "all")) { + memcg->swap_dev->type = SWAP_TYPE_ALL; + return nbytes; + } + + pathname = getname_kernel(buf); + if (IS_ERR(pathname)) { + return PTR_ERR(pathname); + } + swapfile = file_open_name(pathname, O_RDWR|O_LARGEFILE, 0); + if (IS_ERR(swapfile)) { + putname(pathname); + return PTR_ERR(swapfile); + } + ret = write_swapfile_for_memcg(swapfile->f_mapping, + &memcg->swap_dev->type); + filp_close(swapfile, NULL); + putname(pathname); + + return ret < 0 ? ret : nbytes; +} + +static int memcg_swapfile_read(struct seq_file *m, void *v) +{ + struct mem_cgroup *memcg = mem_cgroup_from_seq(m); + + if (memcg->swap_dev->type == SWAP_TYPE_NONE) + seq_printf(m, "none\n"); + else if (memcg->swap_dev->type == SWAP_TYPE_ALL) + seq_printf(m, "all\n"); + else + read_swapfile_for_memcg(m, memcg->swap_dev->type); + return 0; +} + +static void memcg_copy_swap_device(struct mem_cgroup *dst, + struct mem_cgroup *src) +{ + if (!src) + dst->swap_dev->type = SWAP_TYPE_ALL; + else + dst->swap_dev->type = src->swap_dev->type; +} + +int memcg_get_swap_type(struct page *page) +{ + struct mem_cgroup *memcg; + int type; + + if (mem_cgroup_disabled() || !page) + return SWAP_TYPE_ALL; + + memcg = page_memcg(page); + if (!memcg || mem_cgroup_is_root(memcg)) + return SWAP_TYPE_ALL; + + rcu_read_lock(); + if (!css_tryget_online(&memcg->css)) { + rcu_read_unlock(); + return SWAP_TYPE_ALL; + } + rcu_read_unlock(); + + type = READ_ONCE(memcg->swap_dev->type); + css_put(&memcg->css); + return type; +} + +void memcg_remove_swapfile(int type) +{ + struct mem_cgroup *memcg; + + if (mem_cgroup_disabled()) + return; + + for_each_mem_cgroup(memcg) + if (memcg->swap_dev->type == type) + memcg->swap_dev->type = SWAP_TYPE_NONE; +} +#else +static int memcg_alloc_swap_device(struct mem_cgroup *memcg) +{ + return 0; +} + +static void memcg_free_swap_device(struct mem_cgroup *memcg) +{ +} + +static void memcg_copy_swap_device(struct mem_cgroup *dst, + struct mem_cgroup *src) +{ +} + +int memcg_get_swap_type(struct page *page) +{ + return SWAP_TYPE_ALL; +} + +void memcg_remove_swapfile(int type) +{ +} #endif
static int memcg_high_async_ratio_show(struct seq_file *m, void *v) @@ -5775,6 +5908,12 @@ static struct cftype mem_cgroup_legacy_files[] = { .flags = CFTYPE_NOT_ON_ROOT, .write = memory_swapin, }, + { + .name = "swapfile", + .flags = CFTYPE_NOT_ON_ROOT, + .write = memcg_swapfile_write, + .seq_show = memcg_swapfile_read, + }, #endif { .name = "high_async_ratio", @@ -5920,6 +6059,7 @@ static void __mem_cgroup_free(struct mem_cgroup *memcg) for_each_node(node) free_mem_cgroup_per_node_info(memcg, node); free_percpu(memcg->vmstats_percpu); + memcg_free_swap_device(memcg); kfree(memcg); }
@@ -5944,6 +6084,9 @@ static struct mem_cgroup *mem_cgroup_alloc(void) if (!memcg) return ERR_PTR(error);
+ if (memcg_alloc_swap_device(memcg)) + goto fail; + memcg->id.id = idr_alloc(&mem_cgroup_idr, NULL, 1, MEM_CGROUP_ID_MAX, GFP_KERNEL); @@ -6021,17 +6164,20 @@ mem_cgroup_css_alloc(struct cgroup_subsys_state *parent_css) page_counter_init(&memcg->swap, NULL); page_counter_init(&memcg->kmem, NULL); page_counter_init(&memcg->tcpmem, NULL); + memcg_copy_swap_device(memcg, NULL); } else if (parent->use_hierarchy) { memcg->use_hierarchy = true; page_counter_init(&memcg->memory, &parent->memory); page_counter_init(&memcg->swap, &parent->swap); page_counter_init(&memcg->kmem, &parent->kmem); page_counter_init(&memcg->tcpmem, &parent->tcpmem); + memcg_copy_swap_device(memcg, parent); } else { page_counter_init(&memcg->memory, &root_mem_cgroup->memory); page_counter_init(&memcg->swap, &root_mem_cgroup->swap); page_counter_init(&memcg->kmem, &root_mem_cgroup->kmem); page_counter_init(&memcg->tcpmem, &root_mem_cgroup->tcpmem); + memcg_copy_swap_device(memcg, root_mem_cgroup); /* * Deeper hierachy with use_hierarchy == false doesn't make * much sense so let cgroup subsystem know about this diff --git a/mm/swap_slots.c b/mm/swap_slots.c index 0357fbe70645..3d4e4c230305 100644 --- a/mm/swap_slots.c +++ b/mm/swap_slots.c @@ -266,7 +266,7 @@ static int refill_swap_slots_cache(struct swap_slots_cache *cache) cache->cur = 0; if (swap_slot_cache_active) cache->nr = get_swap_pages(SWAP_SLOTS_CACHE_SIZE, - cache->slots, 1); + cache->slots, 1, SWAP_TYPE_ALL);
return cache->nr; } @@ -307,12 +307,17 @@ swp_entry_t get_swap_page(struct page *page) { swp_entry_t entry; struct swap_slots_cache *cache; + int type;
entry.val = 0;
+ type = memcg_get_swap_type(page); + if (type == SWAP_TYPE_NONE) + goto out; + if (PageTransHuge(page)) { if (IS_ENABLED(CONFIG_THP_SWAP)) - get_swap_pages(1, &entry, HPAGE_PMD_NR); + get_swap_pages(1, &entry, HPAGE_PMD_NR, type); goto out; }
@@ -327,7 +332,8 @@ swp_entry_t get_swap_page(struct page *page) */ cache = raw_cpu_ptr(&swp_slots);
- if (likely(check_cache_active() && cache->slots)) { + if (likely(check_cache_active() && cache->slots) && + type == SWAP_TYPE_ALL) { mutex_lock(&cache->alloc_lock); if (cache->slots) { repeat: @@ -344,7 +350,7 @@ swp_entry_t get_swap_page(struct page *page) goto out; }
- get_swap_pages(1, &entry, 1); + get_swap_pages(1, &entry, 1, type); out: if (mem_cgroup_try_charge_swap(page, entry)) { put_swap_page(page, entry); diff --git a/mm/swapfile.c b/mm/swapfile.c index 14e2396fa8a3..3e0533139dfa 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -1056,7 +1056,83 @@ static unsigned long scan_swap_map(struct swap_info_struct *si,
}
-int get_swap_pages(int n_goal, swp_entry_t swp_entries[], int entry_size) +#ifdef CONFIG_MEMCG_SWAP_QOS +int write_swapfile_for_memcg(struct address_space *mapping, int *swap_type) +{ + struct swap_info_struct *si; + unsigned int type; + int ret = -EINVAL; + + spin_lock(&swap_lock); + for (type = 0; type < nr_swapfiles; type++) { + si = swap_info[type]; + if ((si->flags & SWP_WRITEOK) && + (si->swap_file->f_mapping == mapping)) { + *swap_type = type; + ret = 0; + break; + } + } + spin_unlock(&swap_lock); + return ret; +} + +void read_swapfile_for_memcg(struct seq_file *m, int type) +{ + struct swap_info_struct *si; + + spin_lock(&swap_lock); + if (type < nr_swapfiles) { + si = swap_info[type]; + if (si->flags & SWP_WRITEOK) { + seq_file_path(m, si->swap_file, "\t\n\"); + seq_printf(m, "\n"); + } + } + spin_unlock(&swap_lock); +} + +static long get_avail_pages(unsigned long size, int type) +{ + struct swap_info_struct *si; + long avail_pgs = 0; + + if (type == SWAP_TYPE_ALL) + return atomic_long_read(&nr_swap_pages) / size; + + spin_unlock(&swap_avail_lock); + spin_lock(&swap_lock); + if (type < nr_swapfiles) { + si = swap_info[type]; + if (si->flags & SWP_WRITEOK) + avail_pgs = si->pages - si->inuse_pages; + } + spin_unlock(&swap_lock); + spin_lock(&swap_avail_lock); + return avail_pgs; +} + +static inline bool should_skip_swap_type(int swap_type, int type) +{ + if (type == SWAP_TYPE_ALL) + return false; + + return (type != swap_type); +} +#else +static inline long get_avail_pages(unsigned long size, int type) +{ + return atomic_long_read(&nr_swap_pages) / size; +} + +static inline bool should_skip_swap_type(int swap_type, int type) +{ + return false; +} +#endif + +int get_swap_pages(int n_goal, swp_entry_t swp_entries[], int entry_size, + int type) { unsigned long size = swap_entry_size(entry_size); struct swap_info_struct *si, *next; @@ -1069,7 +1145,7 @@ int get_swap_pages(int n_goal, swp_entry_t swp_entries[], int entry_size)
spin_lock(&swap_avail_lock);
- avail_pgs = atomic_long_read(&nr_swap_pages) / size; + avail_pgs = get_avail_pages(size, type); if (avail_pgs <= 0) { spin_unlock(&swap_avail_lock); goto noswap; @@ -1086,6 +1162,11 @@ int get_swap_pages(int n_goal, swp_entry_t swp_entries[], int entry_size) plist_requeue(&si->avail_lists[node], &swap_avail_heads[node]); spin_unlock(&swap_avail_lock); spin_lock(&si->lock); + if (should_skip_swap_type(si->type, type)) { + spin_unlock(&si->lock); + spin_lock(&swap_avail_lock); + goto nextsi; + } if (!si->highest_bit || !(si->flags & SWP_WRITEOK)) { spin_lock(&swap_avail_lock); if (plist_node_empty(&si->avail_lists[node])) { @@ -2703,6 +2784,7 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile) cluster_info = p->cluster_info; p->cluster_info = NULL; frontswap_map = frontswap_map_get(p); + memcg_remove_swapfile(p->type); spin_unlock(&p->lock); spin_unlock(&swap_lock); arch_swap_invalidate_area(p->type);
Since we support per-memcg swapfile control, we need per-type slot cache to optimized performance.
Signed-off-by: Liu Shixin liushixin2@huawei.com --- mm/swap_slots.c | 107 +++++++++++++++++++++++++++++++++++++++++++----- 1 file changed, 96 insertions(+), 11 deletions(-)
diff --git a/mm/swap_slots.c b/mm/swap_slots.c index 3d4e4c230305..71c220975d75 100644 --- a/mm/swap_slots.c +++ b/mm/swap_slots.c @@ -35,6 +35,9 @@ #include <linux/mm.h>
static DEFINE_PER_CPU(struct swap_slots_cache, swp_slots); +#ifdef CONFIG_MEMCG_SWAP_QOS +static DEFINE_PER_CPU(struct swap_slots_cache [MAX_SWAPFILES], swp_type_slots); +#endif static bool swap_slot_cache_active; bool swap_slot_cache_enabled; static bool swap_slot_cache_initialized; @@ -111,7 +114,37 @@ static bool check_cache_active(void) return swap_slot_cache_active; }
-static int alloc_swap_slot_cache(unsigned int cpu) +#ifdef CONFIG_MEMCG_SWAP_QOS +static inline struct swap_slots_cache *get_slots_cache(int swap_type) +{ + if (swap_type == SWAP_TYPE_ALL) + return raw_cpu_ptr(&swp_slots); + else + return raw_cpu_ptr(&swp_type_slots)[swap_type]; +} + +static inline struct swap_slots_cache *get_slots_cache_cpu(unsigned int cpu, + int swap_type) +{ + if (swap_type == SWAP_TYPE_ALL) + return &per_cpu(swp_slots, cpu); + else + return &per_cpu(swp_type_slots, cpu)[swap_type]; +} +#else +static inline struct swap_slots_cache *get_slots_cache(int swap_type) +{ + return raw_cpu_ptr(&swp_slots); +} + +static inline struct swap_slots_cache *get_slots_cache_cpu(unsigned int cpu, + int swap_type) +{ + return &per_cpu(swp_slots, cpu); +} +#endif + +static int alloc_swap_slot_cache_cpu_type(unsigned int cpu, int swap_type) { struct swap_slots_cache *cache; swp_entry_t *slots, *slots_ret; @@ -134,7 +167,7 @@ static int alloc_swap_slot_cache(unsigned int cpu) }
mutex_lock(&swap_slots_cache_mutex); - cache = &per_cpu(swp_slots, cpu); + cache = get_slots_cache_cpu(cpu, swap_type); if (cache->slots || cache->slots_ret) { /* cache already allocated */ mutex_unlock(&swap_slots_cache_mutex); @@ -166,13 +199,42 @@ static int alloc_swap_slot_cache(unsigned int cpu) return 0; }
-static void drain_slots_cache_cpu(unsigned int cpu, unsigned int type, - bool free_slots) +#ifdef CONFIG_MEMCG_SWAP_QOS +static int __alloc_swap_slot_cache_cpu(unsigned int cpu) +{ + int i, ret; + + ret = alloc_swap_slot_cache_cpu_type(cpu, SWAP_TYPE_ALL); + if (ret) + return ret; + + for (i = 0; i < MAX_SWAPFILES; i++) { + ret = alloc_swap_slot_cache_cpu_type(cpu, i); + if (ret) + return ret; + } + + return ret; +} +#else +static inline int __alloc_swap_slot_cache_cpu(unsigned int cpu) +{ + return alloc_swap_slot_cache_cpu_type(cpu, SWAP_TYPE_ALL); +} +#endif + +static int alloc_swap_slot_cache(unsigned int cpu) +{ + return __alloc_swap_slot_cache_cpu(cpu); +} + +static void drain_slots_cache_cpu_type(unsigned int cpu, unsigned int type, + bool free_slots, int swap_type) { struct swap_slots_cache *cache; swp_entry_t *slots = NULL;
- cache = &per_cpu(swp_slots, cpu); + cache = get_slots_cache_cpu(cpu, swap_type); if ((type & SLOTS_CACHE) && cache->slots) { mutex_lock(&cache->alloc_lock); swapcache_free_entries(cache->slots + cache->cur, cache->nr); @@ -198,6 +260,30 @@ static void drain_slots_cache_cpu(unsigned int cpu, unsigned int type, } }
+#ifdef CONFIG_MEMCG_SWAP_QOS +static void __drain_slots_cache_cpu(unsigned int cpu, unsigned int type, + bool free_slots) +{ + int i; + + drain_slots_cache_cpu_type(cpu, type, free_slots, SWAP_TYPE_ALL); + for (i = 0; i < MAX_SWAPFILES; i++) + drain_slots_cache_cpu_type(cpu, type, free_slots, i); +} +#else +static inline void __drain_slots_cache_cpu(unsigned int cpu, + unsigned int type, bool free_slots) +{ + drain_slots_cache_cpu_type(cpu, type, free_slots, SWAP_TYPE_ALL); +} +#endif + +static void drain_slots_cache_cpu(unsigned int cpu, unsigned int type, + bool free_slots) +{ + __drain_slots_cache_cpu(cpu, type, free_slots); +} + static void __drain_swap_slots_cache(unsigned int type) { unsigned int cpu; @@ -258,7 +344,7 @@ void enable_swap_slots_cache(void) }
/* called with swap slot cache's alloc lock held */ -static int refill_swap_slots_cache(struct swap_slots_cache *cache) +static int refill_swap_slots_cache(struct swap_slots_cache *cache, int type) { if (!use_swap_slot_cache || cache->nr) return 0; @@ -266,7 +352,7 @@ static int refill_swap_slots_cache(struct swap_slots_cache *cache) cache->cur = 0; if (swap_slot_cache_active) cache->nr = get_swap_pages(SWAP_SLOTS_CACHE_SIZE, - cache->slots, 1, SWAP_TYPE_ALL); + cache->slots, 1, type);
return cache->nr; } @@ -330,10 +416,9 @@ swp_entry_t get_swap_page(struct page *page) * The alloc path here does not touch cache->slots_ret * so cache->free_lock is not taken. */ - cache = raw_cpu_ptr(&swp_slots); + cache = get_slots_cache(type);
- if (likely(check_cache_active() && cache->slots) && - type == SWAP_TYPE_ALL) { + if (likely(check_cache_active() && cache->slots)) { mutex_lock(&cache->alloc_lock); if (cache->slots) { repeat: @@ -341,7 +426,7 @@ swp_entry_t get_swap_page(struct page *page) entry = cache->slots[cache->cur]; cache->slots[cache->cur++].val = 0; cache->nr--; - } else if (refill_swap_slots_cache(cache)) { + } else if (refill_swap_slots_cache(cache, type)) { goto repeat; } }
To reduce memory waste, allocate per-type slot cache when online the corresponding swap device.
Signed-off-by: Liu Shixin liushixin2@huawei.com --- include/linux/swap_slots.h | 2 +- mm/swap_slots.c | 27 +++++++++++++++++++++++---- mm/swapfile.c | 2 +- 3 files changed, 25 insertions(+), 6 deletions(-)
diff --git a/include/linux/swap_slots.h b/include/linux/swap_slots.h index 347f1a304190..1ea720390822 100644 --- a/include/linux/swap_slots.h +++ b/include/linux/swap_slots.h @@ -23,7 +23,7 @@ struct swap_slots_cache {
void disable_swap_slots_cache_lock(void); void reenable_swap_slots_cache_unlock(void); -void enable_swap_slots_cache(void); +void enable_swap_slots_cache(int type); int free_swap_slot(swp_entry_t entry);
extern bool swap_slot_cache_enabled; diff --git a/mm/swap_slots.c b/mm/swap_slots.c index 71c220975d75..5c37bd784192 100644 --- a/mm/swap_slots.c +++ b/mm/swap_slots.c @@ -36,6 +36,7 @@
static DEFINE_PER_CPU(struct swap_slots_cache, swp_slots); #ifdef CONFIG_MEMCG_SWAP_QOS +static unsigned int nr_swap_slots; static DEFINE_PER_CPU(struct swap_slots_cache [MAX_SWAPFILES], swp_type_slots); #endif static bool swap_slot_cache_active; @@ -208,7 +209,7 @@ static int __alloc_swap_slot_cache_cpu(unsigned int cpu) if (ret) return ret;
- for (i = 0; i < MAX_SWAPFILES; i++) { + for (i = 0; i < nr_swap_slots; i++) { ret = alloc_swap_slot_cache_cpu_type(cpu, i); if (ret) return ret; @@ -216,11 +217,29 @@ static int __alloc_swap_slot_cache_cpu(unsigned int cpu)
return ret; } + +static void alloc_swap_slot_cache_type(int type) +{ + unsigned int cpu; + + /* serialize with cpu hotplug operations */ + get_online_cpus(); + while (type >= nr_swap_slots) { + for_each_online_cpu(cpu) + alloc_swap_slot_cache_cpu_type(cpu, nr_swap_slots); + nr_swap_slots++; + } + put_online_cpus(); +} #else static inline int __alloc_swap_slot_cache_cpu(unsigned int cpu) { return alloc_swap_slot_cache_cpu_type(cpu, SWAP_TYPE_ALL); } + +static void alloc_swap_slot_cache_type(int type) +{ +} #endif
static int alloc_swap_slot_cache(unsigned int cpu) @@ -267,7 +286,7 @@ static void __drain_slots_cache_cpu(unsigned int cpu, unsigned int type, int i;
drain_slots_cache_cpu_type(cpu, type, free_slots, SWAP_TYPE_ALL); - for (i = 0; i < MAX_SWAPFILES; i++) + for (i = 0; i < nr_swap_slots; i++) drain_slots_cache_cpu_type(cpu, type, free_slots, i); } #else @@ -323,7 +342,7 @@ static int free_slot_cache(unsigned int cpu) return 0; }
-void enable_swap_slots_cache(void) +void enable_swap_slots_cache(int type) { mutex_lock(&swap_slots_cache_enable_mutex); if (!swap_slot_cache_initialized) { @@ -337,7 +356,7 @@ void enable_swap_slots_cache(void)
swap_slot_cache_initialized = true; } - + alloc_swap_slot_cache_type(type); __reenable_swap_slots_cache(); out_unlock: mutex_unlock(&swap_slots_cache_enable_mutex); diff --git a/mm/swapfile.c b/mm/swapfile.c index 3e0533139dfa..e103ae583acc 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -3539,7 +3539,7 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags) if (inode) inode_unlock(inode); if (!error) - enable_swap_slots_cache(); + enable_swap_slots_cache(p->type); return error; }
The memsw can't limit the usage of swap space. Add memory.swap.max interface to limit the difference value of memsw.usage and memory.usage. Since a page may occupy both swap entry and a swap cache page, this value is not exactly equal to swap.usage.
Signed-off-by: Liu Shixin liushixin2@huawei.com --- include/linux/memcontrol.h | 1 + mm/memcontrol.c | 82 ++++++++++++++++++++++++++++++++++++-- 2 files changed, 80 insertions(+), 3 deletions(-)
diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index 6b55bfb1cd5b..c1619a044803 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -247,6 +247,7 @@ struct obj_cgroup {
struct swap_device { int type; + unsigned long limit; };
/* diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 5eb973d6672f..f2ec00201aeb 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -5434,10 +5434,37 @@ static int memcg_swapfile_read(struct seq_file *m, void *v) static void memcg_copy_swap_device(struct mem_cgroup *dst, struct mem_cgroup *src) { - if (!src) + if (!src) { dst->swap_dev->type = SWAP_TYPE_ALL; - else + dst->swap_dev->limit = PAGE_COUNTER_MAX; + } else { dst->swap_dev->type = src->swap_dev->type; + dst->swap_dev->limit = src->swap_dev->limit; + } +} + +static ssize_t memcg_swapmax_write(struct kernfs_open_file *of, + char *buf, size_t nbytes, loff_t off) +{ + struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of)); + unsigned long limit; + int err; + + buf = strstrip(buf); + err = page_counter_memparse(buf, "max", &limit); + if (err) + return err; + + memcg->swap_dev->limit = limit; + + return nbytes; +} + +u64 memcg_swapmax_read(struct cgroup_subsys_state *css, struct cftype *cft) +{ + struct mem_cgroup *memcg = mem_cgroup_from_css(css); + + return memcg->swap_dev->limit * PAGE_SIZE; }
int memcg_get_swap_type(struct page *page) @@ -5914,6 +5941,12 @@ static struct cftype mem_cgroup_legacy_files[] = { .write = memcg_swapfile_write, .seq_show = memcg_swapfile_read, }, + { + .name = "swap.max", + .flags = CFTYPE_NOT_ON_ROOT, + .write = memcg_swapmax_write, + .read_u64 = memcg_swapmax_read, + }, #endif { .name = "high_async_ratio", @@ -8094,6 +8127,49 @@ void mem_cgroup_swapout(struct page *page, swp_entry_t entry) css_put(&memcg->css); }
+#ifdef CONFIG_MEMCG_SWAP_QOS +static int mem_cgroup_check_swap_for_v1(struct page *page, swp_entry_t entry) +{ + struct mem_cgroup *memcg, *target_memcg; + unsigned long swap_usage; + unsigned long swap_limit; + long nr_swap_pages = PAGE_COUNTER_MAX; + + if (cgroup_memory_noswap || !entry.val) + return 0; + + target_memcg = page_memcg(page); + if (!target_memcg || mem_cgroup_is_root(target_memcg)) + return 0; + + rcu_read_lock(); + if (!css_tryget_online(&target_memcg->css)) { + rcu_read_unlock(); + return 0; + } + rcu_read_unlock(); + + for (memcg = target_memcg; memcg != root_mem_cgroup; + memcg = parent_mem_cgroup(memcg)) { + swap_limit = READ_ONCE(memcg->swap_dev->limit); + swap_usage = page_counter_read(&memcg->memsw) - + page_counter_read(&memcg->memory); + nr_swap_pages = min_t(long, nr_swap_pages, + swap_limit - swap_usage); + } + css_put(&target_memcg->css); + + if (thp_nr_pages(page) > nr_swap_pages) + return -ENOMEM; + return 0; +} +#else +static int mem_cgroup_check_swap_for_v1(struct page *page, swp_entry_t entry) +{ + return 0; +} +#endif + /** * mem_cgroup_try_charge_swap - try charging swap space for a page * @page: page being added to swap @@ -8111,7 +8187,7 @@ int mem_cgroup_try_charge_swap(struct page *page, swp_entry_t entry) unsigned short oldid;
if (!cgroup_subsys_on_dfl(memory_cgrp_subsys)) - return 0; + return mem_cgroup_check_swap_for_v1(page, entry);
memcg = page_memcg(page);