This series patches add swap control for memory cgroup. Patch[1] add page type in memory.reclaim interface to support reclaim anon pages. Patch[3] add memory.force_swapin interface to support swap back pages proactively. Patch[4-6] add memory.swapfile interface to limit available swap device for memory cgroup. Patch[7] add memory.swap.max interface to limit usage of swap for memory cgroup.
Jinjiang Tu (1): memcg: make sysctl registration more extensible
Liu Shixin (7): memcg: add page type to memory.reclaim interface memcg: introduce memcg swap qos feature memcg: introduce per-memcg swapin interface memcg: add restrict to swap to cgroup1 mm/swapfile: introduce per-memcg swapfile control mm: swap_slots: add per-type slot cache config: enable memcg swap qos for x86_64 and arm64 by default
zhaoxiaoqiang11 (1): memcg/swap: add ability to disable memcg swap
.../admin-guide/cgroup-v1/memory.rst | 3 + Documentation/admin-guide/cgroup-v2.rst | 10 +- arch/arm64/configs/openeuler_defconfig | 1 + arch/x86/configs/openeuler_defconfig | 1 + include/linux/memcontrol.h | 35 ++ include/linux/mm.h | 4 + include/linux/swap.h | 12 +- include/linux/swap_slots.h | 2 +- init/Kconfig | 9 + mm/madvise.c | 31 +- mm/memcontrol.c | 497 +++++++++++++++++- mm/swap_slots.c | 151 +++++- mm/swapfile.c | 97 +++- mm/vmscan.c | 9 + 14 files changed, 813 insertions(+), 49 deletions(-)
From: Liu Shixin liushixin2@huawei.com
hulk inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I8QK6Q CVE: NA
--------------------------------
Add anon/file to memory.reclaim interface to limit only reclaim one type pages. The lru algorithm can reclaim cold pages and balance between file and anon. But it didn't consider the speed of backend device. For example, if there is zram device, reclaim anon pages might has less impact on performance. So extend memory.reclaim interface to reclaim one type pages. Usage: "echo <size> type=anon > memory.reclaim" "echo <size> type=file > memory.reclaim"
Also compatible with the previous format.
Signed-off-by: Liu Shixin liushixin2@huawei.com Signed-off-by: Jinjiang Tu tujinjiang@huawei.com --- Documentation/admin-guide/cgroup-v2.rst | 10 ++-- include/linux/swap.h | 1 + mm/memcontrol.c | 67 +++++++++++++++++++++++-- mm/vmscan.c | 9 ++++ 4 files changed, 80 insertions(+), 7 deletions(-)
diff --git a/Documentation/admin-guide/cgroup-v2.rst b/Documentation/admin-guide/cgroup-v2.rst index b26b5274eaaf..84cbbeaf0d78 100644 --- a/Documentation/admin-guide/cgroup-v2.rst +++ b/Documentation/admin-guide/cgroup-v2.rst @@ -1247,15 +1247,17 @@ PAGE_SIZE multiple when read back. target cgroup.
This file accepts a single key, the number of bytes to reclaim. - No nested keys are currently supported.
Example::
echo "1G" > memory.reclaim
- The interface can be later extended with nested keys to - configure the reclaim behavior. For example, specify the - type of memory to reclaim from (anon, file, ..). + This file also accepts nested keys, the number of bytes to reclaim + with the type of memory to reclaim. + + Example:: + echo "1G type=file" > memory.reclaim + echo "1G type=anon" > memory.reclaim
Please note that the kernel can over or under reclaim from the target cgroup. If less bytes are reclaimed than the diff --git a/include/linux/swap.h b/include/linux/swap.h index fe20c462fecb..1c4c86812e96 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h @@ -419,6 +419,7 @@ extern unsigned long try_to_free_pages(struct zonelist *zonelist, int order,
#define MEMCG_RECLAIM_MAY_SWAP (1 << 1) #define MEMCG_RECLAIM_PROACTIVE (1 << 2) +#define MEMCG_RECLAIM_NOT_FILE (1 << 3) extern unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *memcg, unsigned long nr_pages, gfp_t gfp_mask, diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 2e80504a49c0..aab3ecea6847 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -65,6 +65,7 @@ #include <linux/seq_buf.h> #include <linux/memcg_memfs_info.h> #include <linux/sched/isolation.h> +#include <linux/parser.h> #include "internal.h" #include <net/sock.h> #include <net/ip.h> @@ -7308,6 +7309,62 @@ static ssize_t memory_oom_group_write(struct kernfs_open_file *of, return nbytes; }
+enum { + MEMORY_RECLAIM_TYPE = 0, + MEMORY_RECLAIM_NULL, +}; + +static const match_table_t tokens = { + { MEMORY_RECLAIM_TYPE, "type=%s"}, + { MEMORY_RECLAIM_NULL, NULL }, +}; + +#define RECLAIM_TYPE_SIZE 8 + +static int reclaim_param_parse(char *buf, unsigned long *nr_pages, + unsigned int *reclaim_options) +{ + char *old_buf, *start; + char type[RECLAIM_TYPE_SIZE]; + substring_t args[MAX_OPT_ARGS]; + u64 bytes; + + buf = strstrip(buf); + if (!strcmp(buf, "")) { + *nr_pages = PAGE_COUNTER_MAX; + return 0; + } + + old_buf = buf; + bytes = memparse(buf, &buf); + if (buf == old_buf) + return -EINVAL; + + *nr_pages = min(bytes / PAGE_SIZE, (u64)PAGE_COUNTER_MAX); + + buf = strstrip(buf); + while ((start = strsep(&buf, " ")) != NULL) { + if (!strlen(start)) + continue; + + switch (match_token(start, tokens, args)) { + case MEMORY_RECLAIM_TYPE: + match_strlcpy(type, &args[0], RECLAIM_TYPE_SIZE); + if (!strcmp(type, "anon")) + *reclaim_options |= MEMCG_RECLAIM_NOT_FILE; + else if (!strcmp(type, "file")) + *reclaim_options &= ~MEMCG_RECLAIM_MAY_SWAP; + else + return -EINVAL; + break; + default: + return -EINVAL; + } + } + + return 0; +} + static ssize_t memory_reclaim(struct kernfs_open_file *of, char *buf, size_t nbytes, loff_t off) { @@ -7317,18 +7374,22 @@ static ssize_t memory_reclaim(struct kernfs_open_file *of, char *buf, unsigned int reclaim_options; int err;
- buf = strstrip(buf); - err = page_counter_memparse(buf, "", &nr_to_reclaim); + reclaim_options = MEMCG_RECLAIM_MAY_SWAP | MEMCG_RECLAIM_PROACTIVE; + err = reclaim_param_parse(buf, &nr_to_reclaim, &reclaim_options); if (err) return err;
- reclaim_options = MEMCG_RECLAIM_MAY_SWAP | MEMCG_RECLAIM_PROACTIVE; while (nr_reclaimed < nr_to_reclaim) { unsigned long reclaimed;
if (signal_pending(current)) return -EINTR;
+ /* If only reclaim swap pages, check swap space at first. */ + if ((reclaim_options & MEMCG_RECLAIM_NOT_FILE) && + (mem_cgroup_get_nr_swap_pages(memcg) <= 0)) + return -EAGAIN; + /* * This is the final attempt, drain percpu lru caches in the * hope of introducing more evictable pages for diff --git a/mm/vmscan.c b/mm/vmscan.c index 7a676296af30..6461552c81d7 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -109,6 +109,9 @@ struct scan_control { /* Can folios be swapped as part of reclaim? */ unsigned int may_swap:1;
+ /* Should skip file pages? */ + unsigned int not_file:1; + /* Proactive reclaim invoked by userspace through memory.reclaim */ unsigned int proactive:1;
@@ -3035,6 +3038,11 @@ static void get_scan_count(struct lruvec *lruvec, struct scan_control *sc, unsigned long ap, fp; enum lru_list lru;
+ if (sc->not_file) { + scan_balance = SCAN_ANON; + goto out; + } + /* If we have no swap space, do not bother scanning anon folios. */ if (!sc->may_swap || !can_reclaim_anon_pages(memcg, pgdat->node_id, sc)) { scan_balance = SCAN_FILE; @@ -7141,6 +7149,7 @@ unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *memcg, .may_unmap = 1, .may_swap = !!(reclaim_options & MEMCG_RECLAIM_MAY_SWAP), .proactive = !!(reclaim_options & MEMCG_RECLAIM_PROACTIVE), + .not_file = !!(reclaim_options & MEMCG_RECLAIM_NOT_FILE), }; /* * Traverse the ZONELIST_FALLBACK zonelist of the current node to put
hulk inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I8QK6Q CVE: NA
--------------------------------
Move sysctl registration code out of CONFIG_MEMCG_OOM_PRIORITY, to make the code reused by other features.
Signed-off-by: Jinjiang Tu tujinjiang@huawei.com --- mm/memcontrol.c | 66 +++++++++++++++++++++++++++---------------------- 1 file changed, 37 insertions(+), 29 deletions(-)
diff --git a/mm/memcontrol.c b/mm/memcontrol.c index aab3ecea6847..a707014b644d 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -4203,6 +4203,7 @@ void memcg_print_bad_task(struct oom_control *oc) } }
+#ifdef CONFIG_SYSCTL static void memcg_oom_prio_reset(void) { struct mem_cgroup *iter; @@ -4232,34 +4233,7 @@ static int sysctl_memcg_oom_prio_handler(struct ctl_table *table, int write,
return ret; } - -static struct ctl_table memcg_oom_prio_sysctls[] = { - { - /* - * This sysctl is used to control memcg oom priority - * feature, the sysctl name is for compatibility. - */ - .procname = "memcg_qos_enable", - .data = &sysctl_memcg_oom_prio, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = sysctl_memcg_oom_prio_handler, - .extra1 = SYSCTL_ZERO, - .extra2 = SYSCTL_ONE, - }, -}; - -static __init int memcg_oom_prio_sysctls_init(void) -{ - register_sysctl_init("vm", memcg_oom_prio_sysctls); - return 0; -} -#else -static inline int memcg_oom_prio_sysctls_init(void) -{ - return 0; -} - +#endif #endif
#ifdef CONFIG_NUMA @@ -8031,6 +8005,40 @@ void mem_cgroup_uncharge_skmem(struct mem_cgroup *memcg, unsigned int nr_pages) refill_stock(memcg, nr_pages); }
+#ifdef CONFIG_SYSCTL +static struct ctl_table mem_cgroup_sysctls[] = { +#ifdef CONFIG_MEMCG_OOM_PRIORITY + { + /* + * This sysctl is used to control memcg oom priority + * feature, the sysctl name is for compatibility. + */ + .procname = "memcg_qos_enable", + .data = &sysctl_memcg_oom_prio, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = sysctl_memcg_oom_prio_handler, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_ONE, + }, +#endif +}; + +static __init int mem_cgroup_sysctls_init(void) +{ + if (mem_cgroup_disabled()) + return 0; + + register_sysctl_init("vm", mem_cgroup_sysctls); + return 0; +} +#else +static __init int mem_cgroup_sysctls_init(void) +{ + return 0; +} +#endif + static int __init cgroup_memory(char *s) { char *token; @@ -8090,7 +8098,7 @@ static int __init mem_cgroup_init(void) }
mem_cgroup_memfs_info_init(); - memcg_oom_prio_sysctls_init(); + mem_cgroup_sysctls_init();
return 0; }
From: Liu Shixin liushixin2@huawei.com
hulk inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I8QK6Q CVE: NA
--------------------------------
Introduce memcg swap qos including subsequent sub-features. Add CONFIG_MEMCG_SWAP_QOS and static key memcg_swap_qos_key.
Signed-off-by: Liu Shixin liushixin2@huawei.com Signed-off-by: Jinjiang Tu tujinjiang@huawei.com --- include/linux/memcontrol.h | 4 ++++ init/Kconfig | 9 +++++++ mm/memcontrol.c | 48 ++++++++++++++++++++++++++++++++++++++ 3 files changed, 61 insertions(+)
diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index 31aff8b9286a..578f43b68392 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -367,6 +367,10 @@ static inline void memcg_print_bad_task(struct oom_control *oc) } #endif
+#ifdef CONFIG_MEMCG_SWAP_QOS +DECLARE_STATIC_KEY_FALSE(memcg_swap_qos_key); +#endif + /* * size of first charge trial. * TODO: maybe necessary to use big numbers in big irons or dynamic based of the diff --git a/init/Kconfig b/init/Kconfig index 869eea4108d0..3d7a992a6b57 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -973,6 +973,15 @@ config MEMCG_OOM_PRIORITY
If unsure, say "n".
+config MEMCG_SWAP_QOS + bool "Enable Memory Cgroup Swap Control" + depends on MEMCG && SWAP + depends on X86 || ARM64 + default n + help + memcg swap control include memory force swapin, swapfile control + and swap limit. + config MEMCG_KMEM bool depends on MEMCG diff --git a/mm/memcontrol.c b/mm/memcontrol.c index a707014b644d..98fea5dacd40 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -4236,6 +4236,43 @@ static int sysctl_memcg_oom_prio_handler(struct ctl_table *table, int write, #endif #endif
+#ifdef CONFIG_MEMCG_SWAP_QOS +DEFINE_STATIC_KEY_FALSE(memcg_swap_qos_key); + +#ifdef CONFIG_SYSCTL +static int sysctl_memcg_swap_qos_stat; + +static void memcg_swap_qos_reset(void) +{ +} + +static int sysctl_memcg_swap_qos_handler(struct ctl_table *table, int write, + void __user *buffer, size_t *length, loff_t *ppos) +{ + int ret; + int qos_stat_old = sysctl_memcg_swap_qos_stat; + + ret = proc_dointvec_minmax(table, write, buffer, length, ppos); + if (ret) + return ret; + + if (write) { + if (qos_stat_old == sysctl_memcg_swap_qos_stat) + return 0; + + if (sysctl_memcg_swap_qos_stat) { + memcg_swap_qos_reset(); + static_branch_enable(&memcg_swap_qos_key); + } else { + static_branch_disable(&memcg_swap_qos_key); + } + } + + return 0; +} +#endif +#endif + #ifdef CONFIG_NUMA
#define LRU_ALL_FILE (BIT(LRU_INACTIVE_FILE) | BIT(LRU_ACTIVE_FILE)) @@ -8022,6 +8059,17 @@ static struct ctl_table mem_cgroup_sysctls[] = { .extra2 = SYSCTL_ONE, }, #endif +#ifdef CONFIG_MEMCG_SWAP_QOS + { + .procname = "memcg_swap_qos_enable", + .data = &sysctl_memcg_swap_qos_stat, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = sysctl_memcg_swap_qos_handler, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_ONE, + }, +#endif };
static __init int mem_cgroup_sysctls_init(void)
From: Liu Shixin liushixin2@huawei.com
hulk inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I8QK6Q CVE: NA
--------------------------------
Add a new per-memcg swapin interface to load data into memory in advance to improve access efficiency. Usage: # echo 0 > memory.force_swapin
Signed-off-by: Liu Shixin liushixin2@huawei.com Signed-off-by: Jinjiang Tu tujinjiang@huawei.com --- .../admin-guide/cgroup-v1/memory.rst | 1 + include/linux/mm.h | 4 ++ mm/madvise.c | 31 ++++++++++--- mm/memcontrol.c | 45 +++++++++++++++++++ 4 files changed, 76 insertions(+), 5 deletions(-)
diff --git a/Documentation/admin-guide/cgroup-v1/memory.rst b/Documentation/admin-guide/cgroup-v1/memory.rst index 66ae60dead2e..9bbd489136b2 100644 --- a/Documentation/admin-guide/cgroup-v1/memory.rst +++ b/Documentation/admin-guide/cgroup-v1/memory.rst @@ -83,6 +83,7 @@ Brief summary of control files. This knob is deprecated and shouldn't be used. memory.force_empty trigger forced page reclaim + memory.force_swapin trigger forced swapin anon page memory.pressure_level set memory pressure notifications memory.swappiness set/show swappiness parameter of vmscan (See sysctl's vm.swappiness) diff --git a/include/linux/mm.h b/include/linux/mm.h index 80bacc4da324..f078aa6b493c 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -3313,6 +3313,10 @@ extern int do_munmap(struct mm_struct *, unsigned long, size_t, struct list_head *uf); extern int do_madvise(struct mm_struct *mm, unsigned long start, size_t len_in, int behavior);
+#ifdef CONFIG_MEMCG_SWAP_QOS +extern void force_swapin_vma(struct vm_area_struct *vma); +#endif + #ifdef CONFIG_MMU extern int do_vma_munmap(struct vma_iterator *vmi, struct vm_area_struct *vma, unsigned long start, unsigned long end, diff --git a/mm/madvise.c b/mm/madvise.c index 4dded5d27e7e..2d56815daff2 100644 --- a/mm/madvise.c +++ b/mm/madvise.c @@ -185,6 +185,11 @@ static int madvise_update_vma(struct vm_area_struct *vma, return 0; }
+static inline bool can_madv_lru_vma(struct vm_area_struct *vma) +{ + return !(vma->vm_flags & (VM_LOCKED|VM_PFNMAP|VM_HUGETLB)); +} + #ifdef CONFIG_SWAP static int swapin_walk_pmd_entry(pmd_t *pmd, unsigned long start, unsigned long end, struct mm_walk *walk) @@ -273,6 +278,27 @@ static void shmem_swapin_range(struct vm_area_struct *vma, } #endif /* CONFIG_SWAP */
+#ifdef CONFIG_MEMCG_SWAP_QOS +void force_swapin_vma(struct vm_area_struct *vma) +{ + struct file *file = vma->vm_file; + + if (!can_madv_lru_vma(vma)) + return; + + if (!file) { + walk_page_vma(vma, &swapin_walk_ops, vma); + lru_add_drain(); + } else if (shmem_mapping(file->f_mapping)) + shmem_swapin_range(vma, vma->vm_start, + vma->vm_end, file->f_mapping); +} +#else +void force_swapin_vma(struct vm_area_struct *vma) +{ +} +#endif + /* * Schedule all required I/O operations. Do not wait for completion. */ @@ -555,11 +581,6 @@ static void madvise_cold_page_range(struct mmu_gather *tlb, tlb_end_vma(tlb, vma); }
-static inline bool can_madv_lru_vma(struct vm_area_struct *vma) -{ - return !(vma->vm_flags & (VM_LOCKED|VM_PFNMAP|VM_HUGETLB)); -} - static long madvise_cold(struct vm_area_struct *vma, struct vm_area_struct **prev, unsigned long start_addr, unsigned long end_addr) diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 98fea5dacd40..deebbfcd5bf6 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -66,6 +66,11 @@ #include <linux/memcg_memfs_info.h> #include <linux/sched/isolation.h> #include <linux/parser.h> + +#ifdef CONFIG_MEMCG_SWAP_QOS +#include <linux/blkdev.h> +#endif + #include "internal.h" #include <net/sock.h> #include <net/ip.h> @@ -4271,6 +4276,39 @@ static int sysctl_memcg_swap_qos_handler(struct ctl_table *table, int write, return 0; } #endif + +static int mem_cgroup_task_swapin(struct task_struct *task, void *arg) +{ + struct mm_struct *mm = task->mm; + struct vm_area_struct *vma; + struct blk_plug plug; + VMA_ITERATOR(vmi, mm, 0); + + if (__task_is_dying(task)) + return 0; + if (!mm || !mmget_not_zero(mm)) + return 0; + + mmap_read_lock(mm); + blk_start_plug(&plug); + for_each_vma(vmi, vma) + force_swapin_vma(vma); + blk_finish_plug(&plug); + mmap_read_unlock(mm); + mmput(mm); + + return 0; +} + +static ssize_t memory_swapin(struct kernfs_open_file *of, char *buf, + size_t nbytes, loff_t off) +{ + struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of)); + + mem_cgroup_scan_tasks(memcg, mem_cgroup_task_swapin, NULL); + + return nbytes; +} #endif
#ifdef CONFIG_NUMA @@ -5762,6 +5800,13 @@ static struct cftype mem_cgroup_legacy_files[] = { .write = memory_ksm_write, .seq_show = memory_ksm_show, }, +#endif +#ifdef CONFIG_MEMCG_SWAP_QOS + { + .name = "force_swapin", + .flags = CFTYPE_NOT_ON_ROOT, + .write = memory_swapin, + }, #endif { }, /* terminate */ };
From: Liu Shixin liushixin2@huawei.com
hulk inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I8QK6Q CVE: NA
--------------------------------
The memsw can't limit the usage of swap space. Add memory.swap.max interface to limit the difference value of memsw.usage and memory.usage. Since a page may occupy both swap entry and a swap cache page, this value is not exactly equal to swap.usage.
Signed-off-by: Liu Shixin liushixin2@huawei.com Signed-off-by: Jinjiang Tu tujinjiang@huawei.com --- .../admin-guide/cgroup-v1/memory.rst | 1 + include/linux/memcontrol.h | 8 ++ mm/memcontrol.c | 134 +++++++++++++++++- 3 files changed, 142 insertions(+), 1 deletion(-)
diff --git a/Documentation/admin-guide/cgroup-v1/memory.rst b/Documentation/admin-guide/cgroup-v1/memory.rst index 9bbd489136b2..f3ce13312604 100644 --- a/Documentation/admin-guide/cgroup-v1/memory.rst +++ b/Documentation/admin-guide/cgroup-v1/memory.rst @@ -84,6 +84,7 @@ Brief summary of control files. used. memory.force_empty trigger forced page reclaim memory.force_swapin trigger forced swapin anon page + memory.swap.max set/show limit for swap memory.pressure_level set memory pressure notifications memory.swappiness set/show swappiness parameter of vmscan (See sysctl's vm.swappiness) diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index 578f43b68392..7183a3767bf1 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -201,6 +201,10 @@ struct obj_cgroup { }; };
+struct swap_device { + unsigned long max; +}; + /* * The memory controller data structure. The memory controller controls both * page cache and RSS per cgroup. We would eventually like to provide @@ -350,6 +354,10 @@ struct mem_cgroup { bool high_async_reclaim; #endif
+#ifdef CONFIG_MEMCG_SWAP_QOS + struct swap_device *swap_dev; +#endif + struct mem_cgroup_per_node *nodeinfo[]; };
diff --git a/mm/memcontrol.c b/mm/memcontrol.c index deebbfcd5bf6..c4eb95b331d8 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -4249,6 +4249,10 @@ static int sysctl_memcg_swap_qos_stat;
static void memcg_swap_qos_reset(void) { + struct mem_cgroup *memcg; + + for_each_mem_cgroup(memcg) + WRITE_ONCE(memcg->swap_dev->max, PAGE_COUNTER_MAX); }
static int sysctl_memcg_swap_qos_handler(struct ctl_table *table, int write, @@ -4309,6 +4313,122 @@ static ssize_t memory_swapin(struct kernfs_open_file *of, char *buf,
return nbytes; } + +static int memcg_alloc_swap_device(struct mem_cgroup *memcg) +{ + memcg->swap_dev = kmalloc(sizeof(struct swap_device), GFP_KERNEL); + if (!memcg->swap_dev) + return -ENOMEM; + return 0; +} + +static void memcg_free_swap_device(struct mem_cgroup *memcg) +{ + if (!memcg->swap_dev) + return; + + kfree(memcg->swap_dev); + memcg->swap_dev = NULL; +} + +static void memcg_swap_device_init(struct mem_cgroup *memcg, + struct mem_cgroup *parent) +{ + if (!static_branch_likely(&memcg_swap_qos_key) || !parent) + WRITE_ONCE(memcg->swap_dev->max, PAGE_COUNTER_MAX); + else + WRITE_ONCE(memcg->swap_dev->max, + READ_ONCE(parent->swap_dev->max)); +} + +u64 memcg_swapmax_read(struct cgroup_subsys_state *css, struct cftype *cft) +{ + struct mem_cgroup *memcg = mem_cgroup_from_css(css); + + if (!static_branch_likely(&memcg_swap_qos_key)) + return PAGE_COUNTER_MAX * PAGE_SIZE; + + return READ_ONCE(memcg->swap_dev->max) * PAGE_SIZE; +} + +static ssize_t memcg_swapmax_write(struct kernfs_open_file *of, + char *buf, size_t nbytes, loff_t off) +{ + struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of)); + unsigned long max; + int err; + + if (!static_branch_likely(&memcg_swap_qos_key)) + return -EACCES; + + buf = strstrip(buf); + err = page_counter_memparse(buf, "max", &max); + if (err) + return err; + + WRITE_ONCE(memcg->swap_dev->max, max); + + return nbytes; +} + +static int mem_cgroup_check_swap_for_v1(struct folio *folio, swp_entry_t entry) +{ + struct mem_cgroup *memcg, *target_memcg; + unsigned long swap_usage; + unsigned long swap_limit; + long nr_swap_pages = PAGE_COUNTER_MAX; + + if (!static_branch_likely(&memcg_swap_qos_key)) + return 0; + + if (!entry.val) + return 0; + + rcu_read_lock(); + target_memcg = folio_memcg(folio); + if (!target_memcg || mem_cgroup_is_root(target_memcg) || + !css_tryget_online(&target_memcg->css)) { + rcu_read_unlock(); + return 0; + } + + rcu_read_unlock(); + + for (memcg = target_memcg; memcg != root_mem_cgroup; + memcg = parent_mem_cgroup(memcg)) { + swap_limit = READ_ONCE(memcg->swap_dev->max); + swap_usage = page_counter_read(&memcg->memsw) - + page_counter_read(&memcg->memory); + nr_swap_pages = min_t(long, nr_swap_pages, + swap_limit - swap_usage); + } + css_put(&target_memcg->css); + + if (folio_nr_pages(folio) > nr_swap_pages) + return -ENOMEM; + + return 0; +} + +#else +static int memcg_alloc_swap_device(struct mem_cgroup *memcg) +{ + return 0; +} + +static void memcg_free_swap_device(struct mem_cgroup *memcg) +{ +} + +static void memcg_swap_device_init(struct mem_cgroup *memcg, + struct mem_cgroup *parent) +{ +} + +static int mem_cgroup_check_swap_for_v1(struct folio *folio, swp_entry_t entry) +{ + return 0; +} #endif
#ifdef CONFIG_NUMA @@ -5807,6 +5927,12 @@ static struct cftype mem_cgroup_legacy_files[] = { .flags = CFTYPE_NOT_ON_ROOT, .write = memory_swapin, }, + { + .name = "swap.max", + .flags = CFTYPE_NOT_ON_ROOT, + .write = memcg_swapmax_write, + .read_u64 = memcg_swapmax_read, + }, #endif { }, /* terminate */ }; @@ -5943,6 +6069,7 @@ static void __mem_cgroup_free(struct mem_cgroup *memcg) free_mem_cgroup_per_node_info(memcg, node); kfree(memcg->vmstats); free_percpu(memcg->vmstats_percpu); + memcg_free_swap_device(memcg); kfree(memcg); }
@@ -5964,6 +6091,9 @@ static struct mem_cgroup *mem_cgroup_alloc(void) if (!memcg) return ERR_PTR(error);
+ if (memcg_alloc_swap_device(memcg)) + goto fail; + memcg->id.id = idr_alloc(&mem_cgroup_idr, NULL, 1, MEM_CGROUP_ID_MAX + 1, GFP_KERNEL); if (memcg->id.id < 0) { @@ -6047,12 +6177,14 @@ mem_cgroup_css_alloc(struct cgroup_subsys_state *parent_css) page_counter_init(&memcg->swap, &parent->swap); page_counter_init(&memcg->kmem, &parent->kmem); page_counter_init(&memcg->tcpmem, &parent->tcpmem); + memcg_swap_device_init(memcg, parent); } else { init_memcg_events(); page_counter_init(&memcg->memory, NULL); page_counter_init(&memcg->swap, NULL); page_counter_init(&memcg->kmem, NULL); page_counter_init(&memcg->tcpmem, NULL); + memcg_swap_device_init(memcg, NULL);
root_mem_cgroup = memcg; return &memcg->css; @@ -8301,7 +8433,7 @@ int __mem_cgroup_try_charge_swap(struct folio *folio, swp_entry_t entry) unsigned short oldid;
if (do_memsw_account()) - return 0; + return mem_cgroup_check_swap_for_v1(folio, entry);
memcg = folio_memcg(folio);
From: Liu Shixin liushixin2@huawei.com
hulk inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I8QK6Q CVE: NA
--------------------------------
With memory.swapfile interface, the avail swap device can be limit for memcg. The acceptable parameters are 'all', 'none' and valid swap device. Usage: echo /dev/zram0 > memory.swapfile
If the swap device is offline, the swapfile will be fallback to 'none'.
Signed-off-by: Liu Shixin liushixin2@huawei.com Signed-off-by: Jinjiang Tu tujinjiang@huawei.com --- .../admin-guide/cgroup-v1/memory.rst | 1 + include/linux/memcontrol.h | 18 +++ include/linux/swap.h | 10 +- mm/memcontrol.c | 150 +++++++++++++++++- mm/swap_slots.c | 15 +- mm/swapfile.c | 95 ++++++++++- 6 files changed, 278 insertions(+), 11 deletions(-)
diff --git a/Documentation/admin-guide/cgroup-v1/memory.rst b/Documentation/admin-guide/cgroup-v1/memory.rst index f3ce13312604..b8929b2a7e0b 100644 --- a/Documentation/admin-guide/cgroup-v1/memory.rst +++ b/Documentation/admin-guide/cgroup-v1/memory.rst @@ -85,6 +85,7 @@ Brief summary of control files. memory.force_empty trigger forced page reclaim memory.force_swapin trigger forced swapin anon page memory.swap.max set/show limit for swap + memory.swapfile set/show available swap file memory.pressure_level set memory pressure notifications memory.swappiness set/show swappiness parameter of vmscan (See sysctl's vm.swappiness) diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index 7183a3767bf1..3c28d6580258 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -54,6 +54,11 @@ enum memcg_memory_event { MEMCG_NR_MEMORY_EVENTS, };
+enum { + SWAP_TYPE_ALL = -1, /* allowd use all swap file */ + SWAP_TYPE_NONE = -2, /* prohibited use any swapfile */ +}; + struct mem_cgroup_reclaim_cookie { pg_data_t *pgdat; unsigned int generation; @@ -203,6 +208,7 @@ struct obj_cgroup {
struct swap_device { unsigned long max; + int type; };
/* @@ -1203,6 +1209,9 @@ unsigned long mem_cgroup_soft_limit_reclaim(pg_data_t *pgdat, int order, gfp_t gfp_mask, unsigned long *total_scanned);
+int memcg_get_swap_type(struct folio *folio); +void memcg_remove_swapfile(int type); + #else /* CONFIG_MEMCG */
#define MEM_CGROUP_ID_SHIFT 0 @@ -1639,6 +1648,15 @@ unsigned long mem_cgroup_soft_limit_reclaim(pg_data_t *pgdat, int order, static inline void memcg_print_bad_task(struct oom_control *oc) { } + +static inline int memcg_get_swap_type(struct folio *folio) +{ + return SWAP_TYPE_ALL; +} + +static inline void memcg_remove_swapfile(int type) +{ +} #endif /* CONFIG_MEMCG */
static inline void __inc_lruvec_kmem_state(void *p, enum node_stat_item idx) diff --git a/include/linux/swap.h b/include/linux/swap.h index 1c4c86812e96..c57e4373a095 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h @@ -488,7 +488,8 @@ swp_entry_t folio_alloc_swap(struct folio *folio); bool folio_free_swap(struct folio *folio); void put_swap_folio(struct folio *folio, swp_entry_t entry); extern swp_entry_t get_swap_page_of_type(int); -extern int get_swap_pages(int n, swp_entry_t swp_entries[], int entry_size); +extern int get_swap_pages(int n, swp_entry_t swp_entries[], int entry_size, + int type); extern int add_swap_count_continuation(swp_entry_t, gfp_t); extern void swap_shmem_alloc(swp_entry_t); extern int swap_duplicate(swp_entry_t); @@ -516,6 +517,13 @@ static inline void put_swap_device(struct swap_info_struct *si) percpu_ref_put(&si->users); }
+#ifdef CONFIG_MEMCG_SWAP_QOS +extern int write_swapfile_for_memcg(struct address_space *mapping, + int *swap_type); +extern void read_swapfile_for_memcg(struct seq_file *m, int type); +extern long get_nr_swap_pages_type(int type); +#endif + #else /* CONFIG_SWAP */ static inline struct swap_info_struct *swp_swap_info(swp_entry_t entry) { diff --git a/mm/memcontrol.c b/mm/memcontrol.c index c4eb95b331d8..27801bf09e09 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -4251,8 +4251,10 @@ static void memcg_swap_qos_reset(void) { struct mem_cgroup *memcg;
- for_each_mem_cgroup(memcg) + for_each_mem_cgroup(memcg) { WRITE_ONCE(memcg->swap_dev->max, PAGE_COUNTER_MAX); + WRITE_ONCE(memcg->swap_dev->type, SWAP_TYPE_ALL); + } }
static int sysctl_memcg_swap_qos_handler(struct ctl_table *table, int write, @@ -4334,11 +4336,15 @@ static void memcg_free_swap_device(struct mem_cgroup *memcg) static void memcg_swap_device_init(struct mem_cgroup *memcg, struct mem_cgroup *parent) { - if (!static_branch_likely(&memcg_swap_qos_key) || !parent) + if (!static_branch_likely(&memcg_swap_qos_key) || !parent) { WRITE_ONCE(memcg->swap_dev->max, PAGE_COUNTER_MAX); - else + WRITE_ONCE(memcg->swap_dev->type, SWAP_TYPE_ALL); + } else { WRITE_ONCE(memcg->swap_dev->max, READ_ONCE(parent->swap_dev->max)); + WRITE_ONCE(memcg->swap_dev->type, + READ_ONCE(parent->swap_dev->type)); + } }
u64 memcg_swapmax_read(struct cgroup_subsys_state *css, struct cftype *cft) @@ -4410,6 +4416,121 @@ static int mem_cgroup_check_swap_for_v1(struct folio *folio, swp_entry_t entry) return 0; }
+static int memcg_swapfile_read(struct seq_file *m, void *v) +{ + struct mem_cgroup *memcg = mem_cgroup_from_seq(m); + int type; + + if (!static_branch_likely(&memcg_swap_qos_key)) { + seq_printf(m, "all\n"); + return 0; + } + + type = READ_ONCE(memcg->swap_dev->type); + if (type == SWAP_TYPE_NONE) + seq_printf(m, "none\n"); + else if (type == SWAP_TYPE_ALL) + seq_printf(m, "all\n"); + else + read_swapfile_for_memcg(m, type); + return 0; +} + +static ssize_t memcg_swapfile_write(struct kernfs_open_file *of, char *buf, + size_t nbytes, loff_t off) +{ + struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of)); + struct filename *pathname; + struct file *swapfile; + int ret; + + if (!static_branch_likely(&memcg_swap_qos_key)) + return -EACCES; + + buf = strstrip(buf); + + if (!strcmp(buf, "none")) { + WRITE_ONCE(memcg->swap_dev->type, SWAP_TYPE_NONE); + return nbytes; + } else if (!strcmp(buf, "all")) { + WRITE_ONCE(memcg->swap_dev->type, SWAP_TYPE_ALL); + return nbytes; + } + + pathname = getname_kernel(buf); + if (IS_ERR(pathname)) + return PTR_ERR(pathname); + + swapfile = file_open_name(pathname, O_RDWR|O_LARGEFILE, 0); + if (IS_ERR(swapfile)) { + putname(pathname); + return PTR_ERR(swapfile); + } + ret = write_swapfile_for_memcg(swapfile->f_mapping, + &memcg->swap_dev->type); + filp_close(swapfile, NULL); + putname(pathname); + + return ret < 0 ? ret : nbytes; +} + +int memcg_get_swap_type(struct folio *folio) +{ + struct mem_cgroup *memcg; + int type; + + if (!static_branch_likely(&memcg_swap_qos_key)) + return SWAP_TYPE_ALL; + + if (!folio) + return SWAP_TYPE_ALL; + + rcu_read_lock(); + memcg = folio_memcg(folio); + if (!memcg || mem_cgroup_is_root(memcg)) { + rcu_read_unlock(); + return SWAP_TYPE_ALL; + } + + if (!css_tryget_online(&memcg->css)) { + rcu_read_unlock(); + return SWAP_TYPE_ALL; + } + rcu_read_unlock(); + + type = READ_ONCE(memcg->swap_dev->type); + css_put(&memcg->css); + return type; +} + +void memcg_remove_swapfile(int type) +{ + struct mem_cgroup *memcg; + + if (!static_branch_likely(&memcg_swap_qos_key)) + return; + + for_each_mem_cgroup(memcg) + if (READ_ONCE(memcg->swap_dev->type) == type) + WRITE_ONCE(memcg->swap_dev->type, SWAP_TYPE_NONE); +} + +static long mem_cgroup_get_nr_swap_pages_type(struct mem_cgroup *memcg) +{ + int type; + + if (!static_branch_likely(&memcg_swap_qos_key)) + return mem_cgroup_get_nr_swap_pages(memcg); + + type = READ_ONCE(memcg->swap_dev->type); + if (type == SWAP_TYPE_ALL) + return mem_cgroup_get_nr_swap_pages(memcg); + else if (type == SWAP_TYPE_NONE) + return 0; + else + return get_nr_swap_pages_type(type); +} + #else static int memcg_alloc_swap_device(struct mem_cgroup *memcg) { @@ -4429,6 +4550,21 @@ static int mem_cgroup_check_swap_for_v1(struct folio *folio, swp_entry_t entry) { return 0; } + +int memcg_get_swap_type(struct folio *folio) +{ + return SWAP_TYPE_ALL; +} + +void memcg_remove_swapfile(int type) +{ +} + +static long mem_cgroup_get_nr_swap_pages_type(struct mem_cgroup *memcg) +{ + return mem_cgroup_get_nr_swap_pages(memcg); +} + #endif
#ifdef CONFIG_NUMA @@ -5933,6 +6069,12 @@ static struct cftype mem_cgroup_legacy_files[] = { .write = memcg_swapmax_write, .read_u64 = memcg_swapmax_read, }, + { + .name = "swapfile", + .flags = CFTYPE_NOT_ON_ROOT, + .write = memcg_swapfile_write, + .seq_show = memcg_swapfile_read, + }, #endif { }, /* terminate */ }; @@ -7575,7 +7717,7 @@ static ssize_t memory_reclaim(struct kernfs_open_file *of, char *buf,
/* If only reclaim swap pages, check swap space at first. */ if ((reclaim_options & MEMCG_RECLAIM_NOT_FILE) && - (mem_cgroup_get_nr_swap_pages(memcg) <= 0)) + (mem_cgroup_get_nr_swap_pages_type(memcg) <= 0)) return -EAGAIN;
/* diff --git a/mm/swap_slots.c b/mm/swap_slots.c index 0bec1f705f8e..203b75ba1b10 100644 --- a/mm/swap_slots.c +++ b/mm/swap_slots.c @@ -264,7 +264,7 @@ static int refill_swap_slots_cache(struct swap_slots_cache *cache) cache->cur = 0; if (swap_slot_cache_active) cache->nr = get_swap_pages(SWAP_SLOTS_CACHE_SIZE, - cache->slots, 1); + cache->slots, 1, SWAP_TYPE_ALL);
return cache->nr; } @@ -303,12 +303,18 @@ swp_entry_t folio_alloc_swap(struct folio *folio) { swp_entry_t entry; struct swap_slots_cache *cache; + int type;
entry.val = 0;
+ type = memcg_get_swap_type(folio); + if (type == SWAP_TYPE_NONE) + goto out; + + if (folio_test_large(folio)) { if (IS_ENABLED(CONFIG_THP_SWAP) && arch_thp_swp_supported()) - get_swap_pages(1, &entry, folio_nr_pages(folio)); + get_swap_pages(1, &entry, folio_nr_pages(folio), type); goto out; }
@@ -323,7 +329,8 @@ swp_entry_t folio_alloc_swap(struct folio *folio) */ cache = raw_cpu_ptr(&swp_slots);
- if (likely(check_cache_active() && cache->slots)) { + if (likely(check_cache_active() && cache->slots) && + type == SWAP_TYPE_ALL) { mutex_lock(&cache->alloc_lock); if (cache->slots) { repeat: @@ -340,7 +347,7 @@ swp_entry_t folio_alloc_swap(struct folio *folio) goto out; }
- get_swap_pages(1, &entry, 1); + get_swap_pages(1, &entry, 1, type); out: if (mem_cgroup_try_charge_swap(folio, entry)) { put_swap_folio(folio, entry); diff --git a/mm/swapfile.c b/mm/swapfile.c index 4bc70f459164..54c3425a3c86 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -1044,7 +1044,92 @@ static void swap_free_cluster(struct swap_info_struct *si, unsigned long idx) swap_range_free(si, offset, SWAPFILE_CLUSTER); }
-int get_swap_pages(int n_goal, swp_entry_t swp_entries[], int entry_size) +#ifdef CONFIG_MEMCG_SWAP_QOS +int write_swapfile_for_memcg(struct address_space *mapping, int *swap_type) +{ + struct swap_info_struct *si; + unsigned int type; + int ret = -EINVAL; + + spin_lock(&swap_lock); + for (type = 0; type < nr_swapfiles; type++) { + si = swap_info[type]; + if ((si->flags & SWP_WRITEOK) && + (si->swap_file->f_mapping == mapping)) { + WRITE_ONCE(*swap_type, type); + ret = 0; + break; + } + } + spin_unlock(&swap_lock); + return ret; +} + +void read_swapfile_for_memcg(struct seq_file *m, int type) +{ + struct swap_info_struct *si; + + spin_lock(&swap_lock); + if (type < nr_swapfiles) { + si = swap_info[type]; + if (si->flags & SWP_WRITEOK) { + seq_file_path(m, si->swap_file, "\t\n\"); + seq_printf(m, "\n"); + } + } + spin_unlock(&swap_lock); +} + +long get_nr_swap_pages_type(int type) +{ + struct swap_info_struct *si; + long nr_swap_pages = 0; + + spin_lock(&swap_lock); + if (type < nr_swapfiles) { + si = swap_info[type]; + if (si->flags & SWP_WRITEOK) + nr_swap_pages = si->pages - si->inuse_pages; + } + spin_unlock(&swap_lock); + + return nr_swap_pages; +} + +static long get_avail_pages(unsigned long size, int type) +{ + long avail_pgs = 0; + + if (type == SWAP_TYPE_ALL) + return atomic_long_read(&nr_swap_pages) / size; + + spin_unlock(&swap_avail_lock); + avail_pgs = get_nr_swap_pages_type(type) / size; + spin_lock(&swap_avail_lock); + return avail_pgs; +} + +static inline bool should_skip_swap_type(int swap_type, int type) +{ + if (type == SWAP_TYPE_ALL) + return false; + + return (type != swap_type); +} +#else +static inline long get_avail_pages(unsigned long size, int type) +{ + return atomic_long_read(&nr_swap_pages) / size; +} + +static inline bool should_skip_swap_type(int swap_type, int type) +{ + return false; +} +#endif + +int get_swap_pages(int n_goal, swp_entry_t swp_entries[], int entry_size, + int type) { unsigned long size = swap_entry_size(entry_size); struct swap_info_struct *si, *next; @@ -1057,7 +1142,7 @@ int get_swap_pages(int n_goal, swp_entry_t swp_entries[], int entry_size)
spin_lock(&swap_avail_lock);
- avail_pgs = atomic_long_read(&nr_swap_pages) / size; + avail_pgs = get_avail_pages(size, type); if (avail_pgs <= 0) { spin_unlock(&swap_avail_lock); goto noswap; @@ -1074,6 +1159,11 @@ int get_swap_pages(int n_goal, swp_entry_t swp_entries[], int entry_size) plist_requeue(&si->avail_lists[node], &swap_avail_heads[node]); spin_unlock(&swap_avail_lock); spin_lock(&si->lock); + if (should_skip_swap_type(si->type, type)) { + spin_unlock(&si->lock); + spin_lock(&swap_avail_lock); + goto nextsi; + } if (!si->highest_bit || !(si->flags & SWP_WRITEOK)) { spin_lock(&swap_avail_lock); if (plist_node_empty(&si->avail_lists[node])) { @@ -2514,6 +2604,7 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile) p->swap_map = NULL; cluster_info = p->cluster_info; p->cluster_info = NULL; + memcg_remove_swapfile(p->type); spin_unlock(&p->lock); spin_unlock(&swap_lock); arch_swap_invalidate_area(p->type);
From: Liu Shixin liushixin2@huawei.com
hulk inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I8QK6Q CVE: NA
--------------------------------
Since we support per-memcg swapfile control, we need per-type slot cache to optimize performance. To reduce memory waste, allocate per-type slot cache when enable feature or online the corresponding swap device.
Signed-off-by: Liu Shixin liushixin2@huawei.com Signed-off-by: Jinjiang Tu tujinjiang@huawei.com --- include/linux/swap.h | 1 + include/linux/swap_slots.h | 2 +- mm/memcontrol.c | 1 + mm/swap_slots.c | 144 +++++++++++++++++++++++++++++++++---- mm/swapfile.c | 2 +- 5 files changed, 136 insertions(+), 14 deletions(-)
diff --git a/include/linux/swap.h b/include/linux/swap.h index c57e4373a095..9dc160d6fd43 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h @@ -522,6 +522,7 @@ extern int write_swapfile_for_memcg(struct address_space *mapping, int *swap_type); extern void read_swapfile_for_memcg(struct seq_file *m, int type); extern long get_nr_swap_pages_type(int type); +void enable_swap_slots_cache_max(void); #endif
#else /* CONFIG_SWAP */ diff --git a/include/linux/swap_slots.h b/include/linux/swap_slots.h index 15adfb8c813a..77521ac11dca 100644 --- a/include/linux/swap_slots.h +++ b/include/linux/swap_slots.h @@ -23,7 +23,7 @@ struct swap_slots_cache {
void disable_swap_slots_cache_lock(void); void reenable_swap_slots_cache_unlock(void); -void enable_swap_slots_cache(void); +void enable_swap_slots_cache(int type); void free_swap_slot(swp_entry_t entry);
extern bool swap_slot_cache_enabled; diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 27801bf09e09..e4ae23325efc 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -4274,6 +4274,7 @@ static int sysctl_memcg_swap_qos_handler(struct ctl_table *table, int write, if (sysctl_memcg_swap_qos_stat) { memcg_swap_qos_reset(); static_branch_enable(&memcg_swap_qos_key); + enable_swap_slots_cache_max(); } else { static_branch_disable(&memcg_swap_qos_key); } diff --git a/mm/swap_slots.c b/mm/swap_slots.c index 203b75ba1b10..c7781364fa50 100644 --- a/mm/swap_slots.c +++ b/mm/swap_slots.c @@ -36,6 +36,11 @@ #include <linux/mm.h>
static DEFINE_PER_CPU(struct swap_slots_cache, swp_slots); +#ifdef CONFIG_MEMCG_SWAP_QOS +static unsigned int nr_swap_slots; +static unsigned int max_swap_slots; +static DEFINE_PER_CPU(struct swap_slots_cache [MAX_SWAPFILES], swp_type_slots); +#endif static bool swap_slot_cache_active; bool swap_slot_cache_enabled; static bool swap_slot_cache_initialized; @@ -110,7 +115,37 @@ static bool check_cache_active(void) return swap_slot_cache_active; }
-static int alloc_swap_slot_cache(unsigned int cpu) +#ifdef CONFIG_MEMCG_SWAP_QOS +static inline struct swap_slots_cache *get_slots_cache(int swap_type) +{ + if (swap_type == SWAP_TYPE_ALL) + return raw_cpu_ptr(&swp_slots); + else + return raw_cpu_ptr(&swp_type_slots[swap_type]); +} + +static inline struct swap_slots_cache *get_slots_cache_cpu(unsigned int cpu, + int swap_type) +{ + if (swap_type == SWAP_TYPE_ALL) + return &per_cpu(swp_slots, cpu); + else + return &per_cpu(swp_type_slots, cpu)[swap_type]; +} +#else +static inline struct swap_slots_cache *get_slots_cache(int swap_type) +{ + return raw_cpu_ptr(&swp_slots); +} + +static inline struct swap_slots_cache *get_slots_cache_cpu(unsigned int cpu, + int swap_type) +{ + return &per_cpu(swp_slots, cpu); +} +#endif + +static int alloc_swap_slot_cache_cpu_type(unsigned int cpu, int swap_type) { struct swap_slots_cache *cache; swp_entry_t *slots, *slots_ret; @@ -133,7 +168,7 @@ static int alloc_swap_slot_cache(unsigned int cpu) }
mutex_lock(&swap_slots_cache_mutex); - cache = &per_cpu(swp_slots, cpu); + cache = get_slots_cache_cpu(cpu, swap_type); if (cache->slots || cache->slots_ret) { /* cache already allocated */ mutex_unlock(&swap_slots_cache_mutex); @@ -165,13 +200,74 @@ static int alloc_swap_slot_cache(unsigned int cpu) return 0; }
-static void drain_slots_cache_cpu(unsigned int cpu, unsigned int type, - bool free_slots) +#ifdef CONFIG_MEMCG_SWAP_QOS +static int __alloc_swap_slot_cache_cpu(unsigned int cpu) +{ + int i, ret; + + ret = alloc_swap_slot_cache_cpu_type(cpu, SWAP_TYPE_ALL); + if (ret) + return ret; + + for (i = 0; i < nr_swap_slots; i++) { + ret = alloc_swap_slot_cache_cpu_type(cpu, i); + if (ret) + return ret; + } + + return ret; +} + +static void alloc_swap_slot_cache_type(int type) +{ + unsigned int cpu; + + if (type >= max_swap_slots) + max_swap_slots = type + 1; + + if (!static_branch_likely(&memcg_swap_qos_key)) + return; + + /* serialize with cpu hotplug operations */ + cpus_read_lock(); + while (type >= nr_swap_slots) { + for_each_online_cpu(cpu) + alloc_swap_slot_cache_cpu_type(cpu, nr_swap_slots); + nr_swap_slots++; + } + cpus_read_unlock(); +} + +void enable_swap_slots_cache_max(void) +{ + mutex_lock(&swap_slots_cache_enable_mutex); + if (max_swap_slots) + alloc_swap_slot_cache_type(max_swap_slots - 1); + mutex_unlock(&swap_slots_cache_enable_mutex); +} +#else +static inline int __alloc_swap_slot_cache_cpu(unsigned int cpu) +{ + return alloc_swap_slot_cache_cpu_type(cpu, SWAP_TYPE_ALL); +} + +static void alloc_swap_slot_cache_type(int type) +{ +} +#endif + +static int alloc_swap_slot_cache(unsigned int cpu) +{ + return __alloc_swap_slot_cache_cpu(cpu); +} + +static void drain_slots_cache_cpu_type(unsigned int cpu, unsigned int type, + bool free_slots, int swap_type) { struct swap_slots_cache *cache; swp_entry_t *slots = NULL;
- cache = &per_cpu(swp_slots, cpu); + cache = get_slots_cache_cpu(cpu, swap_type); if ((type & SLOTS_CACHE) && cache->slots) { mutex_lock(&cache->alloc_lock); swapcache_free_entries(cache->slots + cache->cur, cache->nr); @@ -196,6 +292,30 @@ static void drain_slots_cache_cpu(unsigned int cpu, unsigned int type, } }
+#ifdef CONFIG_MEMCG_SWAP_QOS +static void __drain_slots_cache_cpu(unsigned int cpu, unsigned int type, + bool free_slots) +{ + int i; + + drain_slots_cache_cpu_type(cpu, type, free_slots, SWAP_TYPE_ALL); + for (i = 0; i < nr_swap_slots; i++) + drain_slots_cache_cpu_type(cpu, type, free_slots, i); +} +#else +static inline void __drain_slots_cache_cpu(unsigned int cpu, + unsigned int type, bool free_slots) +{ + drain_slots_cache_cpu_type(cpu, type, free_slots, SWAP_TYPE_ALL); +} +#endif + +static void drain_slots_cache_cpu(unsigned int cpu, unsigned int type, + bool free_slots) +{ + __drain_slots_cache_cpu(cpu, type, free_slots); +} + static void __drain_swap_slots_cache(unsigned int type) { unsigned int cpu; @@ -235,7 +355,7 @@ static int free_slot_cache(unsigned int cpu) return 0; }
-void enable_swap_slots_cache(void) +void enable_swap_slots_cache(int type) { mutex_lock(&swap_slots_cache_enable_mutex); if (!swap_slot_cache_initialized) { @@ -250,13 +370,14 @@ void enable_swap_slots_cache(void) swap_slot_cache_initialized = true; }
+ alloc_swap_slot_cache_type(type); __reenable_swap_slots_cache(); out_unlock: mutex_unlock(&swap_slots_cache_enable_mutex); }
/* called with swap slot cache's alloc lock held */ -static int refill_swap_slots_cache(struct swap_slots_cache *cache) +static int refill_swap_slots_cache(struct swap_slots_cache *cache, int type) { if (!use_swap_slot_cache) return 0; @@ -264,7 +385,7 @@ static int refill_swap_slots_cache(struct swap_slots_cache *cache) cache->cur = 0; if (swap_slot_cache_active) cache->nr = get_swap_pages(SWAP_SLOTS_CACHE_SIZE, - cache->slots, 1, SWAP_TYPE_ALL); + cache->slots, 1, type);
return cache->nr; } @@ -327,10 +448,9 @@ swp_entry_t folio_alloc_swap(struct folio *folio) * The alloc path here does not touch cache->slots_ret * so cache->free_lock is not taken. */ - cache = raw_cpu_ptr(&swp_slots); + cache = get_slots_cache(type);
- if (likely(check_cache_active() && cache->slots) && - type == SWAP_TYPE_ALL) { + if (likely(check_cache_active() && cache->slots)) { mutex_lock(&cache->alloc_lock); if (cache->slots) { repeat: @@ -338,7 +458,7 @@ swp_entry_t folio_alloc_swap(struct folio *folio) entry = cache->slots[cache->cur]; cache->slots[cache->cur++].val = 0; cache->nr--; - } else if (refill_swap_slots_cache(cache)) { + } else if (refill_swap_slots_cache(cache, type)) { goto repeat; } } diff --git a/mm/swapfile.c b/mm/swapfile.c index 54c3425a3c86..68859289f19e 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -3325,7 +3325,7 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags) if (inode) inode_unlock(inode); if (!error) - enable_swap_slots_cache(); + enable_swap_slots_cache(p->type); return error; }
From: zhaoxiaoqiang11 zhaoxiaoqiang11@jd.com
jingdong inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I8QK6Q CVE: NA
--------------------------------
in cloud native environment, we need to disable container swap and enable it only in special cases.
Usage: echo 2 > /proc/sys/vm/memcg_swap_qos_enable to enable this new feature.
Note: some write operation will be invalid
Valid change: 0 => 1 (enable swap qos, SWAP_TYPE_ALL) 0 => 2 (enable swap qos, SWAP_TYPE_NONE) 1 => 0 (disable swap qos) 2 => 0 (disable swap qos)
Invalid change: 1 => 2 (SWAP_TYPE_ALL => SWAP_TYPE_NONE) 2 => 1 (SWAP_TYPE_NONE => SWAP_TYPE_ALL) and write operation will return -EINVAL
Signed-off-by: zhaoxiaoqiang11 zhaoxiaoqiang11@jd.com Signed-off-by: Jinjiang Tu tujinjiang@huawei.com --- include/linux/memcontrol.h | 5 +++++ mm/memcontrol.c | 26 ++++++++++++++++++++------ 2 files changed, 25 insertions(+), 6 deletions(-)
diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index 3c28d6580258..287d130ee969 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -383,6 +383,11 @@ static inline void memcg_print_bad_task(struct oom_control *oc)
#ifdef CONFIG_MEMCG_SWAP_QOS DECLARE_STATIC_KEY_FALSE(memcg_swap_qos_key); + +#define MEMCG_SWAP_STAT_DISABLE 0 +#define MEMCG_SWAP_STAT_ALL 1 +#define MEMCG_SWAP_STAT_NONE 2 +#define MAX_MEMCG_SWAP_TYPE MEMCG_SWAP_STAT_NONE #endif
/* diff --git a/mm/memcontrol.c b/mm/memcontrol.c index e4ae23325efc..5f2d37bc7c45 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -4246,14 +4246,15 @@ DEFINE_STATIC_KEY_FALSE(memcg_swap_qos_key);
#ifdef CONFIG_SYSCTL static int sysctl_memcg_swap_qos_stat; +static int swap_qos_type_max = MAX_MEMCG_SWAP_TYPE;
-static void memcg_swap_qos_reset(void) +static void memcg_swap_qos_reset(int type) { struct mem_cgroup *memcg;
for_each_mem_cgroup(memcg) { WRITE_ONCE(memcg->swap_dev->max, PAGE_COUNTER_MAX); - WRITE_ONCE(memcg->swap_dev->type, SWAP_TYPE_ALL); + WRITE_ONCE(memcg->swap_dev->type, type); } }
@@ -4262,6 +4263,7 @@ static int sysctl_memcg_swap_qos_handler(struct ctl_table *table, int write, { int ret; int qos_stat_old = sysctl_memcg_swap_qos_stat; + int swap_type;
ret = proc_dointvec_minmax(table, write, buffer, length, ppos); if (ret) @@ -4271,12 +4273,24 @@ static int sysctl_memcg_swap_qos_handler(struct ctl_table *table, int write, if (qos_stat_old == sysctl_memcg_swap_qos_stat) return 0;
- if (sysctl_memcg_swap_qos_stat) { - memcg_swap_qos_reset(); + switch (sysctl_memcg_swap_qos_stat) { + case MEMCG_SWAP_STAT_DISABLE: + static_branch_disable(&memcg_swap_qos_key); + return 0; + case MEMCG_SWAP_STAT_ALL: + swap_type = SWAP_TYPE_ALL; + break; + case MEMCG_SWAP_STAT_NONE: + swap_type = SWAP_TYPE_NONE; + break; + } + + if (qos_stat_old == MEMCG_SWAP_STAT_DISABLE) { + memcg_swap_qos_reset(swap_type); static_branch_enable(&memcg_swap_qos_key); enable_swap_slots_cache_max(); } else { - static_branch_disable(&memcg_swap_qos_key); + return -EINVAL; } }
@@ -8387,7 +8401,7 @@ static struct ctl_table mem_cgroup_sysctls[] = { .mode = 0644, .proc_handler = sysctl_memcg_swap_qos_handler, .extra1 = SYSCTL_ZERO, - .extra2 = SYSCTL_ONE, + .extra2 = &swap_qos_type_max, }, #endif };
From: Liu Shixin liushixin2@huawei.com
hulk inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I8QK6Q CVE: NA
--------------------------------
Support memcg swap qos by default on x86_64 and arm64 platforms.
Signed-off-by: Liu Shixin liushixin2@huawei.com Signed-off-by: Jinjiang Tu tujinjiang@huawei.com --- arch/arm64/configs/openeuler_defconfig | 1 + arch/x86/configs/openeuler_defconfig | 1 + 2 files changed, 2 insertions(+)
diff --git a/arch/arm64/configs/openeuler_defconfig b/arch/arm64/configs/openeuler_defconfig index 86370e8686f5..9d16157e539b 100644 --- a/arch/arm64/configs/openeuler_defconfig +++ b/arch/arm64/configs/openeuler_defconfig @@ -161,6 +161,7 @@ CONFIG_MEMCG=y CONFIG_MEMCG_V1_RECLAIM=y CONFIG_MEMCG_MEMFS_INFO=y CONFIG_MEMCG_OOM_PRIORITY=y +CONFIG_MEMCG_SWAP_QOS=y CONFIG_MEMCG_KMEM=y CONFIG_BLK_CGROUP=y CONFIG_CGROUP_WRITEBACK=y diff --git a/arch/x86/configs/openeuler_defconfig b/arch/x86/configs/openeuler_defconfig index 1ef398404878..1a61f114f14d 100644 --- a/arch/x86/configs/openeuler_defconfig +++ b/arch/x86/configs/openeuler_defconfig @@ -182,6 +182,7 @@ CONFIG_MEMCG=y CONFIG_MEMCG_V1_RECLAIM=y CONFIG_MEMCG_MEMFS_INFO=y CONFIG_MEMCG_OOM_PRIORITY=y +CONFIG_MEMCG_SWAP_QOS=y CONFIG_MEMCG_KMEM=y CONFIG_BLK_CGROUP=y CONFIG_CGROUP_WRITEBACK=y