From: Liu Shixin liushixin2@huawei.com
hulk inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I8QK6Q CVE: NA
--------------------------------
With memory.swapfile interface, the avail swap device can be limit for memcg. The acceptable parameters are 'all', 'none' and valid swap device. Usage: echo /dev/zram0 > memory.swapfile
If the swap device is offline, the swapfile will be fallback to 'none'.
Signed-off-by: Liu Shixin liushixin2@huawei.com Signed-off-by: Jinjiang Tu tujinjiang@huawei.com --- .../admin-guide/cgroup-v1/memory.rst | 1 + include/linux/memcontrol.h | 18 +++ include/linux/swap.h | 10 +- mm/memcontrol.c | 150 +++++++++++++++++- mm/swap_slots.c | 15 +- mm/swapfile.c | 95 ++++++++++- 6 files changed, 278 insertions(+), 11 deletions(-)
diff --git a/Documentation/admin-guide/cgroup-v1/memory.rst b/Documentation/admin-guide/cgroup-v1/memory.rst index f3ce13312604..b8929b2a7e0b 100644 --- a/Documentation/admin-guide/cgroup-v1/memory.rst +++ b/Documentation/admin-guide/cgroup-v1/memory.rst @@ -85,6 +85,7 @@ Brief summary of control files. memory.force_empty trigger forced page reclaim memory.force_swapin trigger forced swapin anon page memory.swap.max set/show limit for swap + memory.swapfile set/show available swap file memory.pressure_level set memory pressure notifications memory.swappiness set/show swappiness parameter of vmscan (See sysctl's vm.swappiness) diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index 7183a3767bf1..3c28d6580258 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -54,6 +54,11 @@ enum memcg_memory_event { MEMCG_NR_MEMORY_EVENTS, };
+enum { + SWAP_TYPE_ALL = -1, /* allowd use all swap file */ + SWAP_TYPE_NONE = -2, /* prohibited use any swapfile */ +}; + struct mem_cgroup_reclaim_cookie { pg_data_t *pgdat; unsigned int generation; @@ -203,6 +208,7 @@ struct obj_cgroup {
struct swap_device { unsigned long max; + int type; };
/* @@ -1203,6 +1209,9 @@ unsigned long mem_cgroup_soft_limit_reclaim(pg_data_t *pgdat, int order, gfp_t gfp_mask, unsigned long *total_scanned);
+int memcg_get_swap_type(struct folio *folio); +void memcg_remove_swapfile(int type); + #else /* CONFIG_MEMCG */
#define MEM_CGROUP_ID_SHIFT 0 @@ -1639,6 +1648,15 @@ unsigned long mem_cgroup_soft_limit_reclaim(pg_data_t *pgdat, int order, static inline void memcg_print_bad_task(struct oom_control *oc) { } + +static inline int memcg_get_swap_type(struct folio *folio) +{ + return SWAP_TYPE_ALL; +} + +static inline void memcg_remove_swapfile(int type) +{ +} #endif /* CONFIG_MEMCG */
static inline void __inc_lruvec_kmem_state(void *p, enum node_stat_item idx) diff --git a/include/linux/swap.h b/include/linux/swap.h index 1c4c86812e96..c57e4373a095 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h @@ -488,7 +488,8 @@ swp_entry_t folio_alloc_swap(struct folio *folio); bool folio_free_swap(struct folio *folio); void put_swap_folio(struct folio *folio, swp_entry_t entry); extern swp_entry_t get_swap_page_of_type(int); -extern int get_swap_pages(int n, swp_entry_t swp_entries[], int entry_size); +extern int get_swap_pages(int n, swp_entry_t swp_entries[], int entry_size, + int type); extern int add_swap_count_continuation(swp_entry_t, gfp_t); extern void swap_shmem_alloc(swp_entry_t); extern int swap_duplicate(swp_entry_t); @@ -516,6 +517,13 @@ static inline void put_swap_device(struct swap_info_struct *si) percpu_ref_put(&si->users); }
+#ifdef CONFIG_MEMCG_SWAP_QOS +extern int write_swapfile_for_memcg(struct address_space *mapping, + int *swap_type); +extern void read_swapfile_for_memcg(struct seq_file *m, int type); +extern long get_nr_swap_pages_type(int type); +#endif + #else /* CONFIG_SWAP */ static inline struct swap_info_struct *swp_swap_info(swp_entry_t entry) { diff --git a/mm/memcontrol.c b/mm/memcontrol.c index c4eb95b331d8..27801bf09e09 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -4251,8 +4251,10 @@ static void memcg_swap_qos_reset(void) { struct mem_cgroup *memcg;
- for_each_mem_cgroup(memcg) + for_each_mem_cgroup(memcg) { WRITE_ONCE(memcg->swap_dev->max, PAGE_COUNTER_MAX); + WRITE_ONCE(memcg->swap_dev->type, SWAP_TYPE_ALL); + } }
static int sysctl_memcg_swap_qos_handler(struct ctl_table *table, int write, @@ -4334,11 +4336,15 @@ static void memcg_free_swap_device(struct mem_cgroup *memcg) static void memcg_swap_device_init(struct mem_cgroup *memcg, struct mem_cgroup *parent) { - if (!static_branch_likely(&memcg_swap_qos_key) || !parent) + if (!static_branch_likely(&memcg_swap_qos_key) || !parent) { WRITE_ONCE(memcg->swap_dev->max, PAGE_COUNTER_MAX); - else + WRITE_ONCE(memcg->swap_dev->type, SWAP_TYPE_ALL); + } else { WRITE_ONCE(memcg->swap_dev->max, READ_ONCE(parent->swap_dev->max)); + WRITE_ONCE(memcg->swap_dev->type, + READ_ONCE(parent->swap_dev->type)); + } }
u64 memcg_swapmax_read(struct cgroup_subsys_state *css, struct cftype *cft) @@ -4410,6 +4416,121 @@ static int mem_cgroup_check_swap_for_v1(struct folio *folio, swp_entry_t entry) return 0; }
+static int memcg_swapfile_read(struct seq_file *m, void *v) +{ + struct mem_cgroup *memcg = mem_cgroup_from_seq(m); + int type; + + if (!static_branch_likely(&memcg_swap_qos_key)) { + seq_printf(m, "all\n"); + return 0; + } + + type = READ_ONCE(memcg->swap_dev->type); + if (type == SWAP_TYPE_NONE) + seq_printf(m, "none\n"); + else if (type == SWAP_TYPE_ALL) + seq_printf(m, "all\n"); + else + read_swapfile_for_memcg(m, type); + return 0; +} + +static ssize_t memcg_swapfile_write(struct kernfs_open_file *of, char *buf, + size_t nbytes, loff_t off) +{ + struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of)); + struct filename *pathname; + struct file *swapfile; + int ret; + + if (!static_branch_likely(&memcg_swap_qos_key)) + return -EACCES; + + buf = strstrip(buf); + + if (!strcmp(buf, "none")) { + WRITE_ONCE(memcg->swap_dev->type, SWAP_TYPE_NONE); + return nbytes; + } else if (!strcmp(buf, "all")) { + WRITE_ONCE(memcg->swap_dev->type, SWAP_TYPE_ALL); + return nbytes; + } + + pathname = getname_kernel(buf); + if (IS_ERR(pathname)) + return PTR_ERR(pathname); + + swapfile = file_open_name(pathname, O_RDWR|O_LARGEFILE, 0); + if (IS_ERR(swapfile)) { + putname(pathname); + return PTR_ERR(swapfile); + } + ret = write_swapfile_for_memcg(swapfile->f_mapping, + &memcg->swap_dev->type); + filp_close(swapfile, NULL); + putname(pathname); + + return ret < 0 ? ret : nbytes; +} + +int memcg_get_swap_type(struct folio *folio) +{ + struct mem_cgroup *memcg; + int type; + + if (!static_branch_likely(&memcg_swap_qos_key)) + return SWAP_TYPE_ALL; + + if (!folio) + return SWAP_TYPE_ALL; + + rcu_read_lock(); + memcg = folio_memcg(folio); + if (!memcg || mem_cgroup_is_root(memcg)) { + rcu_read_unlock(); + return SWAP_TYPE_ALL; + } + + if (!css_tryget_online(&memcg->css)) { + rcu_read_unlock(); + return SWAP_TYPE_ALL; + } + rcu_read_unlock(); + + type = READ_ONCE(memcg->swap_dev->type); + css_put(&memcg->css); + return type; +} + +void memcg_remove_swapfile(int type) +{ + struct mem_cgroup *memcg; + + if (!static_branch_likely(&memcg_swap_qos_key)) + return; + + for_each_mem_cgroup(memcg) + if (READ_ONCE(memcg->swap_dev->type) == type) + WRITE_ONCE(memcg->swap_dev->type, SWAP_TYPE_NONE); +} + +static long mem_cgroup_get_nr_swap_pages_type(struct mem_cgroup *memcg) +{ + int type; + + if (!static_branch_likely(&memcg_swap_qos_key)) + return mem_cgroup_get_nr_swap_pages(memcg); + + type = READ_ONCE(memcg->swap_dev->type); + if (type == SWAP_TYPE_ALL) + return mem_cgroup_get_nr_swap_pages(memcg); + else if (type == SWAP_TYPE_NONE) + return 0; + else + return get_nr_swap_pages_type(type); +} + #else static int memcg_alloc_swap_device(struct mem_cgroup *memcg) { @@ -4429,6 +4550,21 @@ static int mem_cgroup_check_swap_for_v1(struct folio *folio, swp_entry_t entry) { return 0; } + +int memcg_get_swap_type(struct folio *folio) +{ + return SWAP_TYPE_ALL; +} + +void memcg_remove_swapfile(int type) +{ +} + +static long mem_cgroup_get_nr_swap_pages_type(struct mem_cgroup *memcg) +{ + return mem_cgroup_get_nr_swap_pages(memcg); +} + #endif
#ifdef CONFIG_NUMA @@ -5933,6 +6069,12 @@ static struct cftype mem_cgroup_legacy_files[] = { .write = memcg_swapmax_write, .read_u64 = memcg_swapmax_read, }, + { + .name = "swapfile", + .flags = CFTYPE_NOT_ON_ROOT, + .write = memcg_swapfile_write, + .seq_show = memcg_swapfile_read, + }, #endif { }, /* terminate */ }; @@ -7575,7 +7717,7 @@ static ssize_t memory_reclaim(struct kernfs_open_file *of, char *buf,
/* If only reclaim swap pages, check swap space at first. */ if ((reclaim_options & MEMCG_RECLAIM_NOT_FILE) && - (mem_cgroup_get_nr_swap_pages(memcg) <= 0)) + (mem_cgroup_get_nr_swap_pages_type(memcg) <= 0)) return -EAGAIN;
/* diff --git a/mm/swap_slots.c b/mm/swap_slots.c index 0bec1f705f8e..203b75ba1b10 100644 --- a/mm/swap_slots.c +++ b/mm/swap_slots.c @@ -264,7 +264,7 @@ static int refill_swap_slots_cache(struct swap_slots_cache *cache) cache->cur = 0; if (swap_slot_cache_active) cache->nr = get_swap_pages(SWAP_SLOTS_CACHE_SIZE, - cache->slots, 1); + cache->slots, 1, SWAP_TYPE_ALL);
return cache->nr; } @@ -303,12 +303,18 @@ swp_entry_t folio_alloc_swap(struct folio *folio) { swp_entry_t entry; struct swap_slots_cache *cache; + int type;
entry.val = 0;
+ type = memcg_get_swap_type(folio); + if (type == SWAP_TYPE_NONE) + goto out; + + if (folio_test_large(folio)) { if (IS_ENABLED(CONFIG_THP_SWAP) && arch_thp_swp_supported()) - get_swap_pages(1, &entry, folio_nr_pages(folio)); + get_swap_pages(1, &entry, folio_nr_pages(folio), type); goto out; }
@@ -323,7 +329,8 @@ swp_entry_t folio_alloc_swap(struct folio *folio) */ cache = raw_cpu_ptr(&swp_slots);
- if (likely(check_cache_active() && cache->slots)) { + if (likely(check_cache_active() && cache->slots) && + type == SWAP_TYPE_ALL) { mutex_lock(&cache->alloc_lock); if (cache->slots) { repeat: @@ -340,7 +347,7 @@ swp_entry_t folio_alloc_swap(struct folio *folio) goto out; }
- get_swap_pages(1, &entry, 1); + get_swap_pages(1, &entry, 1, type); out: if (mem_cgroup_try_charge_swap(folio, entry)) { put_swap_folio(folio, entry); diff --git a/mm/swapfile.c b/mm/swapfile.c index 4bc70f459164..54c3425a3c86 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -1044,7 +1044,92 @@ static void swap_free_cluster(struct swap_info_struct *si, unsigned long idx) swap_range_free(si, offset, SWAPFILE_CLUSTER); }
-int get_swap_pages(int n_goal, swp_entry_t swp_entries[], int entry_size) +#ifdef CONFIG_MEMCG_SWAP_QOS +int write_swapfile_for_memcg(struct address_space *mapping, int *swap_type) +{ + struct swap_info_struct *si; + unsigned int type; + int ret = -EINVAL; + + spin_lock(&swap_lock); + for (type = 0; type < nr_swapfiles; type++) { + si = swap_info[type]; + if ((si->flags & SWP_WRITEOK) && + (si->swap_file->f_mapping == mapping)) { + WRITE_ONCE(*swap_type, type); + ret = 0; + break; + } + } + spin_unlock(&swap_lock); + return ret; +} + +void read_swapfile_for_memcg(struct seq_file *m, int type) +{ + struct swap_info_struct *si; + + spin_lock(&swap_lock); + if (type < nr_swapfiles) { + si = swap_info[type]; + if (si->flags & SWP_WRITEOK) { + seq_file_path(m, si->swap_file, "\t\n\"); + seq_printf(m, "\n"); + } + } + spin_unlock(&swap_lock); +} + +long get_nr_swap_pages_type(int type) +{ + struct swap_info_struct *si; + long nr_swap_pages = 0; + + spin_lock(&swap_lock); + if (type < nr_swapfiles) { + si = swap_info[type]; + if (si->flags & SWP_WRITEOK) + nr_swap_pages = si->pages - si->inuse_pages; + } + spin_unlock(&swap_lock); + + return nr_swap_pages; +} + +static long get_avail_pages(unsigned long size, int type) +{ + long avail_pgs = 0; + + if (type == SWAP_TYPE_ALL) + return atomic_long_read(&nr_swap_pages) / size; + + spin_unlock(&swap_avail_lock); + avail_pgs = get_nr_swap_pages_type(type) / size; + spin_lock(&swap_avail_lock); + return avail_pgs; +} + +static inline bool should_skip_swap_type(int swap_type, int type) +{ + if (type == SWAP_TYPE_ALL) + return false; + + return (type != swap_type); +} +#else +static inline long get_avail_pages(unsigned long size, int type) +{ + return atomic_long_read(&nr_swap_pages) / size; +} + +static inline bool should_skip_swap_type(int swap_type, int type) +{ + return false; +} +#endif + +int get_swap_pages(int n_goal, swp_entry_t swp_entries[], int entry_size, + int type) { unsigned long size = swap_entry_size(entry_size); struct swap_info_struct *si, *next; @@ -1057,7 +1142,7 @@ int get_swap_pages(int n_goal, swp_entry_t swp_entries[], int entry_size)
spin_lock(&swap_avail_lock);
- avail_pgs = atomic_long_read(&nr_swap_pages) / size; + avail_pgs = get_avail_pages(size, type); if (avail_pgs <= 0) { spin_unlock(&swap_avail_lock); goto noswap; @@ -1074,6 +1159,11 @@ int get_swap_pages(int n_goal, swp_entry_t swp_entries[], int entry_size) plist_requeue(&si->avail_lists[node], &swap_avail_heads[node]); spin_unlock(&swap_avail_lock); spin_lock(&si->lock); + if (should_skip_swap_type(si->type, type)) { + spin_unlock(&si->lock); + spin_lock(&swap_avail_lock); + goto nextsi; + } if (!si->highest_bit || !(si->flags & SWP_WRITEOK)) { spin_lock(&swap_avail_lock); if (plist_node_empty(&si->avail_lists[node])) { @@ -2514,6 +2604,7 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile) p->swap_map = NULL; cluster_info = p->cluster_info; p->cluster_info = NULL; + memcg_remove_swapfile(p->type); spin_unlock(&p->lock); spin_unlock(&swap_lock); arch_swap_invalidate_area(p->type);