hulk inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I7CGGT CVE: NA
--------------------------------
With memory.swapfile interface, the avail swap device can be limit for memcg. The acceptable parameters are 'all', 'none' and valid swap device. Usage: echo /dev/zram0 > memory.swapfile
If the swap device is offline, the swapfile will be fallback to 'none'.
Signed-off-by: Liu Shixin liushixin2@huawei.com --- .../admin-guide/cgroup-v1/memory.rst | 1 + include/linux/memcontrol.h | 18 +++ include/linux/swap.h | 10 +- mm/memcontrol.c | 150 +++++++++++++++++- mm/swap_slots.c | 14 +- mm/swapfile.c | 100 +++++++++++- 6 files changed, 282 insertions(+), 11 deletions(-)
diff --git a/Documentation/admin-guide/cgroup-v1/memory.rst b/Documentation/admin-guide/cgroup-v1/memory.rst index 962a80d44744..3891916a0671 100644 --- a/Documentation/admin-guide/cgroup-v1/memory.rst +++ b/Documentation/admin-guide/cgroup-v1/memory.rst @@ -80,6 +80,7 @@ Brief summary of control files. memory.force_empty trigger forced page reclaim memory.force_swapin trigger forced swapin anon page memory.swap.max set/show limit for swap + memory.swapfile set/show available swap file memory.pressure_level set memory pressure notifications memory.swappiness set/show swappiness parameter of vmscan (See sysctl's vm.swappiness) diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index 354ef36c4fe5..16b3666393f2 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -50,6 +50,11 @@ enum memcg_memory_event { MEMCG_NR_MEMORY_EVENTS, };
+enum { + SWAP_TYPE_ALL = -1, /* allowd use all swap file */ + SWAP_TYPE_NONE = -2, /* prohibited use any swapfile */ +}; + struct mem_cgroup_reclaim_cookie { pg_data_t *pgdat; unsigned int generation; @@ -242,6 +247,7 @@ struct obj_cgroup {
struct swap_device { unsigned long max; + int type; };
/* @@ -1307,6 +1313,9 @@ static inline bool memcg_has_children(struct mem_cgroup *memcg)
int mem_cgroup_force_empty(struct mem_cgroup *memcg);
+int memcg_get_swap_type(struct page *page); +void memcg_remove_swapfile(int type); + #else /* CONFIG_MEMCG */
#define MEM_CGROUP_ID_SHIFT 0 @@ -1714,6 +1723,15 @@ unsigned long mem_cgroup_soft_limit_reclaim(pg_data_t *pgdat, int order, static inline void memcg_print_bad_task(struct oom_control *oc) { } + +static inline int memcg_get_swap_type(struct page *page) +{ + return SWAP_TYPE_ALL; +} + +static inline void memcg_remove_swapfile(int type) +{ +} #endif /* CONFIG_MEMCG */
/* idx can be of type enum memcg_stat_item or node_stat_item */ diff --git a/include/linux/swap.h b/include/linux/swap.h index 65626521ae2b..c0be56d09b0b 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h @@ -511,11 +511,14 @@ static inline long get_nr_swap_pages(void) return atomic_long_read(&nr_swap_pages); }
+extern long get_nr_swap_pages_type(int type); + extern void si_swapinfo(struct sysinfo *); extern swp_entry_t get_swap_page(struct page *page); extern void put_swap_page(struct page *page, swp_entry_t entry); extern swp_entry_t get_swap_page_of_type(int); -extern int get_swap_pages(int n, swp_entry_t swp_entries[], int entry_size); +extern int get_swap_pages(int n, swp_entry_t swp_entries[], int entry_size, + int type); extern int add_swap_count_continuation(swp_entry_t, gfp_t); extern void swap_shmem_alloc(swp_entry_t); extern int swap_duplicate(swp_entry_t); @@ -547,6 +550,11 @@ static inline void put_swap_device(struct swap_info_struct *si) percpu_ref_put(&si->sei->users); }
+#ifdef CONFIG_MEMCG_SWAP_QOS +extern int write_swapfile_for_memcg(struct address_space *mapping, + int *swap_type); +extern void read_swapfile_for_memcg(struct seq_file *m, int type); +#endif #else /* CONFIG_SWAP */
static inline int swap_readpage(struct page *page, bool do_poll) diff --git a/mm/memcontrol.c b/mm/memcontrol.c index ce59453f9d81..c26128013384 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -4065,8 +4065,10 @@ static void memcg_swap_qos_reset(void) { struct mem_cgroup *memcg;
- for_each_mem_cgroup(memcg) + for_each_mem_cgroup(memcg) { WRITE_ONCE(memcg->swap_dev->max, PAGE_COUNTER_MAX); + WRITE_ONCE(memcg->swap_dev->type, SWAP_TYPE_ALL); + } }
static int sysctl_memcg_swap_qos_handler(struct ctl_table *table, int write, @@ -4157,11 +4159,15 @@ static void memcg_free_swap_device(struct mem_cgroup *memcg) static void memcg_swap_device_init(struct mem_cgroup *memcg, struct mem_cgroup *parent) { - if (!static_branch_likely(&memcg_swap_qos_key) || !parent) + if (!static_branch_likely(&memcg_swap_qos_key) || !parent) { WRITE_ONCE(memcg->swap_dev->max, PAGE_COUNTER_MAX); - else + WRITE_ONCE(memcg->swap_dev->type, SWAP_TYPE_ALL); + } else { WRITE_ONCE(memcg->swap_dev->max, READ_ONCE(parent->swap_dev->max)); + WRITE_ONCE(memcg->swap_dev->type, + READ_ONCE(parent->swap_dev->type)); + } }
u64 memcg_swapmax_read(struct cgroup_subsys_state *css, struct cftype *cft) @@ -4235,6 +4241,121 @@ static int mem_cgroup_check_swap_for_v1(struct page *page, swp_entry_t entry) return 0; }
+static int memcg_swapfile_read(struct seq_file *m, void *v) +{ + struct mem_cgroup *memcg = mem_cgroup_from_seq(m); + int type; + + if (!static_branch_likely(&memcg_swap_qos_key)) { + seq_printf(m, "all\n"); + return 0; + } + + type = READ_ONCE(memcg->swap_dev->type); + if (type == SWAP_TYPE_NONE) + seq_printf(m, "none\n"); + else if (type == SWAP_TYPE_ALL) + seq_printf(m, "all\n"); + else + read_swapfile_for_memcg(m, type); + return 0; +} + +static ssize_t memcg_swapfile_write(struct kernfs_open_file *of, char *buf, + size_t nbytes, loff_t off) +{ + struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of)); + struct filename *pathname; + struct file *swapfile; + int ret; + + if (!static_branch_likely(&memcg_swap_qos_key)) + return -EACCES; + + buf = strstrip(buf); + + if (!strcmp(buf, "none")) { + WRITE_ONCE(memcg->swap_dev->type, SWAP_TYPE_NONE); + return nbytes; + } else if (!strcmp(buf, "all")) { + WRITE_ONCE(memcg->swap_dev->type, SWAP_TYPE_ALL); + return nbytes; + } + + pathname = getname_kernel(buf); + if (IS_ERR(pathname)) + return PTR_ERR(pathname); + + swapfile = file_open_name(pathname, O_RDWR|O_LARGEFILE, 0); + if (IS_ERR(swapfile)) { + putname(pathname); + return PTR_ERR(swapfile); + } + ret = write_swapfile_for_memcg(swapfile->f_mapping, + &memcg->swap_dev->type); + filp_close(swapfile, NULL); + putname(pathname); + + return ret < 0 ? ret : nbytes; +} + +int memcg_get_swap_type(struct page *page) +{ + struct mem_cgroup *memcg; + int type; + + if (!static_branch_likely(&memcg_swap_qos_key)) + return SWAP_TYPE_ALL; + + if (!page) + return SWAP_TYPE_ALL; + + rcu_read_lock(); + memcg = page_memcg(page); + if (!memcg || mem_cgroup_is_root(memcg)) { + rcu_read_unlock(); + return SWAP_TYPE_ALL; + } + + if (!css_tryget_online(&memcg->css)) { + rcu_read_unlock(); + return SWAP_TYPE_ALL; + } + rcu_read_unlock(); + + type = READ_ONCE(memcg->swap_dev->type); + css_put(&memcg->css); + return type; +} + +void memcg_remove_swapfile(int type) +{ + struct mem_cgroup *memcg; + + if (!static_branch_likely(&memcg_swap_qos_key)) + return; + + for_each_mem_cgroup(memcg) + if (READ_ONCE(memcg->swap_dev->type) == type) + WRITE_ONCE(memcg->swap_dev->type, SWAP_TYPE_NONE); +} + +static long mem_cgroup_get_nr_swap_pages_type(struct mem_cgroup *memcg) +{ + int type; + + if (!static_branch_likely(&memcg_swap_qos_key)) + return mem_cgroup_get_nr_swap_pages(memcg); + + type = READ_ONCE(memcg->swap_dev->type); + if (type == SWAP_TYPE_ALL) + return mem_cgroup_get_nr_swap_pages(memcg); + else if (type == SWAP_TYPE_NONE) + return 0; + else + return get_nr_swap_pages_type(type); +} + #else static int memcg_alloc_swap_device(struct mem_cgroup *memcg) { @@ -4254,6 +4375,21 @@ static int mem_cgroup_check_swap_for_v1(struct page *page, swp_entry_t entry) { return 0; } + +int memcg_get_swap_type(struct page *page) +{ + return SWAP_TYPE_ALL; +} + +void memcg_remove_swapfile(int type) +{ +} + +static long mem_cgroup_get_nr_swap_pages_type(struct mem_cgroup *memcg) +{ + return mem_cgroup_get_nr_swap_pages(memcg); +} + #endif
#ifdef CONFIG_NUMA @@ -5523,7 +5659,7 @@ static ssize_t memory_reclaim(struct kernfs_open_file *of, char *buf,
/* If only reclaim swap pages, check swap space at first. */ if ((reclaim_options & MEMCG_RECLAIM_NOT_FILE) && - (mem_cgroup_get_nr_swap_pages(memcg) <= 0)) + (mem_cgroup_get_nr_swap_pages_type(memcg) <= 0)) return -EAGAIN;
/* This is the final attempt, drain percpu lru caches in the @@ -5960,6 +6096,12 @@ static struct cftype mem_cgroup_legacy_files[] = { .write = memcg_swapmax_write, .read_u64 = memcg_swapmax_read, }, + { + .name = "swapfile", + .flags = CFTYPE_NOT_ON_ROOT, + .write = memcg_swapfile_write, + .seq_show = memcg_swapfile_read, + }, #endif { .name = "high_async_ratio", diff --git a/mm/swap_slots.c b/mm/swap_slots.c index 0357fbe70645..3d4e4c230305 100644 --- a/mm/swap_slots.c +++ b/mm/swap_slots.c @@ -266,7 +266,7 @@ static int refill_swap_slots_cache(struct swap_slots_cache *cache) cache->cur = 0; if (swap_slot_cache_active) cache->nr = get_swap_pages(SWAP_SLOTS_CACHE_SIZE, - cache->slots, 1); + cache->slots, 1, SWAP_TYPE_ALL);
return cache->nr; } @@ -307,12 +307,17 @@ swp_entry_t get_swap_page(struct page *page) { swp_entry_t entry; struct swap_slots_cache *cache; + int type;
entry.val = 0;
+ type = memcg_get_swap_type(page); + if (type == SWAP_TYPE_NONE) + goto out; + if (PageTransHuge(page)) { if (IS_ENABLED(CONFIG_THP_SWAP)) - get_swap_pages(1, &entry, HPAGE_PMD_NR); + get_swap_pages(1, &entry, HPAGE_PMD_NR, type); goto out; }
@@ -327,7 +332,8 @@ swp_entry_t get_swap_page(struct page *page) */ cache = raw_cpu_ptr(&swp_slots);
- if (likely(check_cache_active() && cache->slots)) { + if (likely(check_cache_active() && cache->slots) && + type == SWAP_TYPE_ALL) { mutex_lock(&cache->alloc_lock); if (cache->slots) { repeat: @@ -344,7 +350,7 @@ swp_entry_t get_swap_page(struct page *page) goto out; }
- get_swap_pages(1, &entry, 1); + get_swap_pages(1, &entry, 1, type); out: if (mem_cgroup_try_charge_swap(page, entry)) { put_swap_page(page, entry); diff --git a/mm/swapfile.c b/mm/swapfile.c index 14e2396fa8a3..2134e1b83ccb 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -1056,7 +1056,97 @@ static unsigned long scan_swap_map(struct swap_info_struct *si,
}
-int get_swap_pages(int n_goal, swp_entry_t swp_entries[], int entry_size) +#ifdef CONFIG_MEMCG_SWAP_QOS +int write_swapfile_for_memcg(struct address_space *mapping, int *swap_type) +{ + struct swap_info_struct *si; + unsigned int type; + int ret = -EINVAL; + + spin_lock(&swap_lock); + for (type = 0; type < nr_swapfiles; type++) { + si = swap_info[type]; + if ((si->flags & SWP_WRITEOK) && + (si->swap_file->f_mapping == mapping)) { + WRITE_ONCE(*swap_type, type); + ret = 0; + break; + } + } + spin_unlock(&swap_lock); + return ret; +} + +void read_swapfile_for_memcg(struct seq_file *m, int type) +{ + struct swap_info_struct *si; + + spin_lock(&swap_lock); + if (type < nr_swapfiles) { + si = swap_info[type]; + if (si->flags & SWP_WRITEOK) { + seq_file_path(m, si->swap_file, "\t\n\"); + seq_printf(m, "\n"); + } + } + spin_unlock(&swap_lock); +} + +long get_nr_swap_pages_type(int type) +{ + struct swap_info_struct *si; + long nr_swap_pages = 0; + + spin_lock(&swap_lock); + if (type < nr_swapfiles) { + si = swap_info[type]; + if (si->flags & SWP_WRITEOK) + nr_swap_pages = si->pages - si->inuse_pages; + } + spin_unlock(&swap_lock); + + return nr_swap_pages; +} + +static long get_avail_pages(unsigned long size, int type) +{ + long avail_pgs = 0; + + if (type == SWAP_TYPE_ALL) + return atomic_long_read(&nr_swap_pages) / size; + + spin_unlock(&swap_avail_lock); + avail_pgs = get_nr_swap_pages_type(type) / size; + spin_lock(&swap_avail_lock); + return avail_pgs; +} + +static inline bool should_skip_swap_type(int swap_type, int type) +{ + if (type == SWAP_TYPE_ALL) + return false; + + return (type != swap_type); +} +#else +long get_nr_swap_pages_type(int type) +{ + return 0; +} + +static inline long get_avail_pages(unsigned long size, int type) +{ + return atomic_long_read(&nr_swap_pages) / size; +} + +static inline bool should_skip_swap_type(int swap_type, int type) +{ + return false; +} +#endif + +int get_swap_pages(int n_goal, swp_entry_t swp_entries[], int entry_size, + int type) { unsigned long size = swap_entry_size(entry_size); struct swap_info_struct *si, *next; @@ -1069,7 +1159,7 @@ int get_swap_pages(int n_goal, swp_entry_t swp_entries[], int entry_size)
spin_lock(&swap_avail_lock);
- avail_pgs = atomic_long_read(&nr_swap_pages) / size; + avail_pgs = get_avail_pages(size, type); if (avail_pgs <= 0) { spin_unlock(&swap_avail_lock); goto noswap; @@ -1086,6 +1176,11 @@ int get_swap_pages(int n_goal, swp_entry_t swp_entries[], int entry_size) plist_requeue(&si->avail_lists[node], &swap_avail_heads[node]); spin_unlock(&swap_avail_lock); spin_lock(&si->lock); + if (should_skip_swap_type(si->type, type)) { + spin_unlock(&si->lock); + spin_lock(&swap_avail_lock); + goto nextsi; + } if (!si->highest_bit || !(si->flags & SWP_WRITEOK)) { spin_lock(&swap_avail_lock); if (plist_node_empty(&si->avail_lists[node])) { @@ -2703,6 +2798,7 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile) cluster_info = p->cluster_info; p->cluster_info = NULL; frontswap_map = frontswap_map_get(p); + memcg_remove_swapfile(p->type); spin_unlock(&p->lock); spin_unlock(&swap_lock); arch_swap_invalidate_area(p->type);