From: Liu Shixin liushixin2@huawei.com
hulk inclusion category: feature bugzilla: 186182, https://gitee.com/openeuler/kernel/issues/I4SBQX CVE: NA
--------------------------------
Support to print rootfs files and tmpfs files that having pages charged in given memory cgroup. The files infomations can be printed through interface "memory.memfs_files_info" or printed when OOM is triggered.
In order not to flush memory logs, we limit the maximum number of files to be printed when oom through interface "max_print_files_in_oom". And in order to filter out small files, we limit the minimum size of files that can be printed through interface "size_threshold".
Signed-off-by: Liu Shixin liushixin2@huawei.com Reviewed-by: Kefeng Wang wangkefeng.wang@huawei.com Reviewed-by: Zhang Yi yi.zhang@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- Documentation/vm/memcg_memfs_info.rst | 40 ++++ include/linux/memcg_memfs_info.h | 21 ++ init/Kconfig | 10 + mm/Makefile | 1 + mm/memcg_memfs_info.c | 321 ++++++++++++++++++++++++++ mm/memcontrol.c | 11 + 6 files changed, 404 insertions(+) create mode 100644 Documentation/vm/memcg_memfs_info.rst create mode 100644 include/linux/memcg_memfs_info.h create mode 100644 mm/memcg_memfs_info.c
diff --git a/Documentation/vm/memcg_memfs_info.rst b/Documentation/vm/memcg_memfs_info.rst new file mode 100644 index 0000000000000..aff432d125e52 --- /dev/null +++ b/Documentation/vm/memcg_memfs_info.rst @@ -0,0 +1,40 @@ +.. SPDX-License-Identifier: GPL-2.0+ + +================ +Memcg Memfs Info +================ + +Overview +======== + +Support to print rootfs files and tmpfs files that having pages charged +in given memory cgroup. The files infomations can be printed through +interface "memory.memfs_files_info" or printed when OOM is triggered. + +User control +============ + +1. /sys/kernel/mm/memcg_memfs_info/enable +----------------------------------------- + +Boolean type. The default value is 0, set it to 1 to enable the feature. + +2. /sys/kernel/mm/memcg_memfs_info/max_print_files_in_oom +--------------------------------------------------------- + +Unsigned long type. The default value is 500, indicating that the maximum of +files can be print to console when OOM is triggered. + +3. /sys/kernel/mm/memcg_memfs_info/size_threshold +------------------------------------------------- + +Unsigned long type. The default value is 0, indicating that the minimum size of +files that can be printed. + +4. /sys/fs/cgroup/memory/<memory>/memory.memfs_files_info +--------------------------------------------------------- + +Outputs the files who use memory in this memory cgroup. + +--- +Liu Shixin, Jan 2022 diff --git a/include/linux/memcg_memfs_info.h b/include/linux/memcg_memfs_info.h new file mode 100644 index 0000000000000..658a91e22bd7e --- /dev/null +++ b/include/linux/memcg_memfs_info.h @@ -0,0 +1,21 @@ +/* SPDX-License-Identifier: GPL-2.0+ */ +#ifndef _LINUX_MEMCG_MEMFS_INFO_H +#define _LINUX_MEMCG_MEMFS_INFO_H + +#include <linux/memcontrol.h> +#include <linux/seq_file.h> + +#ifdef CONFIG_MEMCG_MEMFS_INFO +void mem_cgroup_print_memfs_info(struct mem_cgroup *memcg, struct seq_file *m); +int mem_cgroup_memfs_files_show(struct seq_file *m, void *v); +void mem_cgroup_memfs_info_init(void); +#else +static inline void mem_cgroup_print_memfs_info(struct mem_cgroup *memcg, + struct seq_file *m) +{ +} +static inline void mem_cgroup_memfs_info_init(void) +{ +} +#endif +#endif diff --git a/init/Kconfig b/init/Kconfig index a338519692d54..1a0b15c5a82b9 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -733,6 +733,16 @@ config MEMCG_KMEM depends on MEMCG && !SLOB default y
+config MEMCG_MEMFS_INFO + bool "Show memfs files that have pages charged in given memory cgroup" + depends on MEMCG + default n + help + Support to print rootfs files and tmpfs files that having pages + charged in given memory cgroup. The files infomations can be printed + through interface "memory.memfs_files_info" or printed when OOM is + triggered. + config BLK_CGROUP bool "IO controller" depends on BLOCK diff --git a/mm/Makefile b/mm/Makefile index deee05d22a853..8fba091be3868 100644 --- a/mm/Makefile +++ b/mm/Makefile @@ -108,3 +108,4 @@ obj-$(CONFIG_MEMFD_CREATE) += memfd.o obj-$(CONFIG_ASCEND_AUTO_TUNING_HUGEPAGE) += hugepage_tuning.o obj-$(CONFIG_PIN_MEMORY) += pin_mem.o obj-$(CONFIG_ASCEND_SHARE_POOL) += share_pool.o +obj-$(CONFIG_MEMCG_MEMFS_INFO) += memcg_memfs_info.o diff --git a/mm/memcg_memfs_info.c b/mm/memcg_memfs_info.c new file mode 100644 index 0000000000000..346175026cae6 --- /dev/null +++ b/mm/memcg_memfs_info.c @@ -0,0 +1,321 @@ +// SPDX-License-Identifier: GPL-2.0+ + +#include <linux/memcg_memfs_info.h> +#include <linux/fs.h> +#include <linux/sysfs.h> +#include <linux/kobject.h> +#include <linux/slab.h> +#include "../fs/mount.h" + +#define SEQ_printf(m, x...) \ +do { \ + if (m) \ + seq_printf(m, x); \ + else \ + pr_info(x); \ +} while (0) + +struct print_files_control { + struct mem_cgroup *memcg; + struct seq_file *m; + unsigned long size_threshold; + unsigned long max_print_files; + + char *pathbuf; + unsigned long pathbuf_size; + + const char *fs_type_name; + struct vfsmount *vfsmnt; + unsigned long total_print_files; + unsigned long total_files_size; +}; + +static bool memfs_enable; +static unsigned long memfs_size_threshold; +static unsigned long memfs_max_print_files = 500; + +static const char *const fs_type_names[] = { + "rootfs", + "tmpfs", +}; + +static struct vfsmount *memfs_get_vfsmount(struct super_block *sb) +{ + struct mount *mnt; + struct vfsmount *vfsmnt; + + lock_mount_hash(); + list_for_each_entry(mnt, &sb->s_mounts, mnt_instance) { + /* + * There may be multiple mount points for a super_block, + * just need to print one of these mount points to determine + * the file path. + */ + vfsmnt = mntget(&mnt->mnt); + unlock_mount_hash(); + return vfsmnt; + } + unlock_mount_hash(); + + return NULL; +} + +static unsigned long memfs_count_in_mem_cgroup(struct mem_cgroup *memcg, + struct address_space *mapping) +{ + struct radix_tree_iter iter; + unsigned long size = 0; + struct page *page, *head; + void __rcu **slot; + + rcu_read_lock(); + radix_tree_for_each_slot(slot, &mapping->i_pages, &iter, 0) { + page = radix_tree_deref_slot(slot); + + if (unlikely(!page)) + continue; + if (radix_tree_exception(page)) { + if (radix_tree_deref_retry(page)) + slot = radix_tree_iter_retry(&iter); + continue; + } + + head = compound_head(page); + if (memcg == head->mem_cgroup) + size += PAGE_SIZE; + } + rcu_read_unlock(); + return size; +} + +static void memfs_show_file_in_mem_cgroup(void *data, struct inode *inode) +{ + struct print_files_control *pfc = data; + struct dentry *dentry; + unsigned long size; + struct path path; + char *filepath; + + size = memfs_count_in_mem_cgroup(pfc->memcg, inode->i_mapping); + if (!size || size < pfc->size_threshold) + return; + + dentry = d_find_alias(inode); + if (!dentry) + return; + path.mnt = pfc->vfsmnt; + path.dentry = dentry; + filepath = d_absolute_path(&path, pfc->pathbuf, pfc->pathbuf_size); + if (!filepath || IS_ERR(filepath)) + filepath = "(too long)"; + pfc->total_print_files++; + pfc->total_files_size += size; + dput(dentry); + + /* + * To prevent excessive logs, limit the amount of data + * that can be output to logs. + */ + if (!pfc->m && pfc->total_print_files > pfc->max_print_files) + return; + + SEQ_printf(pfc->m, "%lukB %llukB %s\n", + size >> 10, inode->i_size >> 10, filepath); +} + +static void memfs_show_files_in_mem_cgroup(struct super_block *sb, void *data) +{ + struct print_files_control *pfc = data; + struct inode *inode, *toput_inode = NULL; + + if (strncmp(sb->s_type->name, + pfc->fs_type_name, strlen(pfc->fs_type_name))) + return; + + pfc->vfsmnt = memfs_get_vfsmount(sb); + if (!pfc->vfsmnt) + return; + + spin_lock(&sb->s_inode_list_lock); + list_for_each_entry(inode, &sb->s_inodes, i_sb_list) { + spin_lock(&inode->i_lock); + + if ((inode->i_state & (I_FREEING|I_WILL_FREE|I_NEW)) || + (inode->i_mapping->nrpages == 0 && !need_resched())) { + spin_unlock(&inode->i_lock); + continue; + } + __iget(inode); + spin_unlock(&inode->i_lock); + spin_unlock(&sb->s_inode_list_lock); + + memfs_show_file_in_mem_cgroup(pfc, inode); + + iput(toput_inode); + toput_inode = inode; + + cond_resched(); + spin_lock(&sb->s_inode_list_lock); + } + spin_unlock(&sb->s_inode_list_lock); + iput(toput_inode); + mntput(pfc->vfsmnt); +} + +void mem_cgroup_print_memfs_info(struct mem_cgroup *memcg, struct seq_file *m) +{ + struct print_files_control pfc = { + .memcg = memcg, + .m = m, + .max_print_files = memfs_max_print_files, + .size_threshold = memfs_size_threshold, + }; + char *pathbuf; + int i; + + if (!memfs_enable || !memcg) + return; + + pathbuf = kmalloc(PATH_MAX, GFP_KERNEL); + if (!pathbuf) { + SEQ_printf(m, "Show memfs failed due to OOM\n"); + return; + } + pfc.pathbuf = pathbuf; + pfc.pathbuf_size = PATH_MAX; + + for (i = 0; i < ARRAY_SIZE(fs_type_names); i++) { + pfc.fs_type_name = fs_type_names[i]; + pfc.total_print_files = 0; + pfc.total_files_size = 0; + + SEQ_printf(m, "Show %s files (memory-size > %lukB):\n", + pfc.fs_type_name, pfc.size_threshold >> 10); + SEQ_printf(m, "<memory-size> <file-size> <path>\n"); + iterate_supers(memfs_show_files_in_mem_cgroup, &pfc); + + SEQ_printf(m, "total files: %lu, total memory-size: %lukB\n", + pfc.total_print_files, pfc.total_files_size >> 10); + } + + kfree(pfc.pathbuf); +} + +int mem_cgroup_memfs_files_show(struct seq_file *m, void *v) +{ + struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m)); + + mem_cgroup_print_memfs_info(memcg, m); + return 0; +} + +static ssize_t memfs_size_threshold_show(struct kobject *kobj, + struct kobj_attribute *attr, + char *buf) +{ + return sprintf(buf, "%lu\n", memfs_size_threshold); +} + +static ssize_t memfs_size_threshold_store(struct kobject *kobj, + struct kobj_attribute *attr, + const char *buf, size_t len) +{ + unsigned long count; + int err; + + err = kstrtoul(buf, 10, &count); + if (err) + return err; + memfs_size_threshold = count; + return len; +} + +static struct kobj_attribute memfs_size_threshold_attr = { + .attr = {"size_threshold", 0644}, + .show = &memfs_size_threshold_show, + .store = &memfs_size_threshold_store, +}; + +static ssize_t memfs_max_print_files_show(struct kobject *kobj, + struct kobj_attribute *attr, + char *buf) +{ + return sprintf(buf, "%lu\n", memfs_max_print_files); +} + +static ssize_t memfs_max_print_files_store(struct kobject *kobj, + struct kobj_attribute *attr, + const char *buf, size_t len) +{ + unsigned long count; + int err; + + err = kstrtoul(buf, 10, &count); + if (err) + return err; + memfs_max_print_files = count; + return len; +} + +static struct kobj_attribute memfs_max_print_files_attr = { + .attr = {"max_print_files_in_oom", 0644}, + .show = &memfs_max_print_files_show, + .store = &memfs_max_print_files_store, +}; + +static ssize_t memfs_enable_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + return sprintf(buf, "%u\n", memfs_enable); +} + +static ssize_t memfs_enable_store(struct kobject *kobj, + struct kobj_attribute *attr, + const char *buf, size_t len) +{ + bool enable; + int err; + + err = kstrtobool(buf, &enable); + if (err) + return err; + + memfs_enable = enable; + return len; +} + +static struct kobj_attribute memfs_enable_attr = { + .attr = {"enable", 0644}, + .show = &memfs_enable_show, + .store = &memfs_enable_store, +}; + +static struct attribute *memfs_attr[] = { + &memfs_size_threshold_attr.attr, + &memfs_max_print_files_attr.attr, + &memfs_enable_attr.attr, + NULL, +}; + +static struct attribute_group memfs_attr_group = { + .attrs = memfs_attr, +}; + +void mem_cgroup_memfs_info_init(void) +{ + struct kobject *memcg_memfs_kobj; + + if (mem_cgroup_disabled()) + return; + + memcg_memfs_kobj = kobject_create_and_add("memcg_memfs_info", mm_kobj); + if (unlikely(!memcg_memfs_kobj)) { + pr_err("failed to create memcg_memfs kobject\n"); + return; + } + + if (sysfs_create_group(memcg_memfs_kobj, &memfs_attr_group)) { + pr_err("failed to register memcg_memfs group\n"); + kobject_put(memcg_memfs_kobj); + } +} diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 974fc5dc6dc81..18b5660dc2459 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -66,6 +66,7 @@ #include <linux/lockdep.h> #include <linux/file.h> #include <linux/tracehook.h> +#include <linux/memcg_memfs_info.h> #include "internal.h" #include <net/sock.h> #include <net/ip.h> @@ -1484,6 +1485,8 @@ void mem_cgroup_print_oom_meminfo(struct mem_cgroup *memcg)
pr_cont("\n"); } + + mem_cgroup_print_memfs_info(memcg, NULL); }
/* @@ -4705,6 +4708,12 @@ static struct cftype mem_cgroup_legacy_files[] = { .write_s64 = memcg_qos_write, }, #endif +#ifdef CONFIG_MEMCG_MEMFS_INFO + { + .name = "memfs_files_info", + .seq_show = mem_cgroup_memfs_files_show, + }, +#endif #ifdef CONFIG_NUMA { .name = "numa_stat", @@ -6916,6 +6925,8 @@ static int __init mem_cgroup_init(void) soft_limit_tree.rb_tree_per_node[node] = rtpn; }
+ mem_cgroup_memfs_info_init(); + return 0; } subsys_initcall(mem_cgroup_init);