euleros inclusion category: feature feature: Add pages to swapcache and swap them out proactively bugzilla: NA CVE: NA
-------------------------------------------------
This patch proposes the etmem swap feature. etmem swap adds target pages to swap cache to be further reclaimed by kswapd, and dwell in swap space
Signed-off-by: yanxiaodan yanxiaodan@huawei.com Signed-off-by: linmiaohe linmiaohe@huawei.com Signed-off-by: louhongxiang louhongxiang@huawei.com Signed-off-by: liubo liubo254@huawei.com Signed-off-by: geruijun geruijun@huawei.com Signed-off-by: liangchenshu liangchenshu@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com Signed-off-by: Yuchen Tang tangyuchen5@huawei.com --- fs/proc/Makefile | 5 +- fs/proc/base.c | 2 + fs/proc/etmem_proc.c | 94 ++++++++++++++++++++++++++++++++++++ fs/proc/etmem_swap.c | 109 ++++++++++++++++++++++++++++++++++++++++++ fs/proc/internal.h | 1 + include/linux/etmem.h | 13 +++++ mm/Kconfig | 10 ++++ mm/Makefile | 1 + mm/etmem.c | 64 +++++++++++++++++++++++++ 9 files changed, 297 insertions(+), 2 deletions(-) create mode 100644 fs/proc/etmem_swap.c create mode 100644 mm/etmem.c
diff --git a/fs/proc/Makefile b/fs/proc/Makefile index b9a7bc7d8a75..fe283f354d61 100644 --- a/fs/proc/Makefile +++ b/fs/proc/Makefile @@ -34,5 +34,6 @@ proc-$(CONFIG_PROC_VMCORE) += vmcore.o proc-$(CONFIG_PRINTK) += kmsg.o proc-$(CONFIG_PROC_PAGE_MONITOR) += page.o proc-$(CONFIG_BOOT_CONFIG) += bootconfig.o -obj-$(CONFIG_ETMEM_SCAN) += etmem_scan.o -proc-${CONFIG_ETMEM} += etmem_proc.o +obj-$(CONFIG_ETMEM_SCAN) += etmem_scan.o +obj-$(CONFIG_ETMEM_SWAP) += etmem_swap.o +proc-${CONFIG_ETMEM} += etmem_proc.o diff --git a/fs/proc/base.c b/fs/proc/base.c index 06cec4f623b5..8525117bc452 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -3357,6 +3357,7 @@ static const struct pid_entry tgid_base_stuff[] = { #endif #ifdef CONFIG_ETMEM REG("idle_pages", 0600, proc_mm_idle_operations), + REG("swap_pages", 0600, proc_mm_swap_operations), #endif #ifdef CONFIG_SECURITY DIR("attr", S_IRUGO|S_IXUGO, proc_attr_dir_inode_operations, proc_attr_dir_operations), @@ -3709,6 +3710,7 @@ static const struct pid_entry tid_base_stuff[] = { #endif #ifdef CONFIG_ETMEM REG("idle_pages", 0600, proc_mm_idle_operations), + REG("swap_pages", 0600, proc_mm_swap_operations), #endif #ifdef CONFIG_SECURITY DIR("attr", S_IRUGO|S_IXUGO, proc_attr_dir_inode_operations, proc_attr_dir_operations), diff --git a/fs/proc/etmem_proc.c b/fs/proc/etmem_proc.c index edacb9260345..2e6712cc43b2 100644 --- a/fs/proc/etmem_proc.c +++ b/fs/proc/etmem_proc.c @@ -120,3 +120,97 @@ const struct file_operations proc_mm_idle_operations = { .release = mm_idle_release, .unlocked_ioctl = mm_idle_ioctl, }; + +static DEFINE_SPINLOCK(swap_lock); + +static int page_swap_lock(struct file *file, int is_lock, struct file_lock *flock) +{ + if (is_lock) + spin_lock(&swap_lock); + else + spin_unlock(&swap_lock); + + return 0; +} +/*swap pages*/ +struct file_operations proc_swap_pages_operations = { + .flock = page_swap_lock, +}; +EXPORT_SYMBOL_GPL(proc_swap_pages_operations); + +static ssize_t mm_swap_write(struct file *file, const char __user *buf, + size_t count, loff_t *ppos) +{ + if (proc_swap_pages_operations.write) + return proc_swap_pages_operations.write(file, buf, count, ppos); + + return -1; +} + +static int mm_swap_open(struct inode *inode, struct file *file) +{ + struct mm_struct *mm = NULL; + struct module *module = NULL; + int ret = -1; + + if (!file_ns_capable(file, &init_user_ns, CAP_SYS_ADMIN)) + return -EPERM; + + page_swap_lock(NULL, 1, NULL); + module = proc_swap_pages_operations.owner; + if (module != NULL && try_module_get(module)) + ret = 0; + page_swap_lock(NULL, 0, NULL); + if (ret != 0) { + /* no swap ko installed, avoid to return valid file */ + return -ENODEV; + } + + mm = proc_mem_open(inode, PTRACE_MODE_READ); + if (IS_ERR(mm)) { + module_put(module); + return PTR_ERR(mm); + } + + file->private_data = mm; + + if (proc_swap_pages_operations.open) + ret = proc_swap_pages_operations.open(inode, file); + + if (ret != 0) + module_put(module); + + return ret; +} + +static int mm_swap_release(struct inode *inode, struct file *file) +{ + struct mm_struct *mm = file->private_data; + int ret = 0; + + if (mm) + mmdrop(mm); + + if (proc_swap_pages_operations.release) + ret = proc_swap_pages_operations.release(inode, file); + + if (proc_swap_pages_operations.owner) + module_put(proc_swap_pages_operations.owner); + + return ret; +} + +static long mm_swap_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) +{ + if (proc_swap_pages_operations.unlocked_ioctl) + return proc_swap_pages_operations.unlocked_ioctl(filp, cmd, arg); + return 0; +} + +const struct file_operations proc_mm_swap_operations = { + .llseek = mem_lseek, + .write = mm_swap_write, + .open = mm_swap_open, + .release = mm_swap_release, + .unlocked_ioctl = mm_swap_ioctl, +}; diff --git a/fs/proc/etmem_swap.c b/fs/proc/etmem_swap.c new file mode 100644 index 000000000000..4aad6b9db9a6 --- /dev/null +++ b/fs/proc/etmem_swap.c @@ -0,0 +1,109 @@ +// SPDX-License-Identifier: GPL-2.0 +#include <linux/init.h> +#include <linux/kernel.h> +#include <linux/module.h> +#include <linux/string.h> +#include <linux/proc_fs.h> +#include <linux/sched/mm.h> +#include <linux/mm.h> +#include <linux/swap.h> +#include <linux/mempolicy.h> +#include <linux/uaccess.h> +#include <linux/delay.h> +#include <linux/etmem.h> + +static ssize_t swap_pages_write(struct file *file, const char __user *buf, + size_t count, loff_t *ppos) +{ + char *p, *data, *data_ptr_res; + unsigned long vaddr; + struct mm_struct *mm = file->private_data; + struct page *page; + LIST_HEAD(pagelist); + int ret = 0; + + if (!mm || !mmget_not_zero(mm)) { + ret = -ESRCH; + goto out; + } + + if (count < 0) { + ret = -EOPNOTSUPP; + goto out_mm; + } + + data = memdup_user_nul(buf, count); + if (IS_ERR(data)) { + ret = PTR_ERR(data); + goto out_mm; + } + + data_ptr_res = data; + while ((p = strsep(&data, "\n")) != NULL) { + if (!*p) + continue; + + ret = kstrtoul(p, 16, &vaddr); + if (ret != 0) + continue; + + /* If get page struct failed, ignore it, get next page */ + page = get_page_from_vaddr(mm, vaddr); + if (!page) + continue; + + add_page_for_swap(page, &pagelist); + } + + if (!list_empty(&pagelist)) + reclaim_pages(&pagelist); + + ret = count; + kfree(data_ptr_res); +out_mm: + mmput(mm); +out: + return ret; +} + +static int swap_pages_open(struct inode *inode, struct file *file) +{ + if (!try_module_get(THIS_MODULE)) + return -EBUSY; + + return 0; +} + +static int swap_pages_release(struct inode *inode, struct file *file) +{ + module_put(THIS_MODULE); + return 0; +} + +extern struct file_operations proc_swap_pages_operations; + +static int swap_pages_entry(void) +{ + proc_swap_pages_operations.flock(NULL, 1, NULL); + proc_swap_pages_operations.owner = THIS_MODULE; + proc_swap_pages_operations.write = swap_pages_write; + proc_swap_pages_operations.open = swap_pages_open; + proc_swap_pages_operations.release = swap_pages_release; + proc_swap_pages_operations.flock(NULL, 0, NULL); + + return 0; +} + +static void swap_pages_exit(void) +{ + proc_swap_pages_operations.flock(NULL, 1, NULL); + proc_swap_pages_operations.owner = NULL; + proc_swap_pages_operations.write = NULL; + proc_swap_pages_operations.open = NULL; + proc_swap_pages_operations.release = NULL; + proc_swap_pages_operations.flock(NULL, 0, NULL); +} + +MODULE_LICENSE("GPL"); +module_init(swap_pages_entry); +module_exit(swap_pages_exit); diff --git a/fs/proc/internal.h b/fs/proc/internal.h index a9615455b709..be6d5dfc330c 100644 --- a/fs/proc/internal.h +++ b/fs/proc/internal.h @@ -305,6 +305,7 @@ extern const struct file_operations proc_clear_refs_operations; extern const struct file_operations proc_pagemap_operations; #ifdef CONFIG_ETMEM extern const struct file_operations proc_mm_idle_operations; +extern const struct file_operations proc_mm_swap_operations; #endif
extern unsigned long task_vsize(struct mm_struct *); diff --git a/include/linux/etmem.h b/include/linux/etmem.h index e8a2585f3891..5ebd1c3274b7 100644 --- a/include/linux/etmem.h +++ b/include/linux/etmem.h @@ -22,6 +22,19 @@ static inline struct kvm *mm_kvm(struct mm_struct *mm) } #endif
+extern int add_page_for_swap(struct page *page, struct list_head *pagelist); +extern struct page *get_page_from_vaddr(struct mm_struct *mm, + unsigned long vaddr); +#else /* !CONFIG_ETMEM */ +static inline int add_page_for_swap(struct page *page, struct list_head *pagelist) +{ + return 0; +}
+static inline struct page *get_page_from_vaddr(struct mm_struct *mm, + unsigned long vaddr) +{ + return NULL; +} #endif /* #ifdef CONFIG_ETMEM */ #endif /* define __MM_ETMEM_H_ */ diff --git a/mm/Kconfig b/mm/Kconfig index c95f551109f9..65c99bff21ad 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -914,6 +914,16 @@ config ETMEM_SCAN scan results back to user space. etmem scan also supports virtual machines.
+config ETMEM_SWAP + tristate "module: etmem page swap for etmem support" + depends on ETMEM + help + etmem swap is a critical component of the etmem feature. + When using etmem slide engine, etmem_swap.ko will add appointed pages + (ideally all of which are all rarely used, "cold" pages) to swapcache + proactively, which will later be reclaimed and added to swap space, + making room for more frequently used, "hot" pages. + config ETMEM bool "Enable etmem feature" depends on MMU diff --git a/mm/Makefile b/mm/Makefile index 6759053ed782..3d15ba814dd7 100644 --- a/mm/Makefile +++ b/mm/Makefile @@ -142,5 +142,6 @@ obj-$(CONFIG_GENERIC_IOREMAP) += ioremap.o obj-$(CONFIG_SHRINKER_DEBUG) += shrinker_debug.o obj-$(CONFIG_SHARE_POOL) += share_pool.o obj-$(CONFIG_MEMCG_MEMFS_INFO) += memcg_memfs_info.o +obj-$(CONFIG_ETMEM) += etmem.o obj-$(CONFIG_PAGE_CACHE_LIMIT) += page_cache_limit.o obj-$(CONFIG_CLEAR_FREELIST_PAGE) += clear_freelist_page.o diff --git a/mm/etmem.c b/mm/etmem.c new file mode 100644 index 000000000000..9a89bfcc1058 --- /dev/null +++ b/mm/etmem.c @@ -0,0 +1,64 @@ +// SPDX-License-Identifier: GPL-2.0 +#include <linux/mm.h> +#include <linux/swap.h> +#include <linux/pagemap.h> +#include <linux/memcontrol.h> +#include <linux/gfp.h> +#include <linux/mm_inline.h> +#include <linux/sysctl.h> +#include <linux/etmem.h> +#include "internal.h" + +int add_page_for_swap(struct page *page, struct list_head *pagelist) +{ + int err = -EBUSY; + struct page *head; + + /* If the page is mapped by more than one process, do not swap it */ + if (page_mapcount(page) > 1) + return -EACCES; + + if (PageHuge(page)) + return -EACCES; + + head = compound_head(page); + if (!folio_isolate_lru(page_folio(head))) { + put_page(page); + return err; + } + put_page(page); + if (PageUnevictable(page)) + putback_lru_page(page); + else + list_add_tail(&head->lru, pagelist); + + err = 0; + return err; +} +EXPORT_SYMBOL_GPL(add_page_for_swap); + +struct page *get_page_from_vaddr(struct mm_struct *mm, unsigned long vaddr) +{ + struct page *page; + struct vm_area_struct *vma; + unsigned int follflags; + + mmap_read_lock(mm); + + vma = find_vma(mm, vaddr); + if (!vma || vaddr < vma->vm_start || vma->vm_flags & VM_LOCKED) { + mmap_read_unlock(mm); + return NULL; + } + + follflags = FOLL_GET | FOLL_DUMP; + page = follow_page(vma, vaddr, follflags); + if (IS_ERR(page) || !page) { + mmap_read_unlock(mm); + return NULL; + } + + mmap_read_unlock(mm); + return page; +} +EXPORT_SYMBOL_GPL(get_page_from_vaddr);