From: liubo liubo254@huawei.com
euleros inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I4QVXW CVE: NA
-------------------------------------------------
etmem, the memory vertical expansion technology, uses DRAM and high-performance storage new media to form multi-level memory storage. By grading the stored data, etmem migrates the classified cold storage data from the storage medium to the high-performance storage medium, so as to achieve the purpose of memory capacity expansion and memory cost reduction.
When the memory expansion function etmem is running, the native swap function of the kernel needs to be disabled in certain scenarios to avoid the impact of kernel swap.
This feature provides the preceding functions.
The /sys/kernel/mm/swap/ directory provides the kernel_swap_enable sys interface to enable or disable the native swap function of the kernel.
The default value of /sys/kernel/mm/swap/kernel_swap_enable is true, that is, kernel swap is enabled by default.
Turn on kernel swap: echo true > /sys/kernel/mm/swap/kernel_swap_enable
Turn off kernel swap: echo false > /sys/kernel/mm/swap/kernel_swap_enable
Signed-off-by: liubo liubo254@huawei.com Reviewed-by: Miaohe Lin linmiaohe@huawei.com Reviewed-by: Kefeng Wang wangkefeng.wang@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- include/linux/swap.h | 1 + mm/swap_state.c | 29 +++++++++++++++++++++++++++++ mm/vmscan.c | 18 ++++++++++++++++++ 3 files changed, 48 insertions(+)
diff --git a/include/linux/swap.h b/include/linux/swap.h index b7cfad35987a2..23549741336a4 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h @@ -476,6 +476,7 @@ extern struct page *swap_cluster_readahead(swp_entry_t entry, gfp_t flag, struct vm_fault *vmf); extern struct page *swapin_readahead(swp_entry_t entry, gfp_t flag, struct vm_fault *vmf); +extern bool kernel_swap_enabled(void);
/* linux/mm/swapfile.c */ extern atomic_long_t nr_swap_pages; diff --git a/mm/swap_state.c b/mm/swap_state.c index 2137e2d571965..1527ac72928b6 100644 --- a/mm/swap_state.c +++ b/mm/swap_state.c @@ -40,6 +40,7 @@ static const struct address_space_operations swap_aops = { struct address_space *swapper_spaces[MAX_SWAPFILES] __read_mostly; static unsigned int nr_swapper_spaces[MAX_SWAPFILES] __read_mostly; static bool enable_vma_readahead __read_mostly = true; +static bool enable_kernel_swap __read_mostly = true;
#define SWAP_RA_WIN_SHIFT (PAGE_SHIFT / 2) #define SWAP_RA_HITS_MASK ((1UL << SWAP_RA_WIN_SHIFT) - 1) @@ -326,6 +327,11 @@ static inline bool swap_use_vma_readahead(void) return READ_ONCE(enable_vma_readahead) && !atomic_read(&nr_rotate_swap); }
+bool kernel_swap_enabled(void) +{ + return READ_ONCE(enable_kernel_swap); +} + /* * Lookup a swap entry in the swap cache. A found page will be returned * unlocked and with its refcount incremented - we rely on the kernel @@ -828,8 +834,31 @@ static struct kobj_attribute vma_ra_enabled_attr = __ATTR(vma_ra_enabled, 0644, vma_ra_enabled_show, vma_ra_enabled_store);
+static ssize_t kernel_swap_enable_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + return sprintf(buf, "%s\n", enable_kernel_swap ? "true" : "false"); +} +static ssize_t kernel_swap_enable_store(struct kobject *kobj, + struct kobj_attribute *attr, + const char *buf, size_t count) +{ + if (!strncmp(buf, "true", 4) || !strncmp(buf, "1", 1)) + WRITE_ONCE(enable_kernel_swap, true); + else if (!strncmp(buf, "false", 5) || !strncmp(buf, "0", 1)) + WRITE_ONCE(enable_kernel_swap, false); + else + return -EINVAL; + + return count; +} +static struct kobj_attribute kernel_swap_enable_attr = + __ATTR(kernel_swap_enable, 0644, kernel_swap_enable_show, + kernel_swap_enable_store); + static struct attribute *swap_attrs[] = { &vma_ra_enabled_attr.attr, + &kernel_swap_enable_attr.attr, NULL, };
diff --git a/mm/vmscan.c b/mm/vmscan.c index e8befe70d2800..2676d6cf2ccac 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -3248,6 +3248,16 @@ static bool throttle_direct_reclaim(gfp_t gfp_mask, struct zonelist *zonelist, return false; }
+/* + * Check if original kernel swap is enabled + * turn off kernel swap,but leave page cache reclaim on + */ +static inline void kernel_swap_check(struct scan_control *sc) +{ + if (sc != NULL && !kernel_swap_enabled()) + sc->may_swap = 0; +} + unsigned long try_to_free_pages(struct zonelist *zonelist, int order, gfp_t gfp_mask, nodemask_t *nodemask) { @@ -3264,6 +3274,7 @@ unsigned long try_to_free_pages(struct zonelist *zonelist, int order, .may_swap = 1, };
+ kernel_swap_check(&sc); /* * scan_control uses s8 fields for order, priority, and reclaim_idx. * Confirm they are large enough for max values. @@ -3548,6 +3559,8 @@ static int balance_pgdat(pg_data_t *pgdat, int order, int classzone_idx)
count_vm_event(PAGEOUTRUN);
+ kernel_swap_check(&sc); + #ifdef CONFIG_SHRINK_PAGECACHE if (vm_cache_limit_mbytes && page_cache_over_limit()) shrink_page_cache(GFP_KERNEL); @@ -3963,6 +3976,8 @@ static unsigned long __shrink_page_cache(gfp_t mask)
struct zonelist *zonelist = node_zonelist(numa_node_id(), mask);
+ kernel_swap_check(&sc); + return do_try_to_free_pages(zonelist, &sc); }
@@ -4282,6 +4297,9 @@ static int __node_reclaim(struct pglist_data *pgdat, gfp_t gfp_mask, unsigned in
cond_resched(); fs_reclaim_acquire(sc.gfp_mask); + + kernel_swap_check(&sc); + /* * We need to be able to allocate from the reserves for RECLAIM_UNMAP * and we also need to be able to write out pages for RECLAIM_WRITE
From: liubo liubo254@huawei.com
euleros inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I4QVXW CVE: NA
------------------------------------------------- etmem, the memory vertical expansion technology,
In the current etmem process, memory page swapping is implemented by invoking shrink_page_list. When this interface is invoked for the first time, pages are added to the swap cache and written to disks.The swap cache page is reclaimed only when this interface is invoked for the second time and no process accesses the page.However, in the etmem process, the user mode scans pages that have been accessed, and the migration is not delivered to pages that are not accessed by processes. Therefore, the swap cache may always be occupied. To solve the preceding problem, add the logic for actively reclaiming the swap cache.When the swap cache occupies a large amount of memory, the system proactively scans the LRU linked list and reclaims the swap cache to save memory within the specified range.
Signed-off-by: liubo liubo254@huawei.com Reviewed-by: Miaohe Lin linmiaohe@huawei.com Reviewed-by: Kefeng Wang wangkefeng.wang@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- fs/proc/etmem_scan.c | 11 +- fs/proc/etmem_swap.c | 195 +++++++++++++++++++++++++++-- fs/proc/task_mmu.c | 75 +++++++++++- include/linux/list.h | 15 +++ include/linux/swap.h | 9 +- mm/swap_state.c | 1 + mm/vmscan.c | 284 ++++++++++++++++++++++++++++++++++++++++++- 7 files changed, 575 insertions(+), 15 deletions(-)
diff --git a/fs/proc/etmem_scan.c b/fs/proc/etmem_scan.c index 2bac5ecd53164..1650208bad4c8 100644 --- a/fs/proc/etmem_scan.c +++ b/fs/proc/etmem_scan.c @@ -1029,17 +1029,24 @@ extern struct file_operations proc_page_scan_operations;
static int page_scan_entry(void) { + proc_page_scan_operations.flock(NULL, 1, NULL); proc_page_scan_operations.owner = THIS_MODULE; proc_page_scan_operations.read = page_scan_read; proc_page_scan_operations.open = page_scan_open; proc_page_scan_operations.release = page_scan_release; + proc_page_scan_operations.flock(NULL, 0, NULL); + return 0; }
static void page_scan_exit(void) { - memset(&proc_page_scan_operations, 0, - sizeof(proc_page_scan_operations)); + proc_page_scan_operations.flock(NULL, 1, NULL); + proc_page_scan_operations.owner = NULL; + proc_page_scan_operations.read = NULL; + proc_page_scan_operations.open = NULL; + proc_page_scan_operations.release = NULL; + proc_page_scan_operations.flock(NULL, 0, NULL); }
MODULE_LICENSE("GPL"); diff --git a/fs/proc/etmem_swap.c b/fs/proc/etmem_swap.c index b24c706c3b2a3..aef9f9952848c 100644 --- a/fs/proc/etmem_swap.c +++ b/fs/proc/etmem_swap.c @@ -10,6 +10,24 @@ #include <linux/mempolicy.h> #include <linux/uaccess.h> #include <linux/delay.h> +#include <linux/numa.h> +#include <linux/freezer.h> +#include <linux/kthread.h> +#include <linux/mm_inline.h> + +#define RECLAIM_SWAPCACHE_MAGIC 0X77 +#define SET_SWAPCACHE_WMARK _IOW(RECLAIM_SWAPCACHE_MAGIC, 0x02, unsigned int) +#define RECLAIM_SWAPCACHE_ON _IOW(RECLAIM_SWAPCACHE_MAGIC, 0x01, unsigned int) +#define RECLAIM_SWAPCACHE_OFF _IOW(RECLAIM_SWAPCACHE_MAGIC, 0x00, unsigned int) + +#define WATERMARK_MAX 100 +#define SWAP_SCAN_NUM_MAX 32 + +static struct task_struct *reclaim_swapcache_tk; +static bool enable_swapcache_reclaim; +static unsigned long swapcache_watermark[ETMEM_SWAPCACHE_NR_WMARK]; + +static DECLARE_WAIT_QUEUE_HEAD(reclaim_queue);
static ssize_t swap_pages_write(struct file *file, const char __user *buf, size_t count, loff_t *ppos) @@ -45,7 +63,7 @@ static ssize_t swap_pages_write(struct file *file, const char __user *buf, ret = kstrtoul(p, 16, &vaddr); if (ret != 0) continue; - /*If get page struct failed, ignore it, get next page*/ + /* If get page struct failed, ignore it, get next page */ page = get_page_from_vaddr(mm, vaddr); if (!page) continue; @@ -78,23 +96,184 @@ static int swap_pages_release(struct inode *inode, struct file *file) return 0; }
+/* check if swapcache meet requirements */ +static bool swapcache_balanced(void) +{ + return total_swapcache_pages() < swapcache_watermark[ETMEM_SWAPCACHE_WMARK_HIGH]; +} + +/* the flag present if swapcache reclaim is started */ +static bool swapcache_reclaim_enabled(void) +{ + return READ_ONCE(enable_swapcache_reclaim); +} + +static void start_swapcache_reclaim(void) +{ + if (swapcache_balanced()) + return; + /* RECLAIM_SWAPCACHE_ON trigger the thread to start running. */ + if (!waitqueue_active(&reclaim_queue)) + return; + + WRITE_ONCE(enable_swapcache_reclaim, true); + wake_up_interruptible(&reclaim_queue); +} + +static void stop_swapcache_reclaim(void) +{ + WRITE_ONCE(enable_swapcache_reclaim, false); +} + +static bool should_goto_sleep(void) +{ + if (swapcache_balanced()) + stop_swapcache_reclaim(); + + if (swapcache_reclaim_enabled()) + return false; + + return true; +} + +static int get_swapcache_watermark(unsigned int ratio) +{ + unsigned int low_watermark; + unsigned int high_watermark; + + low_watermark = ratio & 0xFF; + high_watermark = (ratio >> 8) & 0xFF; + if (low_watermark > WATERMARK_MAX || + high_watermark > WATERMARK_MAX || + low_watermark > high_watermark) + return -EPERM; + + swapcache_watermark[ETMEM_SWAPCACHE_WMARK_LOW] = totalram_pages * + low_watermark / WATERMARK_MAX; + swapcache_watermark[ETMEM_SWAPCACHE_WMARK_HIGH] = totalram_pages * + high_watermark / WATERMARK_MAX; + + return 0; +}
extern struct file_operations proc_swap_pages_operations;
+static void reclaim_swapcache_try_to_sleep(void) +{ + DEFINE_WAIT(wait); + + if (freezing(current) || kthread_should_stop()) + return; + + prepare_to_wait(&reclaim_queue, &wait, TASK_INTERRUPTIBLE); + if (should_goto_sleep()) { + if (!kthread_should_stop()) + schedule(); + } + finish_wait(&reclaim_queue, &wait); +} + +static void etmem_reclaim_swapcache(void) +{ + do_swapcache_reclaim(swapcache_watermark, + ARRAY_SIZE(swapcache_watermark)); + stop_swapcache_reclaim(); +} + +static int reclaim_swapcache_proactive(void *para) +{ + set_freezable(); + + while (1) { + bool ret; + + reclaim_swapcache_try_to_sleep(); + ret = try_to_freeze(); + if (kthread_should_stop()) + break; + + if (ret) + continue; + + etmem_reclaim_swapcache(); + } + + return 0; +} + +static int reclaim_swapcache_run(void) +{ + int ret = 0; + + reclaim_swapcache_tk = kthread_run(reclaim_swapcache_proactive, NULL, + "etmem_recalim_swapcache"); + if (IS_ERR(reclaim_swapcache_tk)) { + ret = PTR_ERR(reclaim_swapcache_tk); + reclaim_swapcache_tk = NULL; + } + return ret; +} + +static long swap_page_ioctl(struct file *filp, unsigned int cmd, + unsigned long arg) +{ + void __user *argp = (void __user *)arg; + unsigned int ratio; + + switch (cmd) { + case RECLAIM_SWAPCACHE_ON: + if (swapcache_reclaim_enabled()) + return 0; + start_swapcache_reclaim(); + break; + case RECLAIM_SWAPCACHE_OFF: + stop_swapcache_reclaim(); + break; + case SET_SWAPCACHE_WMARK: + if (get_user(ratio, (unsigned int __user *)argp)) + return -EFAULT; + + if (get_swapcache_watermark(ratio) != 0) + return -EFAULT; + break; + default: + return -EPERM; + } + + return 0; +} + static int swap_pages_entry(void) { - proc_swap_pages_operations.owner = THIS_MODULE; - proc_swap_pages_operations.write = swap_pages_write; - proc_swap_pages_operations.open = swap_pages_open; - proc_swap_pages_operations.release = swap_pages_release; + proc_swap_pages_operations.flock(NULL, 1, NULL); + proc_swap_pages_operations.owner = THIS_MODULE; + proc_swap_pages_operations.write = swap_pages_write; + proc_swap_pages_operations.open = swap_pages_open; + proc_swap_pages_operations.release = swap_pages_release; + proc_swap_pages_operations.unlocked_ioctl = swap_page_ioctl; + proc_swap_pages_operations.flock(NULL, 0, NULL);
- return 0; + enable_swapcache_reclaim = false; + reclaim_swapcache_run(); + + return 0; }
static void swap_pages_exit(void) { - memset(&proc_swap_pages_operations, 0, - sizeof(proc_swap_pages_operations)); + proc_swap_pages_operations.flock(NULL, 1, NULL); + proc_swap_pages_operations.owner = NULL; + proc_swap_pages_operations.write = NULL; + proc_swap_pages_operations.open = NULL; + proc_swap_pages_operations.release = NULL; + proc_swap_pages_operations.unlocked_ioctl = NULL; + proc_swap_pages_operations.flock(NULL, 0, NULL); + + if (!IS_ERR(reclaim_swapcache_tk)) { + kthread_stop(reclaim_swapcache_tk); + reclaim_swapcache_tk = NULL; + } + return; }
MODULE_LICENSE("GPL"); diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index 8b8129d658e04..4b324d102f512 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c @@ -19,6 +19,7 @@ #include <linux/shmem_fs.h> #include <linux/uaccess.h> #include <linux/pkeys.h> +#include <linux/module.h>
#include <asm/elf.h> #include <asm/tlb.h> @@ -1741,8 +1742,21 @@ const struct file_operations proc_pagemap_operations = { .release = pagemap_release, };
+static DEFINE_SPINLOCK(scan_lock); + +static int page_scan_lock(struct file *file, int is_lock, struct file_lock *flock) +{ + if (is_lock) + spin_lock(&scan_lock); + else + spin_unlock(&scan_lock); + + return 0; +} + /* will be filled when kvm_ept_idle module loads */ struct file_operations proc_page_scan_operations = { + .flock = page_scan_lock, }; EXPORT_SYMBOL_GPL(proc_page_scan_operations);
@@ -1766,10 +1780,23 @@ static ssize_t mm_idle_read(struct file *file, char __user *buf, static int mm_idle_open(struct inode *inode, struct file *file) { struct mm_struct *mm = NULL; + struct module *module = NULL; + int ret = -1;
if (!file_ns_capable(file, &init_user_ns, CAP_SYS_ADMIN)) return -EPERM;
+ page_scan_lock(NULL, 1, NULL); + module = proc_page_scan_operations.owner; + if (module != NULL && try_module_get(module)) + ret = 0; + + page_scan_lock(NULL, 0, NULL); + if (ret != 0) { + /* no scan ko installed, avoid to return valid file */ + return -ENODEV; + } + mm = proc_mem_open(inode, PTRACE_MODE_READ); if (IS_ERR(mm)) return PTR_ERR(mm); @@ -1785,6 +1812,7 @@ static int mm_idle_open(struct inode *inode, struct file *file) static int mm_idle_release(struct inode *inode, struct file *file) { struct mm_struct *mm = file->private_data; + int ret = 0;
if (mm) { if (!mm_kvm(mm)) @@ -1793,9 +1821,12 @@ static int mm_idle_release(struct inode *inode, struct file *file) }
if (proc_page_scan_operations.release) - return proc_page_scan_operations.release(inode, file); + ret = proc_page_scan_operations.release(inode, file);
- return 0; + if (proc_page_scan_operations.owner) + module_put(proc_page_scan_operations.owner); + + return ret; }
const struct file_operations proc_mm_idle_operations = { @@ -1805,8 +1836,21 @@ const struct file_operations proc_mm_idle_operations = { .release = mm_idle_release, };
+static DEFINE_SPINLOCK(swap_lock); + +static int page_swap_lock(struct file *file, int is_lock, struct file_lock *flock) +{ + if (is_lock) + spin_lock(&swap_lock); + else + spin_unlock(&swap_lock); + + return 0; +} + /*swap pages*/ struct file_operations proc_swap_pages_operations = { + .flock = page_swap_lock, }; EXPORT_SYMBOL_GPL(proc_swap_pages_operations);
@@ -1822,10 +1866,23 @@ static ssize_t mm_swap_write(struct file *file, const char __user *buf, static int mm_swap_open(struct inode *inode, struct file *file) { struct mm_struct *mm = NULL; + struct module *module = NULL; + int ret = -1;
if (!file_ns_capable(file, &init_user_ns, CAP_SYS_ADMIN)) return -EPERM;
+ page_swap_lock(NULL, 1, NULL); + module = proc_swap_pages_operations.owner; + if (module != NULL && try_module_get(module)) + ret = 0; + + page_swap_lock(NULL, 0, NULL); + if (ret != 0) { + /* no swap ko installed, avoid to return valid file */ + return -ENODEV; + } + mm = proc_mem_open(inode, PTRACE_MODE_READ); if (IS_ERR(mm)) return PTR_ERR(mm); @@ -1841,13 +1898,23 @@ static int mm_swap_open(struct inode *inode, struct file *file) static int mm_swap_release(struct inode *inode, struct file *file) { struct mm_struct *mm = file->private_data; + int ret = 0;
if (mm) mmdrop(mm);
if (proc_swap_pages_operations.release) - return proc_swap_pages_operations.release(inode, file); + ret = proc_swap_pages_operations.release(inode, file);
+ if (proc_swap_pages_operations.owner) + module_put(proc_swap_pages_operations.owner); + return ret; +} + +static long mm_swap_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) +{ + if (proc_swap_pages_operations.unlocked_ioctl) + return proc_swap_pages_operations.unlocked_ioctl(filp, cmd, arg); return 0; }
@@ -1856,7 +1923,9 @@ const struct file_operations proc_mm_swap_operations = { .write = mm_swap_write, .open = mm_swap_open, .release = mm_swap_release, + .unlocked_ioctl = mm_swap_ioctl, }; + #endif /* CONFIG_PROC_PAGE_MONITOR */
#ifdef CONFIG_NUMA diff --git a/include/linux/list.h b/include/linux/list.h index fc0e87f94d286..1c3fed7bd684f 100644 --- a/include/linux/list.h +++ b/include/linux/list.h @@ -640,6 +640,21 @@ static inline void list_splice_tail_init(struct list_head *list, &pos->member != (head); \ pos = n, n = list_prev_entry(n, member))
+/** + * list_for_each_entry_safe_reverse_from - iterate backwards over list from current point safe against removal + * @pos: the type * to use as a loop cursor. + * @n: another type * to use as temporary storage + * @head: the head for your list. + * @member: the name of the list_head within the struct. + * + * Iterate backwards over list of given type from current point, safe against + * removal of list entry. + */ +#define list_for_each_entry_safe_reverse_from(pos, n, head, member) \ + for (n = list_prev_entry(pos, member); \ + &pos->member != (head); \ + pos = n, n = list_prev_entry(n, member)) + /** * list_safe_reset_next - reset a stale list_for_each_entry_safe loop * @pos: the loop cursor used in the list_for_each_entry_safe loop diff --git a/include/linux/swap.h b/include/linux/swap.h index 23549741336a4..959a2e3811f28 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h @@ -31,6 +31,12 @@ struct bio; SWAP_FLAG_DISCARD_PAGES) #define SWAP_BATCH 64
+enum etmem_swapcache_watermark_en { + ETMEM_SWAPCACHE_WMARK_LOW, + ETMEM_SWAPCACHE_WMARK_HIGH, + ETMEM_SWAPCACHE_NR_WMARK +}; + static inline int current_is_kswapd(void) { return current->flags & PF_KSWAPD; @@ -388,7 +394,8 @@ extern unsigned long reclaim_pages(struct list_head *page_list); extern int add_page_for_swap(struct page *page, struct list_head *pagelist); extern struct page *get_page_from_vaddr(struct mm_struct *mm, unsigned long vaddr); - +extern int do_swapcache_reclaim(unsigned long *swapcache_watermark, + unsigned int watermark_nr); #ifdef CONFIG_SHRINK_PAGECACHE extern unsigned long vm_cache_limit_ratio; extern unsigned long vm_cache_limit_ratio_min; diff --git a/mm/swap_state.c b/mm/swap_state.c index 1527ac72928b6..493d30b02a3fe 100644 --- a/mm/swap_state.c +++ b/mm/swap_state.c @@ -95,6 +95,7 @@ unsigned long total_swapcache_pages(void) } return ret; } +EXPORT_SYMBOL_GPL(total_swapcache_pages);
static atomic_t swapin_readahead_hits = ATOMIC_INIT(4);
diff --git a/mm/vmscan.c b/mm/vmscan.c index 2676d6cf2ccac..67f72f4d9daef 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -4517,7 +4517,7 @@ int add_page_for_swap(struct page *page, struct list_head *pagelist) int err = -EBUSY; struct page *head;
- /*If the page is mapped by more than one process, do not swap it */ + /* If the page is mapped by more than one process, do not swap it */ if (page_mapcount(page) > 1) return -EACCES;
@@ -4565,3 +4565,285 @@ struct page *get_page_from_vaddr(struct mm_struct *mm, unsigned long vaddr) return page; } EXPORT_SYMBOL_GPL(get_page_from_vaddr); + +static int add_page_for_reclaim_swapcache(struct page *page, + struct list_head *pagelist, struct lruvec *lruvec, enum lru_list lru) +{ + struct list_head *src = &lruvec->lists[lru]; + struct page *head; + + /* If the page is mapped by more than one process, do not swap it */ + if (page_mapcount(page) > 1) + return -EACCES; + + if (PageHuge(page)) + return -EACCES; + + head = compound_head(page); + + switch (__isolate_lru_page(head, 0)) { + case 0: + list_move(&head->lru, pagelist); + update_lru_size(lruvec, lru, page_zonenum(head), -hpage_nr_pages(head)); + break; + case -EBUSY: + list_move(&head->lru, src); + return -1; + default: + break; + } + + return 0; +} + +static unsigned long reclaim_swapcache_pages_from_list(int nid, + struct list_head *page_list, unsigned long reclaim_num, bool putback_flag) +{ + struct scan_control sc = { + .may_unmap = 1, + .may_swap = 1, + .may_writepage = 1, + .gfp_mask = GFP_KERNEL, + }; + unsigned long nr_reclaimed = 0; + unsigned long nr_moved = 0; + struct page *page, *next; + LIST_HEAD(swap_pages); + struct pglist_data *pgdat = NULL; + + pgdat = NODE_DATA(nid); + + if (putback_flag) + goto putback_list; + + if (reclaim_num == 0) + return 0; + + list_for_each_entry_safe(page, next, page_list, lru) { + if (!page_is_file_cache(page) && !__PageMovable(page) + && PageSwapCache(page)) { + ClearPageActive(page); + list_move(&page->lru, &swap_pages); + nr_moved++; + } + + if (nr_moved >= reclaim_num) + break; + } + + /* swap the pages */ + if (pgdat) + nr_reclaimed = shrink_page_list(&swap_pages, + pgdat, + &sc, + TTU_IGNORE_ACCESS, + NULL, true); + + while (!list_empty(&swap_pages)) { + page = lru_to_page(&swap_pages); + list_del(&page->lru); + putback_lru_page(page); + } + + return nr_reclaimed; + +putback_list: + while (!list_empty(page_list)) { + page = lru_to_page(page_list); + list_del(&page->lru); + putback_lru_page(page); + } + + return nr_reclaimed; +} + +#define SWAP_SCAN_NUM_MAX 32 + +static bool swapcache_below_watermark(unsigned long *swapcache_watermark) +{ + return total_swapcache_pages() < swapcache_watermark[ETMEM_SWAPCACHE_WMARK_LOW]; +} + +static unsigned long get_swapcache_reclaim_num(unsigned long *swapcache_watermark) +{ + return total_swapcache_pages() > + swapcache_watermark[ETMEM_SWAPCACHE_WMARK_LOW] ? + (total_swapcache_pages() - swapcache_watermark[ETMEM_SWAPCACHE_WMARK_LOW]) : 0; +} + +/* + * The main function to reclaim swapcache, the whole reclaim process is + * divided into 3 steps. + * 1. get the total_swapcache_pages num to reclaim. + * 2. scan the LRU linked list of each memory node to obtain the + * swapcache pages that can be reclaimd. + * 3. reclaim the swapcache page until the requirements are meet. + */ +int do_swapcache_reclaim(unsigned long *swapcache_watermark, + unsigned int watermark_nr) +{ + int err = -EINVAL; + unsigned long swapcache_to_reclaim = 0; + unsigned long nr_reclaimed = 0; + unsigned long nr[MAX_NUMNODES] = {0}; + unsigned long nr_to_reclaim[MAX_NUMNODES] = {0}; + unsigned long swapcache_total_reclaimable = 0; + unsigned long reclaim_page_count = 0; + + struct list_head swapcache_list[MAX_NUMNODES]; + + int nid = 0; + struct lruvec *lruvec = NULL; + struct list_head *src = NULL; + struct page *page = NULL; + struct page *next = NULL; + struct page *pos = NULL; + + s8 priority; + struct mem_cgroup *memcg = NULL; + struct mem_cgroup *root = NULL; + struct mem_cgroup_reclaim_cookie reclaim; + + pg_data_t *pgdat = NULL; + unsigned int scan_count = 0; + int nid_num = 0; + + if (swapcache_watermark == NULL || + watermark_nr < ETMEM_SWAPCACHE_NR_WMARK) + return err; + + /* get the total_swapcache_pages num to reclaim. */ + swapcache_to_reclaim = get_swapcache_reclaim_num(swapcache_watermark); + if (swapcache_to_reclaim <= 0) + return err; + + /* + * scan the LRU linked list of each memory node to obtain the + * swapcache pages that can be reclaimd. + */ + for_each_node_state(nid, N_MEMORY) { + INIT_LIST_HEAD(&swapcache_list[nid_num]); + cond_resched(); + + pgdat = NODE_DATA(nid); + priority = 0; + reclaim.pgdat = pgdat; + reclaim.priority = priority; + + root = NULL; + memcg = mem_cgroup_iter(root, NULL, &reclaim); + do { + cond_resched(); + pos = NULL; + lruvec = mem_cgroup_lruvec(pgdat, memcg); + src = &(lruvec->lists[LRU_INACTIVE_ANON]); + spin_lock_irq(&pgdat->lru_lock); + scan_count = 0; + + /* + * Scan the swapcache pages that are not mapped from + * the end of the LRU linked list, scan SWAP_SCAN_NUM_MAX + * pages each time, and record the scan end point page. + */ + + pos = list_last_entry(src, struct page, lru); + spin_unlock_irq(&pgdat->lru_lock); +do_scan: + cond_resched(); + scan_count = 0; + spin_lock_irq(&pgdat->lru_lock); + + /* + * check if pos page is been released or not in LRU list, if true, + * cancel the subsequent page scanning of the current node. + */ + if (!pos) { + spin_unlock_irq(&pgdat->lru_lock); + continue; + } + + if (!PageLRU(pos) || page_lru(pos) != LRU_INACTIVE_ANON) { + spin_unlock_irq(&pgdat->lru_lock); + continue; + } + + page = pos; + pos = NULL; + /* Continue to scan down from the last scan breakpoint */ + list_for_each_entry_safe_reverse_from(page, next, src, lru) { + scan_count++; + pos = next; + if (scan_count >= SWAP_SCAN_NUM_MAX) + break; + + if (!PageSwapCache(page)) + continue; + + if (page_mapped(page)) + continue; + + if (add_page_for_reclaim_swapcache(page, + &swapcache_list[nid_num], + lruvec, LRU_INACTIVE_ANON) != 0) + continue; + + nr[nid_num]++; + swapcache_total_reclaimable++; + } + spin_unlock_irq(&pgdat->lru_lock); + + /* + * Check whether the scanned pages meet + * the reclaim requirements. + */ + if (swapcache_total_reclaimable <= swapcache_to_reclaim || + scan_count >= SWAP_SCAN_NUM_MAX) + goto do_scan; + + } while ((memcg = mem_cgroup_iter(root, memcg, &reclaim))); + + /* Start reclaiming the next memory node. */ + nid_num++; + } + + /* reclaim the swapcache page until the requirements are meet. */ + do { + nid_num = 0; + reclaim_page_count = 0; + + /* start swapcache page reclaim for each node. */ + for_each_node_state(nid, N_MEMORY) { + cond_resched(); + + nr_to_reclaim[nid_num] = (swapcache_to_reclaim / (swapcache_total_reclaimable / nr[nid_num])); + reclaim_page_count += reclaim_swapcache_pages_from_list(nid, + &swapcache_list[nid_num], nr_to_reclaim[nid_num], false); + nid_num++; + } + + nr_reclaimed += reclaim_page_count; + + /* + * Check whether the swapcache page reaches the reclaim requirement or + * the number of the swapcache page reclaimd is 0. Stop reclaim. + */ + if (nr_reclaimed >= swapcache_to_reclaim || reclaim_page_count == 0) + goto exit; + } while (!swapcache_below_watermark(swapcache_watermark) || + nr_reclaimed < swapcache_to_reclaim); +exit: + nid_num = 0; + /* + * Repopulate the swapcache pages that are not reclaimd back + * to the LRU linked list. + */ + for_each_node_state(nid, N_MEMORY) { + cond_resched(); + reclaim_swapcache_pages_from_list(nid, + &swapcache_list[nid_num], 0, true); + nid_num++; + } + + return 0; +} +EXPORT_SYMBOL_GPL(do_swapcache_reclaim);
From: liubo liubo254@huawei.com
euleros inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I4QVXW CVE: NA
------------------------------------------------- etmem, the memory vertical expansion technology,
The existing memory expansion tool etmem swaps out all pages that can be swapped out for the process by default, unless the page is marked with lock flag.
The function of swapping out specified pages is added. The process adds VM_SWAPFLAG flags for pages to be swapped out. The etmem adds filters to the scanning module and swaps out only these pages.
Signed-off-by: liubo liubo254@huawei.com Reviewed-by: Miaohe Lin linmiaohe@huawei.com Reviewed-by: Kefeng Wang wangkefeng.wang@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- fs/proc/etmem_scan.c | 32 ++++++++++++++++++++++++++ fs/proc/etmem_scan.h | 9 ++++++++ fs/proc/etmem_swap.c | 1 + fs/proc/task_mmu.c | 9 ++++++++ include/linux/mm.h | 2 ++ include/uapi/asm-generic/mman-common.h | 3 +++ mm/madvise.c | 9 +++++++- 7 files changed, 64 insertions(+), 1 deletion(-)
diff --git a/fs/proc/etmem_scan.c b/fs/proc/etmem_scan.c index 1650208bad4c8..a436fa9280bb8 100644 --- a/fs/proc/etmem_scan.c +++ b/fs/proc/etmem_scan.c @@ -923,6 +923,11 @@ static int mm_idle_test_walk(unsigned long start, unsigned long end, struct mm_walk *walk) { struct vm_area_struct *vma = walk->vma; + struct page_idle_ctrl *pic = walk->private; + + /* If the specified page swapout is set, the untagged vma is skipped. */ + if ((pic->flags & VMA_SCAN_FLAG) && !(vma->vm_flags & VM_SWAPFLAG)) + return 1;
if (vma->vm_file) { if ((vma->vm_flags & (VM_WRITE|VM_MAYSHARE)) == VM_WRITE) @@ -1025,6 +1030,31 @@ static ssize_t mm_idle_read(struct file *file, char *buf, return ret; }
+static long page_scan_ioctl(struct file *filp, unsigned int cmd, + unsigned long arg) +{ + void __user *argp = (void __user *)arg; + unsigned int flags; + + if (get_user(flags, (unsigned int __user *)argp)) + return -EFAULT; + + flags &= ALL_SCAN_FLAGS; + + switch (cmd) { + case VMA_SCAN_ADD_FLAGS: + filp->f_flags |= flags; + break; + case VMA_SCAN_REMOVE_FLAGS: + filp->f_flags &= ~flags; + break; + default: + return -EOPNOTSUPP; + } + + return 0; +} + extern struct file_operations proc_page_scan_operations;
static int page_scan_entry(void) @@ -1034,6 +1064,7 @@ static int page_scan_entry(void) proc_page_scan_operations.read = page_scan_read; proc_page_scan_operations.open = page_scan_open; proc_page_scan_operations.release = page_scan_release; + proc_page_scan_operations.unlocked_ioctl = page_scan_ioctl; proc_page_scan_operations.flock(NULL, 0, NULL);
return 0; @@ -1046,6 +1077,7 @@ static void page_scan_exit(void) proc_page_scan_operations.read = NULL; proc_page_scan_operations.open = NULL; proc_page_scan_operations.release = NULL; + proc_page_scan_operations.unlocked_ioctl = NULL; proc_page_scan_operations.flock(NULL, 0, NULL); }
diff --git a/fs/proc/etmem_scan.h b/fs/proc/etmem_scan.h index 305739f92eef2..5deb7fb02f059 100644 --- a/fs/proc/etmem_scan.h +++ b/fs/proc/etmem_scan.h @@ -6,6 +6,15 @@ #define SCAN_SKIM_IDLE O_NOFOLLOW /* stop on PMD_IDLE_PTES */ #define SCAN_DIRTY_PAGE O_NOATIME /* report pte/pmd dirty bit */
+/* define to not used file flags */ +#define VMA_SCAN_FLAG 0x1000 /* scan the specifics vma with flag */ + +#define ALL_SCAN_FLAGS (SCAN_HUGE_PAGE | SCAN_SKIM_IDLE | SCAN_DIRTY_PAGE | VMA_SCAN_FLAG) + +#define IDLE_SCAN_MAGIC 0x66 +#define VMA_SCAN_ADD_FLAGS _IOW(IDLE_SCAN_MAGIC, 0x2, unsigned int) +#define VMA_SCAN_REMOVE_FLAGS _IOW(IDLE_SCAN_MAGIC, 0x3, unsigned int) + enum ProcIdlePageType { PTE_ACCESSED, /* 4k page */ PMD_ACCESSED, /* 2M page */ diff --git a/fs/proc/etmem_swap.c b/fs/proc/etmem_swap.c index aef9f9952848c..6eb422f5f3a34 100644 --- a/fs/proc/etmem_swap.c +++ b/fs/proc/etmem_swap.c @@ -63,6 +63,7 @@ static ssize_t swap_pages_write(struct file *file, const char __user *buf, ret = kstrtoul(p, 16, &vaddr); if (ret != 0) continue; + /* If get page struct failed, ignore it, get next page */ page = get_page_from_vaddr(mm, vaddr); if (!page) diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index 4b324d102f512..495044e1990bd 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c @@ -1829,11 +1829,20 @@ static int mm_idle_release(struct inode *inode, struct file *file) return ret; }
+static long mm_idle_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) +{ + if (proc_page_scan_operations.unlocked_ioctl) + return proc_page_scan_operations.unlocked_ioctl(filp, cmd, arg); + + return 0; +} + const struct file_operations proc_mm_idle_operations = { .llseek = mem_lseek, /* borrow this */ .read = mm_idle_read, .open = mm_idle_open, .release = mm_idle_release, + .unlocked_ioctl = mm_idle_ioctl, };
static DEFINE_SPINLOCK(swap_lock); diff --git a/include/linux/mm.h b/include/linux/mm.h index 7b724d39e6ee0..be0be448c3f19 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -251,6 +251,8 @@ extern unsigned int kobjsize(const void *objp); #define VM_USWAP 0x2000000000000000 #endif
+#define VM_SWAPFLAG 0x400000000000000 /* memory swap out flag in vma */ + #ifdef CONFIG_ARCH_USES_HIGH_VMA_FLAGS #define VM_HIGH_ARCH_BIT_0 32 /* bit only usable on 64-bit architectures */ #define VM_HIGH_ARCH_BIT_1 33 /* bit only usable on 64-bit architectures */ diff --git a/include/uapi/asm-generic/mman-common.h b/include/uapi/asm-generic/mman-common.h index e7ee32861d51d..58e55857258f0 100644 --- a/include/uapi/asm-generic/mman-common.h +++ b/include/uapi/asm-generic/mman-common.h @@ -66,6 +66,9 @@ #define MADV_WIPEONFORK 18 /* Zero memory on fork, child only */ #define MADV_KEEPONFORK 19 /* Undo MADV_WIPEONFORK */
+#define MADV_SWAPFLAG 203 /* memory swap flag, for memory to be swap out */ +#define MADV_SWAPFLAG_REMOVE 204 + /* compatibility flags */ #define MAP_FILE 0
diff --git a/mm/madvise.c b/mm/madvise.c index 1317267807b19..242a88ae3acf1 100644 --- a/mm/madvise.c +++ b/mm/madvise.c @@ -129,6 +129,12 @@ static long madvise_behavior(struct vm_area_struct *vma, goto out; } break; + case MADV_SWAPFLAG: + new_flags |= VM_SWAPFLAG; + break; + case MADV_SWAPFLAG_REMOVE: + new_flags &= ~VM_SWAPFLAG; + break; }
if (new_flags == vma->vm_flags) { @@ -740,8 +746,9 @@ madvise_behavior_valid(int behavior) case MADV_SOFT_OFFLINE: case MADV_HWPOISON: #endif + case MADV_SWAPFLAG: + case MADV_SWAPFLAG_REMOVE: return true; - default: return false; }