euleros inclusion category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I7RO5Q CVE: NA
----------------------------------------------------
During the upgrade from Linux 5.10 to Linux 6.4, some structures are changed.
Therefore, need to adapt to these changes when installing the etmem patch to Linux 6.4.
Move some of etmem feature code to mm/etmem.c, Avoid intrusive modification of the original process by the etmem feature.
Signed-off-by: liubo liubo254@huawei.com --- fs/proc/etmem_scan.c | 58 ++++--- include/linux/swap.h | 2 + mm/Makefile | 1 + mm/etmem.c | 383 ++++++++++++++++++++++++++++++++++++++++++ mm/internal.h | 1 - mm/swap_state.c | 34 ---- mm/vmscan.c | 391 +------------------------------------------ 7 files changed, 425 insertions(+), 445 deletions(-) create mode 100644 mm/etmem.c
diff --git a/fs/proc/etmem_scan.c b/fs/proc/etmem_scan.c index 7c1af58bbf21..06c202dcf1fe 100644 --- a/fs/proc/etmem_scan.c +++ b/fs/proc/etmem_scan.c @@ -43,7 +43,7 @@ #endif
# define kvm_arch_mmu_pointer(vcpu) (vcpu->arch.mmu) -# define kvm_mmu_ad_disabled(mmu) (mmu->mmu_role.base.ad_disabled) +# define kvm_mmu_ad_disabled(mmu) (mmu->cpu_role.base.ad_disabled) #endif /*CONFIG_X86_64*/
#ifdef CONFIG_ARM64 @@ -314,13 +314,13 @@ static int vm_walk_host_range(unsigned long long start, unsigned long tmp_gpa_to_hva = pic->gpa_to_hva;
pic->gpa_to_hva = 0; - spin_unlock_irq(&pic->kvm->mmu_lock); - down_read(&walk->mm->mmap_lock); + read_unlock(&pic->kvm->mmu_lock); + mmap_read_lock(walk->mm); local_irq_disable(); ret = walk_page_range(walk->mm, start + tmp_gpa_to_hva, end + tmp_gpa_to_hva, walk->ops, walk->private); local_irq_enable(); - up_read(&walk->mm->mmap_lock); + mmap_read_unlock(walk->mm); pic->gpa_to_hva = tmp_gpa_to_hva; if (pic->flags & VM_SCAN_HOST) { pic->restart_gpa -= tmp_gpa_to_hva; @@ -537,28 +537,28 @@ static int ept_page_range(struct page_idle_ctrl *pic,
WARN_ON(addr >= end);
- spin_lock_irq(&pic->kvm->mmu_lock); + read_lock(&pic->kvm->mmu_lock);
vcpu = kvm_get_vcpu(pic->kvm, 0); if (!vcpu) { pic->gpa_to_hva = 0; set_restart_gpa(TASK_SIZE, "NO-VCPU"); - spin_unlock_irq(&pic->kvm->mmu_lock); + read_unlock(&pic->kvm->mmu_lock); return -EINVAL; }
mmu = kvm_arch_mmu_pointer(vcpu); - if (!VALID_PAGE(mmu->root_hpa)) { + if (!VALID_PAGE(mmu->root.hpa)) { pic->gpa_to_hva = 0; set_restart_gpa(TASK_SIZE, "NO-HPA"); - spin_unlock_irq(&pic->kvm->mmu_lock); + read_unlock(&pic->kvm->mmu_lock); return -EINVAL; }
- ept_root = __va(mmu->root_hpa); + ept_root = __va(mmu->root.hpa);
/* Walk start at p4d when vm has 4 level table pages */ - if (mmu->shadow_root_level != 4) + if (mmu->root_role.level != 4) err = ept_pgd_range(pic, (pgd_t *)ept_root, addr, end, walk); else err = ept_p4d_range(pic, (p4d_t *)ept_root, addr, end, walk); @@ -567,7 +567,7 @@ static int ept_page_range(struct page_idle_ctrl *pic, * and RET_RESCAN_FLAG will be set in ret value */ if (!(err & RET_RESCAN_FLAG)) - spin_unlock_irq(&pic->kvm->mmu_lock); + read_unlock(&pic->kvm->mmu_lock); else err &= ~RET_RESCAN_FLAG;
@@ -584,23 +584,31 @@ static int ept_idle_supports_cpu(struct kvm *kvm) if (!vcpu) return -EINVAL;
- spin_lock(&kvm->mmu_lock); + read_lock(&kvm->mmu_lock); mmu = kvm_arch_mmu_pointer(vcpu); if (kvm_mmu_ad_disabled(mmu)) { pr_notice("CPU does not support EPT A/D bits tracking\n"); ret = -EINVAL; - } else if (mmu->shadow_root_level < 4 || - (mmu->shadow_root_level == 5 && !pgtable_l5_enabled())) { - pr_notice("Unsupported EPT level %d\n", mmu->shadow_root_level); + } else if (mmu->root_role.level < 4 || + (mmu->root_role.level == 5 && !pgtable_l5_enabled())) { + pr_notice("Unsupported EPT level %d\n", mmu->root_role.level); ret = -EINVAL; } else ret = 0; - spin_unlock(&kvm->mmu_lock); + read_unlock(&kvm->mmu_lock);
return ret; }
#else +static inline phys_addr_t stage2_range_addr_end(phys_addr_t addr, phys_addr_t end) +{ + phys_addr_t size = kvm_granule_size(KVM_PGTABLE_MIN_BLOCK_LEVEL); + phys_addr_t boundary = ALIGN_DOWN(addr + size, size); + + return (boundary - 1 < end - 1) ? boundary : end; +} + static int arm_pte_range(struct page_idle_ctrl *pic, pmd_t *pmd, unsigned long addr, unsigned long end) { @@ -724,13 +732,13 @@ static int arm_page_range(struct page_idle_ctrl *pic,
WARN_ON(addr >= end);
- spin_lock(&pic->kvm->mmu_lock); + read_lock(&pic->kvm->mmu_lock); pgd = (pgd_t *)kvm->arch.mmu.pgt->pgd + pgd_index(addr); - spin_unlock(&pic->kvm->mmu_lock); + read_unlock(&pic->kvm->mmu_lock);
local_irq_disable(); do { - next = stage2_pgd_addr_end(kvm, addr, end); + next = stage2_range_addr_end(addr, end); if (!pgd_present(*pgd)) { set_restart_gpa(next, "PGD_HOLE"); continue; @@ -773,11 +781,12 @@ static unsigned long vm_idle_find_gpa(struct page_idle_ctrl *pic, struct kvm_memory_slot *memslot; unsigned long hva_end; gfn_t gfn; + int bkt;
*addr_range = ~0UL; mutex_lock(&kvm->slots_lock); slots = kvm_memslots(pic->kvm); - kvm_for_each_memslot(memslot, slots) { + kvm_for_each_memslot(memslot, bkt, slots) { hva_end = memslot->userspace_addr + (memslot->npages << PAGE_SHIFT);
@@ -1045,9 +1054,9 @@ static int page_scan_release(struct inode *inode, struct file *file) goto out; } #ifdef CONFIG_X86_64 - spin_lock(&kvm->mmu_lock); + write_lock(&kvm->mmu_lock); kvm_flush_remote_tlbs(kvm); - spin_unlock(&kvm->mmu_lock); + write_unlock(&kvm->mmu_lock); #endif
out: @@ -1217,7 +1226,7 @@ static int mm_idle_walk_range(struct page_idle_ctrl *pic, return ret;
for (; start < end;) { - down_read(&walk->mm->mmap_lock); + mmap_read_lock(walk->mm); vma = find_vma(walk->mm, start); if (vma) { if (end > vma->vm_start) { @@ -1229,8 +1238,7 @@ static int mm_idle_walk_range(struct page_idle_ctrl *pic, set_restart_gpa(vma->vm_start, "VMA-HOLE"); } else set_restart_gpa(TASK_SIZE, "EOF"); - up_read(&walk->mm->mmap_lock); - + mmap_read_unlock(walk->mm); WARN_ONCE(pic->gpa_to_hva, "non-zero gpa_to_hva"); if (ret != PAGE_IDLE_KBUF_FULL && end > pic->restart_gpa) pic->restart_gpa = end; diff --git a/include/linux/swap.h b/include/linux/swap.h index 45cbd02c909f..f620decea34e 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h @@ -449,6 +449,7 @@ enum etmem_swapcache_watermark_en { ETMEM_SWAPCACHE_NR_WMARK };
+extern struct kobj_attribute kernel_swap_enable_attr; extern int add_page_for_swap(struct page *page, struct list_head *pagelist); extern struct page *get_page_from_vaddr(struct mm_struct *mm, unsigned long vaddr); @@ -752,6 +753,7 @@ static inline bool mem_cgroup_swap_full(struct folio *folio) return vm_swap_full(); } #endif +extern unsigned long reclaim_pages(struct list_head *folio_list);
#endif /* __KERNEL__*/ #endif /* _LINUX_SWAP_H */ diff --git a/mm/Makefile b/mm/Makefile index 0824907eab98..cc147c0d7ca0 100644 --- a/mm/Makefile +++ b/mm/Makefile @@ -138,3 +138,4 @@ obj-$(CONFIG_IO_MAPPING) += io-mapping.o obj-$(CONFIG_HAVE_BOOTMEM_INFO_NODE) += bootmem_info.o obj-$(CONFIG_GENERIC_IOREMAP) += ioremap.o obj-$(CONFIG_SHRINKER_DEBUG) += shrinker_debug.o +obj-$(CONFIG_ETMEM) += etmem.o diff --git a/mm/etmem.c b/mm/etmem.c new file mode 100644 index 000000000000..4187fe7eef0c --- /dev/null +++ b/mm/etmem.c @@ -0,0 +1,383 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include <linux/mm.h> +#include <linux/swap.h> +#include <linux/pagemap.h> +#include <linux/memcontrol.h> +#include <linux/gfp.h> +#include <linux/mm_inline.h> + +#include "internal.h" + +static bool enable_kernel_swap __read_mostly = true; + +bool kernel_swap_enabled(void) +{ + return READ_ONCE(enable_kernel_swap); +} + +static ssize_t kernel_swap_enable_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + return sprintf(buf, "%s\n", enable_kernel_swap ? "true" : "false"); +} +static ssize_t kernel_swap_enable_store(struct kobject *kobj, + struct kobj_attribute *attr, + const char *buf, size_t count) +{ + if (!strncmp(buf, "true", 4) || !strncmp(buf, "1", 1)) + WRITE_ONCE(enable_kernel_swap, true); + else if (!strncmp(buf, "false", 5) || !strncmp(buf, "0", 1)) + WRITE_ONCE(enable_kernel_swap, false); + else + return -EINVAL; + + return count; +} + +struct kobj_attribute kernel_swap_enable_attr = + __ATTR(kernel_swap_enable, 0644, kernel_swap_enable_show, + kernel_swap_enable_store); + +int add_page_for_swap(struct page *page, struct list_head *pagelist) +{ + int err = -EBUSY; + struct page *head; + + /* If the page is mapped by more than one process, do not swap it */ + if (page_mapcount(page) > 1) + return -EACCES; + + if (PageHuge(page)) + return -EACCES; + + head = compound_head(page); + if (!folio_isolate_lru(page_folio(head))) { + put_page(page); + return err; + } + put_page(page); + if (PageUnevictable(page)) + putback_lru_page(page); + else + list_add_tail(&head->lru, pagelist); + + err = 0; + return err; +} +EXPORT_SYMBOL_GPL(add_page_for_swap); + +struct page *get_page_from_vaddr(struct mm_struct *mm, unsigned long vaddr) +{ + struct page *page; + struct vm_area_struct *vma; + unsigned int follflags; + + mmap_read_lock(mm); + + vma = find_vma(mm, vaddr); + if (!vma || vaddr < vma->vm_start || vma->vm_flags & VM_LOCKED) { + mmap_read_unlock(mm); + return NULL; + } + + follflags = FOLL_GET | FOLL_DUMP | FOLL_FORCE; + page = follow_page(vma, vaddr, follflags); + if (IS_ERR(page) || !page) { + mmap_read_unlock(mm); + return NULL; + } + + mmap_read_unlock(mm); + return page; +} +EXPORT_SYMBOL_GPL(get_page_from_vaddr); + +static int add_page_for_reclaim_swapcache(struct page *page, + struct list_head *pagelist, struct lruvec *lruvec, enum lru_list lru) +{ + struct page *head; + + /* If the page is mapped by more than one process, do not swap it */ + if (page_mapcount(page) > 1) + return -EINVAL; + + if (PageHuge(page)) + return -EINVAL; + + head = compound_head(page); + if (!PageLRU(head) || PageUnevictable(head)) + return -EBUSY; + + if (unlikely(!get_page_unless_zero(page))) + return -EBUSY; + + if (!TestClearPageLRU(page)) { + /* + * This page may in other isolation path, + * but we still hold lru_lock. + */ + put_page(page); + return -EBUSY; + } + + list_move(&head->lru, pagelist); + update_lru_size(lruvec, lru, page_zonenum(head), -thp_nr_pages(head)); + + return 0; +} + +static unsigned long reclaim_swapcache_pages_from_list(int nid, + struct list_head *page_list, unsigned long reclaim_num, bool putback_flag) +{ + unsigned long nr_reclaimed = 0; + unsigned long nr_moved = 0; + struct page *page, *next; + LIST_HEAD(swap_pages); + struct pglist_data *pgdat = NULL; + + pgdat = NODE_DATA(nid); + + if (putback_flag) + goto putback_list; + + if (reclaim_num == 0) + return 0; + + list_for_each_entry_safe(page, next, page_list, lru) { + if (!page_is_file_lru(page) && !__PageMovable(page) + && PageSwapCache(page)) { + ClearPageActive(page); + list_move(&page->lru, &swap_pages); + nr_moved++; + } + + if (nr_moved >= reclaim_num) + break; + } + + /* swap the pages */ + if (pgdat) + nr_reclaimed = reclaim_pages(&swap_pages); + + return nr_reclaimed; + +putback_list: + while (!list_empty(page_list)) { + page = lru_to_page(page_list); + list_del(&page->lru); + putback_lru_page(page); + } + + return nr_reclaimed; +} + +#define SWAP_SCAN_NUM_MAX 32 + +static bool swapcache_below_watermark(unsigned long *swapcache_watermark) +{ + return total_swapcache_pages() < swapcache_watermark[ETMEM_SWAPCACHE_WMARK_LOW]; +} + +static unsigned long get_swapcache_reclaim_num(unsigned long *swapcache_watermark) +{ + return total_swapcache_pages() > + swapcache_watermark[ETMEM_SWAPCACHE_WMARK_LOW] ? + (total_swapcache_pages() - swapcache_watermark[ETMEM_SWAPCACHE_WMARK_LOW]) : 0; +} + +/* + * The main function to reclaim swapcache, the whole reclaim process is + * divided into 3 steps. + * 1. get the total_swapcache_pages num to reclaim. + * 2. scan the LRU linked list of each memory node to obtain the + * swapcache pages that can be reclaimd. + * 3. reclaim the swapcache page until the requirements are meet. + */ +int do_swapcache_reclaim(unsigned long *swapcache_watermark, + unsigned int watermark_nr) +{ + int err = -EINVAL; + unsigned long swapcache_to_reclaim = 0; + unsigned long nr_reclaimed = 0; + unsigned long swapcache_total_reclaimable = 0; + unsigned long reclaim_page_count = 0; + + unsigned long *nr = NULL; + unsigned long *nr_to_reclaim = NULL; + struct list_head *swapcache_list = NULL; + + int nid = 0; + struct lruvec *lruvec = NULL; + struct list_head *src = NULL; + struct page *page = NULL; + struct page *next = NULL; + struct page *pos = NULL; + + struct mem_cgroup *memcg = NULL; + struct mem_cgroup *target_memcg = NULL; + + pg_data_t *pgdat = NULL; + unsigned int scan_count = 0; + int nid_num = 0; + + if (swapcache_watermark == NULL || + watermark_nr < ETMEM_SWAPCACHE_NR_WMARK) + return err; + + /* get the total_swapcache_pages num to reclaim. */ + swapcache_to_reclaim = get_swapcache_reclaim_num(swapcache_watermark); + if (swapcache_to_reclaim <= 0) + return err; + + nr = kcalloc(MAX_NUMNODES, sizeof(unsigned long), GFP_KERNEL); + if (nr == NULL) + return -ENOMEM; + + nr_to_reclaim = kcalloc(MAX_NUMNODES, sizeof(unsigned long), GFP_KERNEL); + if (nr_to_reclaim == NULL) { + kfree(nr); + return -ENOMEM; + } + + swapcache_list = kcalloc(MAX_NUMNODES, sizeof(struct list_head), GFP_KERNEL); + if (swapcache_list == NULL) { + kfree(nr); + kfree(nr_to_reclaim); + return -ENOMEM; + } + + /* + * scan the LRU linked list of each memory node to obtain the + * swapcache pages that can be reclaimd. + */ + for_each_node_state(nid, N_MEMORY) { + INIT_LIST_HEAD(&swapcache_list[nid_num]); + cond_resched(); + + pgdat = NODE_DATA(nid); + + memcg = mem_cgroup_iter(target_memcg, NULL, NULL); + do { + cond_resched(); + pos = NULL; + lruvec = mem_cgroup_lruvec(memcg, pgdat); + src = &(lruvec->lists[LRU_INACTIVE_ANON]); + spin_lock_irq(&lruvec->lru_lock); + scan_count = 0; + + /* + * Scan the swapcache pages that are not mapped from + * the end of the LRU linked list, scan SWAP_SCAN_NUM_MAX + * pages each time, and record the scan end point page. + */ + + pos = list_last_entry(src, struct page, lru); + spin_unlock_irq(&lruvec->lru_lock); +do_scan: + cond_resched(); + scan_count = 0; + spin_lock_irq(&lruvec->lru_lock); + + /* + * check if pos page is been released or not in LRU list, if true, + * cancel the subsequent page scanning of the current node. + */ + if (!pos || list_entry_is_head(pos, src, lru)) { + spin_unlock_irq(&lruvec->lru_lock); + continue; + } + + if (!PageLRU(pos) || folio_lru_list(page_folio(pos)) != LRU_INACTIVE_ANON) { + spin_unlock_irq(&lruvec->lru_lock); + continue; + } + + page = pos; + pos = NULL; + /* Continue to scan down from the last scan breakpoint */ + list_for_each_entry_safe_reverse_from(page, next, src, lru) { + scan_count++; + pos = next; + if (scan_count >= SWAP_SCAN_NUM_MAX) + break; + + if (!PageSwapCache(page)) + continue; + + if (page_mapped(page)) + continue; + + if (add_page_for_reclaim_swapcache(page, + &swapcache_list[nid_num], + lruvec, LRU_INACTIVE_ANON) != 0) + continue; + + nr[nid_num]++; + swapcache_total_reclaimable++; + } + spin_unlock_irq(&lruvec->lru_lock); + + /* + * Check whether the scanned pages meet + * the reclaim requirements. + */ + if (swapcache_total_reclaimable <= swapcache_to_reclaim || + scan_count >= SWAP_SCAN_NUM_MAX) + goto do_scan; + + } while ((memcg = mem_cgroup_iter(target_memcg, memcg, NULL))); + + /* Start reclaiming the next memory node. */ + nid_num++; + } + + /* reclaim the swapcache page until the requirements are meet. */ + do { + nid_num = 0; + reclaim_page_count = 0; + + /* start swapcache page reclaim for each node. */ + for_each_node_state(nid, N_MEMORY) { + cond_resched(); + + nr_to_reclaim[nid_num] = (swapcache_total_reclaimable == 0) ? 0 : + ((swapcache_to_reclaim * nr[nid_num]) / + swapcache_total_reclaimable); + + reclaim_page_count += reclaim_swapcache_pages_from_list(nid, + &swapcache_list[nid_num], + nr_to_reclaim[nid_num], false); + nid_num++; + } + + nr_reclaimed += reclaim_page_count; + + /* + * Check whether the swapcache page reaches the reclaim requirement or + * the number of the swapcache page reclaimd is 0. Stop reclaim. + */ + if (nr_reclaimed >= swapcache_to_reclaim || reclaim_page_count == 0) + goto exit; + } while (!swapcache_below_watermark(swapcache_watermark) || + nr_reclaimed < swapcache_to_reclaim); +exit: + nid_num = 0; + /* + * Repopulate the swapcache pages that are not reclaimd back + * to the LRU linked list. + */ + for_each_node_state(nid, N_MEMORY) { + cond_resched(); + reclaim_swapcache_pages_from_list(nid, + &swapcache_list[nid_num], 0, true); + nid_num++; + } + + kfree(nr); + kfree(nr_to_reclaim); + kfree(swapcache_list); + + return 0; +} +EXPORT_SYMBOL_GPL(do_swapcache_reclaim); diff --git a/mm/internal.h b/mm/internal.h index 68410c6d97ac..ba568b48072c 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -783,7 +783,6 @@ extern unsigned long __must_check vm_mmap_pgoff(struct file *, unsigned long, unsigned long, unsigned long);
extern void set_pageblock_order(void); -unsigned long reclaim_pages(struct list_head *folio_list); unsigned int reclaim_clean_pages_from_list(struct zone *zone, struct list_head *folio_list); /* The ALLOC_WMARK bits are used as an index to zone->watermark */ diff --git a/mm/swap_state.c b/mm/swap_state.c index 48a34514b235..4ce292e2aea3 100644 --- a/mm/swap_state.c +++ b/mm/swap_state.c @@ -40,9 +40,6 @@ static const struct address_space_operations swap_aops = { struct address_space *swapper_spaces[MAX_SWAPFILES] __read_mostly; static unsigned int nr_swapper_spaces[MAX_SWAPFILES] __read_mostly; static bool enable_vma_readahead __read_mostly = true; -#ifdef CONFIG_ETMEM -static bool enable_kernel_swap __read_mostly = true; -#endif
#define SWAP_RA_WIN_SHIFT (PAGE_SHIFT / 2) #define SWAP_RA_HITS_MASK ((1UL << SWAP_RA_WIN_SHIFT) - 1) @@ -325,13 +322,6 @@ static inline bool swap_use_vma_readahead(void) return READ_ONCE(enable_vma_readahead) && !atomic_read(&nr_rotate_swap); }
-#ifdef CONFIG_ETMEM -bool kernel_swap_enabled(void) -{ - return READ_ONCE(enable_kernel_swap); -} -#endif - /* * Lookup a swap entry in the swap cache. A found folio will be returned * unlocked and with its refcount incremented - we rely on the kernel @@ -879,30 +869,6 @@ static ssize_t vma_ra_enabled_store(struct kobject *kobj, } static struct kobj_attribute vma_ra_enabled_attr = __ATTR_RW(vma_ra_enabled);
-#ifdef CONFIG_ETMEM -static ssize_t kernel_swap_enable_show(struct kobject *kobj, - struct kobj_attribute *attr, char *buf) -{ - return sprintf(buf, "%s\n", enable_kernel_swap ? "true" : "false"); -} -static ssize_t kernel_swap_enable_store(struct kobject *kobj, - struct kobj_attribute *attr, - const char *buf, size_t count) -{ - if (!strncmp(buf, "true", 4) || !strncmp(buf, "1", 1)) - WRITE_ONCE(enable_kernel_swap, true); - else if (!strncmp(buf, "false", 5) || !strncmp(buf, "0", 1)) - WRITE_ONCE(enable_kernel_swap, false); - else - return -EINVAL; - - return count; -} -static struct kobj_attribute kernel_swap_enable_attr = - __ATTR(kernel_swap_enable, 0644, kernel_swap_enable_show, - kernel_swap_enable_store); -#endif - static struct attribute *swap_attrs[] = { &vma_ra_enabled_attr.attr, #ifdef CONFIG_ETMEM diff --git a/mm/vmscan.c b/mm/vmscan.c index dda21e824349..0a8cd99f78a7 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -33,7 +33,6 @@ #include <linux/topology.h> #include <linux/cpu.h> #include <linux/cpuset.h> -#include <linux/mempolicy.h> #include <linux/compaction.h> #include <linux/notifier.h> #include <linux/rwsem.h> @@ -6983,17 +6982,17 @@ static bool throttle_direct_reclaim(gfp_t gfp_mask, struct zonelist *zonelist, return false; }
-#ifdef CONFIG_ETMEM /* * Check if original kernel swap is enabled * turn off kernel swap,but leave page cache reclaim on */ -static inline void kernel_swap_check(struct scan_control *sc) +static inline void kernel_force_no_swap(struct scan_control *sc) { +#ifdef CONFIG_ETMEM if (sc != NULL && !kernel_swap_enabled()) sc->may_swap = 0; -} #endif +}
unsigned long try_to_free_pages(struct zonelist *zonelist, int order, gfp_t gfp_mask, nodemask_t *nodemask) @@ -7011,9 +7010,7 @@ unsigned long try_to_free_pages(struct zonelist *zonelist, int order, .may_swap = 1, };
-#ifdef CONFIG_ETMEM - kernel_swap_check(&sc); -#endif + kernel_force_no_swap(&sc); /* * scan_control uses s8 fields for order, priority, and reclaim_idx. * Confirm they are large enough for max values. @@ -7451,9 +7448,7 @@ static int balance_pgdat(pg_data_t *pgdat, int order, int highest_zoneidx) sc.may_writepage = !laptop_mode && !nr_boost_reclaim; sc.may_swap = !nr_boost_reclaim;
-#ifdef CONFIG_ETMEM - kernel_swap_check(&sc); -#endif + kernel_force_no_swap(&sc);
/* * Do some background aging, to give pages a chance to be @@ -7833,9 +7828,7 @@ unsigned long shrink_all_memory(unsigned long nr_to_reclaim) noreclaim_flag = memalloc_noreclaim_save(); set_task_reclaim_state(current, &sc.reclaim_state);
-#ifdef CONFIG_ETMEM - kernel_swap_check(&sc); -#endif + kernel_force_no_swap(&sc);
nr_reclaimed = do_try_to_free_pages(zonelist, &sc);
@@ -7994,9 +7987,6 @@ static int __node_reclaim(struct pglist_data *pgdat, gfp_t gfp_mask, unsigned in cond_resched(); psi_memstall_enter(&pflags); fs_reclaim_acquire(sc.gfp_mask); -#ifdef CONFIG_ETMEM - kernel_swap_check(&sc); -#endif /* * We need to be able to allocate from the reserves for RECLAIM_UNMAP */ @@ -8133,372 +8123,3 @@ void check_move_unevictable_folios(struct folio_batch *fbatch) } } EXPORT_SYMBOL_GPL(check_move_unevictable_folios); - -#ifdef CONFIG_ETMEM -int add_page_for_swap(struct page *page, struct list_head *pagelist) -{ - int err = -EBUSY; - struct page *head; - - /* If the page is mapped by more than one process, do not swap it */ - if (page_mapcount(page) > 1) - return -EACCES; - - if (PageHuge(page)) - return -EACCES; - - head = compound_head(page); - err = isolate_lru_page(head); - if (err) { - put_page(page); - return err; - } - put_page(page); - if (PageUnevictable(page)) - putback_lru_page(page); - else - list_add_tail(&head->lru, pagelist); - - err = 0; - return err; -} -EXPORT_SYMBOL_GPL(add_page_for_swap); - -struct page *get_page_from_vaddr(struct mm_struct *mm, unsigned long vaddr) -{ - struct page *page; - struct vm_area_struct *vma; - unsigned int follflags; - - down_read(&mm->mmap_lock); - - vma = find_vma(mm, vaddr); - if (!vma || vaddr < vma->vm_start || vma->vm_flags & VM_LOCKED) { - up_read(&mm->mmap_lock); - return NULL; - } - - follflags = FOLL_GET | FOLL_DUMP; - page = follow_page(vma, vaddr, follflags); - if (IS_ERR(page) || !page) { - up_read(&mm->mmap_lock); - return NULL; - } - - up_read(&mm->mmap_lock); - return page; -} -EXPORT_SYMBOL_GPL(get_page_from_vaddr); - -static int add_page_for_reclaim_swapcache(struct page *page, - struct list_head *pagelist, struct lruvec *lruvec, enum lru_list lru) -{ - struct page *head; - - /* If the page is mapped by more than one process, do not swap it */ - if (page_mapcount(page) > 1) - return -EACCES; - - if (PageHuge(page)) - return -EACCES; - - head = compound_head(page); - - switch (__isolate_lru_page_prepare(head, 0)) { - case 0: - if (unlikely(!get_page_unless_zero(page))) - return -1; - - if (!TestClearPageLRU(page)) { - /* - * This page may in other isolation path, - * but we still hold lru_lock. - */ - put_page(page); - return -1; - } - - list_move(&head->lru, pagelist); - update_lru_size(lruvec, lru, page_zonenum(head), -thp_nr_pages(head)); - break; - - case -EBUSY: - return -1; - default: - break; - } - - return 0; -} - -static unsigned long reclaim_swapcache_pages_from_list(int nid, - struct list_head *page_list, unsigned long reclaim_num, bool putback_flag) -{ - struct scan_control sc = { - .may_unmap = 1, - .may_swap = 1, - .may_writepage = 1, - .gfp_mask = GFP_KERNEL, - }; - unsigned long nr_reclaimed = 0; - unsigned long nr_moved = 0; - struct page *page, *next; - LIST_HEAD(swap_pages); - struct pglist_data *pgdat = NULL; - struct reclaim_stat stat; - - pgdat = NODE_DATA(nid); - - if (putback_flag) - goto putback_list; - - if (reclaim_num == 0) - return 0; - - list_for_each_entry_safe(page, next, page_list, lru) { - if (!page_is_file_lru(page) && !__PageMovable(page) - && PageSwapCache(page)) { - ClearPageActive(page); - list_move(&page->lru, &swap_pages); - nr_moved++; - } - - if (nr_moved >= reclaim_num) - break; - } - - /* swap the pages */ - if (pgdat) - nr_reclaimed = shrink_page_list(&swap_pages, - pgdat, - &sc, - &stat, true); - - while (!list_empty(&swap_pages)) { - page = lru_to_page(&swap_pages); - list_del(&page->lru); - putback_lru_page(page); - } - - return nr_reclaimed; - -putback_list: - while (!list_empty(page_list)) { - page = lru_to_page(page_list); - list_del(&page->lru); - putback_lru_page(page); - } - - return nr_reclaimed; -} - -#define SWAP_SCAN_NUM_MAX 32 - -static bool swapcache_below_watermark(unsigned long *swapcache_watermark) -{ - return total_swapcache_pages() < swapcache_watermark[ETMEM_SWAPCACHE_WMARK_LOW]; -} - -static unsigned long get_swapcache_reclaim_num(unsigned long *swapcache_watermark) -{ - return total_swapcache_pages() > - swapcache_watermark[ETMEM_SWAPCACHE_WMARK_LOW] ? - (total_swapcache_pages() - swapcache_watermark[ETMEM_SWAPCACHE_WMARK_LOW]) : 0; -} - -/* - * The main function to reclaim swapcache, the whole reclaim process is - * divided into 3 steps. - * 1. get the total_swapcache_pages num to reclaim. - * 2. scan the LRU linked list of each memory node to obtain the - * swapcache pages that can be reclaimd. - * 3. reclaim the swapcache page until the requirements are meet. - */ -int do_swapcache_reclaim(unsigned long *swapcache_watermark, - unsigned int watermark_nr) -{ - int err = -EINVAL; - unsigned long swapcache_to_reclaim = 0; - unsigned long nr_reclaimed = 0; - unsigned long swapcache_total_reclaimable = 0; - unsigned long reclaim_page_count = 0; - - unsigned long *nr = NULL; - unsigned long *nr_to_reclaim = NULL; - struct list_head *swapcache_list = NULL; - - int nid = 0; - struct lruvec *lruvec = NULL; - struct list_head *src = NULL; - struct page *page = NULL; - struct page *next = NULL; - struct page *pos = NULL; - - struct mem_cgroup *memcg = NULL; - struct mem_cgroup *target_memcg = NULL; - - pg_data_t *pgdat = NULL; - unsigned int scan_count = 0; - int nid_num = 0; - - if (swapcache_watermark == NULL || - watermark_nr < ETMEM_SWAPCACHE_NR_WMARK) - return err; - - /* get the total_swapcache_pages num to reclaim. */ - swapcache_to_reclaim = get_swapcache_reclaim_num(swapcache_watermark); - if (swapcache_to_reclaim <= 0) - return err; - - nr = kcalloc(MAX_NUMNODES, sizeof(unsigned long), GFP_KERNEL); - if (nr == NULL) - return -ENOMEM; - - nr_to_reclaim = kcalloc(MAX_NUMNODES, sizeof(unsigned long), GFP_KERNEL); - if (nr_to_reclaim == NULL) { - kfree(nr); - return -ENOMEM; - } - - swapcache_list = kcalloc(MAX_NUMNODES, sizeof(struct list_head), GFP_KERNEL); - if (swapcache_list == NULL) { - kfree(nr); - kfree(nr_to_reclaim); - return -ENOMEM; - } - - /* - * scan the LRU linked list of each memory node to obtain the - * swapcache pages that can be reclaimd. - */ - for_each_node_state(nid, N_MEMORY) { - INIT_LIST_HEAD(&swapcache_list[nid_num]); - cond_resched(); - - pgdat = NODE_DATA(nid); - - memcg = mem_cgroup_iter(target_memcg, NULL, NULL); - do { - cond_resched(); - pos = NULL; - lruvec = mem_cgroup_lruvec(memcg, pgdat); - src = &(lruvec->lists[LRU_INACTIVE_ANON]); - spin_lock_irq(&lruvec->lru_lock); - scan_count = 0; - - /* - * Scan the swapcache pages that are not mapped from - * the end of the LRU linked list, scan SWAP_SCAN_NUM_MAX - * pages each time, and record the scan end point page. - */ - - pos = list_last_entry(src, struct page, lru); - spin_unlock_irq(&lruvec->lru_lock); -do_scan: - cond_resched(); - scan_count = 0; - spin_lock_irq(&lruvec->lru_lock); - - /* - * check if pos page is been released or not in LRU list, if true, - * cancel the subsequent page scanning of the current node. - */ - if (!pos || list_entry_is_head(pos, src, lru)) { - spin_unlock_irq(&lruvec->lru_lock); - continue; - } - - if (!PageLRU(pos) || page_lru(pos) != LRU_INACTIVE_ANON) { - spin_unlock_irq(&lruvec->lru_lock); - continue; - } - - page = pos; - pos = NULL; - /* Continue to scan down from the last scan breakpoint */ - list_for_each_entry_safe_reverse_from(page, next, src, lru) { - scan_count++; - pos = next; - if (scan_count >= SWAP_SCAN_NUM_MAX) - break; - - if (!PageSwapCache(page)) - continue; - - if (page_mapped(page)) - continue; - - if (add_page_for_reclaim_swapcache(page, - &swapcache_list[nid_num], - lruvec, LRU_INACTIVE_ANON) != 0) - continue; - - nr[nid_num]++; - swapcache_total_reclaimable++; - } - spin_unlock_irq(&lruvec->lru_lock); - - /* - * Check whether the scanned pages meet - * the reclaim requirements. - */ - if (swapcache_total_reclaimable <= swapcache_to_reclaim || - scan_count >= SWAP_SCAN_NUM_MAX) - goto do_scan; - - } while ((memcg = mem_cgroup_iter(target_memcg, memcg, NULL))); - - /* Start reclaiming the next memory node. */ - nid_num++; - } - - /* reclaim the swapcache page until the requirements are meet. */ - do { - nid_num = 0; - reclaim_page_count = 0; - - /* start swapcache page reclaim for each node. */ - for_each_node_state(nid, N_MEMORY) { - cond_resched(); - - nr_to_reclaim[nid_num] = (swapcache_total_reclaimable == 0) ? 0 : - ((swapcache_to_reclaim * nr[nid_num]) / - swapcache_total_reclaimable); - - reclaim_page_count += reclaim_swapcache_pages_from_list(nid, - &swapcache_list[nid_num], - nr_to_reclaim[nid_num], false); - nid_num++; - } - - nr_reclaimed += reclaim_page_count; - - /* - * Check whether the swapcache page reaches the reclaim requirement or - * the number of the swapcache page reclaimd is 0. Stop reclaim. - */ - if (nr_reclaimed >= swapcache_to_reclaim || reclaim_page_count == 0) - goto exit; - } while (!swapcache_below_watermark(swapcache_watermark) || - nr_reclaimed < swapcache_to_reclaim); -exit: - nid_num = 0; - /* - * Repopulate the swapcache pages that are not reclaimd back - * to the LRU linked list. - */ - for_each_node_state(nid, N_MEMORY) { - cond_resched(); - reclaim_swapcache_pages_from_list(nid, - &swapcache_list[nid_num], 0, true); - nid_num++; - } - - kfree(nr); - kfree(nr_to_reclaim); - kfree(swapcache_list); - - return 0; -} -EXPORT_SYMBOL_GPL(do_swapcache_reclaim); -#endif