This patch set is used to backport etmem feature patches to version 23.09.
etmem is a tiered memory extension technology that uses DRAM+memory compression/high-performance storage media to form tiered memory storage.
Memory data is tiered, and cold data is migrated from memory media to high-performance storage media to release memory space and reduce memory costs.
v5 -> v6 - Change the value of sysctl_hugetlb_* to static and delete unnecessary external declarations.
v4 -> v5 - fix some review comment during https://gitee.com/openeuler/kernel/pulls/1772 patch
v3 -> v4 - fix the commit message format error
v2 -> v3 - fix the build problem in riscv64 arch.
v1 -> v2 - some cleanup fix
Kemeng Shi (11): etmem: add ioctl for mm idle scan etmem: x86: support scan hugetlb of vm etmem_scan: x86: support scan 4 level ept under 5 level host page table etmem scan: fix memleak in vm_idle_read etmem: fix concurrent access to export file operations etmem: fix potential UAF when walk ept page table etmem_scan: release CPU after scan walk_step size etmem_scan: add pte_hole callback acpi/numa: memorize node type from SRAT table hugepage: add sysctl for hugepage alloc and mig x86: hugepage: use nt copy hugepage to AEP in x86
liubo (14): memig: add memig-scan feature to openEuler memig: add memig-swap feature to openEuler memig: fix compile error when CONFIG_NUMA is turned off etmem: Modify the memig feature name to etmem move ETMEM feature CONFIG to mm/Kconfig and add architecture dependency etmem: etmem scan module Replace WARN_ONCE() with debug_printk for "nothing read" add ETMEM feature CONFIG to mm/Kconfig config: enable CONFIG_ETMEM by default etmem: add CONFIG_ETMEM macro definition for etmem feature etmem: add original kernel swap enabled options etmem: add swapcache reclaim to etmem etmem: Add a scan flag to support specified page swap-out etmem: fix the div 0 problem in swapcache reclaim process etmem: fix some problem during backport etmem patches
arch/arm64/configs/openeuler_defconfig | 3 + arch/x86/configs/openeuler_defconfig | 3 + arch/x86/include/asm/page_64.h | 7 + arch/x86/lib/Makefile | 1 + arch/x86/lib/copy_highpages.c | 107 ++ arch/x86/lib/copy_page_64.S | 73 ++ drivers/acpi/numa/srat.c | 5 + fs/proc/Makefile | 2 + fs/proc/base.c | 8 + fs/proc/etmem_scan.c | 1382 ++++++++++++++++++++++++ fs/proc/etmem_scan.h | 149 +++ fs/proc/etmem_swap.c | 282 +++++ fs/proc/internal.h | 4 + fs/proc/task_mmu.c | 194 ++++ include/linux/highmem.h | 17 + include/linux/list.h | 17 + include/linux/mm.h | 4 + include/linux/mm_types.h | 17 +- include/linux/numa.h | 12 + include/linux/swap.h | 39 + include/uapi/asm-generic/mman-common.h | 4 + mm/Kconfig | 25 + mm/Makefile | 1 + mm/etmem.c | 383 +++++++ mm/huge_memory.c | 1 + mm/hugetlb.c | 27 +- mm/internal.h | 1 - mm/madvise.c | 17 +- mm/page_alloc.c | 13 + mm/pagewalk.c | 1 + mm/swap_state.c | 3 + mm/util.c | 11 +- mm/vmscan.c | 18 + virt/kvm/kvm_main.c | 6 + 34 files changed, 2824 insertions(+), 13 deletions(-) create mode 100644 arch/x86/lib/copy_highpages.c create mode 100644 fs/proc/etmem_scan.c create mode 100644 fs/proc/etmem_scan.h create mode 100644 fs/proc/etmem_swap.c create mode 100644 mm/etmem.c
euleros inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I7RO5Q Reference: https://gitee.com/openeuler/kernel/commit/c13e5b6a937c6865c461484b07530be3c3...
-------------------------------------------------
reason:This patch is used to add memig scan feature to openEuler system. memig_scan.ko is used to scan the virtual address of the target process and return the address access information to the user mode for grading cold and hot pages.
Signed-off-by: Fengguang Wu fengguang.wu@intel.com Signed-off-by: yanxiaodan yanxiaodan@huawei.com Signed-off-by: Feilong Lin linfeilong@huawei.com Signed-off-by: geruijun geruijun@huawei.com Signed-off-by: liubo liubo254@huawei.com Reviewed-by: Jing Xiangfeng jingxiangfeng@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- arch/arm64/configs/openeuler_defconfig | 1 + arch/x86/configs/openeuler_defconfig | 1 + fs/proc/Makefile | 1 + fs/proc/base.c | 2 + fs/proc/internal.h | 1 + fs/proc/memig_scan.c | 1079 ++++++++++++++++++++++++ fs/proc/memig_scan.h | 132 +++ fs/proc/task_mmu.c | 66 ++ include/linux/mm_types.h | 17 +- lib/Kconfig | 6 + mm/huge_memory.c | 1 + mm/pagewalk.c | 1 + virt/kvm/kvm_main.c | 6 + 13 files changed, 1313 insertions(+), 1 deletion(-) create mode 100644 fs/proc/memig_scan.c create mode 100644 fs/proc/memig_scan.h
diff --git a/arch/arm64/configs/openeuler_defconfig b/arch/arm64/configs/openeuler_defconfig index 081a223bc65b..fc6af3dbf1c8 100644 --- a/arch/arm64/configs/openeuler_defconfig +++ b/arch/arm64/configs/openeuler_defconfig @@ -7810,6 +7810,7 @@ CONFIG_FUNCTION_ERROR_INJECTION=y CONFIG_ARCH_HAS_KCOV=y # CONFIG_RUNTIME_TESTING_MENU is not set CONFIG_ARCH_USE_MEMTEST=y +CONFIG_MEMIG_SCAN_MODULE=m # CONFIG_MEMTEST is not set # end of Kernel Testing and Coverage
diff --git a/arch/x86/configs/openeuler_defconfig b/arch/x86/configs/openeuler_defconfig index f6140635690e..6fb2f9270251 100644 --- a/arch/x86/configs/openeuler_defconfig +++ b/arch/x86/configs/openeuler_defconfig @@ -9046,6 +9046,7 @@ CONFIG_ARCH_HAS_KCOV=y # CONFIG_KCOV is not set # CONFIG_RUNTIME_TESTING_MENU is not set CONFIG_ARCH_USE_MEMTEST=y +CONFIG_MEMIG_SCAN_MODULE=m # CONFIG_MEMTEST is not set # CONFIG_HYPERV_TESTING is not set # end of Kernel Testing and Coverage diff --git a/fs/proc/Makefile b/fs/proc/Makefile index bd08616ed8ba..50c6de6f4979 100644 --- a/fs/proc/Makefile +++ b/fs/proc/Makefile @@ -34,3 +34,4 @@ proc-$(CONFIG_PROC_VMCORE) += vmcore.o proc-$(CONFIG_PRINTK) += kmsg.o proc-$(CONFIG_PROC_PAGE_MONITOR) += page.o proc-$(CONFIG_BOOT_CONFIG) += bootconfig.o +obj-$(CONFIG_MEMIG_SCAN_MODULE) += memig_scan.o diff --git a/fs/proc/base.c b/fs/proc/base.c index 7183f338404d..ac108995d68b 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -3353,6 +3353,7 @@ static const struct pid_entry tgid_base_stuff[] = { REG("smaps", S_IRUGO, proc_pid_smaps_operations), REG("smaps_rollup", S_IRUGO, proc_pid_smaps_rollup_operations), REG("pagemap", S_IRUSR, proc_pagemap_operations), + REG("idle_pages", S_IRUSR|S_IWUSR, proc_mm_idle_operations), #endif #ifdef CONFIG_SECURITY DIR("attr", S_IRUGO|S_IXUGO, proc_attr_dir_inode_operations, proc_attr_dir_operations), @@ -3701,6 +3702,7 @@ static const struct pid_entry tid_base_stuff[] = { REG("smaps", S_IRUGO, proc_pid_smaps_operations), REG("smaps_rollup", S_IRUGO, proc_pid_smaps_rollup_operations), REG("pagemap", S_IRUSR, proc_pagemap_operations), + REG("idle_pages", S_IRUSR|S_IWUSR, proc_mm_idle_operations), #endif #ifdef CONFIG_SECURITY DIR("attr", S_IRUGO|S_IXUGO, proc_attr_dir_inode_operations, proc_attr_dir_operations), diff --git a/fs/proc/internal.h b/fs/proc/internal.h index 9dda7e54b2d0..09867fbcbafc 100644 --- a/fs/proc/internal.h +++ b/fs/proc/internal.h @@ -305,6 +305,7 @@ extern const struct file_operations proc_pid_smaps_operations; extern const struct file_operations proc_pid_smaps_rollup_operations; extern const struct file_operations proc_clear_refs_operations; extern const struct file_operations proc_pagemap_operations; +extern const struct file_operations proc_mm_idle_operations;
extern unsigned long task_vsize(struct mm_struct *); extern unsigned long task_statm(struct mm_struct *, diff --git a/fs/proc/memig_scan.c b/fs/proc/memig_scan.c new file mode 100644 index 000000000000..3964e7652127 --- /dev/null +++ b/fs/proc/memig_scan.c @@ -0,0 +1,1079 @@ +// SPDX-License-Identifier: GPL-2.0 +#include <linux/pagemap.h> +#include <linux/mm.h> +#include <linux/hugetlb.h> +#include <linux/kernel.h> +#include <linux/sched.h> +#include <linux/proc_fs.h> +#include <linux/uaccess.h> +#include <linux/kvm.h> +#include <linux/kvm_host.h> +#include <linux/bitmap.h> +#include <linux/sched/mm.h> +#include <linux/version.h> +#include <linux/module.h> +#include <linux/io.h> +#include <linux/pagewalk.h> +#include <linux/uaccess.h> +#include <asm/cacheflush.h> +#include <asm/page.h> +#include <asm/pgalloc.h> +#include <asm/pgtable.h> +#include <linux/huge_mm.h> +#ifdef CONFIG_ARM64 +#include <asm/pgtable-types.h> +#include <asm/memory.h> +#include <asm/kvm_mmu.h> +#include <asm/kvm_arm.h> +#include <asm/stage2_pgtable.h> +#endif +#include "memig_scan.h" + +#ifdef CONFIG_X86_64 +/* + * Fallback to false for kernel doens't support KVM_INVALID_SPTE + * ept_idle can sitll work in this situation but the scan accuracy may drop, + * depends on the access frequences of the workload. + */ +#ifdef KVM_INVALID_SPTE +#define KVM_CHECK_INVALID_SPTE(val) ((val) == KVM_INVALID_SPTE) +#else +#define KVM_CHECK_INVALID_SPTE(val) (0) +#endif + +# define kvm_arch_mmu_pointer(vcpu) (vcpu->arch.mmu) +# define kvm_mmu_ad_disabled(mmu) (mmu->mmu_role.base.ad_disabled) +#endif /*CONFIG_X86_64*/ + +#ifdef CONFIG_ARM64 +#define if_pmd_thp_or_huge(pmd) (if_pmd_huge(pmd) || pmd_trans_huge(pmd)) +#endif /* CONFIG_ARM64 */ + +#ifdef DEBUG + +#define debug_printk trace_printk + +#define set_restart_gpa(val, note) ({ \ + unsigned long old_val = pic->restart_gpa; \ + pic->restart_gpa = (val); \ + trace_printk("restart_gpa=%lx %luK %s %s %d\n", \ + (val), (pic->restart_gpa - old_val) >> 10, \ + note, __func__, __LINE__); \ +}) + +#define set_next_hva(val, note) ({ \ + unsigned long old_val = pic->next_hva; \ + pic->next_hva = (val); \ + trace_printk(" next_hva=%lx %luK %s %s %d\n", \ + (val), (pic->next_hva - old_val) >> 10, \ + note, __func__, __LINE__); \ +}) + +#else + +#define debug_printk(...) + +#define set_restart_gpa(val, note) ({ \ + pic->restart_gpa = (val); \ +}) + +#define set_next_hva(val, note) ({ \ + pic->next_hva = (val); \ +}) + +#endif + +static unsigned long pagetype_size[16] = { + [PTE_ACCESSED] = PAGE_SIZE, /* 4k page */ + [PMD_ACCESSED] = PMD_SIZE, /* 2M page */ + [PUD_PRESENT] = PUD_SIZE, /* 1G page */ + + [PTE_DIRTY_M] = PAGE_SIZE, + [PMD_DIRTY_M] = PMD_SIZE, + + [PTE_IDLE] = PAGE_SIZE, + [PMD_IDLE] = PMD_SIZE, + [PMD_IDLE_PTES] = PMD_SIZE, + + [PTE_HOLE] = PAGE_SIZE, + [PMD_HOLE] = PMD_SIZE, +}; + +static void u64_to_u8(uint64_t n, uint8_t *p) +{ + p += sizeof(uint64_t) - 1; + + *p-- = n; n >>= 8; + *p-- = n; n >>= 8; + *p-- = n; n >>= 8; + *p-- = n; n >>= 8; + + *p-- = n; n >>= 8; + *p-- = n; n >>= 8; + *p-- = n; n >>= 8; + *p = n; +} + +static void dump_pic(struct page_idle_ctrl *pic) +{ + debug_printk("page_idle_ctrl: pie_read=%d pie_read_max=%d", + pic->pie_read, + pic->pie_read_max); + debug_printk(" buf_size=%d bytes_copied=%d next_hva=%pK", + pic->buf_size, + pic->bytes_copied, + pic->next_hva); + debug_printk(" restart_gpa=%pK pa_to_hva=%pK\n", + pic->restart_gpa, + pic->gpa_to_hva); +} + +#ifdef CONFIG_ARM64 +static int if_pmd_huge(pmd_t pmd) +{ + return pmd_val(pmd) && !(pmd_val(pmd) & PMD_TABLE_BIT); +} + +static int if_pud_huge(pud_t pud) +{ +#ifndef __PAGETABLE_PMD_FOLDED + return pud_val(pud) && !(pud_val(pud) & PUD_TABLE_BIT); +#else + return 0; +#endif +} +#endif + +static void pic_report_addr(struct page_idle_ctrl *pic, unsigned long addr) +{ + unsigned long hva; + + pic->kpie[pic->pie_read++] = PIP_CMD_SET_HVA; + hva = addr; + u64_to_u8(hva, &pic->kpie[pic->pie_read]); + pic->pie_read += sizeof(uint64_t); + dump_pic(pic); +} + +static int pic_add_page(struct page_idle_ctrl *pic, + unsigned long addr, + unsigned long next, + enum ProcIdlePageType page_type) +{ + unsigned long page_size = pagetype_size[page_type]; + + dump_pic(pic); + + /* align kernel/user vision of cursor position */ + next = round_up(next, page_size); + + if (!pic->pie_read || + addr + pic->gpa_to_hva != pic->next_hva) { + /* merge hole */ + if (page_type == PTE_HOLE || + page_type == PMD_HOLE) { + set_restart_gpa(next, "PTE_HOLE|PMD_HOLE"); + return 0; + } + + if (addr + pic->gpa_to_hva < pic->next_hva) { + debug_printk("page_idle: addr moves backwards\n"); + WARN_ONCE(1, "page_idle: addr moves backwards"); + } + + if (pic->pie_read + sizeof(uint64_t) + 2 >= pic->pie_read_max) { + set_restart_gpa(addr, "PAGE_IDLE_KBUF_FULL"); + return PAGE_IDLE_KBUF_FULL; + } + + pic_report_addr(pic, round_down(addr, page_size) + + pic->gpa_to_hva); + } else { + if (PIP_TYPE(pic->kpie[pic->pie_read - 1]) == page_type && + PIP_SIZE(pic->kpie[pic->pie_read - 1]) < 0xF) { + set_next_hva(next + pic->gpa_to_hva, "IN-PLACE INC"); + set_restart_gpa(next, "IN-PLACE INC"); + pic->kpie[pic->pie_read - 1]++; + WARN_ONCE(page_size < next-addr, "next-addr too large"); + return 0; + } + if (pic->pie_read >= pic->pie_read_max) { + set_restart_gpa(addr, "PAGE_IDLE_KBUF_FULL"); + return PAGE_IDLE_KBUF_FULL; + } + } + + set_next_hva(next + pic->gpa_to_hva, "NEW-ITEM"); + set_restart_gpa(next, "NEW-ITEM"); + pic->kpie[pic->pie_read] = PIP_COMPOSE(page_type, 1); + pic->pie_read++; + + return 0; +} + +static int init_page_idle_ctrl_buffer(struct page_idle_ctrl *pic) +{ + pic->pie_read = 0; + pic->pie_read_max = min(PAGE_IDLE_KBUF_SIZE, + pic->buf_size - pic->bytes_copied); + /* reserve space for PIP_CMD_SET_HVA in the end */ + pic->pie_read_max -= sizeof(uint64_t) + 1; + + /* + * Align with PAGE_IDLE_KBUF_FULL + * logic in pic_add_page(), to avoid pic->pie_read = 0 when + * PAGE_IDLE_KBUF_FULL happened. + */ + if (pic->pie_read_max <= sizeof(uint64_t) + 2) + return PAGE_IDLE_KBUF_FULL; + + memset(pic->kpie, 0, sizeof(pic->kpie)); + return 0; +} + +static void setup_page_idle_ctrl(struct page_idle_ctrl *pic, void *buf, + int buf_size, unsigned int flags) +{ + pic->buf = buf; + pic->buf_size = buf_size; + pic->bytes_copied = 0; + pic->next_hva = 0; + pic->gpa_to_hva = 0; + pic->restart_gpa = 0; + pic->last_va = 0; + pic->flags = flags; +} + +static int page_idle_copy_user(struct page_idle_ctrl *pic, + unsigned long start, unsigned long end) +{ + int bytes_read; + int lc = 0; /* last copy? */ + int ret; + + dump_pic(pic); + + /* Break out of loop on no more progress. */ + if (!pic->pie_read) { + lc = 1; + if (start < end) + start = end; + } + + if (start >= end && start > pic->next_hva) { + set_next_hva(start, "TAIL-HOLE"); + pic_report_addr(pic, start); + } + + bytes_read = pic->pie_read; + if (!bytes_read) + return 1; + + ret = copy_to_user(pic->buf, pic->kpie, bytes_read); + if (ret) + return -EFAULT; + + pic->buf += bytes_read; + pic->bytes_copied += bytes_read; + if (pic->bytes_copied >= pic->buf_size) + return PAGE_IDLE_BUF_FULL; + if (lc) + return lc; + + ret = init_page_idle_ctrl_buffer(pic); + if (ret) + return ret; + + cond_resched(); + return 0; +} + +#ifdef CONFIG_X86_64 +static int ept_pte_range(struct page_idle_ctrl *pic, + pmd_t *pmd, unsigned long addr, unsigned long end) +{ + pte_t *pte; + enum ProcIdlePageType page_type; + int err = 0; + + pte = pte_offset_kernel(pmd, addr); + do { + if (KVM_CHECK_INVALID_SPTE(pte->pte)) { + page_type = PTE_IDLE; + } else if (!ept_pte_present(*pte)) + page_type = PTE_HOLE; + else if (!test_and_clear_bit(_PAGE_BIT_EPT_ACCESSED, + (unsigned long *) &pte->pte)) + page_type = PTE_IDLE; + else { + page_type = PTE_ACCESSED; + if (pic->flags & SCAN_DIRTY_PAGE) { + if (test_and_clear_bit(_PAGE_BIT_EPT_DIRTY, + (unsigned long *) &pte->pte)) + page_type = PTE_DIRTY_M; + } + } + + err = pic_add_page(pic, addr, addr + PAGE_SIZE, page_type); + if (err) + break; + } while (pte++, addr += PAGE_SIZE, addr != end); + + return err; +} + + +static int ept_pmd_range(struct page_idle_ctrl *pic, + pud_t *pud, unsigned long addr, unsigned long end) +{ + pmd_t *pmd; + unsigned long next; + enum ProcIdlePageType page_type; + enum ProcIdlePageType pte_page_type; + int err = 0; + + if (pic->flags & SCAN_HUGE_PAGE) + pte_page_type = PMD_IDLE_PTES; + else + pte_page_type = IDLE_PAGE_TYPE_MAX; + + pmd = pmd_offset(pud, addr); + do { + next = pmd_addr_end(addr, end); + if (KVM_CHECK_INVALID_SPTE(pmd->pmd)) + page_type = PMD_IDLE; + else if (!ept_pmd_present(*pmd)) + page_type = PMD_HOLE; /* likely won't hit here */ + else if (!pmd_large(*pmd)) + page_type = pte_page_type; + else if (!test_and_clear_bit(_PAGE_BIT_EPT_ACCESSED, + (unsigned long *)pmd)) + page_type = PMD_IDLE; + else { + page_type = PMD_ACCESSED; + if ((pic->flags & SCAN_DIRTY_PAGE) && + test_and_clear_bit(_PAGE_BIT_EPT_DIRTY, + (unsigned long *) pmd)) + page_type = PMD_DIRTY_M; + } + + if (page_type != IDLE_PAGE_TYPE_MAX) + err = pic_add_page(pic, addr, next, page_type); + else + err = ept_pte_range(pic, pmd, addr, next); + if (err) + break; + } while (pmd++, addr = next, addr != end); + + return err; +} + + +static int ept_pud_range(struct page_idle_ctrl *pic, + p4d_t *p4d, unsigned long addr, unsigned long end) +{ + pud_t *pud; + unsigned long next; + int err = 0; + + pud = pud_offset(p4d, addr); + do { + next = pud_addr_end(addr, end); + + if (!ept_pud_present(*pud)) { + set_restart_gpa(next, "PUD_HOLE"); + continue; + } + + if (pud_large(*pud)) + err = pic_add_page(pic, addr, next, PUD_PRESENT); + else + err = ept_pmd_range(pic, pud, addr, next); + + if (err) + break; + } while (pud++, addr = next, addr != end); + + return err; +} + +static int ept_p4d_range(struct page_idle_ctrl *pic, + pgd_t *pgd, unsigned long addr, unsigned long end) +{ + p4d_t *p4d; + unsigned long next; + int err = 0; + + p4d = p4d_offset(pgd, addr); + do { + next = p4d_addr_end(addr, end); + if (!ept_p4d_present(*p4d)) { + set_restart_gpa(next, "P4D_HOLE"); + continue; + } + + err = ept_pud_range(pic, p4d, addr, next); + if (err) + break; + } while (p4d++, addr = next, addr != end); + + return err; +} + + +static int ept_page_range(struct page_idle_ctrl *pic, + unsigned long addr, + unsigned long end) +{ + struct kvm_vcpu *vcpu; + struct kvm_mmu *mmu; + pgd_t *ept_root; + pgd_t *pgd; + unsigned long next; + int err = 0; + + WARN_ON(addr >= end); + + spin_lock(&pic->kvm->mmu_lock); + + vcpu = kvm_get_vcpu(pic->kvm, 0); + if (!vcpu) { + spin_unlock(&pic->kvm->mmu_lock); + return -EINVAL; + } + + mmu = kvm_arch_mmu_pointer(vcpu); + if (!VALID_PAGE(mmu->root_hpa)) { + spin_unlock(&pic->kvm->mmu_lock); + return -EINVAL; + } + + ept_root = __va(mmu->root_hpa); + + spin_unlock(&pic->kvm->mmu_lock); + local_irq_disable(); + pgd = pgd_offset_pgd(ept_root, addr); + do { + next = pgd_addr_end(addr, end); + if (!ept_pgd_present(*pgd)) { + set_restart_gpa(next, "PGD_HOLE"); + continue; + } + + err = ept_p4d_range(pic, pgd, addr, next); + if (err) + break; + } while (pgd++, addr = next, addr != end); + local_irq_enable(); + return err; +} + +static int ept_idle_supports_cpu(struct kvm *kvm) +{ + struct kvm_vcpu *vcpu; + struct kvm_mmu *mmu; + int ret; + + vcpu = kvm_get_vcpu(kvm, 0); + if (!vcpu) + return -EINVAL; + + spin_lock(&kvm->mmu_lock); + mmu = kvm_arch_mmu_pointer(vcpu); + if (kvm_mmu_ad_disabled(mmu)) { + pr_notice("CPU does not support EPT A/D bits tracking\n"); + ret = -EINVAL; + } else if (mmu->shadow_root_level != 4 + (!!pgtable_l5_enabled())) { + pr_notice("Unsupported EPT level %d\n", mmu->shadow_root_level); + ret = -EINVAL; + } else + ret = 0; + spin_unlock(&kvm->mmu_lock); + + return ret; +} + +#else +static int arm_pte_range(struct page_idle_ctrl *pic, + pmd_t *pmd, unsigned long addr, unsigned long end) +{ + pte_t *pte; + enum ProcIdlePageType page_type; + int err = 0; + + pte = pte_offset_kernel(pmd, addr); + do { + if (!pte_present(*pte)) + page_type = PTE_HOLE; + else if (!test_and_clear_bit(_PAGE_MM_BIT_ACCESSED, + (unsigned long *) &pte->pte)) + page_type = PTE_IDLE; + else + page_type = PTE_ACCESSED; + + err = pic_add_page(pic, addr, addr + PAGE_SIZE, page_type); + if (err) + break; + } while (pte++, addr += PAGE_SIZE, addr != end); + + return err; +} + +static int arm_pmd_range(struct page_idle_ctrl *pic, + pud_t *pud, unsigned long addr, unsigned long end) +{ + pmd_t *pmd; + unsigned long next; + enum ProcIdlePageType page_type; + enum ProcIdlePageType pte_page_type; + int err = 0; + + if (pic->flags & SCAN_HUGE_PAGE) + pte_page_type = PMD_IDLE_PTES; + else + pte_page_type = IDLE_PAGE_TYPE_MAX; + + pmd = pmd_offset(pud, addr); + do { + next = pmd_addr_end(addr, end); + if (!pmd_present(*pmd)) + page_type = PMD_HOLE; + else if (!if_pmd_thp_or_huge(*pmd)) + page_type = pte_page_type; + else if (!test_and_clear_bit(_PAGE_MM_BIT_ACCESSED, + (unsigned long *)pmd)) + page_type = PMD_IDLE; + else + page_type = PMD_ACCESSED; + + if (page_type != IDLE_PAGE_TYPE_MAX) + err = pic_add_page(pic, addr, next, page_type); + else + err = arm_pte_range(pic, pmd, addr, next); + if (err) + break; + } while (pmd++, addr = next, addr != end); + + return err; +} + +static int arm_pud_range(struct page_idle_ctrl *pic, + p4d_t *p4d, unsigned long addr, unsigned long end) +{ + pud_t *pud; + unsigned long next; + int err = 0; + + pud = pud_offset(p4d, addr); + do { + next = pud_addr_end(addr, end); + if (!pud_present(*pud)) { + set_restart_gpa(next, "PUD_HOLE"); + continue; + } + + if (if_pud_huge(*pud)) + err = pic_add_page(pic, addr, next, PUD_PRESENT); + else + err = arm_pmd_range(pic, pud, addr, next); + if (err) + break; + } while (pud++, addr = next, addr != end); + + return err; +} + +static int arm_p4d_range(struct page_idle_ctrl *pic, + pgd_t *pgd, unsigned long addr, unsigned long end) +{ + p4d_t *p4d; + unsigned long next; + int err = 0; + + p4d = p4d_offset(pgd, addr); + do { + next = p4d_addr_end(addr, end); + if (!p4d_present(*p4d)) { + set_restart_gpa(next, "P4D_HOLE"); + continue; + } + + err = arm_pud_range(pic, p4d, addr, next); + if (err) + break; + } while (p4d++, addr = next, addr != end); + + return err; +} + +static int arm_page_range(struct page_idle_ctrl *pic, + unsigned long addr, + unsigned long end) +{ + pgd_t *pgd; + unsigned long next; + struct kvm *kvm = pic->kvm; + int err = 0; + + WARN_ON(addr >= end); + + spin_lock(&pic->kvm->mmu_lock); + pgd = (pgd_t *)kvm->arch.mmu.pgt->pgd + pgd_index(addr); + spin_unlock(&pic->kvm->mmu_lock); + + local_irq_disable(); + do { + next = stage2_pgd_addr_end(kvm, addr, end); + if (!pgd_present(*pgd)) { + set_restart_gpa(next, "PGD_HOLE"); + continue; + } + + err = arm_p4d_range(pic, pgd, addr, next); + if (err) + break; + } while (pgd++, addr = next, addr != end); + + local_irq_enable(); + return err; +} +#endif + +/* + * Depending on whether hva falls in a memslot: + * + * 1) found => return gpa and remaining memslot size in *addr_range + * + * |<----- addr_range --------->| + * [ mem slot ] + * ^hva + * + * 2) not found => return hole size in *addr_range + * + * |<----- addr_range --------->| + * [first mem slot above hva ] + * ^hva + * + * If hva is above all mem slots, *addr_range will be ~0UL. + * We can finish read(2). + */ +static unsigned long vm_idle_find_gpa(struct page_idle_ctrl *pic, + unsigned long hva, + unsigned long *addr_range) +{ + struct kvm *kvm = pic->kvm; + struct kvm_memslots *slots; + struct kvm_memory_slot *memslot; + unsigned long hva_end; + gfn_t gfn; + + *addr_range = ~0UL; + mutex_lock(&kvm->slots_lock); + slots = kvm_memslots(pic->kvm); + kvm_for_each_memslot(memslot, slots) { + hva_end = memslot->userspace_addr + + (memslot->npages << PAGE_SHIFT); + + if (hva >= memslot->userspace_addr && hva < hva_end) { + gpa_t gpa; + + gfn = hva_to_gfn_memslot(hva, memslot); + *addr_range = hva_end - hva; + gpa = gfn_to_gpa(gfn); + mutex_unlock(&kvm->slots_lock); + return gpa; + } + + if (memslot->userspace_addr > hva) + *addr_range = min(*addr_range, + memslot->userspace_addr - hva); + } + mutex_unlock(&kvm->slots_lock); + return INVALID_PAGE; +} + +static int vm_idle_walk_hva_range(struct page_idle_ctrl *pic, + unsigned long start, unsigned long end) +{ + unsigned long gpa_addr; + unsigned long addr_range; + unsigned long va_end; + int ret; + +#ifdef CONFIG_X86_64 + ret = ept_idle_supports_cpu(pic->kvm); + if (ret) + return ret; +#endif + + ret = init_page_idle_ctrl_buffer(pic); + if (ret) + return ret; + + for (; start < end;) { + gpa_addr = vm_idle_find_gpa(pic, start, &addr_range); + + if (gpa_addr == INVALID_PAGE) { + pic->gpa_to_hva = 0; + if (addr_range == ~0UL) { + set_restart_gpa(TASK_SIZE, "EOF"); + va_end = end; + } else { + start += addr_range; + set_restart_gpa(start, "OUT-OF-SLOT"); + va_end = start; + } + } else { + pic->gpa_to_hva = start - gpa_addr; +#ifdef CONFIG_ARM64 + arm_page_range(pic, gpa_addr, gpa_addr + addr_range); +#else + ept_page_range(pic, gpa_addr, gpa_addr + addr_range); +#endif + va_end = pic->gpa_to_hva + gpa_addr + addr_range; + } + + start = pic->restart_gpa + pic->gpa_to_hva; + ret = page_idle_copy_user(pic, start, va_end); + if (ret) + break; + } + + if (pic->bytes_copied) + ret = 0; + return ret; +} + +static ssize_t vm_idle_read(struct file *file, char *buf, + size_t count, loff_t *ppos) +{ + struct mm_struct *mm = file->private_data; + struct page_idle_ctrl *pic; + unsigned long hva_start = *ppos; + unsigned long hva_end = hva_start + (count << (3 + PAGE_SHIFT)); + int ret; + + pic = kzalloc(sizeof(*pic), GFP_KERNEL); + if (!pic) + return -ENOMEM; + + setup_page_idle_ctrl(pic, buf, count, file->f_flags); + pic->kvm = mm_kvm(mm); + + ret = vm_idle_walk_hva_range(pic, hva_start, hva_end); + if (ret) + goto out_kvm; + + ret = pic->bytes_copied; + *ppos = pic->next_hva; +out_kvm: + return ret; + +} + +static ssize_t mm_idle_read(struct file *file, char *buf, + size_t count, loff_t *ppos); + +static ssize_t page_scan_read(struct file *file, char *buf, + size_t count, loff_t *ppos) +{ + struct mm_struct *mm = file->private_data; + unsigned long hva_start = *ppos; + unsigned long hva_end = hva_start + (count << (3 + PAGE_SHIFT)); + + if ((hva_start >= TASK_SIZE) || (hva_end >= TASK_SIZE)) { + debug_printk("page_idle_read past TASK_SIZE: %pK %pK %lx\n", + hva_start, hva_end, TASK_SIZE); + return 0; + } + if (hva_end <= hva_start) { + debug_printk("page_idle_read past EOF: %pK %pK\n", + hva_start, hva_end); + return 0; + } + if (*ppos & (PAGE_SIZE - 1)) { + debug_printk("page_idle_read unaligned ppos: %pK\n", + hva_start); + return -EINVAL; + } + if (count < PAGE_IDLE_BUF_MIN) { + debug_printk("page_idle_read small count: %lx\n", + (unsigned long)count); + return -EINVAL; + } + + if (!mm_kvm(mm)) + return mm_idle_read(file, buf, count, ppos); + + return vm_idle_read(file, buf, count, ppos); +} + +static int page_scan_open(struct inode *inode, struct file *file) +{ + if (!try_module_get(THIS_MODULE)) + return -EBUSY; + + return 0; +} + +static int page_scan_release(struct inode *inode, struct file *file) +{ + struct mm_struct *mm = file->private_data; + struct kvm *kvm; + int ret = 0; + + if (!mm) { + ret = -EBADF; + goto out; + } + + kvm = mm_kvm(mm); + if (!kvm) { + ret = -EINVAL; + goto out; + } +#ifdef CONFIG_X86_64 + spin_lock(&kvm->mmu_lock); + kvm_flush_remote_tlbs(kvm); + spin_unlock(&kvm->mmu_lock); +#endif + +out: + module_put(THIS_MODULE); + return ret; +} + +static int mm_idle_pmd_large(pmd_t pmd) +{ +#ifdef CONFIG_ARM64 + return if_pmd_thp_or_huge(pmd); +#else + return pmd_large(pmd); +#endif +} + +static int mm_idle_pte_range(struct page_idle_ctrl *pic, pmd_t *pmd, + unsigned long addr, unsigned long next) +{ + enum ProcIdlePageType page_type; + pte_t *pte; + int err = 0; + + pte = pte_offset_kernel(pmd, addr); + do { + if (!pte_present(*pte)) + page_type = PTE_HOLE; + else if (!test_and_clear_bit(_PAGE_MM_BIT_ACCESSED, + (unsigned long *) &pte->pte)) + page_type = PTE_IDLE; + else { + page_type = PTE_ACCESSED; + } + + err = pic_add_page(pic, addr, addr + PAGE_SIZE, page_type); + if (err) + break; + } while (pte++, addr += PAGE_SIZE, addr != next); + + return err; +} + +static int mm_idle_pmd_entry(pmd_t *pmd, unsigned long addr, + unsigned long next, struct mm_walk *walk) +{ + struct page_idle_ctrl *pic = walk->private; + enum ProcIdlePageType page_type; + enum ProcIdlePageType pte_page_type; + int err; + + /* + * Skip duplicate PMD_IDLE_PTES: when the PMD crosses VMA boundary, + * walk_page_range() can call on the same PMD twice. + */ + if ((addr & PMD_MASK) == (pic->last_va & PMD_MASK)) { + debug_printk("ignore duplicate addr %pK %pK\n", + addr, pic->last_va); + return 0; + } + pic->last_va = addr; + + if (pic->flags & SCAN_HUGE_PAGE) + pte_page_type = PMD_IDLE_PTES; + else + pte_page_type = IDLE_PAGE_TYPE_MAX; + + if (!pmd_present(*pmd)) + page_type = PMD_HOLE; + else if (!mm_idle_pmd_large(*pmd)) + page_type = pte_page_type; + else if (!test_and_clear_bit(_PAGE_MM_BIT_ACCESSED, + (unsigned long *)pmd)) + page_type = PMD_IDLE; + else + page_type = PMD_ACCESSED; + + if (page_type != IDLE_PAGE_TYPE_MAX) + err = pic_add_page(pic, addr, next, page_type); + else + err = mm_idle_pte_range(pic, pmd, addr, next); + + return err; +} + +static int mm_idle_pud_entry(pud_t *pud, unsigned long addr, + unsigned long next, struct mm_walk *walk) +{ + struct page_idle_ctrl *pic = walk->private; + + spinlock_t *ptl = pud_trans_huge_lock(pud, walk->vma); + + if (ptl) { + if ((addr & PUD_MASK) != (pic->last_va & PUD_MASK)) { + pic_add_page(pic, addr, next, PUD_PRESENT); + pic->last_va = addr; + } + spin_unlock(ptl); + return 1; + } + + return 0; +} + +static int mm_idle_test_walk(unsigned long start, unsigned long end, + struct mm_walk *walk) +{ + struct vm_area_struct *vma = walk->vma; + + if (vma->vm_file) { + if ((vma->vm_flags & (VM_WRITE|VM_MAYSHARE)) == VM_WRITE) + return 0; + return 1; + } + + return 0; +} + +static int mm_idle_walk_range(struct page_idle_ctrl *pic, + unsigned long start, + unsigned long end, + struct mm_walk *walk) +{ + struct vm_area_struct *vma; + int ret = 0; + + ret = init_page_idle_ctrl_buffer(pic); + if (ret) + return ret; + + for (; start < end;) { + down_read(&walk->mm->mmap_lock); + vma = find_vma(walk->mm, start); + if (vma) { + if (end > vma->vm_start) { + local_irq_disable(); + ret = walk_page_range(walk->mm, start, end, + walk->ops, walk->private); + local_irq_enable(); + } else + set_restart_gpa(vma->vm_start, "VMA-HOLE"); + } else + set_restart_gpa(TASK_SIZE, "EOF"); + up_read(&walk->mm->mmap_lock); + + WARN_ONCE(pic->gpa_to_hva, "non-zero gpa_to_hva"); + start = pic->restart_gpa; + ret = page_idle_copy_user(pic, start, end); + if (ret) + break; + } + + if (pic->bytes_copied) { + if (ret != PAGE_IDLE_BUF_FULL && pic->next_hva < end) + debug_printk("partial scan: next_hva=%pK end=%pK\n", + pic->next_hva, end); + ret = 0; + } else + WARN_ONCE(1, "nothing read"); + return ret; +} + +static ssize_t mm_idle_read(struct file *file, char *buf, + size_t count, loff_t *ppos) +{ + struct mm_struct *mm = file->private_data; + struct mm_walk_ops *mm_walk_ops = NULL; + struct mm_walk mm_walk = {}; + struct page_idle_ctrl *pic; + unsigned long va_start = *ppos; + unsigned long va_end = va_start + (count << (3 + PAGE_SHIFT)); + int ret; + + if (va_end <= va_start) { + debug_printk("%s past EOF: %pK %pK\n", + __func__, va_start, va_end); + return 0; + } + if (*ppos & (PAGE_SIZE - 1)) { + debug_printk("%s unaligned ppos: %pK\n", + __func__, va_start); + return -EINVAL; + } + if (count < PAGE_IDLE_BUF_MIN) { + debug_printk("%s small count: %lx\n", + __func__, (unsigned long)count); + return -EINVAL; + } + + pic = kzalloc(sizeof(*pic), GFP_KERNEL); + if (!pic) + return -ENOMEM; + + mm_walk_ops = kzalloc(sizeof(struct mm_walk_ops), GFP_KERNEL); + if (!mm_walk_ops) { + kfree(pic); + return -ENOMEM; + } + + setup_page_idle_ctrl(pic, buf, count, file->f_flags); + + mm_walk_ops->pmd_entry = mm_idle_pmd_entry; + mm_walk_ops->pud_entry = mm_idle_pud_entry; + mm_walk_ops->test_walk = mm_idle_test_walk; + + mm_walk.mm = mm; + mm_walk.ops = mm_walk_ops; + mm_walk.private = pic; + mm_walk.pgd = NULL; + mm_walk.no_vma = false; + ret = mm_idle_walk_range(pic, va_start, va_end, &mm_walk); + if (ret) + goto out_free; + + ret = pic->bytes_copied; + *ppos = pic->next_hva; +out_free: + kfree(pic); + kfree(mm_walk_ops); + return ret; +} + +extern struct file_operations proc_page_scan_operations; + +static int page_scan_entry(void) +{ + proc_page_scan_operations.owner = THIS_MODULE; + proc_page_scan_operations.read = page_scan_read; + proc_page_scan_operations.open = page_scan_open; + proc_page_scan_operations.release = page_scan_release; + return 0; +} + +static void page_scan_exit(void) +{ + memset(&proc_page_scan_operations, 0, + sizeof(proc_page_scan_operations)); +} + +MODULE_LICENSE("GPL"); +module_init(page_scan_entry); +module_exit(page_scan_exit); diff --git a/fs/proc/memig_scan.h b/fs/proc/memig_scan.h new file mode 100644 index 000000000000..305739f92eef --- /dev/null +++ b/fs/proc/memig_scan.h @@ -0,0 +1,132 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _PAGE_IDLE_H +#define _PAGE_IDLE_H + +#define SCAN_HUGE_PAGE O_NONBLOCK /* only huge page */ +#define SCAN_SKIM_IDLE O_NOFOLLOW /* stop on PMD_IDLE_PTES */ +#define SCAN_DIRTY_PAGE O_NOATIME /* report pte/pmd dirty bit */ + +enum ProcIdlePageType { + PTE_ACCESSED, /* 4k page */ + PMD_ACCESSED, /* 2M page */ + PUD_PRESENT, /* 1G page */ + + PTE_DIRTY_M, + PMD_DIRTY_M, + + PTE_IDLE, + PMD_IDLE, + PMD_IDLE_PTES, /* all PTE idle */ + + PTE_HOLE, + PMD_HOLE, + + PIP_CMD, + + IDLE_PAGE_TYPE_MAX +}; + +#define PIP_TYPE(a) (0xf & (a >> 4)) +#define PIP_SIZE(a) (0xf & a) +#define PIP_COMPOSE(type, nr) ((type << 4) | nr) + +#define PIP_CMD_SET_HVA PIP_COMPOSE(PIP_CMD, 0) + +#ifndef INVALID_PAGE +#define INVALID_PAGE ~0UL +#endif + +#ifdef CONFIG_ARM64 +#define _PAGE_MM_BIT_ACCESSED 10 +#else +#define _PAGE_MM_BIT_ACCESSED _PAGE_BIT_ACCESSED +#endif + +#ifdef CONFIG_X86_64 +#define _PAGE_BIT_EPT_ACCESSED 8 +#define _PAGE_BIT_EPT_DIRTY 9 +#define _PAGE_EPT_ACCESSED (_AT(pteval_t, 1) << _PAGE_BIT_EPT_ACCESSED) +#define _PAGE_EPT_DIRTY (_AT(pteval_t, 1) << _PAGE_BIT_EPT_DIRTY) + +#define _PAGE_EPT_PRESENT (_AT(pteval_t, 7)) + +static inline int ept_pte_present(pte_t a) +{ + return pte_flags(a) & _PAGE_EPT_PRESENT; +} + +static inline int ept_pmd_present(pmd_t a) +{ + return pmd_flags(a) & _PAGE_EPT_PRESENT; +} + +static inline int ept_pud_present(pud_t a) +{ + return pud_flags(a) & _PAGE_EPT_PRESENT; +} + +static inline int ept_p4d_present(p4d_t a) +{ + return p4d_flags(a) & _PAGE_EPT_PRESENT; +} + +static inline int ept_pgd_present(pgd_t a) +{ + return pgd_flags(a) & _PAGE_EPT_PRESENT; +} + +static inline int ept_pte_accessed(pte_t a) +{ + return pte_flags(a) & _PAGE_EPT_ACCESSED; +} + +static inline int ept_pmd_accessed(pmd_t a) +{ + return pmd_flags(a) & _PAGE_EPT_ACCESSED; +} + +static inline int ept_pud_accessed(pud_t a) +{ + return pud_flags(a) & _PAGE_EPT_ACCESSED; +} + +static inline int ept_p4d_accessed(p4d_t a) +{ + return p4d_flags(a) & _PAGE_EPT_ACCESSED; +} + +static inline int ept_pgd_accessed(pgd_t a) +{ + return pgd_flags(a) & _PAGE_EPT_ACCESSED; +} +#endif + +extern struct file_operations proc_page_scan_operations; + +#define PAGE_IDLE_KBUF_FULL 1 +#define PAGE_IDLE_BUF_FULL 2 +#define PAGE_IDLE_BUF_MIN (sizeof(uint64_t) * 2 + 3) + +#define PAGE_IDLE_KBUF_SIZE 8000 + +struct page_idle_ctrl { + struct mm_struct *mm; + struct kvm *kvm; + + uint8_t kpie[PAGE_IDLE_KBUF_SIZE]; + int pie_read; + int pie_read_max; + + void __user *buf; + int buf_size; + int bytes_copied; + + unsigned long next_hva; /* GPA for EPT; VA for PT */ + unsigned long gpa_to_hva; + unsigned long restart_gpa; + unsigned long last_va; + + unsigned int flags; +}; + +#endif diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index 420510f6a545..8fdd14fb8ff9 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c @@ -1770,6 +1770,72 @@ const struct file_operations proc_pagemap_operations = { .open = pagemap_open, .release = pagemap_release, }; + +/* will be filled when kvm_ept_idle module loads */ +struct file_operations proc_page_scan_operations = { +}; +EXPORT_SYMBOL_GPL(proc_page_scan_operations); + +static ssize_t mm_idle_read(struct file *file, char __user *buf, + size_t count, loff_t *ppos) +{ + struct mm_struct *mm = file->private_data; + int ret = 0; + + if (!mm || !mmget_not_zero(mm)) { + ret = -ESRCH; + return ret; + } + if (proc_page_scan_operations.read) + ret = proc_page_scan_operations.read(file, buf, count, ppos); + + mmput(mm); + return ret; +} + +static int mm_idle_open(struct inode *inode, struct file *file) +{ + struct mm_struct *mm = NULL; + + if (!file_ns_capable(file, &init_user_ns, CAP_SYS_ADMIN)) + return -EPERM; + + mm = proc_mem_open(inode, PTRACE_MODE_READ); + if (IS_ERR(mm)) + return PTR_ERR(mm); + + file->private_data = mm; + + if (proc_page_scan_operations.open) + return proc_page_scan_operations.open(inode, file); + + return 0; +} + +static int mm_idle_release(struct inode *inode, struct file *file) +{ + struct mm_struct *mm = file->private_data; + + if (mm) { + if (!mm_kvm(mm)) + flush_tlb_mm(mm); + mmdrop(mm); + } + + if (proc_page_scan_operations.release) + return proc_page_scan_operations.release(inode, file); + + return 0; +} + +const struct file_operations proc_mm_idle_operations = { + .llseek = mem_lseek, /* borrow this */ + .read = mm_idle_read, + .open = mm_idle_open, + .release = mm_idle_release, +}; + + #endif /* CONFIG_PROC_PAGE_MONITOR */
#ifdef CONFIG_NUMA diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index cac73ccf7367..98a13fb411bf 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -35,7 +35,7 @@
struct address_space; struct mem_cgroup; - +struct kvm; /* * Each physical page in the system has a struct page associated with * it to keep track of whatever it is we are using the page for at the @@ -849,6 +849,9 @@ struct mm_struct { #endif /* CONFIG_LRU_GEN */ #ifdef CONFIG_GMEM gm_as_t *gm_as; +#endif +#if IS_ENABLED(CONFIG_KVM) + struct kvm *kvm; #endif } __randomize_layout;
@@ -863,6 +866,18 @@ struct mm_struct { MT_FLAGS_USE_RCU) extern struct mm_struct init_mm;
+#if IS_ENABLED(CONFIG_KVM) +static inline struct kvm *mm_kvm(struct mm_struct *mm) +{ + return mm->kvm; +} +#else +static inline struct kvm *mm_kvm(struct mm_struct *mm) +{ + return NULL; +} +#endif + /* Pointer magic because the dynamic array size confuses some compilers. */ static inline void mm_init_cpumask(struct mm_struct *mm) { diff --git a/lib/Kconfig b/lib/Kconfig index 5c2da561c516..d6ab862f7040 100644 --- a/lib/Kconfig +++ b/lib/Kconfig @@ -728,6 +728,12 @@ config PARMAN config OBJAGG tristate "objagg" if COMPILE_TEST
+config MEMIG_SCAN_MODULE + tristate "module: memig page scan for memig support" + help + memig page scan feature + used to scan the virtual address of the target process + endmenu
config GENERIC_IOREMAP diff --git a/mm/huge_memory.c b/mm/huge_memory.c index a55c88ba305d..7f90f8fb6b0c 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -2051,6 +2051,7 @@ spinlock_t *__pud_trans_huge_lock(pud_t *pud, struct vm_area_struct *vma) spin_unlock(ptl); return NULL; } +EXPORT_SYMBOL_GPL(__pud_trans_huge_lock);
#ifdef CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD int zap_huge_pud(struct mmu_gather *tlb, struct vm_area_struct *vma, diff --git a/mm/pagewalk.c b/mm/pagewalk.c index cb23f8a15c13..0d39aebb432e 100644 --- a/mm/pagewalk.c +++ b/mm/pagewalk.c @@ -481,6 +481,7 @@ int walk_page_range(struct mm_struct *mm, unsigned long start, } while (start = next, start < end); return err; } +EXPORT_SYMBOL_GPL(walk_page_range);
/** * walk_page_range_novma - walk a range of pagetables not backed by a vma diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index 65f94f592ff8..905da44be082 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -1297,6 +1297,9 @@ static void kvm_destroy_vm(struct kvm *kvm)
kvm_destroy_pm_notifier(kvm); kvm_uevent_notify_change(KVM_EVENT_DESTROY_VM, kvm); +#if IS_ENABLED(CONFIG_KVM) + mm->kvm = NULL; +#endif kvm_destroy_vm_debugfs(kvm); kvm_arch_sync_events(kvm); mutex_lock(&kvm_lock); @@ -5054,6 +5057,9 @@ static int kvm_dev_ioctl_create_vm(unsigned long type) * cases it will be called by the final fput(file) and will take * care of doing kvm_put_kvm(kvm). */ +#if IS_ENABLED(CONFIG_KVM) + kvm->mm->kvm = kvm; +#endif kvm_uevent_notify_change(KVM_EVENT_CREATE_VM, kvm);
fd_install(fd, file);
euleros inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I7RO5Q Reference: https://gitee.com/openeuler/kernel/commit/8a655676e63680d2c02acf479f4cfaa923...
-------------------------------------------------
reason:This patch is used to add memig swap feature to openEuler system. memig_swap.ko is used to transfer the address passed in the user state for page migration
Signed-off-by: yanxiaodan yanxiaodan@huawei.com Signed-off-by: linmiaohe linmiaohe@huawei.com Signed-off-by: louhongxiang louhongxiang@huawei.com Signed-off-by: liubo liubo254@huawei.com Signed-off-by: geruijun geruijun@huawei.com Signed-off-by: liangchenshu liangchenshu@huawei.com Reviewed-by: Jing Xiangfeng jingxiangfeng@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- arch/arm64/configs/openeuler_defconfig | 1 + arch/x86/configs/openeuler_defconfig | 1 + fs/proc/Makefile | 1 + fs/proc/base.c | 2 + fs/proc/internal.h | 1 + fs/proc/memig_swap.c | 102 +++++++++++++++++++++++++ fs/proc/task_mmu.c | 51 +++++++++++++ include/linux/swap.h | 4 +- lib/Kconfig | 5 ++ mm/vmscan.c | 57 ++++++++++++++ 10 files changed, 224 insertions(+), 1 deletion(-) create mode 100644 fs/proc/memig_swap.c
diff --git a/arch/arm64/configs/openeuler_defconfig b/arch/arm64/configs/openeuler_defconfig index fc6af3dbf1c8..d526512c1473 100644 --- a/arch/arm64/configs/openeuler_defconfig +++ b/arch/arm64/configs/openeuler_defconfig @@ -7811,6 +7811,7 @@ CONFIG_ARCH_HAS_KCOV=y # CONFIG_RUNTIME_TESTING_MENU is not set CONFIG_ARCH_USE_MEMTEST=y CONFIG_MEMIG_SCAN_MODULE=m +CONFIG_MEMIG_SWAP_MODULE=m # CONFIG_MEMTEST is not set # end of Kernel Testing and Coverage
diff --git a/arch/x86/configs/openeuler_defconfig b/arch/x86/configs/openeuler_defconfig index 6fb2f9270251..35190a4cee4f 100644 --- a/arch/x86/configs/openeuler_defconfig +++ b/arch/x86/configs/openeuler_defconfig @@ -9047,6 +9047,7 @@ CONFIG_ARCH_HAS_KCOV=y # CONFIG_RUNTIME_TESTING_MENU is not set CONFIG_ARCH_USE_MEMTEST=y CONFIG_MEMIG_SCAN_MODULE=m +CONFIG_MEMIG_SWAP_MODULE=m # CONFIG_MEMTEST is not set # CONFIG_HYPERV_TESTING is not set # end of Kernel Testing and Coverage diff --git a/fs/proc/Makefile b/fs/proc/Makefile index 50c6de6f4979..e6747114a75b 100644 --- a/fs/proc/Makefile +++ b/fs/proc/Makefile @@ -35,3 +35,4 @@ proc-$(CONFIG_PRINTK) += kmsg.o proc-$(CONFIG_PROC_PAGE_MONITOR) += page.o proc-$(CONFIG_BOOT_CONFIG) += bootconfig.o obj-$(CONFIG_MEMIG_SCAN_MODULE) += memig_scan.o +obj-$(CONFIG_MEMIG_SWAP_MODULE) += memig_swap.o diff --git a/fs/proc/base.c b/fs/proc/base.c index ac108995d68b..c5fbe0815614 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -3354,6 +3354,7 @@ static const struct pid_entry tgid_base_stuff[] = { REG("smaps_rollup", S_IRUGO, proc_pid_smaps_rollup_operations), REG("pagemap", S_IRUSR, proc_pagemap_operations), REG("idle_pages", S_IRUSR|S_IWUSR, proc_mm_idle_operations), + REG("swap_pages", S_IWUSR, proc_mm_swap_operations), #endif #ifdef CONFIG_SECURITY DIR("attr", S_IRUGO|S_IXUGO, proc_attr_dir_inode_operations, proc_attr_dir_operations), @@ -3703,6 +3704,7 @@ static const struct pid_entry tid_base_stuff[] = { REG("smaps_rollup", S_IRUGO, proc_pid_smaps_rollup_operations), REG("pagemap", S_IRUSR, proc_pagemap_operations), REG("idle_pages", S_IRUSR|S_IWUSR, proc_mm_idle_operations), + REG("swap_pages", S_IWUSR, proc_mm_swap_operations), #endif #ifdef CONFIG_SECURITY DIR("attr", S_IRUGO|S_IXUGO, proc_attr_dir_inode_operations, proc_attr_dir_operations), diff --git a/fs/proc/internal.h b/fs/proc/internal.h index 09867fbcbafc..0e6bf977ba23 100644 --- a/fs/proc/internal.h +++ b/fs/proc/internal.h @@ -306,6 +306,7 @@ extern const struct file_operations proc_pid_smaps_rollup_operations; extern const struct file_operations proc_clear_refs_operations; extern const struct file_operations proc_pagemap_operations; extern const struct file_operations proc_mm_idle_operations; +extern const struct file_operations proc_mm_swap_operations;
extern unsigned long task_vsize(struct mm_struct *); extern unsigned long task_statm(struct mm_struct *, diff --git a/fs/proc/memig_swap.c b/fs/proc/memig_swap.c new file mode 100644 index 000000000000..b24c706c3b2a --- /dev/null +++ b/fs/proc/memig_swap.c @@ -0,0 +1,102 @@ +// SPDX-License-Identifier: GPL-2.0 +#include <linux/init.h> +#include <linux/kernel.h> +#include <linux/module.h> +#include <linux/string.h> +#include <linux/proc_fs.h> +#include <linux/sched/mm.h> +#include <linux/mm.h> +#include <linux/swap.h> +#include <linux/mempolicy.h> +#include <linux/uaccess.h> +#include <linux/delay.h> + +static ssize_t swap_pages_write(struct file *file, const char __user *buf, + size_t count, loff_t *ppos) +{ + char *p, *data, *data_ptr_res; + unsigned long vaddr; + struct mm_struct *mm = file->private_data; + struct page *page; + LIST_HEAD(pagelist); + int ret = 0; + + if (!mm || !mmget_not_zero(mm)) { + ret = -ESRCH; + goto out; + } + + if (count < 0) { + ret = -EOPNOTSUPP; + goto out_mm; + } + + data = memdup_user_nul(buf, count); + if (IS_ERR(data)) { + ret = PTR_ERR(data); + goto out_mm; + } + + data_ptr_res = data; + while ((p = strsep(&data, "\n")) != NULL) { + if (!*p) + continue; + + ret = kstrtoul(p, 16, &vaddr); + if (ret != 0) + continue; + /*If get page struct failed, ignore it, get next page*/ + page = get_page_from_vaddr(mm, vaddr); + if (!page) + continue; + + add_page_for_swap(page, &pagelist); + } + + if (!list_empty(&pagelist)) + reclaim_pages(&pagelist); + + ret = count; + kfree(data_ptr_res); +out_mm: + mmput(mm); +out: + return ret; +} + +static int swap_pages_open(struct inode *inode, struct file *file) +{ + if (!try_module_get(THIS_MODULE)) + return -EBUSY; + + return 0; +} + +static int swap_pages_release(struct inode *inode, struct file *file) +{ + module_put(THIS_MODULE); + return 0; +} + + +extern struct file_operations proc_swap_pages_operations; + +static int swap_pages_entry(void) +{ + proc_swap_pages_operations.owner = THIS_MODULE; + proc_swap_pages_operations.write = swap_pages_write; + proc_swap_pages_operations.open = swap_pages_open; + proc_swap_pages_operations.release = swap_pages_release; + + return 0; +} + +static void swap_pages_exit(void) +{ + memset(&proc_swap_pages_operations, 0, + sizeof(proc_swap_pages_operations)); +} + +MODULE_LICENSE("GPL"); +module_init(swap_pages_entry); +module_exit(swap_pages_exit); diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index 8fdd14fb8ff9..f771df409978 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c @@ -1835,7 +1835,58 @@ const struct file_operations proc_mm_idle_operations = { .release = mm_idle_release, };
+/*swap pages*/ +struct file_operations proc_swap_pages_operations = { +}; +EXPORT_SYMBOL_GPL(proc_swap_pages_operations); + +static ssize_t mm_swap_write(struct file *file, const char __user *buf, + size_t count, loff_t *ppos) +{ + if (proc_swap_pages_operations.write) + return proc_swap_pages_operations.write(file, buf, count, ppos); + + return -1; +} + +static int mm_swap_open(struct inode *inode, struct file *file) +{ + struct mm_struct *mm = NULL; + + if (!file_ns_capable(file, &init_user_ns, CAP_SYS_ADMIN)) + return -EPERM; + + mm = proc_mem_open(inode, PTRACE_MODE_READ); + if (IS_ERR(mm)) + return PTR_ERR(mm); + + file->private_data = mm; + + if (proc_swap_pages_operations.open) + return proc_swap_pages_operations.open(inode, file); + + return 0; +} + +static int mm_swap_release(struct inode *inode, struct file *file) +{ + struct mm_struct *mm = file->private_data;
+ if (mm) + mmdrop(mm); + + if (proc_swap_pages_operations.release) + return proc_swap_pages_operations.release(inode, file); + + return 0; +} + +const struct file_operations proc_mm_swap_operations = { + .llseek = mem_lseek, + .write = mm_swap_write, + .open = mm_swap_open, + .release = mm_swap_release, +}; #endif /* CONFIG_PROC_PAGE_MONITOR */
#ifdef CONFIG_NUMA diff --git a/include/linux/swap.h b/include/linux/swap.h index 3c69cb653cb9..96666ca67a15 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h @@ -441,7 +441,9 @@ extern unsigned long mem_cgroup_shrink_node(struct mem_cgroup *mem, extern unsigned long shrink_all_memory(unsigned long nr_pages); extern int vm_swappiness; long remove_mapping(struct address_space *mapping, struct folio *folio); - +extern int add_page_for_swap(struct page *page, struct list_head *pagelist); +extern struct page *get_page_from_vaddr(struct mm_struct *mm, + unsigned long vaddr); #ifdef CONFIG_NUMA extern int node_reclaim_mode; extern int sysctl_min_unmapped_ratio; diff --git a/lib/Kconfig b/lib/Kconfig index d6ab862f7040..b341e880adf3 100644 --- a/lib/Kconfig +++ b/lib/Kconfig @@ -734,6 +734,11 @@ config MEMIG_SCAN_MODULE memig page scan feature used to scan the virtual address of the target process
+config MEMIG_SWAP_MODULE + tristate "module: memig page swap for memig support" + help + memig page swap feature + endmenu
config GENERIC_IOREMAP diff --git a/mm/vmscan.c b/mm/vmscan.c index 5bf98d0a22c9..9287be69e468 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -33,6 +33,7 @@ #include <linux/topology.h> #include <linux/cpu.h> #include <linux/cpuset.h> +#include <linux/mempolicy.h> #include <linux/compaction.h> #include <linux/notifier.h> #include <linux/rwsem.h> @@ -2814,6 +2815,7 @@ unsigned long reclaim_pages(struct list_head *folio_list)
return nr_reclaimed; } +EXPORT_SYMBOL_GPL(reclaim_pages);
static unsigned long shrink_list(enum lru_list lru, unsigned long nr_to_scan, struct lruvec *lruvec, struct scan_control *sc) @@ -8105,3 +8107,58 @@ void check_move_unevictable_folios(struct folio_batch *fbatch) } } EXPORT_SYMBOL_GPL(check_move_unevictable_folios); + +int add_page_for_swap(struct page *page, struct list_head *pagelist) +{ + int err = -EBUSY; + struct page *head; + + /*If the page is mapped by more than one process, do not swap it */ + if (page_mapcount(page) > 1) + return -EACCES; + + if (PageHuge(page)) + return -EACCES; + + head = compound_head(page); + err = isolate_lru_page(head); + if (err) { + put_page(page); + return err; + } + put_page(page); + if (PageUnevictable(page)) + putback_lru_page(page); + else + list_add_tail(&head->lru, pagelist); + + err = 0; + return err; +} +EXPORT_SYMBOL_GPL(add_page_for_swap); + +struct page *get_page_from_vaddr(struct mm_struct *mm, unsigned long vaddr) +{ + struct page *page; + struct vm_area_struct *vma; + unsigned int follflags; + + down_read(&mm->mmap_lock); + + vma = find_vma(mm, vaddr); + if (!vma || vaddr < vma->vm_start || !vma_migratable(vma)) { + up_read(&mm->mmap_lock); + return NULL; + } + + follflags = FOLL_GET | FOLL_DUMP; + page = follow_page(vma, vaddr, follflags); + if (IS_ERR(page) || !page) { + up_read(&mm->mmap_lock); + return NULL; + } + + up_read(&mm->mmap_lock); + return page; +} +EXPORT_SYMBOL_GPL(get_page_from_vaddr);
euleros inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I7RO5Q Reference: https://gitee.com/openeuler/kernel/commit/74706f34e144e1bf95914d2e36eb29d40f...
-------------------------------------------------
fix compile error when CONFIG_NUMA is turned off on Raspberry platform.
compile info: mm/vmscan.c: In function ‘get_page_from_vaddr’: mm/vmscan.c:4350:40: error: implicit declaration of function ‘vma_migratable’; did you mean ‘rq_mergeable’? [-Werror=implicit-function-declaration]
This patch solves the problem of compilation errors introduced by memig into the interface in the vmscan.c file. Delete dependency on NUMA for CONFIG_MEMIG_SWAP, and use vma->vm_flags & VM_LOCKED for judgement.
Signed-off-by: liubo liubo254@huawei.com Reviewed-by: Jing Xiangfeng jingxiangfeng@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- arch/arm64/configs/openeuler_defconfig | 4 ++-- arch/x86/configs/openeuler_defconfig | 4 ++-- fs/proc/Makefile | 4 ++-- lib/Kconfig | 4 ++-- mm/vmscan.c | 2 +- 5 files changed, 9 insertions(+), 9 deletions(-)
diff --git a/arch/arm64/configs/openeuler_defconfig b/arch/arm64/configs/openeuler_defconfig index d526512c1473..233e736bf6f1 100644 --- a/arch/arm64/configs/openeuler_defconfig +++ b/arch/arm64/configs/openeuler_defconfig @@ -7810,8 +7810,8 @@ CONFIG_FUNCTION_ERROR_INJECTION=y CONFIG_ARCH_HAS_KCOV=y # CONFIG_RUNTIME_TESTING_MENU is not set CONFIG_ARCH_USE_MEMTEST=y -CONFIG_MEMIG_SCAN_MODULE=m -CONFIG_MEMIG_SWAP_MODULE=m +CONFIG_MEMIG_SCAN=m +CONFIG_MEMIG_SWAP=m # CONFIG_MEMTEST is not set # end of Kernel Testing and Coverage
diff --git a/arch/x86/configs/openeuler_defconfig b/arch/x86/configs/openeuler_defconfig index 35190a4cee4f..f2ae112791cb 100644 --- a/arch/x86/configs/openeuler_defconfig +++ b/arch/x86/configs/openeuler_defconfig @@ -9046,8 +9046,8 @@ CONFIG_ARCH_HAS_KCOV=y # CONFIG_KCOV is not set # CONFIG_RUNTIME_TESTING_MENU is not set CONFIG_ARCH_USE_MEMTEST=y -CONFIG_MEMIG_SCAN_MODULE=m -CONFIG_MEMIG_SWAP_MODULE=m +CONFIG_MEMIG_SCAN=m +CONFIG_MEMIG_SWAP=m # CONFIG_MEMTEST is not set # CONFIG_HYPERV_TESTING is not set # end of Kernel Testing and Coverage diff --git a/fs/proc/Makefile b/fs/proc/Makefile index e6747114a75b..1da3ddf423c0 100644 --- a/fs/proc/Makefile +++ b/fs/proc/Makefile @@ -34,5 +34,5 @@ proc-$(CONFIG_PROC_VMCORE) += vmcore.o proc-$(CONFIG_PRINTK) += kmsg.o proc-$(CONFIG_PROC_PAGE_MONITOR) += page.o proc-$(CONFIG_BOOT_CONFIG) += bootconfig.o -obj-$(CONFIG_MEMIG_SCAN_MODULE) += memig_scan.o -obj-$(CONFIG_MEMIG_SWAP_MODULE) += memig_swap.o +obj-$(CONFIG_MEMIG_SCAN) += memig_scan.o +obj-$(CONFIG_MEMIG_SWAP) += memig_swap.o diff --git a/lib/Kconfig b/lib/Kconfig index b341e880adf3..041103e5519c 100644 --- a/lib/Kconfig +++ b/lib/Kconfig @@ -728,13 +728,13 @@ config PARMAN config OBJAGG tristate "objagg" if COMPILE_TEST
-config MEMIG_SCAN_MODULE +config MEMIG_SCAN tristate "module: memig page scan for memig support" help memig page scan feature used to scan the virtual address of the target process
-config MEMIG_SWAP_MODULE +config MEMIG_SWAP tristate "module: memig page swap for memig support" help memig page swap feature diff --git a/mm/vmscan.c b/mm/vmscan.c index 9287be69e468..ba95fe8afff7 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -8146,7 +8146,7 @@ struct page *get_page_from_vaddr(struct mm_struct *mm, unsigned long vaddr) down_read(&mm->mmap_lock);
vma = find_vma(mm, vaddr); - if (!vma || vaddr < vma->vm_start || !vma_migratable(vma)) { + if (!vma || vaddr < vma->vm_start || vma->vm_flags & VM_LOCKED) { up_read(&mm->mmap_lock); return NULL; }
euleros inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I7RO5Q Reference: https://gitee.com/openeuler/kernel/commit/aa7f1d222cdab88f12e6d889437fed6571...
-------------------------------------------------
The original memory extension feature name is memig, after open source the feature name is changed to etmem.
This patch is used to synchronously modify the feature name and file name.
The config options and the file name are modified synchronously to etmem.
Signed-off-by: liubo liubo254@huawei.com Reviewed-by: geruijun geruijun@huawei.com Reviewed-by: Xiongfeng Wang wangxiongfeng2@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- arch/arm64/configs/openeuler_defconfig | 4 ++-- arch/x86/configs/openeuler_defconfig | 4 ++-- fs/proc/Makefile | 4 ++-- fs/proc/{memig_scan.c => etmem_scan.c} | 2 +- fs/proc/{memig_scan.h => etmem_scan.h} | 0 fs/proc/{memig_swap.c => etmem_swap.c} | 0 lib/Kconfig | 12 ++++++------ 7 files changed, 13 insertions(+), 13 deletions(-) rename fs/proc/{memig_scan.c => etmem_scan.c} (99%) rename fs/proc/{memig_scan.h => etmem_scan.h} (100%) rename fs/proc/{memig_swap.c => etmem_swap.c} (100%)
diff --git a/arch/arm64/configs/openeuler_defconfig b/arch/arm64/configs/openeuler_defconfig index 233e736bf6f1..16cad3fb2744 100644 --- a/arch/arm64/configs/openeuler_defconfig +++ b/arch/arm64/configs/openeuler_defconfig @@ -7810,8 +7810,8 @@ CONFIG_FUNCTION_ERROR_INJECTION=y CONFIG_ARCH_HAS_KCOV=y # CONFIG_RUNTIME_TESTING_MENU is not set CONFIG_ARCH_USE_MEMTEST=y -CONFIG_MEMIG_SCAN=m -CONFIG_MEMIG_SWAP=m +CONFIG_ETMEM_SCAN=m +CONFIG_ETMEM_SWAP=m # CONFIG_MEMTEST is not set # end of Kernel Testing and Coverage
diff --git a/arch/x86/configs/openeuler_defconfig b/arch/x86/configs/openeuler_defconfig index f2ae112791cb..4f9945502961 100644 --- a/arch/x86/configs/openeuler_defconfig +++ b/arch/x86/configs/openeuler_defconfig @@ -9046,8 +9046,8 @@ CONFIG_ARCH_HAS_KCOV=y # CONFIG_KCOV is not set # CONFIG_RUNTIME_TESTING_MENU is not set CONFIG_ARCH_USE_MEMTEST=y -CONFIG_MEMIG_SCAN=m -CONFIG_MEMIG_SWAP=m +CONFIG_ETMEM_SCAN=m +CONFIG_ETMEM_SWAP=m # CONFIG_MEMTEST is not set # CONFIG_HYPERV_TESTING is not set # end of Kernel Testing and Coverage diff --git a/fs/proc/Makefile b/fs/proc/Makefile index 1da3ddf423c0..8704d41dd67c 100644 --- a/fs/proc/Makefile +++ b/fs/proc/Makefile @@ -34,5 +34,5 @@ proc-$(CONFIG_PROC_VMCORE) += vmcore.o proc-$(CONFIG_PRINTK) += kmsg.o proc-$(CONFIG_PROC_PAGE_MONITOR) += page.o proc-$(CONFIG_BOOT_CONFIG) += bootconfig.o -obj-$(CONFIG_MEMIG_SCAN) += memig_scan.o -obj-$(CONFIG_MEMIG_SWAP) += memig_swap.o +obj-$(CONFIG_ETMEM_SCAN) += etmem_scan.o +obj-$(CONFIG_ETMEM_SWAP) += etmem_swap.o diff --git a/fs/proc/memig_scan.c b/fs/proc/etmem_scan.c similarity index 99% rename from fs/proc/memig_scan.c rename to fs/proc/etmem_scan.c index 3964e7652127..8acf71c29958 100644 --- a/fs/proc/memig_scan.c +++ b/fs/proc/etmem_scan.c @@ -27,7 +27,7 @@ #include <asm/kvm_arm.h> #include <asm/stage2_pgtable.h> #endif -#include "memig_scan.h" +#include "etmem_scan.h"
#ifdef CONFIG_X86_64 /* diff --git a/fs/proc/memig_scan.h b/fs/proc/etmem_scan.h similarity index 100% rename from fs/proc/memig_scan.h rename to fs/proc/etmem_scan.h diff --git a/fs/proc/memig_swap.c b/fs/proc/etmem_swap.c similarity index 100% rename from fs/proc/memig_swap.c rename to fs/proc/etmem_swap.c diff --git a/lib/Kconfig b/lib/Kconfig index 041103e5519c..053ab2c4366b 100644 --- a/lib/Kconfig +++ b/lib/Kconfig @@ -728,16 +728,16 @@ config PARMAN config OBJAGG tristate "objagg" if COMPILE_TEST
-config MEMIG_SCAN - tristate "module: memig page scan for memig support" +config ETMEM_SCAN + tristate "module: etmem page scan for etmem support" help - memig page scan feature + etmem page scan feature used to scan the virtual address of the target process
-config MEMIG_SWAP - tristate "module: memig page swap for memig support" +config ETMEM_SWAP + tristate "module: etmem page swap for etmem support" help - memig page swap feature + etmem page swap feature
endmenu
euleros inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I7RO5Q Reference: https://gitee.com/openeuler/kernel/commit/d7b8dcbecdc40d98d20812176f57d7f72f...
-------------------------------------------------
The original etmem feature failed to compile a specific architecture, for example, powerpc, because the architecture is not specified.
This patch is move ETMEM feature CONFIG to mm/Kconfig and add architecture.
Signed-off-by: liubo liubo254@huawei.com Reviewed-by: jingxiangfeng 00447129 jingxiangfeng@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- lib/Kconfig | 11 ----------- mm/Kconfig | 15 +++++++++++++++ 2 files changed, 15 insertions(+), 11 deletions(-)
diff --git a/lib/Kconfig b/lib/Kconfig index 053ab2c4366b..5c2da561c516 100644 --- a/lib/Kconfig +++ b/lib/Kconfig @@ -728,17 +728,6 @@ config PARMAN config OBJAGG tristate "objagg" if COMPILE_TEST
-config ETMEM_SCAN - tristate "module: etmem page scan for etmem support" - help - etmem page scan feature - used to scan the virtual address of the target process - -config ETMEM_SWAP - tristate "module: etmem page swap for etmem support" - help - etmem page swap feature - endmenu
config GENERIC_IOREMAP diff --git a/mm/Kconfig b/mm/Kconfig index b950407dd87f..ac02ca7bdec0 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -1215,6 +1215,21 @@ config GMEM help say Y here to enable gmem subsystem
+config ETMEM_SCAN + tristate "module: etmem page scan for etmem support" + depends on MMU + depends on X86 || ARM64 + help + etmem page scan feature + used to scan the virtual address of the target process + +config ETMEM_SWAP + tristate "module: etmem page swap for etmem support" + depends on MMU + depends on X86 || ARM64 + help + etmem page swap feature + source "mm/damon/Kconfig"
endmenu
euleros inclusion category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I7RO5Q CVE: NA Reference: https://gitee.com/openeuler/kernel/commit/79246e864484015e1ef3c7ecec2d18a6b2...
-------------------------------------------------
The etmem_scan.ko module is used to scan the process memory.
The specific usage is as follows: The etmem user mode process issues scan commands through /proc/pid/idle_pages, and the etmem_scan module scans based on the issued address information.
Under certain circumstances, the phenomenon that the scan result is empty may occur. This phenomenon is a normal logic flow and does not need to print the log through WARN_ONCE.
Therefore, Replace WARN_ONCE() with debug_printk for "nothing read"
Signed-off-by: liubo liubo254@huawei.com Reviewed-by: Miaohe Lin linmiaohe@huawei.com Reviewed-by: Kefeng Wang wangkefeng.wang@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- fs/proc/etmem_scan.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/fs/proc/etmem_scan.c b/fs/proc/etmem_scan.c index 8acf71c29958..bb81ad9ca175 100644 --- a/fs/proc/etmem_scan.c +++ b/fs/proc/etmem_scan.c @@ -993,7 +993,7 @@ static int mm_idle_walk_range(struct page_idle_ctrl *pic, pic->next_hva, end); ret = 0; } else - WARN_ONCE(1, "nothing read"); + debug_printk("nothing read"); return ret; }
euleros inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I7RO5Q Reference: https://gitee.com/openeuler/kernel/commit/59abec29c6ca44301d034590d021d17cdd...
-------------------------------------------------
etmem, the memory vertical expansion technology, uses DRAM and high-performance storage new media to form multi-level memory storage.
The etmem feature was introduced in the previous commit (aa7f1d222cdab88f12e6d889437fed6571dec824),but only the config options for the etmem_swap and etmem_scan modules were added, and the config options for the etmem feature were not added, so in this commit, the CONFIG_ETMEM option for the etmem feature was added
Signed-off-by: liubo liubo254@huawei.com Reviewed-by: Miaohe Lin linmiaohe@huawei.com Reviewed-by: Kefeng Wang wangkefeng.wang@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- mm/Kconfig | 16 +++++++++++++--- 1 file changed, 13 insertions(+), 3 deletions(-)
diff --git a/mm/Kconfig b/mm/Kconfig index ac02ca7bdec0..2b69c1b3d8a7 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -1217,18 +1217,28 @@ config GMEM
config ETMEM_SCAN tristate "module: etmem page scan for etmem support" - depends on MMU - depends on X86 || ARM64 + depends on ETMEM help etmem page scan feature used to scan the virtual address of the target process
config ETMEM_SWAP tristate "module: etmem page swap for etmem support" + depends on ETMEM + help + etmem page swap feature + +config ETMEM + bool "Enable etmem feature" depends on MMU depends on X86 || ARM64 + default n help - etmem page swap feature + etmem is a tiered memory extension technology that uses DRAM and memory + compression/high-performance storage media to form tiered memory storage. + Memory data is tiered, and cold data is migrated from memory media to + high-performance storage media to release memory space and reduce + memory costs.
source "mm/damon/Kconfig"
euleros inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I7RO5Q CVE: NA Reference: https://gitee.com/openeuler/kernel/commit/3f29b698f44088182d8306b5e3020cdca9...
--------------------------------
enable CONFIG_ETMEM by default.
Signed-off-by: liubo liubo254@huawei.com Reviewed-by: Miaohe Lin linmiaohe@huawei.com Reviewed-by: Kefeng Wang wangkefeng.wang@huawei.com Reviewed-by: Chao Liu liuchao173@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- arch/arm64/configs/openeuler_defconfig | 1 + arch/x86/configs/openeuler_defconfig | 1 + 2 files changed, 2 insertions(+)
diff --git a/arch/arm64/configs/openeuler_defconfig b/arch/arm64/configs/openeuler_defconfig index 16cad3fb2744..9f79befaace5 100644 --- a/arch/arm64/configs/openeuler_defconfig +++ b/arch/arm64/configs/openeuler_defconfig @@ -7812,6 +7812,7 @@ CONFIG_ARCH_HAS_KCOV=y CONFIG_ARCH_USE_MEMTEST=y CONFIG_ETMEM_SCAN=m CONFIG_ETMEM_SWAP=m +CONFIG_ETMEM=y # CONFIG_MEMTEST is not set # end of Kernel Testing and Coverage
diff --git a/arch/x86/configs/openeuler_defconfig b/arch/x86/configs/openeuler_defconfig index 4f9945502961..c69c0db09089 100644 --- a/arch/x86/configs/openeuler_defconfig +++ b/arch/x86/configs/openeuler_defconfig @@ -9048,6 +9048,7 @@ CONFIG_ARCH_HAS_KCOV=y CONFIG_ARCH_USE_MEMTEST=y CONFIG_ETMEM_SCAN=m CONFIG_ETMEM_SWAP=m +CONFIG_ETMEM=y # CONFIG_MEMTEST is not set # CONFIG_HYPERV_TESTING is not set # end of Kernel Testing and Coverage
From: Kemeng Shi shikemeng@huawei.com
euleros inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I7RO5Q CVE: NA Reference: https://gitee.com/openeuler/kernel/commit/b861c9edbec7478c5dbaafeeb4da0a8fe1...
-------------------------------------------------
support ioctl for etmem scan to set scan flag
Signed-off-by: Kemeng Shi shikemeng@huawei.com Reviewed-by: louhongxiang louhongxiang@huawei.com Reviewed-by: Chen Wandun chenwandun@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- fs/proc/task_mmu.c | 9 +++++++++ 1 file changed, 9 insertions(+)
diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index f771df409978..38439f46de3a 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c @@ -1828,11 +1828,20 @@ static int mm_idle_release(struct inode *inode, struct file *file) return 0; }
+static long mm_idle_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) +{ + if (proc_page_scan_operations.unlocked_ioctl) + return proc_page_scan_operations.unlocked_ioctl(filp, cmd, arg); + + return 0; +} + const struct file_operations proc_mm_idle_operations = { .llseek = mem_lseek, /* borrow this */ .read = mm_idle_read, .open = mm_idle_open, .release = mm_idle_release, + .unlocked_ioctl = mm_idle_ioctl, };
/*swap pages*/
From: Kemeng Shi shikemeng@huawei.com
euleros inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I7RO5Q CVE: NA Reference: https://gitee.com/openeuler/kernel/commit/f0f2e730c851242713a3bf1e05a2317067...
-------------------------------------------------
1. add hugetlb_entry callback to report hugetlb page. 2. try to walk host page table when ept entry is not present. 3. add SCAN_AS_HUGE to report ept page in pmd level as host hugetlb page may be splited into 4k ept page in vm. 4. add SCAN_IGN_HOST for user to ignore access from host.
Signed-off-by: Kemeng Shi shikemeng@huawei.com Reviewed-by: louhongxiang louhongxiang@huawei.com Reviewed-by: Chen Wandun chenwandun@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- fs/proc/etmem_scan.c | 217 ++++++++++++++++++++++++++++++++++++++----- fs/proc/etmem_scan.h | 14 +++ 2 files changed, 207 insertions(+), 24 deletions(-)
diff --git a/fs/proc/etmem_scan.c b/fs/proc/etmem_scan.c index bb81ad9ca175..8dbf186f8174 100644 --- a/fs/proc/etmem_scan.c +++ b/fs/proc/etmem_scan.c @@ -28,6 +28,7 @@ #include <asm/stage2_pgtable.h> #endif #include "etmem_scan.h" +#include <linux/hugetlb_inline.h>
#ifdef CONFIG_X86_64 /* @@ -289,8 +290,32 @@ static int page_idle_copy_user(struct page_idle_ctrl *pic, }
#ifdef CONFIG_X86_64 +static int vm_walk_host_range(unsigned long long start, + unsigned long end, + struct mm_walk *walk) +{ + int ret; + struct page_idle_ctrl *pic = walk->private; + unsigned long tmp_gpa_to_hva = pic->gpa_to_hva; + + pic->gpa_to_hva = 0; + local_irq_enable(); + down_read(&walk->mm->mmap_lock); + local_irq_disable(); + ret = walk_page_range(walk->mm, start + tmp_gpa_to_hva, end + tmp_gpa_to_hva, + walk->ops, walk->private); + up_read(&walk->mm->mmap_lock); + pic->gpa_to_hva = tmp_gpa_to_hva; + if (pic->flags & VM_SCAN_HOST) { + pic->restart_gpa -= tmp_gpa_to_hva; + pic->flags &= ~VM_SCAN_HOST; + } + return ret; +} + static int ept_pte_range(struct page_idle_ctrl *pic, - pmd_t *pmd, unsigned long addr, unsigned long end) + pmd_t *pmd, unsigned long addr, unsigned long end, + struct mm_walk *walk) { pte_t *pte; enum ProcIdlePageType page_type; @@ -300,9 +325,10 @@ static int ept_pte_range(struct page_idle_ctrl *pic, do { if (KVM_CHECK_INVALID_SPTE(pte->pte)) { page_type = PTE_IDLE; - } else if (!ept_pte_present(*pte)) - page_type = PTE_HOLE; - else if (!test_and_clear_bit(_PAGE_BIT_EPT_ACCESSED, + } else if (!ept_pte_present(*pte)) { + err = vm_walk_host_range(addr, end, walk); + goto next; + } else if (!test_and_clear_bit(_PAGE_BIT_EPT_ACCESSED, (unsigned long *) &pte->pte)) page_type = PTE_IDLE; else { @@ -315,6 +341,7 @@ static int ept_pte_range(struct page_idle_ctrl *pic, }
err = pic_add_page(pic, addr, addr + PAGE_SIZE, page_type); +next: if (err) break; } while (pte++, addr += PAGE_SIZE, addr != end); @@ -322,9 +349,30 @@ static int ept_pte_range(struct page_idle_ctrl *pic, return err; }
+static enum ProcIdlePageType ept_huge_accessed(pmd_t *pmd, unsigned long addr, + unsigned long end) +{ + int accessed = PMD_IDLE; + pte_t *pte; + + pte = pte_offset_kernel(pmd, addr); + do { + if (!KVM_CHECK_INVALID_SPTE(pte->pte)) + continue; + if (!ept_pte_present(*pte)) + continue; + if (!test_and_clear_bit(_PAGE_BIT_EPT_ACCESSED, + (unsigned long *)&pte->pte)) + continue; + accessed = PMD_ACCESSED; + } while (pte++, addr += PAGE_SIZE, addr != end); + + return accessed; +}
static int ept_pmd_range(struct page_idle_ctrl *pic, - pud_t *pud, unsigned long addr, unsigned long end) + pud_t *pud, unsigned long addr, unsigned long end, + struct mm_walk *walk) { pmd_t *pmd; unsigned long next; @@ -342,11 +390,15 @@ static int ept_pmd_range(struct page_idle_ctrl *pic, next = pmd_addr_end(addr, end); if (KVM_CHECK_INVALID_SPTE(pmd->pmd)) page_type = PMD_IDLE; - else if (!ept_pmd_present(*pmd)) - page_type = PMD_HOLE; /* likely won't hit here */ - else if (!pmd_large(*pmd)) - page_type = pte_page_type; - else if (!test_and_clear_bit(_PAGE_BIT_EPT_ACCESSED, + else if (!ept_pmd_present(*pmd)) { + err = vm_walk_host_range(addr, next, walk); + goto next; + } else if (!pmd_large(*pmd)) { + if (pic->flags & SCAN_AS_HUGE) + page_type = ept_huge_accessed(pmd, addr, next); + else + page_type = pte_page_type; + } else if (!test_and_clear_bit(_PAGE_BIT_EPT_ACCESSED, (unsigned long *)pmd)) page_type = PMD_IDLE; else { @@ -360,7 +412,9 @@ static int ept_pmd_range(struct page_idle_ctrl *pic, if (page_type != IDLE_PAGE_TYPE_MAX) err = pic_add_page(pic, addr, next, page_type); else - err = ept_pte_range(pic, pmd, addr, next); + err = ept_pte_range(pic, pmd, addr, next, walk); + +next: if (err) break; } while (pmd++, addr = next, addr != end); @@ -370,7 +424,8 @@ static int ept_pmd_range(struct page_idle_ctrl *pic,
static int ept_pud_range(struct page_idle_ctrl *pic, - p4d_t *p4d, unsigned long addr, unsigned long end) + p4d_t *p4d, unsigned long addr, unsigned long end, + struct mm_walk *walk) { pud_t *pud; unsigned long next; @@ -381,15 +436,16 @@ static int ept_pud_range(struct page_idle_ctrl *pic, next = pud_addr_end(addr, end);
if (!ept_pud_present(*pud)) { - set_restart_gpa(next, "PUD_HOLE"); - continue; + err = vm_walk_host_range(addr, next, walk); + goto next; }
if (pud_large(*pud)) err = pic_add_page(pic, addr, next, PUD_PRESENT); else - err = ept_pmd_range(pic, pud, addr, next); + err = ept_pmd_range(pic, pud, addr, next, walk);
+next: if (err) break; } while (pud++, addr = next, addr != end); @@ -398,7 +454,8 @@ static int ept_pud_range(struct page_idle_ctrl *pic, }
static int ept_p4d_range(struct page_idle_ctrl *pic, - pgd_t *pgd, unsigned long addr, unsigned long end) + pgd_t *pgd, unsigned long addr, unsigned long end, + struct mm_walk *walk) { p4d_t *p4d; unsigned long next; @@ -412,7 +469,7 @@ static int ept_p4d_range(struct page_idle_ctrl *pic, continue; }
- err = ept_pud_range(pic, p4d, addr, next); + err = ept_pud_range(pic, p4d, addr, next, walk); if (err) break; } while (p4d++, addr = next, addr != end); @@ -420,10 +477,10 @@ static int ept_p4d_range(struct page_idle_ctrl *pic, return err; }
- static int ept_page_range(struct page_idle_ctrl *pic, unsigned long addr, - unsigned long end) + unsigned long end, + struct mm_walk *walk) { struct kvm_vcpu *vcpu; struct kvm_mmu *mmu; @@ -460,7 +517,7 @@ static int ept_page_range(struct page_idle_ctrl *pic, continue; }
- err = ept_p4d_range(pic, pgd, addr, next); + err = ept_p4d_range(pic, pgd, addr, next, walk); if (err) break; } while (pgd++, addr = next, addr != end); @@ -692,8 +749,44 @@ static unsigned long vm_idle_find_gpa(struct page_idle_ctrl *pic, return INVALID_PAGE; }
+static int mm_idle_hugetlb_entry(pte_t *pte, unsigned long hmask, + unsigned long addr, unsigned long next, + struct mm_walk *walk); +static int vm_idle_hugetlb_entry(pte_t *pte, unsigned long hmask, + unsigned long addr, unsigned long next, + struct mm_walk *walk) +{ + struct page_idle_ctrl *pic = walk->private; + + pic->flags |= VM_SCAN_HOST; + return mm_idle_hugetlb_entry(pte, hmask, addr, next, walk); +} + +static int mm_idle_pmd_entry(pmd_t *pmd, unsigned long addr, + unsigned long next, struct mm_walk *walk); +static int vm_idle_pmd_entry(pmd_t *pmd, unsigned long addr, + unsigned long next, struct mm_walk *walk) +{ + struct page_idle_ctrl *pic = walk->private; + + pic->flags |= VM_SCAN_HOST; + return mm_idle_pmd_entry(pmd, addr, next, walk); +} + +static int mm_idle_pud_entry(pud_t *pud, unsigned long addr, + unsigned long next, struct mm_walk *walk); +static int vm_idle_pud_entry(pud_t *pud, unsigned long addr, + unsigned long next, struct mm_walk *walk) +{ + struct page_idle_ctrl *pic = walk->private; + + pic->flags |= VM_SCAN_HOST; + return mm_idle_pud_entry(pud, addr, next, walk); +} + static int vm_idle_walk_hva_range(struct page_idle_ctrl *pic, - unsigned long start, unsigned long end) + unsigned long start, unsigned long end, + struct mm_walk *walk) { unsigned long gpa_addr; unsigned long addr_range; @@ -728,7 +821,7 @@ static int vm_idle_walk_hva_range(struct page_idle_ctrl *pic, #ifdef CONFIG_ARM64 arm_page_range(pic, gpa_addr, gpa_addr + addr_range); #else - ept_page_range(pic, gpa_addr, gpa_addr + addr_range); + ept_page_range(pic, gpa_addr, gpa_addr + addr_range, walk); #endif va_end = pic->gpa_to_hva + gpa_addr + addr_range; } @@ -744,10 +837,14 @@ static int vm_idle_walk_hva_range(struct page_idle_ctrl *pic, return ret; }
+static int mm_idle_test_walk(unsigned long start, unsigned long end, + struct mm_walk *walk); static ssize_t vm_idle_read(struct file *file, char *buf, size_t count, loff_t *ppos) { struct mm_struct *mm = file->private_data; + struct mm_walk mm_walk = {}; + struct mm_walk_ops mm_walk_ops = {}; struct page_idle_ctrl *pic; unsigned long hva_start = *ppos; unsigned long hva_end = hva_start + (count << (3 + PAGE_SHIFT)); @@ -760,7 +857,16 @@ static ssize_t vm_idle_read(struct file *file, char *buf, setup_page_idle_ctrl(pic, buf, count, file->f_flags); pic->kvm = mm_kvm(mm);
- ret = vm_idle_walk_hva_range(pic, hva_start, hva_end); + mm_walk_ops.pmd_entry = vm_idle_pmd_entry; + mm_walk_ops.pud_entry = vm_idle_pud_entry; + mm_walk_ops.hugetlb_entry = vm_idle_hugetlb_entry; + mm_walk_ops.test_walk = mm_idle_test_walk; + + mm_walk.mm = mm; + mm_walk.ops = &mm_walk_ops; + mm_walk.private = pic; + + ret = vm_idle_walk_hva_range(pic, hva_start, hva_end, &mm_walk); if (ret) goto out_kvm;
@@ -863,6 +969,8 @@ static int mm_idle_pte_range(struct page_idle_ctrl *pic, pmd_t *pmd, do { if (!pte_present(*pte)) page_type = PTE_HOLE; + else if (pic->flags & SCAN_IGN_HOST) + page_type = PTE_IDLE; else if (!test_and_clear_bit(_PAGE_MM_BIT_ACCESSED, (unsigned long *) &pte->pte)) page_type = PTE_IDLE; @@ -878,6 +986,39 @@ static int mm_idle_pte_range(struct page_idle_ctrl *pic, pmd_t *pmd, return err; }
+static inline unsigned long mask_to_size(unsigned long mask) +{ + return ~mask + 1; +} + +static int mm_idle_hugetlb_entry(pte_t *pte, unsigned long hmask, + unsigned long addr, unsigned long next, + struct mm_walk *walk) +{ + struct page_idle_ctrl *pic = walk->private; + enum ProcIdlePageType page_type; + unsigned long start = addr & hmask; /* hugepage may be splited in vm */ + int ret; + + if (mask_to_size(hmask) == PUD_SIZE) { + page_type = PUD_PRESENT; + goto add_page; + } + + if (!pte_present(*pte)) + page_type = PMD_HOLE; + else if (pic->flags & SCAN_IGN_HOST) + page_type = PMD_IDLE; + else if (!test_and_clear_bit(_PAGE_MM_BIT_ACCESSED, (unsigned long *)pte)) + page_type = PMD_IDLE; + else + page_type = PMD_ACCESSED; + +add_page: + ret = pic_add_page(pic, start, start + pagetype_size[page_type], page_type); + return ret; +} + static int mm_idle_pmd_entry(pmd_t *pmd, unsigned long addr, unsigned long next, struct mm_walk *walk) { @@ -907,7 +1048,8 @@ static int mm_idle_pmd_entry(pmd_t *pmd, unsigned long addr, else if (!mm_idle_pmd_large(*pmd)) page_type = pte_page_type; else if (!test_and_clear_bit(_PAGE_MM_BIT_ACCESSED, - (unsigned long *)pmd)) + (unsigned long *)pmd) || + pic->flags & SCAN_IGN_HOST) page_type = PMD_IDLE; else page_type = PMD_ACCESSED; @@ -945,6 +1087,8 @@ static int mm_idle_test_walk(unsigned long start, unsigned long end, struct vm_area_struct *vma = walk->vma;
if (vma->vm_file) { + if (is_vm_hugetlb_page(vma)) + return 0; if ((vma->vm_flags & (VM_WRITE|VM_MAYSHARE)) == VM_WRITE) return 0; return 1; @@ -1038,6 +1182,7 @@ static ssize_t mm_idle_read(struct file *file, char *buf,
mm_walk_ops->pmd_entry = mm_idle_pmd_entry; mm_walk_ops->pud_entry = mm_idle_pud_entry; + mm_walk_ops->hugetlb_entry = mm_idle_hugetlb_entry; mm_walk_ops->test_walk = mm_idle_test_walk;
mm_walk.mm = mm; @@ -1057,6 +1202,29 @@ static ssize_t mm_idle_read(struct file *file, char *buf, return ret; }
+static long page_scan_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) +{ + void __user *argp = (void __user *)arg; + unsigned int flags; + + if (get_user(flags, (unsigned int __user *)argp)) + return -EFAULT; + flags &= ALL_SCAN_FLAGS; + + switch (cmd) { + case IDLE_SCAN_ADD_FLAGS: + filp->f_flags |= flags; + break; + case IDLE_SCAN_REMOVE_FLAGS: + filp->f_flags &= ~flags; + break; + default: + return -EOPNOTSUPP; + } + + return 0; +} + extern struct file_operations proc_page_scan_operations;
static int page_scan_entry(void) @@ -1065,6 +1233,7 @@ static int page_scan_entry(void) proc_page_scan_operations.read = page_scan_read; proc_page_scan_operations.open = page_scan_open; proc_page_scan_operations.release = page_scan_release; + proc_page_scan_operations.unlocked_ioctl = page_scan_ioctl; return 0; }
diff --git a/fs/proc/etmem_scan.h b/fs/proc/etmem_scan.h index 305739f92eef..93a6e33f2025 100644 --- a/fs/proc/etmem_scan.h +++ b/fs/proc/etmem_scan.h @@ -2,10 +2,24 @@ #ifndef _PAGE_IDLE_H #define _PAGE_IDLE_H
+#include <linux/types.h> + #define SCAN_HUGE_PAGE O_NONBLOCK /* only huge page */ #define SCAN_SKIM_IDLE O_NOFOLLOW /* stop on PMD_IDLE_PTES */ #define SCAN_DIRTY_PAGE O_NOATIME /* report pte/pmd dirty bit */
+/* define to not used file flags */ +#define SCAN_AS_HUGE 0100000000 /* treat normal page as hugepage in vm */ +#define SCAN_IGN_HOST 0200000000 /* ignore host access when scan vm */ +#define VM_SCAN_HOST 0400000000 /* scan and add host page for vm hole(internal) */ + +#define ALL_SCAN_FLAGS (SCAN_HUGE_PAGE | SCAN_SKIM_IDLE | SCAN_DIRTY_PAGE | \ + SCAN_AS_HUGE | SCAN_IGN_HOST | VM_SCAN_HOST) + +#define IDLE_SCAN_MAGIC 0x66 +#define IDLE_SCAN_ADD_FLAGS _IOW(IDLE_SCAN_MAGIC, 0x0, unsigned int) +#define IDLE_SCAN_REMOVE_FLAGS _IOW(IDLE_SCAN_MAGIC, 0x1, unsigned int) + enum ProcIdlePageType { PTE_ACCESSED, /* 4k page */ PMD_ACCESSED, /* 2M page */
From: Kemeng Shi shikemeng@huawei.com
euleros inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I7RO5Q CVE: NA Reference: https://gitee.com/openeuler/kernel/commit/c3a569dba2e7f4b1eec9323027e2c18cb8...
-------------------------------------------------
Before this patch, etmem_scan is failed if vm and host has different page level. This patch supports scan 4 level ept while 5 level page is enabled in host.
Signed-off-by: Kemeng Shi shikemeng@huawei.com Reviewed-by: louhongxiang louhongxiang@huawei.com Reviewed-by: Chen Wandun chenwandun@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- fs/proc/etmem_scan.c | 56 +++++++++++++++++++++++++++++--------------- 1 file changed, 37 insertions(+), 19 deletions(-)
diff --git a/fs/proc/etmem_scan.c b/fs/proc/etmem_scan.c index 8dbf186f8174..e08d60c3bc8a 100644 --- a/fs/proc/etmem_scan.c +++ b/fs/proc/etmem_scan.c @@ -454,14 +454,13 @@ static int ept_pud_range(struct page_idle_ctrl *pic, }
static int ept_p4d_range(struct page_idle_ctrl *pic, - pgd_t *pgd, unsigned long addr, unsigned long end, + p4d_t *p4d, unsigned long addr, unsigned long end, struct mm_walk *walk) { - p4d_t *p4d; unsigned long next; int err = 0;
- p4d = p4d_offset(pgd, addr); + p4d += p4d_index(addr); do { next = p4d_addr_end(addr, end); if (!ept_p4d_present(*p4d)) { @@ -477,6 +476,33 @@ static int ept_p4d_range(struct page_idle_ctrl *pic, return err; }
+static int ept_pgd_range(struct page_idle_ctrl *pic, + pgd_t *pgd, + unsigned long addr, + unsigned long end, + struct mm_walk *walk) +{ + p4d_t *p4d; + unsigned long next; + int err = 0; + + pgd = pgd_offset_pgd(pgd, addr); + do { + next = pgd_addr_end(addr, end); + if (!ept_pgd_present(*pgd)) { + set_restart_gpa(next, "PGD_HOLE"); + continue; + } + + p4d = (p4d_t *)pgd_page_vaddr(*pgd); + err = ept_p4d_range(pic, p4d, addr, next, walk); + if (err) + break; + } while (pgd++, addr = next, addr != end); + + return err; +} + static int ept_page_range(struct page_idle_ctrl *pic, unsigned long addr, unsigned long end, @@ -484,9 +510,7 @@ static int ept_page_range(struct page_idle_ctrl *pic, { struct kvm_vcpu *vcpu; struct kvm_mmu *mmu; - pgd_t *ept_root; - pgd_t *pgd; - unsigned long next; + uint64_t *ept_root; int err = 0;
WARN_ON(addr >= end); @@ -509,18 +533,11 @@ static int ept_page_range(struct page_idle_ctrl *pic,
spin_unlock(&pic->kvm->mmu_lock); local_irq_disable(); - pgd = pgd_offset_pgd(ept_root, addr); - do { - next = pgd_addr_end(addr, end); - if (!ept_pgd_present(*pgd)) { - set_restart_gpa(next, "PGD_HOLE"); - continue; - } - - err = ept_p4d_range(pic, pgd, addr, next, walk); - if (err) - break; - } while (pgd++, addr = next, addr != end); + /* Walk start at p4d when vm has 4 level table pages */ + if (mmu->shadow_root_level != 4) + err = ept_pgd_range(pic, (pgd_t *)ept_root, addr, end, walk); + else + err = ept_p4d_range(pic, (p4d_t *)ept_root, addr, end, walk); local_irq_enable(); return err; } @@ -540,7 +557,8 @@ static int ept_idle_supports_cpu(struct kvm *kvm) if (kvm_mmu_ad_disabled(mmu)) { pr_notice("CPU does not support EPT A/D bits tracking\n"); ret = -EINVAL; - } else if (mmu->shadow_root_level != 4 + (!!pgtable_l5_enabled())) { + } else if (mmu->shadow_root_level < 4 || + (mmu->shadow_root_level == 5 && !pgtable_l5_enabled())) { pr_notice("Unsupported EPT level %d\n", mmu->shadow_root_level); ret = -EINVAL; } else
From: Kemeng Shi shikemeng@huawei.com
euleros inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I7RO5Q CVE: NA Reference: https://gitee.com/openeuler/kernel/commit/16af7936ee7836d37c39457250479845a1...
-------------------------------------------------
free pic before return from vm_idle_read in etmem scan
Signed-off-by: Kemeng Shi shikemeng@huawei.com Reviewed-by: louhongxiang louhongxiang@huawei.com Reviewed-by: Chen Wandun chenwandun@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- fs/proc/etmem_scan.c | 1 + 1 file changed, 1 insertion(+)
diff --git a/fs/proc/etmem_scan.c b/fs/proc/etmem_scan.c index e08d60c3bc8a..85524e2802da 100644 --- a/fs/proc/etmem_scan.c +++ b/fs/proc/etmem_scan.c @@ -891,6 +891,7 @@ static ssize_t vm_idle_read(struct file *file, char *buf, ret = pic->bytes_copied; *ppos = pic->next_hva; out_kvm: + kfree(pic); return ret;
}
From: Kemeng Shi shikemeng@huawei.com
euleros inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I7RO5Q CVE: NA Reference: https://gitee.com/openeuler/kernel/commit/227739107be440efe8c58cda78554648e5...
-------------------------------------------------
Module scan/swap and etmem access export file operations without protection. Kernel crash can be triggered by following: 1.insert scan/swap module. 2.etmem check if exported file operations are set. 3.remove scan/swap module. 4.etmem call checked file operation. 5.kernel crash happens.
Fix this as following: Module scan/swap set and clear operations with lock held. Etmem in kernel calls try_module_get to with lock held. Etmem call read/open/release/ioctl callback without lock held with module get.
Another concurrent access situaction is that open for idles_pages and swap_pages will success without scan/swap module inserted. If scan/swap module is inserteds after open, subsequent call of open/read/close will call exported file operations set by scan/swap. This also may trigger kernel crash as following: 1.open idle_pages or swap_pages 2.modprobe scan/swap module 3.close idle_pages or swap_pages(module_put is called without try_module_get) 4.modprobe -r scan/swap module found invalid module reference count in trace delete_module syscall->try_stop_module->try_release_module_ref and report a BUG_ON for ret < 0.
Fix this by only return file successfully with scan/swap module inserted.
Signed-off-by: Kemeng Shi shikemeng@huawei.com Reviewed-by: louhongxiang louhongxiang@huawei.com Reviewed-by: Chen Wandun chenwandun@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- fs/proc/etmem_scan.c | 12 ++++++-- fs/proc/etmem_swap.c | 20 +++++++++----- fs/proc/task_mmu.c | 66 +++++++++++++++++++++++++++++++++++++++++--- 3 files changed, 85 insertions(+), 13 deletions(-)
diff --git a/fs/proc/etmem_scan.c b/fs/proc/etmem_scan.c index 85524e2802da..981df9076d13 100644 --- a/fs/proc/etmem_scan.c +++ b/fs/proc/etmem_scan.c @@ -1248,18 +1248,26 @@ extern struct file_operations proc_page_scan_operations;
static int page_scan_entry(void) { + proc_page_scan_operations.flock(NULL, 1, NULL); proc_page_scan_operations.owner = THIS_MODULE; proc_page_scan_operations.read = page_scan_read; proc_page_scan_operations.open = page_scan_open; proc_page_scan_operations.release = page_scan_release; proc_page_scan_operations.unlocked_ioctl = page_scan_ioctl; + proc_page_scan_operations.flock(NULL, 0, NULL); + return 0; }
static void page_scan_exit(void) { - memset(&proc_page_scan_operations, 0, - sizeof(proc_page_scan_operations)); + proc_page_scan_operations.flock(NULL, 1, NULL); + proc_page_scan_operations.owner = NULL; + proc_page_scan_operations.read = NULL; + proc_page_scan_operations.open = NULL; + proc_page_scan_operations.release = NULL; + proc_page_scan_operations.unlocked_ioctl = NULL; + proc_page_scan_operations.flock(NULL, 0, NULL); }
MODULE_LICENSE("GPL"); diff --git a/fs/proc/etmem_swap.c b/fs/proc/etmem_swap.c index b24c706c3b2a..f9f796cfaf97 100644 --- a/fs/proc/etmem_swap.c +++ b/fs/proc/etmem_swap.c @@ -83,18 +83,24 @@ extern struct file_operations proc_swap_pages_operations;
static int swap_pages_entry(void) { - proc_swap_pages_operations.owner = THIS_MODULE; - proc_swap_pages_operations.write = swap_pages_write; - proc_swap_pages_operations.open = swap_pages_open; - proc_swap_pages_operations.release = swap_pages_release; + proc_swap_pages_operations.flock(NULL, 1, NULL); + proc_swap_pages_operations.owner = THIS_MODULE; + proc_swap_pages_operations.write = swap_pages_write; + proc_swap_pages_operations.open = swap_pages_open; + proc_swap_pages_operations.release = swap_pages_release; + proc_swap_pages_operations.flock(NULL, 0, NULL);
- return 0; + return 0; }
static void swap_pages_exit(void) { - memset(&proc_swap_pages_operations, 0, - sizeof(proc_swap_pages_operations)); + proc_swap_pages_operations.flock(NULL, 1, NULL); + proc_swap_pages_operations.owner = NULL; + proc_swap_pages_operations.write = NULL; + proc_swap_pages_operations.open = NULL; + proc_swap_pages_operations.release = NULL; + proc_swap_pages_operations.flock(NULL, 0, NULL); }
MODULE_LICENSE("GPL"); diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index 38439f46de3a..1cee2bb0250e 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c @@ -19,6 +19,7 @@ #include <linux/shmem_fs.h> #include <linux/uaccess.h> #include <linux/pkeys.h> +#include <linux/module.h>
#include <asm/elf.h> #include <asm/tlb.h> @@ -1771,8 +1772,21 @@ const struct file_operations proc_pagemap_operations = { .release = pagemap_release, };
+static DEFINE_SPINLOCK(scan_lock); + +static int page_scan_lock(struct file *file, int is_lock, struct file_lock *flock) +{ + if (is_lock) + spin_lock(&scan_lock); + else + spin_unlock(&scan_lock); + + return 0; +} + /* will be filled when kvm_ept_idle module loads */ struct file_operations proc_page_scan_operations = { + .flock = page_scan_lock, }; EXPORT_SYMBOL_GPL(proc_page_scan_operations);
@@ -1796,10 +1810,22 @@ static ssize_t mm_idle_read(struct file *file, char __user *buf, static int mm_idle_open(struct inode *inode, struct file *file) { struct mm_struct *mm = NULL; + struct module *module = NULL; + int ret = -1;
if (!file_ns_capable(file, &init_user_ns, CAP_SYS_ADMIN)) return -EPERM;
+ page_scan_lock(NULL, 1, NULL); + module = proc_page_scan_operations.owner; + if (module != NULL && try_module_get(module)) + ret = 0; + page_scan_lock(NULL, 0, NULL); + if (ret != 0) { + /* no scan ko installed, avoid to return valid file */ + return -ENODEV; + } + mm = proc_mem_open(inode, PTRACE_MODE_READ); if (IS_ERR(mm)) return PTR_ERR(mm); @@ -1815,6 +1841,7 @@ static int mm_idle_open(struct inode *inode, struct file *file) static int mm_idle_release(struct inode *inode, struct file *file) { struct mm_struct *mm = file->private_data; + int ret = 0;
if (mm) { if (!mm_kvm(mm)) @@ -1823,9 +1850,12 @@ static int mm_idle_release(struct inode *inode, struct file *file) }
if (proc_page_scan_operations.release) - return proc_page_scan_operations.release(inode, file); + ret = proc_page_scan_operations.release(inode, file);
- return 0; + if (proc_page_scan_operations.owner) + module_put(proc_page_scan_operations.owner); + + return ret; }
static long mm_idle_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) @@ -1844,8 +1874,20 @@ const struct file_operations proc_mm_idle_operations = { .unlocked_ioctl = mm_idle_ioctl, };
+static DEFINE_SPINLOCK(swap_lock); + +static int page_swap_lock(struct file *file, int is_lock, struct file_lock *flock) +{ + if (is_lock) + spin_lock(&swap_lock); + else + spin_unlock(&swap_lock); + + return 0; +} /*swap pages*/ struct file_operations proc_swap_pages_operations = { + .flock = page_swap_lock, }; EXPORT_SYMBOL_GPL(proc_swap_pages_operations);
@@ -1861,10 +1903,22 @@ static ssize_t mm_swap_write(struct file *file, const char __user *buf, static int mm_swap_open(struct inode *inode, struct file *file) { struct mm_struct *mm = NULL; + struct module *module = NULL; + int ret = -1;
if (!file_ns_capable(file, &init_user_ns, CAP_SYS_ADMIN)) return -EPERM;
+ page_swap_lock(NULL, 1, NULL); + module = proc_swap_pages_operations.owner; + if (module != NULL && try_module_get(module)) + ret = 0; + page_swap_lock(NULL, 0, NULL); + if (ret != 0) { + /* no swap ko installed, avoid to return valid file */ + return -ENODEV; + } + mm = proc_mem_open(inode, PTRACE_MODE_READ); if (IS_ERR(mm)) return PTR_ERR(mm); @@ -1880,14 +1934,18 @@ static int mm_swap_open(struct inode *inode, struct file *file) static int mm_swap_release(struct inode *inode, struct file *file) { struct mm_struct *mm = file->private_data; + int ret = 0;
if (mm) mmdrop(mm);
if (proc_swap_pages_operations.release) - return proc_swap_pages_operations.release(inode, file); + ret = proc_swap_pages_operations.release(inode, file);
- return 0; + if (proc_swap_pages_operations.owner) + module_put(proc_swap_pages_operations.owner); + + return ret; }
const struct file_operations proc_mm_swap_operations = {
From: Kemeng Shi shikemeng@huawei.com
euleros inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I7RO5Q CVE: NA Reference: https://gitee.com/openeuler/kernel/commit/0bb20a8fa7e49d95a179337f54e3d23be1...
-------------------------------------------------
Kvm shadow page may be freed when etmem_scan is walking ept page table. Hold mmu_lock when walking ept page table to avoid UAF. To avoid holding mmu_lock for too long time, walk step module parameter is added to control lock holding time.
Signed-off-by: Kemeng Shi shikemeng@huawei.com Reviewed-by: louhongxiang louhongxiang@huawei.com Reviewed-by: Chen Wandun chenwandun@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- fs/proc/etmem_scan.c | 97 ++++++++++++++++++++++++++++++++------------ 1 file changed, 70 insertions(+), 27 deletions(-)
diff --git a/fs/proc/etmem_scan.c b/fs/proc/etmem_scan.c index 981df9076d13..863bd61529f9 100644 --- a/fs/proc/etmem_scan.c +++ b/fs/proc/etmem_scan.c @@ -84,6 +84,28 @@
#endif
+#define RET_RESCAN_FLAG 0x10000 + +static int set_walk_step(const char *val, const struct kernel_param *kp) +{ + int ret; + unsigned int n; + + ret = kstrtouint(val, 0, &n); + if (ret != 0 || n == 0) + return -EINVAL; + + return param_set_uint(val, kp); +} + +static struct kernel_param_ops walk_step_ops = { + .set = set_walk_step, + .get = param_get_uint, +}; + +static unsigned int __read_mostly walk_step = 512; // in PAGE_SIZE +module_param_cb(walk_step, &walk_step_ops, &walk_step, 0644); + static unsigned long pagetype_size[16] = { [PTE_ACCESSED] = PAGE_SIZE, /* 4k page */ [PMD_ACCESSED] = PMD_SIZE, /* 2M page */ @@ -249,26 +271,13 @@ static int page_idle_copy_user(struct page_idle_ctrl *pic, unsigned long start, unsigned long end) { int bytes_read; - int lc = 0; /* last copy? */ int ret;
dump_pic(pic);
- /* Break out of loop on no more progress. */ - if (!pic->pie_read) { - lc = 1; - if (start < end) - start = end; - } - - if (start >= end && start > pic->next_hva) { - set_next_hva(start, "TAIL-HOLE"); - pic_report_addr(pic, start); - } - bytes_read = pic->pie_read; if (!bytes_read) - return 1; + return 0;
ret = copy_to_user(pic->buf, pic->kpie, bytes_read); if (ret) @@ -278,8 +287,6 @@ static int page_idle_copy_user(struct page_idle_ctrl *pic, pic->bytes_copied += bytes_read; if (pic->bytes_copied >= pic->buf_size) return PAGE_IDLE_BUF_FULL; - if (lc) - return lc;
ret = init_page_idle_ctrl_buffer(pic); if (ret) @@ -299,17 +306,24 @@ static int vm_walk_host_range(unsigned long long start, unsigned long tmp_gpa_to_hva = pic->gpa_to_hva;
pic->gpa_to_hva = 0; - local_irq_enable(); + spin_unlock_irq(&pic->kvm->mmu_lock); down_read(&walk->mm->mmap_lock); local_irq_disable(); ret = walk_page_range(walk->mm, start + tmp_gpa_to_hva, end + tmp_gpa_to_hva, walk->ops, walk->private); + local_irq_enable(); up_read(&walk->mm->mmap_lock); pic->gpa_to_hva = tmp_gpa_to_hva; if (pic->flags & VM_SCAN_HOST) { pic->restart_gpa -= tmp_gpa_to_hva; pic->flags &= ~VM_SCAN_HOST; } + if (ret != PAGE_IDLE_KBUF_FULL && end > pic->restart_gpa) + pic->restart_gpa = end; + + /* ept page table may change after spin_unlock, rescan vm from root ept */ + ret |= RET_RESCAN_FLAG; + return ret; }
@@ -515,30 +529,40 @@ static int ept_page_range(struct page_idle_ctrl *pic,
WARN_ON(addr >= end);
- spin_lock(&pic->kvm->mmu_lock); + spin_lock_irq(&pic->kvm->mmu_lock);
vcpu = kvm_get_vcpu(pic->kvm, 0); if (!vcpu) { - spin_unlock(&pic->kvm->mmu_lock); + pic->gpa_to_hva = 0; + set_restart_gpa(TASK_SIZE, "NO-VCPU"); + spin_unlock_irq(&pic->kvm->mmu_lock); return -EINVAL; }
mmu = kvm_arch_mmu_pointer(vcpu); if (!VALID_PAGE(mmu->root_hpa)) { - spin_unlock(&pic->kvm->mmu_lock); + pic->gpa_to_hva = 0; + set_restart_gpa(TASK_SIZE, "NO-HPA"); + spin_unlock_irq(&pic->kvm->mmu_lock); return -EINVAL; }
ept_root = __va(mmu->root_hpa);
- spin_unlock(&pic->kvm->mmu_lock); - local_irq_disable(); /* Walk start at p4d when vm has 4 level table pages */ if (mmu->shadow_root_level != 4) err = ept_pgd_range(pic, (pgd_t *)ept_root, addr, end, walk); else err = ept_p4d_range(pic, (p4d_t *)ept_root, addr, end, walk); - local_irq_enable(); + + /* mmu_lock is unlock in vm_walk_host_range which will unlock mmu_lock + * and RET_RESCAN_FLAG will be set in ret value + */ + if (!(err & RET_RESCAN_FLAG)) + spin_unlock_irq(&pic->kvm->mmu_lock); + else + err &= ~RET_RESCAN_FLAG; + return err; }
@@ -807,6 +831,8 @@ static int vm_idle_walk_hva_range(struct page_idle_ctrl *pic, struct mm_walk *walk) { unsigned long gpa_addr; + unsigned long gpa_next; + unsigned long gpa_end; unsigned long addr_range; unsigned long va_end; int ret; @@ -836,12 +862,20 @@ static int vm_idle_walk_hva_range(struct page_idle_ctrl *pic, } } else { pic->gpa_to_hva = start - gpa_addr; + gpa_end = gpa_addr + addr_range; + for (; gpa_addr < gpa_end;) { + gpa_next = min(gpa_end, gpa_addr + walk_step * PAGE_SIZE); #ifdef CONFIG_ARM64 - arm_page_range(pic, gpa_addr, gpa_addr + addr_range); + ret = arm_page_range(pic, gpa_addr, gpa_next); #else - ept_page_range(pic, gpa_addr, gpa_addr + addr_range, walk); + ret = ept_page_range(pic, gpa_addr, gpa_next, walk); #endif - va_end = pic->gpa_to_hva + gpa_addr + addr_range; + gpa_addr = pic->restart_gpa; + + if (ret) + break; + } + va_end = pic->gpa_to_hva + gpa_end; }
start = pic->restart_gpa + pic->gpa_to_hva; @@ -850,6 +884,9 @@ static int vm_idle_walk_hva_range(struct page_idle_ctrl *pic, break; }
+ if (start > pic->next_hva) + set_next_hva(start, "NEXT-START"); + if (pic->bytes_copied) ret = 0; return ret; @@ -1050,9 +1087,10 @@ static int mm_idle_pmd_entry(pmd_t *pmd, unsigned long addr, * Skip duplicate PMD_IDLE_PTES: when the PMD crosses VMA boundary, * walk_page_range() can call on the same PMD twice. */ - if ((addr & PMD_MASK) == (pic->last_va & PMD_MASK)) { + if ((addr & PMD_MASK) == (pic->last_va & PMD_MASK) && (pic->flags & SCAN_HUGE_PAGE)) { debug_printk("ignore duplicate addr %pK %pK\n", addr, pic->last_va); + set_restart_gpa(round_up(next, PMD_SIZE), "DUP_ADDR"); return 0; } pic->last_va = addr; @@ -1144,12 +1182,17 @@ static int mm_idle_walk_range(struct page_idle_ctrl *pic, up_read(&walk->mm->mmap_lock);
WARN_ONCE(pic->gpa_to_hva, "non-zero gpa_to_hva"); + if (ret != PAGE_IDLE_KBUF_FULL && end > pic->restart_gpa) + pic->restart_gpa = end; start = pic->restart_gpa; ret = page_idle_copy_user(pic, start, end); if (ret) break; }
+ if (start > pic->next_hva) + set_next_hva(start, "NEXT-START"); + if (pic->bytes_copied) { if (ret != PAGE_IDLE_BUF_FULL && pic->next_hva < end) debug_printk("partial scan: next_hva=%pK end=%pK\n",
From: Kemeng Shi shikemeng@huawei.com
euleros inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I7RO5Q CVE: NA Reference: https://gitee.com/openeuler/kernel/commit/76cd04fa6fbe1bfca952d5bb6fb8ddd70f...
-------------------------------------------------
Now, we will call cond_resched after scan a full memslot. If we scan a huge memslot, it will take long time before cond_resched. So call cond_resched after scan walk_step size memory.
Signed-off-by: Kemeng Shi shikemeng@huawei.com Reviewed-by: louhongxiang louhongxiang@huawei.com Reviewed-by: Chen Wandun chenwandun@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- fs/proc/etmem_scan.c | 10 ++++++++++ 1 file changed, 10 insertions(+)
diff --git a/fs/proc/etmem_scan.c b/fs/proc/etmem_scan.c index 863bd61529f9..f3fff53b1411 100644 --- a/fs/proc/etmem_scan.c +++ b/fs/proc/etmem_scan.c @@ -106,6 +106,9 @@ static struct kernel_param_ops walk_step_ops = { static unsigned int __read_mostly walk_step = 512; // in PAGE_SIZE module_param_cb(walk_step, &walk_step_ops, &walk_step, 0644);
+static unsigned int resched_step = 10; +module_param(resched_step, uint, 0644); + static unsigned long pagetype_size[16] = { [PTE_ACCESSED] = PAGE_SIZE, /* 4k page */ [PMD_ACCESSED] = PMD_SIZE, /* 2M page */ @@ -836,6 +839,7 @@ static int vm_idle_walk_hva_range(struct page_idle_ctrl *pic, unsigned long addr_range; unsigned long va_end; int ret; + int steps;
#ifdef CONFIG_X86_64 ret = ept_idle_supports_cpu(pic->kvm); @@ -863,6 +867,7 @@ static int vm_idle_walk_hva_range(struct page_idle_ctrl *pic, } else { pic->gpa_to_hva = start - gpa_addr; gpa_end = gpa_addr + addr_range; + steps = 0; for (; gpa_addr < gpa_end;) { gpa_next = min(gpa_end, gpa_addr + walk_step * PAGE_SIZE); #ifdef CONFIG_ARM64 @@ -874,6 +879,11 @@ static int vm_idle_walk_hva_range(struct page_idle_ctrl *pic,
if (ret) break; + + if (++steps >= resched_step) { + cond_resched(); + steps = 0; + } } va_end = pic->gpa_to_hva + gpa_end; }
From: Kemeng Shi shikemeng@huawei.com
euleros inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I7RO5Q CVE: NA Reference: https://gitee.com/openeuler/kernel/commit/62620120ef23c8b2dbb69339339b5e4924...
-------------------------------------------------
add a callback in pte_hole during walk_page_range for user to scan page without page table.
Signed-off-by: Kemeng Shi shikemeng@huawei.com Reviewed-by: louhongxiang louhongxiang@huawei.com Reviewed-by: Chen Wandun chenwandun@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- fs/proc/etmem_scan.c | 45 +++++++++++++++++++++++++++++++++++++++----- 1 file changed, 40 insertions(+), 5 deletions(-)
diff --git a/fs/proc/etmem_scan.c b/fs/proc/etmem_scan.c index f3fff53b1411..adb932a3c1d6 100644 --- a/fs/proc/etmem_scan.c +++ b/fs/proc/etmem_scan.c @@ -86,6 +86,11 @@
#define RET_RESCAN_FLAG 0x10000
+/* error return IDLE_PAGE_TYPE_MAX or return valid page type */ +enum ProcIdlePageType (*vm_handle_pte_hole)(unsigned long addr, + unsigned long next, int depth, struct mm_walk *walk) = NULL; +EXPORT_SYMBOL_GPL(vm_handle_pte_hole); + static int set_walk_step(const char *val, const struct kernel_param *kp) { int ret; @@ -794,6 +799,11 @@ static unsigned long vm_idle_find_gpa(struct page_idle_ctrl *pic, return INVALID_PAGE; }
+static inline unsigned long mask_to_size(unsigned long mask) +{ + return ~mask + 1; +} + static int mm_idle_hugetlb_entry(pte_t *pte, unsigned long hmask, unsigned long addr, unsigned long next, struct mm_walk *walk); @@ -802,11 +812,40 @@ static int vm_idle_hugetlb_entry(pte_t *pte, unsigned long hmask, struct mm_walk *walk) { struct page_idle_ctrl *pic = walk->private; + enum ProcIdlePageType page_type;
pic->flags |= VM_SCAN_HOST; + + /* hugetlb page table entry of vm maybe not present while page is resident + * in address_space + */ + if (mask_to_size(hmask) != PUD_SIZE && !pte_present(*pte) && + vm_handle_pte_hole != NULL) { + page_type = vm_handle_pte_hole(addr, next, -1, walk); + if (page_type < IDLE_PAGE_TYPE_MAX) + return pic_add_page(pic, addr, next, page_type); + } + return mm_idle_hugetlb_entry(pte, hmask, addr, next, walk); }
+static int vm_idle_pte_hole(unsigned long addr, unsigned long next, int depth, struct mm_walk *walk) +{ + struct page_idle_ctrl *pic = walk->private; + enum ProcIdlePageType pagetype; + + if (vm_handle_pte_hole == NULL) + return 0; + + pagetype = vm_handle_pte_hole(addr, next, depth, walk); + if (pagetype >= IDLE_PAGE_TYPE_MAX) + return 0; + + debug_printk("scan pte hole addr %pK type %d\n", addr, pagetype); + pic->flags |= VM_SCAN_HOST; + return pic_add_page(pic, addr, next, pagetype); +} + static int mm_idle_pmd_entry(pmd_t *pmd, unsigned long addr, unsigned long next, struct mm_walk *walk); static int vm_idle_pmd_entry(pmd_t *pmd, unsigned long addr, @@ -925,6 +964,7 @@ static ssize_t vm_idle_read(struct file *file, char *buf, mm_walk_ops.pmd_entry = vm_idle_pmd_entry; mm_walk_ops.pud_entry = vm_idle_pud_entry; mm_walk_ops.hugetlb_entry = vm_idle_hugetlb_entry; + mm_walk_ops.pte_hole = vm_idle_pte_hole; mm_walk_ops.test_walk = mm_idle_test_walk;
mm_walk.mm = mm; @@ -1052,11 +1092,6 @@ static int mm_idle_pte_range(struct page_idle_ctrl *pic, pmd_t *pmd, return err; }
-static inline unsigned long mask_to_size(unsigned long mask) -{ - return ~mask + 1; -} - static int mm_idle_hugetlb_entry(pte_t *pte, unsigned long hmask, unsigned long addr, unsigned long next, struct mm_walk *walk)
From: Kemeng Shi shikemeng@huawei.com
euleros inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I7RO5Q CVE: NA Reference: https://gitee.com/openeuler/kernel/commit/1c76b8cf0bd5dbcd0ebb576384e2f3af77...
-------------------------------------------------
Driver dax_kmem will export pmem as a NUMA node. This patch will record node consists of persistent memory for futher use.
Signed-off-by: Kemeng Shi shikemeng@huawei.com Reviewed-by: louhongxiang louhongxiang@huawei.com Reviewed-by: Kefeng Wang wangkefeng.wang@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- drivers/acpi/numa/srat.c | 5 +++++ include/linux/numa.h | 12 ++++++++++++ mm/page_alloc.c | 13 +++++++++++++ 3 files changed, 30 insertions(+)
diff --git a/drivers/acpi/numa/srat.c b/drivers/acpi/numa/srat.c index 1f4fc5f8a819..3c5eae855e6a 100644 --- a/drivers/acpi/numa/srat.c +++ b/drivers/acpi/numa/srat.c @@ -278,6 +278,11 @@ acpi_numa_memory_affinity_init(struct acpi_srat_mem_affinity *ma)
node_set(node, numa_nodes_parsed);
+ if (ma->flags & ACPI_SRAT_MEM_NON_VOLATILE) + set_node_type(node, NODE_TYPE_PMEM); + else + set_node_type(node, NODE_TYPE_DRAM); + pr_info("SRAT: Node %u PXM %u [mem %#010Lx-%#010Lx]%s%s\n", node, pxm, (unsigned long long) start, (unsigned long long) end - 1, diff --git a/include/linux/numa.h b/include/linux/numa.h index 59df211d051f..fdcd888f70cd 100644 --- a/include/linux/numa.h +++ b/include/linux/numa.h @@ -20,6 +20,11 @@ #define __initdata_or_meminfo __initdata #endif
+enum node_type { + NODE_TYPE_DRAM, + NODE_TYPE_PMEM, +}; + #ifdef CONFIG_NUMA #include <linux/printk.h> #include <asm/sparsemem.h> @@ -43,6 +48,8 @@ static inline int phys_to_target_node(u64 start) return 0; } #endif +void set_node_type(int nid, enum node_type type); +enum node_type get_node_type(int nid); #else /* !CONFIG_NUMA */ static inline int numa_map_to_online_node(int node) { @@ -56,6 +63,11 @@ static inline int phys_to_target_node(u64 start) { return 0; } +static inline enum node_type get_node_type(int nid) +{ + return NODE_TYPE_DRAM; +} +static inline void set_node_type(int nid, enum node_type type) {} #endif
#ifdef CONFIG_HAVE_ARCH_NODE_DEV_GROUP diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 90762bee9730..8ab70473a2d0 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -7218,3 +7218,16 @@ bool has_managed_dma(void) return false; } #endif /* CONFIG_ZONE_DMA */ + +#ifdef CONFIG_NUMA +enum node_type nodes_type[MAX_NUMNODES]; + +void set_node_type(int nid, enum node_type type) +{ + nodes_type[nid] = type; +} +enum node_type get_node_type(int nid) +{ + return nodes_type[nid]; +} +#endif
From: Kemeng Shi shikemeng@huawei.com
euleros inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I7RO5Q CVE: NA Reference: https://gitee.com/openeuler/kernel/commit/80ed6b3203c960b286e1fd5d8db1b39a19...
-------------------------------------------------
Add /proc/sys/kernel/hugepage_pmem_allocall switch. Set 1 to allowed all memory in pmem could alloc for hugepage. Set 0(default) hugepage alloc is limited by zone watermark as usual. Add /proc/sys/kernel/hugepage_mig_noalloc switch. Set 1 to forbid new hugepage alloc in hugepage migration when hugepage in dest node runs out. Set 0(default) to allow hugepage alloc in hugepage migration as usual.
Signed-off-by: Kemeng Shi shikemeng@huawei.com Reviewed-by: louhongxiang louhongxiang@huawei.com Reviewed-by: Kefeng Wang wangkefeng.wang@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- mm/hugetlb.c | 27 ++++++++++++++++++++++++++- 1 file changed, 26 insertions(+), 1 deletion(-)
diff --git a/mm/hugetlb.c b/mm/hugetlb.c index f154019e6b84..9b492bbc9188 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -89,6 +89,9 @@ DEFINE_SPINLOCK(hugetlb_lock); static int num_fault_mutexes; struct mutex *hugetlb_fault_mutex_table ____cacheline_aligned_in_smp;
+static int sysctl_hugetlb_mig_noalloc; +static int sysctl_hugetlb_pmem_allocall; + /* Forward declaration */ static int hugetlb_acct_memory(struct hstate *h, long delta); static void hugetlb_vma_lock_free(struct vm_area_struct *vma); @@ -2218,6 +2221,8 @@ static int alloc_pool_huge_page(struct hstate *h, nodemask_t *nodes_allowed, gfp_t gfp_mask = htlb_alloc_mask(h) | __GFP_THISNODE;
for_each_node_mask_to_alloc(h, nr_nodes, node, nodes_allowed) { + if (get_node_type(node) == NODE_TYPE_PMEM && sysctl_hugetlb_pmem_allocall) + gfp_mask |= __GFP_MEMALLOC; folio = alloc_fresh_hugetlb_folio(h, gfp_mask, node, nodes_allowed, node_alloc_noretry); if (folio) { @@ -2486,7 +2491,7 @@ struct folio *alloc_hugetlb_folio_nodemask(struct hstate *h, int preferred_nid,
folio = dequeue_hugetlb_folio_nodemask(h, gfp_mask, preferred_nid, nmask); - if (folio) { + if (folio || sysctl_hugetlb_mig_noalloc) { spin_unlock_irq(&hugetlb_lock); return folio; } @@ -4676,6 +4681,26 @@ static struct ctl_table hugetlb_table[] = { .mode = 0644, .proc_handler = hugetlb_overcommit_handler, }, +#ifdef CONFIG_HUGETLBFS + { + .procname = "hugepage_mig_noalloc", + .data = &sysctl_hugetlb_mig_noalloc, + .maxlen = sizeof(sysctl_hugetlb_mig_noalloc), + .mode = 0600, + .proc_handler = proc_dointvec_minmax, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_ONE, + }, + { + .procname = "hugepage_pmem_allocall", + .data = &sysctl_hugetlb_pmem_allocall, + .maxlen = sizeof(sysctl_hugetlb_pmem_allocall), + .mode = 0600, + .proc_handler = proc_dointvec_minmax, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_ONE, + }, +#endif { } };
From: Kemeng Shi shikemeng@huawei.com
euleros inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I7RO5Q CVE: NA Reference: https://gitee.com/openeuler/kernel/commit/50d5bf1b6da9f74bf93f9dec601c09d45d...
-------------------------------------------------
Add proc/sys/vm/hugepage_nocache_copy switch. Set 1 to copy hugepage with movnt SSE instructoin if cpu support it. Set 0 to copy hugepage as usual.
Signed-off-by: Kemeng Shi shikemeng@huawei.com Reviewed-by: louhongxiang louhongxiang@huawei.com Reviewed-by: Kefeng Wang wangkefeng.wang@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- arch/x86/include/asm/page_64.h | 7 +++ arch/x86/lib/Makefile | 1 + arch/x86/lib/copy_highpages.c | 107 +++++++++++++++++++++++++++++++++ arch/x86/lib/copy_page_64.S | 73 ++++++++++++++++++++++ include/linux/highmem.h | 17 ++++++ mm/util.c | 11 +--- 6 files changed, 207 insertions(+), 9 deletions(-) create mode 100644 arch/x86/lib/copy_highpages.c
diff --git a/arch/x86/include/asm/page_64.h b/arch/x86/include/asm/page_64.h index cc6b8e087192..f869dec42f34 100644 --- a/arch/x86/include/asm/page_64.h +++ b/arch/x86/include/asm/page_64.h @@ -60,6 +60,13 @@ static inline void clear_page(void *page)
void copy_page(void *to, void *from);
+void copy_page_nocache(void *to, void *from); +void copy_page_nocache_barrir(void); + +struct folio; +#define __HAVE_ARCH_COPY_HUGEPAGES 1 +void copy_highpages(struct folio *dst, struct folio *src); + #ifdef CONFIG_X86_5LEVEL /* * User space process size. This is the first address outside the user range. diff --git a/arch/x86/lib/Makefile b/arch/x86/lib/Makefile index 01932af64193..f3a8fa45c010 100644 --- a/arch/x86/lib/Makefile +++ b/arch/x86/lib/Makefile @@ -73,4 +73,5 @@ endif lib-y += memmove_64.o memset_64.o lib-y += copy_user_64.o copy_user_uncached_64.o lib-y += cmpxchg16b_emu.o + lib-y += copy_highpages.o endif diff --git a/arch/x86/lib/copy_highpages.c b/arch/x86/lib/copy_highpages.c new file mode 100644 index 000000000000..d8357a938007 --- /dev/null +++ b/arch/x86/lib/copy_highpages.c @@ -0,0 +1,107 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * accelerate copying page to pmem with non-temproal stroes + */ +#include <linux/sched.h> +#include <linux/mmzone.h> +#include <linux/highmem.h> +#include <linux/sysctl.h> + +DEFINE_STATIC_KEY_FALSE(hugepage_nocache_copy); +#ifdef CONFIG_SYSCTL +static void set_hugepage_nocache_copy(bool enabled) +{ + if (enabled) + static_branch_enable(&hugepage_nocache_copy); + else + static_branch_disable(&hugepage_nocache_copy); +} + +int sysctl_hugepage_nocache_copy(struct ctl_table *table, int write, + void __user *buffer, size_t *lenp, loff_t *ppos) +{ + struct ctl_table t; + int err; + int state; + + if (write && !capable(CAP_SYS_ADMIN)) + return -EPERM; + + state = static_branch_unlikely(&hugepage_nocache_copy); + t = *table; + t.data = &state; + err = proc_dointvec_minmax(&t, write, buffer, lenp, ppos); + if (err < 0) + return err; + if (write) + set_hugepage_nocache_copy(state); + return err; +} + +static struct ctl_table copy_highpages_table[] = { + { + .procname = "hugepage_nocache_copy", + .data = NULL, + .maxlen = sizeof(unsigned int), + .mode = 0600, + .proc_handler = sysctl_hugepage_nocache_copy, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_ONE, + }, + {} +}; + +static struct ctl_table copy_highpages_root_table[] = { + { + .procname = "vm", + .mode = 0555, + .child = copy_highpages_table, + }, + {} +}; + +static __init int copy_highpages_init(void) +{ + return register_sysctl_table(copy_highpages_root_table) ? 0 : -ENOMEM; +} +__initcall(copy_highpages_init); +#endif + +static void copy_highpages_nocache(struct folio *dst, struct folio *src) +{ + char *vfrom, *vto; + int i; + int nr = folio_nr_pages(src); + + for (i = 0; i < nr; i++) { + cond_resched(); + vfrom = kmap_atomic(folio_page(src, i)); + vto = kmap_atomic(folio_page(dst, i)); + copy_page_nocache(vto, vfrom); + kunmap_atomic(vto); + kunmap_atomic(vfrom); + } + copy_page_nocache_barrir(); +} + +static void copy_highpages_cache(struct folio *dst, struct folio *src) +{ + long i = 0; + long nr = folio_nr_pages(src); + + for (;;) { + copy_highpage(folio_page(dst, i), folio_page(src, i)); + if (++i == nr) + break; + cond_resched(); + } +} + +void copy_highpages(struct folio *dst, struct folio *src) +{ + if (static_branch_unlikely(&hugepage_nocache_copy) && + get_node_type(page_to_nid(folio_page(dst, 0))) == NODE_TYPE_PMEM) + return copy_highpages_nocache(dst, src); + + return copy_highpages_cache(dst, src); +} diff --git a/arch/x86/lib/copy_page_64.S b/arch/x86/lib/copy_page_64.S index 30ea644bf446..c31a9a79b18e 100644 --- a/arch/x86/lib/copy_page_64.S +++ b/arch/x86/lib/copy_page_64.S @@ -87,3 +87,76 @@ SYM_FUNC_START_LOCAL(copy_page_regs) addq $2*8, %rsp RET SYM_FUNC_END(copy_page_regs) + +SYM_FUNC_START(copy_page_nocache) + ALTERNATIVE "jmp copy_page", "", X86_FEATURE_XMM2 + subq $2*8, %rsp + movq %rbx, (%rsp) + movq %r12, 1*8(%rsp) + + movl $(4096/64)-5, %ecx + .p2align 4 +.LoopNT64: + dec %rcx + movq 0x8*0(%rsi), %rax + movq 0x8*1(%rsi), %rbx + movq 0x8*2(%rsi), %rdx + movq 0x8*3(%rsi), %r8 + movq 0x8*4(%rsi), %r9 + movq 0x8*5(%rsi), %r10 + movq 0x8*6(%rsi), %r11 + movq 0x8*7(%rsi), %r12 + + prefetcht0 5*64(%rsi) + + movnti %rax, 0x8*0(%rdi) + movnti %rbx, 0x8*1(%rdi) + movnti %rdx, 0x8*2(%rdi) + movnti %r8, 0x8*3(%rdi) + movnti %r9, 0x8*4(%rdi) + movnti %r10, 0x8*5(%rdi) + movnti %r11, 0x8*6(%rdi) + movnti %r12, 0x8*7(%rdi) + + leaq 64 (%rsi), %rsi + leaq 64 (%rdi), %rdi + + jnz .LoopNT64 + + movl $5, %ecx + .p2align 4 +.LoopNT2: + decl %ecx + + movq 0x8*0(%rsi), %rax + movq 0x8*1(%rsi), %rbx + movq 0x8*2(%rsi), %rdx + movq 0x8*3(%rsi), %r8 + movq 0x8*4(%rsi), %r9 + movq 0x8*5(%rsi), %r10 + movq 0x8*6(%rsi), %r11 + movq 0x8*7(%rsi), %r12 + + movnti %rax, 0x8*0(%rdi) + movnti %rbx, 0x8*1(%rdi) + movnti %rdx, 0x8*2(%rdi) + movnti %r8, 0x8*3(%rdi) + movnti %r9, 0x8*4(%rdi) + movnti %r10, 0x8*5(%rdi) + movnti %r11, 0x8*6(%rdi) + movnti %r12, 0x8*7(%rdi) + + leaq 64(%rdi), %rdi + leaq 64(%rsi), %rsi + jnz .LoopNT2 + + movq (%rsp), %rbx + movq 1*8(%rsp), %r12 + addq $2*8, %rsp + ret +SYM_FUNC_END(copy_page_nocache) + +SYM_FUNC_START(copy_page_nocache_barrir) + ALTERNATIVE "", "sfence", X86_FEATURE_XMM2 + ret +SYM_FUNC_END(copy_page_nocache_barrir) diff --git a/include/linux/highmem.h b/include/linux/highmem.h index 4de1dbcd3ef6..4cf084e6371b 100644 --- a/include/linux/highmem.h +++ b/include/linux/highmem.h @@ -513,4 +513,21 @@ static inline void put_and_unmap_page(struct page *page, void *addr) put_page(page); }
+#ifndef __HAVE_ARCH_COPY_HUGEPAGES + +static inline void copy_highpages(struct folio *dst, struct folio *src) +{ + long i = 0; + long nr = folio_nr_pages(src); + + for (;;) { + copy_highpage(folio_page(dst, i), folio_page(src, i)); + if (++i == nr) + break; + cond_resched(); + } +} + +#endif /* __HAVE_ARCH_COPY_HUGEPAGES */ + #endif /* _LINUX_HIGHMEM_H */ diff --git a/mm/util.c b/mm/util.c index dd12b9531ac4..6593ad7b84ee 100644 --- a/mm/util.c +++ b/mm/util.c @@ -23,6 +23,7 @@ #include <linux/processor.h> #include <linux/sizes.h> #include <linux/compat.h> +#include <linux/page-flags.h>
#include <linux/uaccess.h>
@@ -792,15 +793,7 @@ EXPORT_SYMBOL(folio_mapping); */ void folio_copy(struct folio *dst, struct folio *src) { - long i = 0; - long nr = folio_nr_pages(src); - - for (;;) { - copy_highpage(folio_page(dst, i), folio_page(src, i)); - if (++i == nr) - break; - cond_resched(); - } + copy_highpages(dst, src); }
int sysctl_overcommit_memory __read_mostly = OVERCOMMIT_GUESS;
euleros inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I7RO5Q Reference: https://gitee.com/openeuler/kernel/commit/4232d9005401d839ceeb1a02da34749b6d...
-------------------------------------------------
add CONFIG_ETMEM macro definition for etmem feature.
Signed-off-by: liubo liubo254@huawei.com Reviewed-by: Miaohe Lin linmiaohe@huawei.com Reviewed-by: Kefeng Wang wangkefeng.wang@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- fs/proc/base.c | 4 ++++ fs/proc/internal.h | 2 ++ fs/proc/task_mmu.c | 2 ++ include/linux/swap.h | 4 ++++ mm/vmscan.c | 2 ++ 5 files changed, 14 insertions(+)
diff --git a/fs/proc/base.c b/fs/proc/base.c index c5fbe0815614..420e1d572856 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -3353,6 +3353,8 @@ static const struct pid_entry tgid_base_stuff[] = { REG("smaps", S_IRUGO, proc_pid_smaps_operations), REG("smaps_rollup", S_IRUGO, proc_pid_smaps_rollup_operations), REG("pagemap", S_IRUSR, proc_pagemap_operations), +#endif +#ifdef CONFIG_ETMEM REG("idle_pages", S_IRUSR|S_IWUSR, proc_mm_idle_operations), REG("swap_pages", S_IWUSR, proc_mm_swap_operations), #endif @@ -3703,6 +3705,8 @@ static const struct pid_entry tid_base_stuff[] = { REG("smaps", S_IRUGO, proc_pid_smaps_operations), REG("smaps_rollup", S_IRUGO, proc_pid_smaps_rollup_operations), REG("pagemap", S_IRUSR, proc_pagemap_operations), +#endif +#ifdef CONFIG_ETMEM REG("idle_pages", S_IRUSR|S_IWUSR, proc_mm_idle_operations), REG("swap_pages", S_IWUSR, proc_mm_swap_operations), #endif diff --git a/fs/proc/internal.h b/fs/proc/internal.h index 0e6bf977ba23..24f74abfcbc4 100644 --- a/fs/proc/internal.h +++ b/fs/proc/internal.h @@ -305,8 +305,10 @@ extern const struct file_operations proc_pid_smaps_operations; extern const struct file_operations proc_pid_smaps_rollup_operations; extern const struct file_operations proc_clear_refs_operations; extern const struct file_operations proc_pagemap_operations; +#ifdef CONFIG_ETMEM extern const struct file_operations proc_mm_idle_operations; extern const struct file_operations proc_mm_swap_operations; +#endif
extern unsigned long task_vsize(struct mm_struct *); extern unsigned long task_statm(struct mm_struct *, diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index 1cee2bb0250e..721ea2a42dfd 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c @@ -1772,6 +1772,7 @@ const struct file_operations proc_pagemap_operations = { .release = pagemap_release, };
+#ifdef CONFIG_ETMEM static DEFINE_SPINLOCK(scan_lock);
static int page_scan_lock(struct file *file, int is_lock, struct file_lock *flock) @@ -1954,6 +1955,7 @@ const struct file_operations proc_mm_swap_operations = { .open = mm_swap_open, .release = mm_swap_release, }; +#endif #endif /* CONFIG_PROC_PAGE_MONITOR */
#ifdef CONFIG_NUMA diff --git a/include/linux/swap.h b/include/linux/swap.h index 96666ca67a15..637fe2635a06 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h @@ -441,9 +441,13 @@ extern unsigned long mem_cgroup_shrink_node(struct mem_cgroup *mem, extern unsigned long shrink_all_memory(unsigned long nr_pages); extern int vm_swappiness; long remove_mapping(struct address_space *mapping, struct folio *folio); + +#ifdef CONFIG_ETMEM extern int add_page_for_swap(struct page *page, struct list_head *pagelist); extern struct page *get_page_from_vaddr(struct mm_struct *mm, unsigned long vaddr); +#endif + #ifdef CONFIG_NUMA extern int node_reclaim_mode; extern int sysctl_min_unmapped_ratio; diff --git a/mm/vmscan.c b/mm/vmscan.c index ba95fe8afff7..d9c7e1672aad 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -8108,6 +8108,7 @@ void check_move_unevictable_folios(struct folio_batch *fbatch) } EXPORT_SYMBOL_GPL(check_move_unevictable_folios);
+#ifdef CONFIG_ETMEM int add_page_for_swap(struct page *page, struct list_head *pagelist) { int err = -EBUSY; @@ -8162,3 +8163,4 @@ struct page *get_page_from_vaddr(struct mm_struct *mm, unsigned long vaddr) return page; } EXPORT_SYMBOL_GPL(get_page_from_vaddr); +#endif
euleros inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I7RO5Q CVE: NA Reference: https://gitee.com/openeuler/kernel/commit/79c68ab3280fab8ace1b10bf8eadce508e...
-------------------------------------------------
etmem, the memory vertical expansion technology, uses DRAM and high-performance storage new media to form multi-level memory storage. By grading the stored data, etmem migrates the classified cold storage data from the storage medium to the high-performance storage medium, so as to achieve the purpose of memory capacity expansion and memory cost reduction.
When the memory expansion function etmem is running, the native swap function of the kernel needs to be disabled in certain scenarios to avoid the impact of kernel swap.
This feature provides the preceding functions.
The /sys/kernel/mm/swap/ directory provides the kernel_swap_enable sys interface to enable or disable the native swap function of the kernel.
The default value of /sys/kernel/mm/swap/kernel_swap_enable is true, that is, kernel swap is enabled by default.
Turn on kernel swap: echo true > /sys/kernel/mm/swap/kernel_swap_enable
Turn off kernel swap: echo false > /sys/kernel/mm/swap/kernel_swap_enable
Signed-off-by: liubo liubo254@huawei.com Reviewed-by: Miaohe Lin linmiaohe@huawei.com Reviewed-by: Kefeng Wang wangkefeng.wang@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- include/linux/swap.h | 4 ++++ mm/swap_state.c | 37 +++++++++++++++++++++++++++++++++++++ mm/vmscan.c | 26 ++++++++++++++++++++++++++ 3 files changed, 67 insertions(+)
diff --git a/include/linux/swap.h b/include/linux/swap.h index 637fe2635a06..811f7cc05ced 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h @@ -722,5 +722,9 @@ static inline bool mem_cgroup_swap_full(struct folio *folio) } #endif
+#ifdef CONFIG_ETMEM +extern bool kernel_swap_enabled(void); +#endif + #endif /* __KERNEL__*/ #endif /* _LINUX_SWAP_H */ diff --git a/mm/swap_state.c b/mm/swap_state.c index b76a65ac28b3..48a34514b235 100644 --- a/mm/swap_state.c +++ b/mm/swap_state.c @@ -40,6 +40,9 @@ static const struct address_space_operations swap_aops = { struct address_space *swapper_spaces[MAX_SWAPFILES] __read_mostly; static unsigned int nr_swapper_spaces[MAX_SWAPFILES] __read_mostly; static bool enable_vma_readahead __read_mostly = true; +#ifdef CONFIG_ETMEM +static bool enable_kernel_swap __read_mostly = true; +#endif
#define SWAP_RA_WIN_SHIFT (PAGE_SHIFT / 2) #define SWAP_RA_HITS_MASK ((1UL << SWAP_RA_WIN_SHIFT) - 1) @@ -322,6 +325,13 @@ static inline bool swap_use_vma_readahead(void) return READ_ONCE(enable_vma_readahead) && !atomic_read(&nr_rotate_swap); }
+#ifdef CONFIG_ETMEM +bool kernel_swap_enabled(void) +{ + return READ_ONCE(enable_kernel_swap); +} +#endif + /* * Lookup a swap entry in the swap cache. A found folio will be returned * unlocked and with its refcount incremented - we rely on the kernel @@ -869,8 +879,35 @@ static ssize_t vma_ra_enabled_store(struct kobject *kobj, } static struct kobj_attribute vma_ra_enabled_attr = __ATTR_RW(vma_ra_enabled);
+#ifdef CONFIG_ETMEM +static ssize_t kernel_swap_enable_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + return sprintf(buf, "%s\n", enable_kernel_swap ? "true" : "false"); +} +static ssize_t kernel_swap_enable_store(struct kobject *kobj, + struct kobj_attribute *attr, + const char *buf, size_t count) +{ + if (!strncmp(buf, "true", 4) || !strncmp(buf, "1", 1)) + WRITE_ONCE(enable_kernel_swap, true); + else if (!strncmp(buf, "false", 5) || !strncmp(buf, "0", 1)) + WRITE_ONCE(enable_kernel_swap, false); + else + return -EINVAL; + + return count; +} +static struct kobj_attribute kernel_swap_enable_attr = + __ATTR(kernel_swap_enable, 0644, kernel_swap_enable_show, + kernel_swap_enable_store); +#endif + static struct attribute *swap_attrs[] = { &vma_ra_enabled_attr.attr, +#ifdef CONFIG_ETMEM + &kernel_swap_enable_attr.attr, +#endif NULL, };
diff --git a/mm/vmscan.c b/mm/vmscan.c index d9c7e1672aad..79799c013740 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -6983,6 +6983,18 @@ static bool throttle_direct_reclaim(gfp_t gfp_mask, struct zonelist *zonelist, return false; }
+#ifdef CONFIG_ETMEM +/* + * Check if original kernel swap is enabled + * turn off kernel swap,but leave page cache reclaim on + */ +static inline void kernel_swap_check(struct scan_control *sc) +{ + if (sc != NULL && !kernel_swap_enabled()) + sc->may_swap = 0; +} +#endif + unsigned long try_to_free_pages(struct zonelist *zonelist, int order, gfp_t gfp_mask, nodemask_t *nodemask) { @@ -6999,6 +7011,9 @@ unsigned long try_to_free_pages(struct zonelist *zonelist, int order, .may_swap = 1, };
+#ifdef CONFIG_ETMEM + kernel_swap_check(&sc); +#endif /* * scan_control uses s8 fields for order, priority, and reclaim_idx. * Confirm they are large enough for max values. @@ -7436,6 +7451,10 @@ static int balance_pgdat(pg_data_t *pgdat, int order, int highest_zoneidx) sc.may_writepage = !laptop_mode && !nr_boost_reclaim; sc.may_swap = !nr_boost_reclaim;
+#ifdef CONFIG_ETMEM + kernel_swap_check(&sc); +#endif + /* * Do some background aging, to give pages a chance to be * referenced before reclaiming. All pages are rotated @@ -7814,6 +7833,10 @@ unsigned long shrink_all_memory(unsigned long nr_to_reclaim) noreclaim_flag = memalloc_noreclaim_save(); set_task_reclaim_state(current, &sc.reclaim_state);
+#ifdef CONFIG_ETMEM + kernel_swap_check(&sc); +#endif + nr_reclaimed = do_try_to_free_pages(zonelist, &sc);
set_task_reclaim_state(current, NULL); @@ -7971,6 +7994,9 @@ static int __node_reclaim(struct pglist_data *pgdat, gfp_t gfp_mask, unsigned in cond_resched(); psi_memstall_enter(&pflags); fs_reclaim_acquire(sc.gfp_mask); +#ifdef CONFIG_ETMEM + kernel_swap_check(&sc); +#endif /* * We need to be able to allocate from the reserves for RECLAIM_UNMAP */
euleros inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I7RO5Q CVE: NA Reference: https://gitee.com/openeuler/kernel/commit/abfd8691d951fb037506e1ee52ed39abee...
-------------------------------------------------
etmem, the memory vertical expansion technology,
In the current etmem process, memory page swapping is implemented by invoking shrink_page_list. When this interface is invoked for the first time, pages are added to the swap cache and written to disks.The swap cache page is reclaimed only when this interface is invoked for the second time and no process accesses the page.However, in the etmem process, the user mode scans pages that have been accessed, and the migration is not delivered to pages that are not accessed by processes. Therefore, the swap cache may always be occupied. To solve the preceding problem, add the logic for actively reclaiming the swap cache.When the swap cache occupies a large amount of memory, the system proactively scans the LRU linked list and reclaims the swap cache to save memory within the specified range.
Signed-off-by: liubo liubo254@huawei.com Reviewed-by: Miaohe Lin linmiaohe@huawei.com Reviewed-by: Kefeng Wang wangkefeng.wang@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- fs/proc/etmem_swap.c | 175 +++++++++++++++++++++++- fs/proc/task_mmu.c | 8 ++ include/linux/list.h | 17 +++ include/linux/swap.h | 35 ++++- mm/vmscan.c | 312 ++++++++++++++++++++++++++++++++++++++++++- 5 files changed, 541 insertions(+), 6 deletions(-)
diff --git a/fs/proc/etmem_swap.c b/fs/proc/etmem_swap.c index f9f796cfaf97..0e0a5225e301 100644 --- a/fs/proc/etmem_swap.c +++ b/fs/proc/etmem_swap.c @@ -10,6 +10,24 @@ #include <linux/mempolicy.h> #include <linux/uaccess.h> #include <linux/delay.h> +#include <linux/numa.h> +#include <linux/freezer.h> +#include <linux/kthread.h> +#include <linux/mm_inline.h> + +#define RECLAIM_SWAPCACHE_MAGIC 0X77 +#define SET_SWAPCACHE_WMARK _IOW(RECLAIM_SWAPCACHE_MAGIC, 0x02, unsigned int) +#define RECLAIM_SWAPCACHE_ON _IOW(RECLAIM_SWAPCACHE_MAGIC, 0x01, unsigned int) +#define RECLAIM_SWAPCACHE_OFF _IOW(RECLAIM_SWAPCACHE_MAGIC, 0x00, unsigned int) + +#define WATERMARK_MAX 100 +#define SWAP_SCAN_NUM_MAX 32 + +static struct task_struct *reclaim_swapcache_tk; +static bool enable_swapcache_reclaim; +static unsigned long swapcache_watermark[ETMEM_SWAPCACHE_NR_WMARK]; + +static DECLARE_WAIT_QUEUE_HEAD(reclaim_queue);
static ssize_t swap_pages_write(struct file *file, const char __user *buf, size_t count, loff_t *ppos) @@ -45,7 +63,7 @@ static ssize_t swap_pages_write(struct file *file, const char __user *buf, ret = kstrtoul(p, 16, &vaddr); if (ret != 0) continue; - /*If get page struct failed, ignore it, get next page*/ + /* If get page struct failed, ignore it, get next page */ page = get_page_from_vaddr(mm, vaddr); if (!page) continue; @@ -78,9 +96,153 @@ static int swap_pages_release(struct inode *inode, struct file *file) return 0; }
+/* check if swapcache meet requirements */ +static bool swapcache_balanced(void) +{ + return total_swapcache_pages() < swapcache_watermark[ETMEM_SWAPCACHE_WMARK_HIGH]; +} + +/* the flag present if swapcache reclaim is started */ +static bool swapcache_reclaim_enabled(void) +{ + return READ_ONCE(enable_swapcache_reclaim); +} + +static void start_swapcache_reclaim(void) +{ + if (swapcache_balanced()) + return; + /* RECLAIM_SWAPCACHE_ON trigger the thread to start running. */ + if (!waitqueue_active(&reclaim_queue)) + return; + + WRITE_ONCE(enable_swapcache_reclaim, true); + wake_up_interruptible(&reclaim_queue); +} + +static void stop_swapcache_reclaim(void) +{ + WRITE_ONCE(enable_swapcache_reclaim, false); +} + +static bool should_goto_sleep(void) +{ + if (swapcache_balanced()) + stop_swapcache_reclaim(); + + if (swapcache_reclaim_enabled()) + return false; + + return true; +} + +static int get_swapcache_watermark(unsigned int ratio) +{ + unsigned int low_watermark; + unsigned int high_watermark; + + low_watermark = ratio & 0xFF; + high_watermark = (ratio >> 8) & 0xFF; + if (low_watermark > WATERMARK_MAX || + high_watermark > WATERMARK_MAX || + low_watermark > high_watermark) + return -EPERM; + + swapcache_watermark[ETMEM_SWAPCACHE_WMARK_LOW] = totalram_pages() * + low_watermark / WATERMARK_MAX; + swapcache_watermark[ETMEM_SWAPCACHE_WMARK_HIGH] = totalram_pages() * + high_watermark / WATERMARK_MAX; + + return 0; +}
extern struct file_operations proc_swap_pages_operations;
+static void reclaim_swapcache_try_to_sleep(void) +{ + DEFINE_WAIT(wait); + + if (freezing(current) || kthread_should_stop()) + return; + + prepare_to_wait(&reclaim_queue, &wait, TASK_INTERRUPTIBLE); + if (should_goto_sleep()) { + if (!kthread_should_stop()) + schedule(); + } + finish_wait(&reclaim_queue, &wait); +} + +static void etmem_reclaim_swapcache(void) +{ + do_swapcache_reclaim(swapcache_watermark, + ARRAY_SIZE(swapcache_watermark)); + stop_swapcache_reclaim(); +} + +static int reclaim_swapcache_proactive(void *para) +{ + set_freezable(); + + while (1) { + bool ret; + + reclaim_swapcache_try_to_sleep(); + ret = try_to_freeze(); + if (kthread_should_stop()) + break; + + if (ret) + continue; + + etmem_reclaim_swapcache(); + } + + return 0; +} + +static int reclaim_swapcache_run(void) +{ + int ret = 0; + + reclaim_swapcache_tk = kthread_run(reclaim_swapcache_proactive, NULL, + "etmem_recalim_swapcache"); + if (IS_ERR(reclaim_swapcache_tk)) { + ret = PTR_ERR(reclaim_swapcache_tk); + reclaim_swapcache_tk = NULL; + } + return ret; +} + +static long swap_page_ioctl(struct file *filp, unsigned int cmd, + unsigned long arg) +{ + void __user *argp = (void __user *)arg; + unsigned int ratio; + + switch (cmd) { + case RECLAIM_SWAPCACHE_ON: + if (swapcache_reclaim_enabled()) + return 0; + start_swapcache_reclaim(); + break; + case RECLAIM_SWAPCACHE_OFF: + stop_swapcache_reclaim(); + break; + case SET_SWAPCACHE_WMARK: + if (get_user(ratio, (unsigned int __user *)argp)) + return -EFAULT; + + if (get_swapcache_watermark(ratio) != 0) + return -EFAULT; + break; + default: + return -EPERM; + } + + return 0; +} + static int swap_pages_entry(void) { proc_swap_pages_operations.flock(NULL, 1, NULL); @@ -88,8 +250,12 @@ static int swap_pages_entry(void) proc_swap_pages_operations.write = swap_pages_write; proc_swap_pages_operations.open = swap_pages_open; proc_swap_pages_operations.release = swap_pages_release; + proc_swap_pages_operations.unlocked_ioctl = swap_page_ioctl; proc_swap_pages_operations.flock(NULL, 0, NULL);
+ enable_swapcache_reclaim = false; + reclaim_swapcache_run(); + return 0; }
@@ -100,7 +266,14 @@ static void swap_pages_exit(void) proc_swap_pages_operations.write = NULL; proc_swap_pages_operations.open = NULL; proc_swap_pages_operations.release = NULL; + proc_swap_pages_operations.unlocked_ioctl = NULL; proc_swap_pages_operations.flock(NULL, 0, NULL); + + if (!IS_ERR(reclaim_swapcache_tk)) { + kthread_stop(reclaim_swapcache_tk); + reclaim_swapcache_tk = NULL; + } + return; }
MODULE_LICENSE("GPL"); diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index 721ea2a42dfd..f1f3b03e1867 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c @@ -1949,11 +1949,19 @@ static int mm_swap_release(struct inode *inode, struct file *file) return ret; }
+static long mm_swap_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) +{ + if (proc_swap_pages_operations.unlocked_ioctl) + return proc_swap_pages_operations.unlocked_ioctl(filp, cmd, arg); + return 0; +} + const struct file_operations proc_mm_swap_operations = { .llseek = mem_lseek, .write = mm_swap_write, .open = mm_swap_open, .release = mm_swap_release, + .unlocked_ioctl = mm_swap_ioctl, }; #endif #endif /* CONFIG_PROC_PAGE_MONITOR */ diff --git a/include/linux/list.h b/include/linux/list.h index f10344dbad4d..f946aae6e8ab 100644 --- a/include/linux/list.h +++ b/include/linux/list.h @@ -825,6 +825,23 @@ static inline size_t list_count_nodes(struct list_head *head) !list_entry_is_head(pos, head, member); \ pos = n, n = list_prev_entry(n, member))
+/** + * list_for_each_entry_safe_reverse_from - iterate backwards over list from + * current point safe against removal + * @pos: the type * to use as a loop cursor. + * @n: another type * to use as temporary storage + * @head: the head for your list. + * @member: the name of the list_head within the struct. + * + * Iterate backwards over list of given type from current point, safe against + * removal of list entry. + */ +#define list_for_each_entry_safe_reverse_from(pos, n, head, member) \ + for (n = list_prev_entry(pos, member); \ + !list_entry_is_head(pos, head, member); \ + pos = n, n = list_prev_entry(n, member)) + + /** * list_safe_reset_next - reset a stale list_for_each_entry_safe loop * @pos: the loop cursor used in the list_for_each_entry_safe loop diff --git a/include/linux/swap.h b/include/linux/swap.h index 811f7cc05ced..45cbd02c909f 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h @@ -443,9 +443,40 @@ extern int vm_swappiness; long remove_mapping(struct address_space *mapping, struct folio *folio);
#ifdef CONFIG_ETMEM +enum etmem_swapcache_watermark_en { + ETMEM_SWAPCACHE_WMARK_LOW, + ETMEM_SWAPCACHE_WMARK_HIGH, + ETMEM_SWAPCACHE_NR_WMARK +}; + extern int add_page_for_swap(struct page *page, struct list_head *pagelist); extern struct page *get_page_from_vaddr(struct mm_struct *mm, unsigned long vaddr); +extern int do_swapcache_reclaim(unsigned long *swapcache_watermark, + unsigned int watermark_nr); +extern bool kernel_swap_enabled(void); +#else +static inline int add_page_for_swap(struct page *page, struct list_head *pagelist) +{ + return 0; +} + +static inline struct page *get_page_from_vaddr(struct mm_struct *mm, + unsigned long vaddr) +{ + return NULL; +} + +static inline int do_swapcache_reclaim(unsigned long *swapcache_watermark, + unsigned int watermark_nr) +{ + return 0; +} + +static inline bool kernel_swap_enabled(void) +{ + return true; +} #endif
#ifdef CONFIG_NUMA @@ -722,9 +753,5 @@ static inline bool mem_cgroup_swap_full(struct folio *folio) } #endif
-#ifdef CONFIG_ETMEM -extern bool kernel_swap_enabled(void); -#endif - #endif /* __KERNEL__*/ #endif /* _LINUX_SWAP_H */ diff --git a/mm/vmscan.c b/mm/vmscan.c index 79799c013740..aca7b6a11d54 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -8140,7 +8140,7 @@ int add_page_for_swap(struct page *page, struct list_head *pagelist) int err = -EBUSY; struct page *head;
- /*If the page is mapped by more than one process, do not swap it */ + /* If the page is mapped by more than one process, do not swap it */ if (page_mapcount(page) > 1) return -EACCES;
@@ -8189,4 +8189,314 @@ struct page *get_page_from_vaddr(struct mm_struct *mm, unsigned long vaddr) return page; } EXPORT_SYMBOL_GPL(get_page_from_vaddr); + +static int add_page_for_reclaim_swapcache(struct page *page, + struct list_head *pagelist, struct lruvec *lruvec, enum lru_list lru) +{ + struct page *head; + + /* If the page is mapped by more than one process, do not swap it */ + if (page_mapcount(page) > 1) + return -EACCES; + + if (PageHuge(page)) + return -EACCES; + + head = compound_head(page); + + switch (__isolate_lru_page_prepare(head, 0)) { + case 0: + if (unlikely(!get_page_unless_zero(page))) + return -1; + + if (!TestClearPageLRU(page)) { + /* + * This page may in other isolation path, + * but we still hold lru_lock. + */ + put_page(page); + return -1; + } + + list_move(&head->lru, pagelist); + update_lru_size(lruvec, lru, page_zonenum(head), -thp_nr_pages(head)); + break; + + case -EBUSY: + return -1; + default: + break; + } + + return 0; +} + +static unsigned long reclaim_swapcache_pages_from_list(int nid, + struct list_head *page_list, unsigned long reclaim_num, bool putback_flag) +{ + struct scan_control sc = { + .may_unmap = 1, + .may_swap = 1, + .may_writepage = 1, + .gfp_mask = GFP_KERNEL, + }; + unsigned long nr_reclaimed = 0; + unsigned long nr_moved = 0; + struct page *page, *next; + LIST_HEAD(swap_pages); + struct pglist_data *pgdat = NULL; + struct reclaim_stat stat; + + pgdat = NODE_DATA(nid); + + if (putback_flag) + goto putback_list; + + if (reclaim_num == 0) + return 0; + + list_for_each_entry_safe(page, next, page_list, lru) { + if (!page_is_file_lru(page) && !__PageMovable(page) + && PageSwapCache(page)) { + ClearPageActive(page); + list_move(&page->lru, &swap_pages); + nr_moved++; + } + + if (nr_moved >= reclaim_num) + break; + } + + /* swap the pages */ + if (pgdat) + nr_reclaimed = shrink_page_list(&swap_pages, + pgdat, + &sc, + &stat, true); + + while (!list_empty(&swap_pages)) { + page = lru_to_page(&swap_pages); + list_del(&page->lru); + putback_lru_page(page); + } + + return nr_reclaimed; + +putback_list: + while (!list_empty(page_list)) { + page = lru_to_page(page_list); + list_del(&page->lru); + putback_lru_page(page); + } + + return nr_reclaimed; +} + +#define SWAP_SCAN_NUM_MAX 32 + +static bool swapcache_below_watermark(unsigned long *swapcache_watermark) +{ + return total_swapcache_pages() < swapcache_watermark[ETMEM_SWAPCACHE_WMARK_LOW]; +} + +static unsigned long get_swapcache_reclaim_num(unsigned long *swapcache_watermark) +{ + return total_swapcache_pages() > + swapcache_watermark[ETMEM_SWAPCACHE_WMARK_LOW] ? + (total_swapcache_pages() - swapcache_watermark[ETMEM_SWAPCACHE_WMARK_LOW]) : 0; +} + +/* + * The main function to reclaim swapcache, the whole reclaim process is + * divided into 3 steps. + * 1. get the total_swapcache_pages num to reclaim. + * 2. scan the LRU linked list of each memory node to obtain the + * swapcache pages that can be reclaimd. + * 3. reclaim the swapcache page until the requirements are meet. + */ +int do_swapcache_reclaim(unsigned long *swapcache_watermark, + unsigned int watermark_nr) +{ + int err = -EINVAL; + unsigned long swapcache_to_reclaim = 0; + unsigned long nr_reclaimed = 0; + unsigned long swapcache_total_reclaimable = 0; + unsigned long reclaim_page_count = 0; + + unsigned long *nr = NULL; + unsigned long *nr_to_reclaim = NULL; + struct list_head *swapcache_list = NULL; + + int nid = 0; + struct lruvec *lruvec = NULL; + struct list_head *src = NULL; + struct page *page = NULL; + struct page *next = NULL; + struct page *pos = NULL; + + struct mem_cgroup *memcg = NULL; + struct mem_cgroup *target_memcg = NULL; + + pg_data_t *pgdat = NULL; + unsigned int scan_count = 0; + int nid_num = 0; + + if (swapcache_watermark == NULL || + watermark_nr < ETMEM_SWAPCACHE_NR_WMARK) + return err; + + /* get the total_swapcache_pages num to reclaim. */ + swapcache_to_reclaim = get_swapcache_reclaim_num(swapcache_watermark); + if (swapcache_to_reclaim <= 0) + return err; + + nr = kcalloc(MAX_NUMNODES, sizeof(unsigned long), GFP_KERNEL); + if (nr == NULL) + return -ENOMEM; + + nr_to_reclaim = kcalloc(MAX_NUMNODES, sizeof(unsigned long), GFP_KERNEL); + if (nr_to_reclaim == NULL) { + kfree(nr); + return -ENOMEM; + } + + swapcache_list = kcalloc(MAX_NUMNODES, sizeof(struct list_head), GFP_KERNEL); + if (swapcache_list == NULL) { + kfree(nr); + kfree(nr_to_reclaim); + return -ENOMEM; + } + + /* + * scan the LRU linked list of each memory node to obtain the + * swapcache pages that can be reclaimd. + */ + for_each_node_state(nid, N_MEMORY) { + INIT_LIST_HEAD(&swapcache_list[nid_num]); + cond_resched(); + + pgdat = NODE_DATA(nid); + + memcg = mem_cgroup_iter(target_memcg, NULL, NULL); + do { + cond_resched(); + pos = NULL; + lruvec = mem_cgroup_lruvec(memcg, pgdat); + src = &(lruvec->lists[LRU_INACTIVE_ANON]); + spin_lock_irq(&lruvec->lru_lock); + scan_count = 0; + + /* + * Scan the swapcache pages that are not mapped from + * the end of the LRU linked list, scan SWAP_SCAN_NUM_MAX + * pages each time, and record the scan end point page. + */ + + pos = list_last_entry(src, struct page, lru); + spin_unlock_irq(&lruvec->lru_lock); +do_scan: + cond_resched(); + scan_count = 0; + spin_lock_irq(&lruvec->lru_lock); + + /* + * check if pos page is been released or not in LRU list, if true, + * cancel the subsequent page scanning of the current node. + */ + if (!pos || list_entry_is_head(pos, src, lru)) { + spin_unlock_irq(&lruvec->lru_lock); + continue; + } + + if (!PageLRU(pos) || page_lru(pos) != LRU_INACTIVE_ANON) { + spin_unlock_irq(&lruvec->lru_lock); + continue; + } + + page = pos; + pos = NULL; + /* Continue to scan down from the last scan breakpoint */ + list_for_each_entry_safe_reverse_from(page, next, src, lru) { + scan_count++; + pos = next; + if (scan_count >= SWAP_SCAN_NUM_MAX) + break; + + if (!PageSwapCache(page)) + continue; + + if (page_mapped(page)) + continue; + + if (add_page_for_reclaim_swapcache(page, + &swapcache_list[nid_num], + lruvec, LRU_INACTIVE_ANON) != 0) + continue; + + nr[nid_num]++; + swapcache_total_reclaimable++; + } + spin_unlock_irq(&lruvec->lru_lock); + + /* + * Check whether the scanned pages meet + * the reclaim requirements. + */ + if (swapcache_total_reclaimable <= swapcache_to_reclaim || + scan_count >= SWAP_SCAN_NUM_MAX) + goto do_scan; + + } while ((memcg = mem_cgroup_iter(target_memcg, memcg, NULL))); + + /* Start reclaiming the next memory node. */ + nid_num++; + } + + /* reclaim the swapcache page until the requirements are meet. */ + do { + nid_num = 0; + reclaim_page_count = 0; + + /* start swapcache page reclaim for each node. */ + for_each_node_state(nid, N_MEMORY) { + cond_resched(); + + nr_to_reclaim[nid_num] = (swapcache_to_reclaim / + (swapcache_total_reclaimable / nr[nid_num])); + reclaim_page_count += reclaim_swapcache_pages_from_list(nid, + &swapcache_list[nid_num], + nr_to_reclaim[nid_num], false); + nid_num++; + } + + nr_reclaimed += reclaim_page_count; + + /* + * Check whether the swapcache page reaches the reclaim requirement or + * the number of the swapcache page reclaimd is 0. Stop reclaim. + */ + if (nr_reclaimed >= swapcache_to_reclaim || reclaim_page_count == 0) + goto exit; + } while (!swapcache_below_watermark(swapcache_watermark) || + nr_reclaimed < swapcache_to_reclaim); +exit: + nid_num = 0; + /* + * Repopulate the swapcache pages that are not reclaimd back + * to the LRU linked list. + */ + for_each_node_state(nid, N_MEMORY) { + cond_resched(); + reclaim_swapcache_pages_from_list(nid, + &swapcache_list[nid_num], 0, true); + nid_num++; + } + + kfree(nr); + kfree(nr_to_reclaim); + kfree(swapcache_list); + + return 0; +} +EXPORT_SYMBOL_GPL(do_swapcache_reclaim); #endif
euleros inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I7RO5Q CVE: NA Reference: https://gitee.com/openeuler/kernel/commit/4f0eedb8c688ea61bcb129ed78c1784fb9...
-------------------------------------------------
etmem, the memory vertical expansion technology,
The existing memory expansion tool etmem swaps out all pages that can be swapped out for the process by default, unless the page is marked with lock flag.
The function of swapping out specified pages is added. The process adds VM_SWAPFLAG flags for pages to be swapped out. The etmem adds filters to the scanning module and swaps out only these pages.
Signed-off-by: liubo liubo254@huawei.com Reviewed-by: Miaohe Lin linmiaohe@huawei.com Reviewed-by: Kefeng Wang wangkefeng.wang@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- fs/proc/etmem_scan.c | 11 +++++++++++ fs/proc/etmem_scan.h | 5 ++++- fs/proc/etmem_swap.c | 1 + include/linux/mm.h | 4 ++++ include/uapi/asm-generic/mman-common.h | 4 ++++ mm/madvise.c | 17 ++++++++++++++++- 6 files changed, 40 insertions(+), 2 deletions(-)
diff --git a/fs/proc/etmem_scan.c b/fs/proc/etmem_scan.c index adb932a3c1d6..7c1af58bbf21 100644 --- a/fs/proc/etmem_scan.c +++ b/fs/proc/etmem_scan.c @@ -1187,6 +1187,11 @@ static int mm_idle_test_walk(unsigned long start, unsigned long end, struct mm_walk *walk) { struct vm_area_struct *vma = walk->vma; + struct page_idle_ctrl *pic = walk->private; + + /* If the specified page swapout is set, the untagged vma is skipped. */ + if ((pic->flags & VMA_SCAN_FLAG) && !(vma->vm_flags & VM_SWAPFLAG)) + return 1;
if (vma->vm_file) { if (is_vm_hugetlb_page(vma)) @@ -1325,6 +1330,12 @@ static long page_scan_ioctl(struct file *filp, unsigned int cmd, unsigned long a case IDLE_SCAN_REMOVE_FLAGS: filp->f_flags &= ~flags; break; + case VMA_SCAN_ADD_FLAGS: + filp->f_flags |= flags; + break; + case VMA_SCAN_REMOVE_FLAGS: + filp->f_flags &= ~flags; + break; default: return -EOPNOTSUPP; } diff --git a/fs/proc/etmem_scan.h b/fs/proc/etmem_scan.h index 93a6e33f2025..e109f7f350e1 100644 --- a/fs/proc/etmem_scan.h +++ b/fs/proc/etmem_scan.h @@ -12,13 +12,16 @@ #define SCAN_AS_HUGE 0100000000 /* treat normal page as hugepage in vm */ #define SCAN_IGN_HOST 0200000000 /* ignore host access when scan vm */ #define VM_SCAN_HOST 0400000000 /* scan and add host page for vm hole(internal) */ +#define VMA_SCAN_FLAG 0x1000 /* scan the specifics vma with flag */
#define ALL_SCAN_FLAGS (SCAN_HUGE_PAGE | SCAN_SKIM_IDLE | SCAN_DIRTY_PAGE | \ - SCAN_AS_HUGE | SCAN_IGN_HOST | VM_SCAN_HOST) + SCAN_AS_HUGE | SCAN_IGN_HOST | VM_SCAN_HOST | VMA_SCAN_FLAG)
#define IDLE_SCAN_MAGIC 0x66 #define IDLE_SCAN_ADD_FLAGS _IOW(IDLE_SCAN_MAGIC, 0x0, unsigned int) #define IDLE_SCAN_REMOVE_FLAGS _IOW(IDLE_SCAN_MAGIC, 0x1, unsigned int) +#define VMA_SCAN_ADD_FLAGS _IOW(IDLE_SCAN_MAGIC, 0x2, unsigned int) +#define VMA_SCAN_REMOVE_FLAGS _IOW(IDLE_SCAN_MAGIC, 0x3, unsigned int)
enum ProcIdlePageType { PTE_ACCESSED, /* 4k page */ diff --git a/fs/proc/etmem_swap.c b/fs/proc/etmem_swap.c index 0e0a5225e301..86f5cf8c90a1 100644 --- a/fs/proc/etmem_swap.c +++ b/fs/proc/etmem_swap.c @@ -63,6 +63,7 @@ static ssize_t swap_pages_write(struct file *file, const char __user *buf, ret = kstrtoul(p, 16, &vaddr); if (ret != 0) continue; + /* If get page struct failed, ignore it, get next page */ page = get_page_from_vaddr(mm, vaddr); if (!page) diff --git a/include/linux/mm.h b/include/linux/mm.h index 50f04282efcb..15eb40f2cd97 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -313,6 +313,10 @@ extern unsigned int kobjsize(const void *objp); #define VM_NOHUGEPAGE 0x40000000 /* MADV_NOHUGEPAGE marked this vma */ #define VM_MERGEABLE 0x80000000 /* KSM may merge identical pages */
+#ifdef CONFIG_ETMEM +#define VM_SWAPFLAG 0x400000000000000 /* memory swap out flag in vma */ +#endif + #ifdef CONFIG_ARCH_USES_HIGH_VMA_FLAGS #define VM_HIGH_ARCH_BIT_0 32 /* bit only usable on 64-bit architectures */ #define VM_HIGH_ARCH_BIT_1 33 /* bit only usable on 64-bit architectures */ diff --git a/include/uapi/asm-generic/mman-common.h b/include/uapi/asm-generic/mman-common.h index 9f6ee16d1884..24c4be60f713 100644 --- a/include/uapi/asm-generic/mman-common.h +++ b/include/uapi/asm-generic/mman-common.h @@ -86,6 +86,10 @@ #define MADV_PREFETCH MADV_GMEM_BASE /* prefetch pages for hNUMA node */ #define MADV_PINNED (MADV_GMEM_BASE+1) /* pin these pages */
+#define MADV_ETMEM_BASE 0x1100 +#define MADV_SWAPFLAG MADV_ETMEM_BASE /* for memory to be swap out */ +#define MADV_SWAPFLAG_REMOVE (MADV_SWAPFLAG + 1) + /* compatibility flags */ #define MAP_FILE 0
diff --git a/mm/madvise.c b/mm/madvise.c index b5ffbaf616f5..250d0e8e9a0d 100644 --- a/mm/madvise.c +++ b/mm/madvise.c @@ -1072,6 +1072,14 @@ static int madvise_vma_behavior(struct vm_area_struct *vma, break; case MADV_COLLAPSE: return madvise_collapse(vma, prev, start, end); +#ifdef CONFIG_ETMEM + case MADV_SWAPFLAG: + new_flags |= VM_SWAPFLAG; + break; + case MADV_SWAPFLAG_REMOVE: + new_flags &= ~VM_SWAPFLAG; + break; +#endif }
anon_name = anon_vma_name(vma); @@ -1174,9 +1182,12 @@ madvise_behavior_valid(int behavior) #ifdef CONFIG_MEMORY_FAILURE case MADV_SOFT_OFFLINE: case MADV_HWPOISON: +#endif +#ifdef CONFIG_ETMEM + case MADV_SWAPFLAG: + case MADV_SWAPFLAG_REMOVE: #endif return true; - default: return false; } @@ -1368,6 +1379,10 @@ int madvise_set_anon_name(struct mm_struct *mm, unsigned long start, * triggering read faults if required * MADV_POPULATE_WRITE - populate (prefault) page tables writable by * triggering write faults if required + * MADV_SWAPFLAG - Used in the etmem memory extension feature, the process + * specifies the memory swap area by adding a flag to a specific + * vma address. + * MADV_SWAPFLAG_REMOVE - remove the specific vma flag * * return values: * zero - success
euleros inclusion category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I7RO5Q CVE: NA Reference: https://gitee.com/openeuler/kernel/commit/905e7debc2732c158b31ff064af1611831...
----------------------------------------------------
In the swapcache recycling process, the number of pages to be reclaimed on each node is obtained as follows:
nr_to_reclaim[nid_num] = (swapcache_to_reclaim / (swapcache_total_reclaimable / nr[nid_num]));
However, nr[nid_num] is obtained by traversing the number of swapcache pages on each node. If there are multiple nodes in the environment and no swap process occurs on a node, no swapcache page exists. The value of nr[nid_num] may be 0.
Therefore, division by zero errors may occur.
Signed-off-by: liubo liubo254@huawei.com --- mm/vmscan.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-)
diff --git a/mm/vmscan.c b/mm/vmscan.c index aca7b6a11d54..dda21e824349 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -8461,8 +8461,10 @@ int do_swapcache_reclaim(unsigned long *swapcache_watermark, for_each_node_state(nid, N_MEMORY) { cond_resched();
- nr_to_reclaim[nid_num] = (swapcache_to_reclaim / - (swapcache_total_reclaimable / nr[nid_num])); + nr_to_reclaim[nid_num] = (swapcache_total_reclaimable == 0) ? 0 : + ((swapcache_to_reclaim * nr[nid_num]) / + swapcache_total_reclaimable); + reclaim_page_count += reclaim_swapcache_pages_from_list(nid, &swapcache_list[nid_num], nr_to_reclaim[nid_num], false);
euleros inclusion category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I7RO5Q CVE: NA
----------------------------------------------------
During the upgrade from Linux 5.10 to Linux 6.4, some structures are changed.
Therefore, need to adapt to these changes when installing the etmem patch to Linux 6.4.
Move some of etmem feature code to mm/etmem.c, Avoid intrusive modification of the original process by the etmem feature.
Signed-off-by: liubo liubo254@huawei.com --- fs/proc/etmem_scan.c | 58 ++++--- include/linux/swap.h | 2 + mm/Makefile | 1 + mm/etmem.c | 383 ++++++++++++++++++++++++++++++++++++++++++ mm/internal.h | 1 - mm/swap_state.c | 34 ---- mm/vmscan.c | 391 +------------------------------------------ 7 files changed, 425 insertions(+), 445 deletions(-) create mode 100644 mm/etmem.c
diff --git a/fs/proc/etmem_scan.c b/fs/proc/etmem_scan.c index 7c1af58bbf21..06c202dcf1fe 100644 --- a/fs/proc/etmem_scan.c +++ b/fs/proc/etmem_scan.c @@ -43,7 +43,7 @@ #endif
# define kvm_arch_mmu_pointer(vcpu) (vcpu->arch.mmu) -# define kvm_mmu_ad_disabled(mmu) (mmu->mmu_role.base.ad_disabled) +# define kvm_mmu_ad_disabled(mmu) (mmu->cpu_role.base.ad_disabled) #endif /*CONFIG_X86_64*/
#ifdef CONFIG_ARM64 @@ -314,13 +314,13 @@ static int vm_walk_host_range(unsigned long long start, unsigned long tmp_gpa_to_hva = pic->gpa_to_hva;
pic->gpa_to_hva = 0; - spin_unlock_irq(&pic->kvm->mmu_lock); - down_read(&walk->mm->mmap_lock); + read_unlock(&pic->kvm->mmu_lock); + mmap_read_lock(walk->mm); local_irq_disable(); ret = walk_page_range(walk->mm, start + tmp_gpa_to_hva, end + tmp_gpa_to_hva, walk->ops, walk->private); local_irq_enable(); - up_read(&walk->mm->mmap_lock); + mmap_read_unlock(walk->mm); pic->gpa_to_hva = tmp_gpa_to_hva; if (pic->flags & VM_SCAN_HOST) { pic->restart_gpa -= tmp_gpa_to_hva; @@ -537,28 +537,28 @@ static int ept_page_range(struct page_idle_ctrl *pic,
WARN_ON(addr >= end);
- spin_lock_irq(&pic->kvm->mmu_lock); + read_lock(&pic->kvm->mmu_lock);
vcpu = kvm_get_vcpu(pic->kvm, 0); if (!vcpu) { pic->gpa_to_hva = 0; set_restart_gpa(TASK_SIZE, "NO-VCPU"); - spin_unlock_irq(&pic->kvm->mmu_lock); + read_unlock(&pic->kvm->mmu_lock); return -EINVAL; }
mmu = kvm_arch_mmu_pointer(vcpu); - if (!VALID_PAGE(mmu->root_hpa)) { + if (!VALID_PAGE(mmu->root.hpa)) { pic->gpa_to_hva = 0; set_restart_gpa(TASK_SIZE, "NO-HPA"); - spin_unlock_irq(&pic->kvm->mmu_lock); + read_unlock(&pic->kvm->mmu_lock); return -EINVAL; }
- ept_root = __va(mmu->root_hpa); + ept_root = __va(mmu->root.hpa);
/* Walk start at p4d when vm has 4 level table pages */ - if (mmu->shadow_root_level != 4) + if (mmu->root_role.level != 4) err = ept_pgd_range(pic, (pgd_t *)ept_root, addr, end, walk); else err = ept_p4d_range(pic, (p4d_t *)ept_root, addr, end, walk); @@ -567,7 +567,7 @@ static int ept_page_range(struct page_idle_ctrl *pic, * and RET_RESCAN_FLAG will be set in ret value */ if (!(err & RET_RESCAN_FLAG)) - spin_unlock_irq(&pic->kvm->mmu_lock); + read_unlock(&pic->kvm->mmu_lock); else err &= ~RET_RESCAN_FLAG;
@@ -584,23 +584,31 @@ static int ept_idle_supports_cpu(struct kvm *kvm) if (!vcpu) return -EINVAL;
- spin_lock(&kvm->mmu_lock); + read_lock(&kvm->mmu_lock); mmu = kvm_arch_mmu_pointer(vcpu); if (kvm_mmu_ad_disabled(mmu)) { pr_notice("CPU does not support EPT A/D bits tracking\n"); ret = -EINVAL; - } else if (mmu->shadow_root_level < 4 || - (mmu->shadow_root_level == 5 && !pgtable_l5_enabled())) { - pr_notice("Unsupported EPT level %d\n", mmu->shadow_root_level); + } else if (mmu->root_role.level < 4 || + (mmu->root_role.level == 5 && !pgtable_l5_enabled())) { + pr_notice("Unsupported EPT level %d\n", mmu->root_role.level); ret = -EINVAL; } else ret = 0; - spin_unlock(&kvm->mmu_lock); + read_unlock(&kvm->mmu_lock);
return ret; }
#else +static inline phys_addr_t stage2_range_addr_end(phys_addr_t addr, phys_addr_t end) +{ + phys_addr_t size = kvm_granule_size(KVM_PGTABLE_MIN_BLOCK_LEVEL); + phys_addr_t boundary = ALIGN_DOWN(addr + size, size); + + return (boundary - 1 < end - 1) ? boundary : end; +} + static int arm_pte_range(struct page_idle_ctrl *pic, pmd_t *pmd, unsigned long addr, unsigned long end) { @@ -724,13 +732,13 @@ static int arm_page_range(struct page_idle_ctrl *pic,
WARN_ON(addr >= end);
- spin_lock(&pic->kvm->mmu_lock); + read_lock(&pic->kvm->mmu_lock); pgd = (pgd_t *)kvm->arch.mmu.pgt->pgd + pgd_index(addr); - spin_unlock(&pic->kvm->mmu_lock); + read_unlock(&pic->kvm->mmu_lock);
local_irq_disable(); do { - next = stage2_pgd_addr_end(kvm, addr, end); + next = stage2_range_addr_end(addr, end); if (!pgd_present(*pgd)) { set_restart_gpa(next, "PGD_HOLE"); continue; @@ -773,11 +781,12 @@ static unsigned long vm_idle_find_gpa(struct page_idle_ctrl *pic, struct kvm_memory_slot *memslot; unsigned long hva_end; gfn_t gfn; + int bkt;
*addr_range = ~0UL; mutex_lock(&kvm->slots_lock); slots = kvm_memslots(pic->kvm); - kvm_for_each_memslot(memslot, slots) { + kvm_for_each_memslot(memslot, bkt, slots) { hva_end = memslot->userspace_addr + (memslot->npages << PAGE_SHIFT);
@@ -1045,9 +1054,9 @@ static int page_scan_release(struct inode *inode, struct file *file) goto out; } #ifdef CONFIG_X86_64 - spin_lock(&kvm->mmu_lock); + write_lock(&kvm->mmu_lock); kvm_flush_remote_tlbs(kvm); - spin_unlock(&kvm->mmu_lock); + write_unlock(&kvm->mmu_lock); #endif
out: @@ -1217,7 +1226,7 @@ static int mm_idle_walk_range(struct page_idle_ctrl *pic, return ret;
for (; start < end;) { - down_read(&walk->mm->mmap_lock); + mmap_read_lock(walk->mm); vma = find_vma(walk->mm, start); if (vma) { if (end > vma->vm_start) { @@ -1229,8 +1238,7 @@ static int mm_idle_walk_range(struct page_idle_ctrl *pic, set_restart_gpa(vma->vm_start, "VMA-HOLE"); } else set_restart_gpa(TASK_SIZE, "EOF"); - up_read(&walk->mm->mmap_lock); - + mmap_read_unlock(walk->mm); WARN_ONCE(pic->gpa_to_hva, "non-zero gpa_to_hva"); if (ret != PAGE_IDLE_KBUF_FULL && end > pic->restart_gpa) pic->restart_gpa = end; diff --git a/include/linux/swap.h b/include/linux/swap.h index 45cbd02c909f..f620decea34e 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h @@ -449,6 +449,7 @@ enum etmem_swapcache_watermark_en { ETMEM_SWAPCACHE_NR_WMARK };
+extern struct kobj_attribute kernel_swap_enable_attr; extern int add_page_for_swap(struct page *page, struct list_head *pagelist); extern struct page *get_page_from_vaddr(struct mm_struct *mm, unsigned long vaddr); @@ -752,6 +753,7 @@ static inline bool mem_cgroup_swap_full(struct folio *folio) return vm_swap_full(); } #endif +extern unsigned long reclaim_pages(struct list_head *folio_list);
#endif /* __KERNEL__*/ #endif /* _LINUX_SWAP_H */ diff --git a/mm/Makefile b/mm/Makefile index 0824907eab98..cc147c0d7ca0 100644 --- a/mm/Makefile +++ b/mm/Makefile @@ -138,3 +138,4 @@ obj-$(CONFIG_IO_MAPPING) += io-mapping.o obj-$(CONFIG_HAVE_BOOTMEM_INFO_NODE) += bootmem_info.o obj-$(CONFIG_GENERIC_IOREMAP) += ioremap.o obj-$(CONFIG_SHRINKER_DEBUG) += shrinker_debug.o +obj-$(CONFIG_ETMEM) += etmem.o diff --git a/mm/etmem.c b/mm/etmem.c new file mode 100644 index 000000000000..4187fe7eef0c --- /dev/null +++ b/mm/etmem.c @@ -0,0 +1,383 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include <linux/mm.h> +#include <linux/swap.h> +#include <linux/pagemap.h> +#include <linux/memcontrol.h> +#include <linux/gfp.h> +#include <linux/mm_inline.h> + +#include "internal.h" + +static bool enable_kernel_swap __read_mostly = true; + +bool kernel_swap_enabled(void) +{ + return READ_ONCE(enable_kernel_swap); +} + +static ssize_t kernel_swap_enable_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + return sprintf(buf, "%s\n", enable_kernel_swap ? "true" : "false"); +} +static ssize_t kernel_swap_enable_store(struct kobject *kobj, + struct kobj_attribute *attr, + const char *buf, size_t count) +{ + if (!strncmp(buf, "true", 4) || !strncmp(buf, "1", 1)) + WRITE_ONCE(enable_kernel_swap, true); + else if (!strncmp(buf, "false", 5) || !strncmp(buf, "0", 1)) + WRITE_ONCE(enable_kernel_swap, false); + else + return -EINVAL; + + return count; +} + +struct kobj_attribute kernel_swap_enable_attr = + __ATTR(kernel_swap_enable, 0644, kernel_swap_enable_show, + kernel_swap_enable_store); + +int add_page_for_swap(struct page *page, struct list_head *pagelist) +{ + int err = -EBUSY; + struct page *head; + + /* If the page is mapped by more than one process, do not swap it */ + if (page_mapcount(page) > 1) + return -EACCES; + + if (PageHuge(page)) + return -EACCES; + + head = compound_head(page); + if (!folio_isolate_lru(page_folio(head))) { + put_page(page); + return err; + } + put_page(page); + if (PageUnevictable(page)) + putback_lru_page(page); + else + list_add_tail(&head->lru, pagelist); + + err = 0; + return err; +} +EXPORT_SYMBOL_GPL(add_page_for_swap); + +struct page *get_page_from_vaddr(struct mm_struct *mm, unsigned long vaddr) +{ + struct page *page; + struct vm_area_struct *vma; + unsigned int follflags; + + mmap_read_lock(mm); + + vma = find_vma(mm, vaddr); + if (!vma || vaddr < vma->vm_start || vma->vm_flags & VM_LOCKED) { + mmap_read_unlock(mm); + return NULL; + } + + follflags = FOLL_GET | FOLL_DUMP | FOLL_FORCE; + page = follow_page(vma, vaddr, follflags); + if (IS_ERR(page) || !page) { + mmap_read_unlock(mm); + return NULL; + } + + mmap_read_unlock(mm); + return page; +} +EXPORT_SYMBOL_GPL(get_page_from_vaddr); + +static int add_page_for_reclaim_swapcache(struct page *page, + struct list_head *pagelist, struct lruvec *lruvec, enum lru_list lru) +{ + struct page *head; + + /* If the page is mapped by more than one process, do not swap it */ + if (page_mapcount(page) > 1) + return -EINVAL; + + if (PageHuge(page)) + return -EINVAL; + + head = compound_head(page); + if (!PageLRU(head) || PageUnevictable(head)) + return -EBUSY; + + if (unlikely(!get_page_unless_zero(page))) + return -EBUSY; + + if (!TestClearPageLRU(page)) { + /* + * This page may in other isolation path, + * but we still hold lru_lock. + */ + put_page(page); + return -EBUSY; + } + + list_move(&head->lru, pagelist); + update_lru_size(lruvec, lru, page_zonenum(head), -thp_nr_pages(head)); + + return 0; +} + +static unsigned long reclaim_swapcache_pages_from_list(int nid, + struct list_head *page_list, unsigned long reclaim_num, bool putback_flag) +{ + unsigned long nr_reclaimed = 0; + unsigned long nr_moved = 0; + struct page *page, *next; + LIST_HEAD(swap_pages); + struct pglist_data *pgdat = NULL; + + pgdat = NODE_DATA(nid); + + if (putback_flag) + goto putback_list; + + if (reclaim_num == 0) + return 0; + + list_for_each_entry_safe(page, next, page_list, lru) { + if (!page_is_file_lru(page) && !__PageMovable(page) + && PageSwapCache(page)) { + ClearPageActive(page); + list_move(&page->lru, &swap_pages); + nr_moved++; + } + + if (nr_moved >= reclaim_num) + break; + } + + /* swap the pages */ + if (pgdat) + nr_reclaimed = reclaim_pages(&swap_pages); + + return nr_reclaimed; + +putback_list: + while (!list_empty(page_list)) { + page = lru_to_page(page_list); + list_del(&page->lru); + putback_lru_page(page); + } + + return nr_reclaimed; +} + +#define SWAP_SCAN_NUM_MAX 32 + +static bool swapcache_below_watermark(unsigned long *swapcache_watermark) +{ + return total_swapcache_pages() < swapcache_watermark[ETMEM_SWAPCACHE_WMARK_LOW]; +} + +static unsigned long get_swapcache_reclaim_num(unsigned long *swapcache_watermark) +{ + return total_swapcache_pages() > + swapcache_watermark[ETMEM_SWAPCACHE_WMARK_LOW] ? + (total_swapcache_pages() - swapcache_watermark[ETMEM_SWAPCACHE_WMARK_LOW]) : 0; +} + +/* + * The main function to reclaim swapcache, the whole reclaim process is + * divided into 3 steps. + * 1. get the total_swapcache_pages num to reclaim. + * 2. scan the LRU linked list of each memory node to obtain the + * swapcache pages that can be reclaimd. + * 3. reclaim the swapcache page until the requirements are meet. + */ +int do_swapcache_reclaim(unsigned long *swapcache_watermark, + unsigned int watermark_nr) +{ + int err = -EINVAL; + unsigned long swapcache_to_reclaim = 0; + unsigned long nr_reclaimed = 0; + unsigned long swapcache_total_reclaimable = 0; + unsigned long reclaim_page_count = 0; + + unsigned long *nr = NULL; + unsigned long *nr_to_reclaim = NULL; + struct list_head *swapcache_list = NULL; + + int nid = 0; + struct lruvec *lruvec = NULL; + struct list_head *src = NULL; + struct page *page = NULL; + struct page *next = NULL; + struct page *pos = NULL; + + struct mem_cgroup *memcg = NULL; + struct mem_cgroup *target_memcg = NULL; + + pg_data_t *pgdat = NULL; + unsigned int scan_count = 0; + int nid_num = 0; + + if (swapcache_watermark == NULL || + watermark_nr < ETMEM_SWAPCACHE_NR_WMARK) + return err; + + /* get the total_swapcache_pages num to reclaim. */ + swapcache_to_reclaim = get_swapcache_reclaim_num(swapcache_watermark); + if (swapcache_to_reclaim <= 0) + return err; + + nr = kcalloc(MAX_NUMNODES, sizeof(unsigned long), GFP_KERNEL); + if (nr == NULL) + return -ENOMEM; + + nr_to_reclaim = kcalloc(MAX_NUMNODES, sizeof(unsigned long), GFP_KERNEL); + if (nr_to_reclaim == NULL) { + kfree(nr); + return -ENOMEM; + } + + swapcache_list = kcalloc(MAX_NUMNODES, sizeof(struct list_head), GFP_KERNEL); + if (swapcache_list == NULL) { + kfree(nr); + kfree(nr_to_reclaim); + return -ENOMEM; + } + + /* + * scan the LRU linked list of each memory node to obtain the + * swapcache pages that can be reclaimd. + */ + for_each_node_state(nid, N_MEMORY) { + INIT_LIST_HEAD(&swapcache_list[nid_num]); + cond_resched(); + + pgdat = NODE_DATA(nid); + + memcg = mem_cgroup_iter(target_memcg, NULL, NULL); + do { + cond_resched(); + pos = NULL; + lruvec = mem_cgroup_lruvec(memcg, pgdat); + src = &(lruvec->lists[LRU_INACTIVE_ANON]); + spin_lock_irq(&lruvec->lru_lock); + scan_count = 0; + + /* + * Scan the swapcache pages that are not mapped from + * the end of the LRU linked list, scan SWAP_SCAN_NUM_MAX + * pages each time, and record the scan end point page. + */ + + pos = list_last_entry(src, struct page, lru); + spin_unlock_irq(&lruvec->lru_lock); +do_scan: + cond_resched(); + scan_count = 0; + spin_lock_irq(&lruvec->lru_lock); + + /* + * check if pos page is been released or not in LRU list, if true, + * cancel the subsequent page scanning of the current node. + */ + if (!pos || list_entry_is_head(pos, src, lru)) { + spin_unlock_irq(&lruvec->lru_lock); + continue; + } + + if (!PageLRU(pos) || folio_lru_list(page_folio(pos)) != LRU_INACTIVE_ANON) { + spin_unlock_irq(&lruvec->lru_lock); + continue; + } + + page = pos; + pos = NULL; + /* Continue to scan down from the last scan breakpoint */ + list_for_each_entry_safe_reverse_from(page, next, src, lru) { + scan_count++; + pos = next; + if (scan_count >= SWAP_SCAN_NUM_MAX) + break; + + if (!PageSwapCache(page)) + continue; + + if (page_mapped(page)) + continue; + + if (add_page_for_reclaim_swapcache(page, + &swapcache_list[nid_num], + lruvec, LRU_INACTIVE_ANON) != 0) + continue; + + nr[nid_num]++; + swapcache_total_reclaimable++; + } + spin_unlock_irq(&lruvec->lru_lock); + + /* + * Check whether the scanned pages meet + * the reclaim requirements. + */ + if (swapcache_total_reclaimable <= swapcache_to_reclaim || + scan_count >= SWAP_SCAN_NUM_MAX) + goto do_scan; + + } while ((memcg = mem_cgroup_iter(target_memcg, memcg, NULL))); + + /* Start reclaiming the next memory node. */ + nid_num++; + } + + /* reclaim the swapcache page until the requirements are meet. */ + do { + nid_num = 0; + reclaim_page_count = 0; + + /* start swapcache page reclaim for each node. */ + for_each_node_state(nid, N_MEMORY) { + cond_resched(); + + nr_to_reclaim[nid_num] = (swapcache_total_reclaimable == 0) ? 0 : + ((swapcache_to_reclaim * nr[nid_num]) / + swapcache_total_reclaimable); + + reclaim_page_count += reclaim_swapcache_pages_from_list(nid, + &swapcache_list[nid_num], + nr_to_reclaim[nid_num], false); + nid_num++; + } + + nr_reclaimed += reclaim_page_count; + + /* + * Check whether the swapcache page reaches the reclaim requirement or + * the number of the swapcache page reclaimd is 0. Stop reclaim. + */ + if (nr_reclaimed >= swapcache_to_reclaim || reclaim_page_count == 0) + goto exit; + } while (!swapcache_below_watermark(swapcache_watermark) || + nr_reclaimed < swapcache_to_reclaim); +exit: + nid_num = 0; + /* + * Repopulate the swapcache pages that are not reclaimd back + * to the LRU linked list. + */ + for_each_node_state(nid, N_MEMORY) { + cond_resched(); + reclaim_swapcache_pages_from_list(nid, + &swapcache_list[nid_num], 0, true); + nid_num++; + } + + kfree(nr); + kfree(nr_to_reclaim); + kfree(swapcache_list); + + return 0; +} +EXPORT_SYMBOL_GPL(do_swapcache_reclaim); diff --git a/mm/internal.h b/mm/internal.h index 68410c6d97ac..ba568b48072c 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -783,7 +783,6 @@ extern unsigned long __must_check vm_mmap_pgoff(struct file *, unsigned long, unsigned long, unsigned long);
extern void set_pageblock_order(void); -unsigned long reclaim_pages(struct list_head *folio_list); unsigned int reclaim_clean_pages_from_list(struct zone *zone, struct list_head *folio_list); /* The ALLOC_WMARK bits are used as an index to zone->watermark */ diff --git a/mm/swap_state.c b/mm/swap_state.c index 48a34514b235..4ce292e2aea3 100644 --- a/mm/swap_state.c +++ b/mm/swap_state.c @@ -40,9 +40,6 @@ static const struct address_space_operations swap_aops = { struct address_space *swapper_spaces[MAX_SWAPFILES] __read_mostly; static unsigned int nr_swapper_spaces[MAX_SWAPFILES] __read_mostly; static bool enable_vma_readahead __read_mostly = true; -#ifdef CONFIG_ETMEM -static bool enable_kernel_swap __read_mostly = true; -#endif
#define SWAP_RA_WIN_SHIFT (PAGE_SHIFT / 2) #define SWAP_RA_HITS_MASK ((1UL << SWAP_RA_WIN_SHIFT) - 1) @@ -325,13 +322,6 @@ static inline bool swap_use_vma_readahead(void) return READ_ONCE(enable_vma_readahead) && !atomic_read(&nr_rotate_swap); }
-#ifdef CONFIG_ETMEM -bool kernel_swap_enabled(void) -{ - return READ_ONCE(enable_kernel_swap); -} -#endif - /* * Lookup a swap entry in the swap cache. A found folio will be returned * unlocked and with its refcount incremented - we rely on the kernel @@ -879,30 +869,6 @@ static ssize_t vma_ra_enabled_store(struct kobject *kobj, } static struct kobj_attribute vma_ra_enabled_attr = __ATTR_RW(vma_ra_enabled);
-#ifdef CONFIG_ETMEM -static ssize_t kernel_swap_enable_show(struct kobject *kobj, - struct kobj_attribute *attr, char *buf) -{ - return sprintf(buf, "%s\n", enable_kernel_swap ? "true" : "false"); -} -static ssize_t kernel_swap_enable_store(struct kobject *kobj, - struct kobj_attribute *attr, - const char *buf, size_t count) -{ - if (!strncmp(buf, "true", 4) || !strncmp(buf, "1", 1)) - WRITE_ONCE(enable_kernel_swap, true); - else if (!strncmp(buf, "false", 5) || !strncmp(buf, "0", 1)) - WRITE_ONCE(enable_kernel_swap, false); - else - return -EINVAL; - - return count; -} -static struct kobj_attribute kernel_swap_enable_attr = - __ATTR(kernel_swap_enable, 0644, kernel_swap_enable_show, - kernel_swap_enable_store); -#endif - static struct attribute *swap_attrs[] = { &vma_ra_enabled_attr.attr, #ifdef CONFIG_ETMEM diff --git a/mm/vmscan.c b/mm/vmscan.c index dda21e824349..0a8cd99f78a7 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -33,7 +33,6 @@ #include <linux/topology.h> #include <linux/cpu.h> #include <linux/cpuset.h> -#include <linux/mempolicy.h> #include <linux/compaction.h> #include <linux/notifier.h> #include <linux/rwsem.h> @@ -6983,17 +6982,17 @@ static bool throttle_direct_reclaim(gfp_t gfp_mask, struct zonelist *zonelist, return false; }
-#ifdef CONFIG_ETMEM /* * Check if original kernel swap is enabled * turn off kernel swap,but leave page cache reclaim on */ -static inline void kernel_swap_check(struct scan_control *sc) +static inline void kernel_force_no_swap(struct scan_control *sc) { +#ifdef CONFIG_ETMEM if (sc != NULL && !kernel_swap_enabled()) sc->may_swap = 0; -} #endif +}
unsigned long try_to_free_pages(struct zonelist *zonelist, int order, gfp_t gfp_mask, nodemask_t *nodemask) @@ -7011,9 +7010,7 @@ unsigned long try_to_free_pages(struct zonelist *zonelist, int order, .may_swap = 1, };
-#ifdef CONFIG_ETMEM - kernel_swap_check(&sc); -#endif + kernel_force_no_swap(&sc); /* * scan_control uses s8 fields for order, priority, and reclaim_idx. * Confirm they are large enough for max values. @@ -7451,9 +7448,7 @@ static int balance_pgdat(pg_data_t *pgdat, int order, int highest_zoneidx) sc.may_writepage = !laptop_mode && !nr_boost_reclaim; sc.may_swap = !nr_boost_reclaim;
-#ifdef CONFIG_ETMEM - kernel_swap_check(&sc); -#endif + kernel_force_no_swap(&sc);
/* * Do some background aging, to give pages a chance to be @@ -7833,9 +7828,7 @@ unsigned long shrink_all_memory(unsigned long nr_to_reclaim) noreclaim_flag = memalloc_noreclaim_save(); set_task_reclaim_state(current, &sc.reclaim_state);
-#ifdef CONFIG_ETMEM - kernel_swap_check(&sc); -#endif + kernel_force_no_swap(&sc);
nr_reclaimed = do_try_to_free_pages(zonelist, &sc);
@@ -7994,9 +7987,6 @@ static int __node_reclaim(struct pglist_data *pgdat, gfp_t gfp_mask, unsigned in cond_resched(); psi_memstall_enter(&pflags); fs_reclaim_acquire(sc.gfp_mask); -#ifdef CONFIG_ETMEM - kernel_swap_check(&sc); -#endif /* * We need to be able to allocate from the reserves for RECLAIM_UNMAP */ @@ -8133,372 +8123,3 @@ void check_move_unevictable_folios(struct folio_batch *fbatch) } } EXPORT_SYMBOL_GPL(check_move_unevictable_folios); - -#ifdef CONFIG_ETMEM -int add_page_for_swap(struct page *page, struct list_head *pagelist) -{ - int err = -EBUSY; - struct page *head; - - /* If the page is mapped by more than one process, do not swap it */ - if (page_mapcount(page) > 1) - return -EACCES; - - if (PageHuge(page)) - return -EACCES; - - head = compound_head(page); - err = isolate_lru_page(head); - if (err) { - put_page(page); - return err; - } - put_page(page); - if (PageUnevictable(page)) - putback_lru_page(page); - else - list_add_tail(&head->lru, pagelist); - - err = 0; - return err; -} -EXPORT_SYMBOL_GPL(add_page_for_swap); - -struct page *get_page_from_vaddr(struct mm_struct *mm, unsigned long vaddr) -{ - struct page *page; - struct vm_area_struct *vma; - unsigned int follflags; - - down_read(&mm->mmap_lock); - - vma = find_vma(mm, vaddr); - if (!vma || vaddr < vma->vm_start || vma->vm_flags & VM_LOCKED) { - up_read(&mm->mmap_lock); - return NULL; - } - - follflags = FOLL_GET | FOLL_DUMP; - page = follow_page(vma, vaddr, follflags); - if (IS_ERR(page) || !page) { - up_read(&mm->mmap_lock); - return NULL; - } - - up_read(&mm->mmap_lock); - return page; -} -EXPORT_SYMBOL_GPL(get_page_from_vaddr); - -static int add_page_for_reclaim_swapcache(struct page *page, - struct list_head *pagelist, struct lruvec *lruvec, enum lru_list lru) -{ - struct page *head; - - /* If the page is mapped by more than one process, do not swap it */ - if (page_mapcount(page) > 1) - return -EACCES; - - if (PageHuge(page)) - return -EACCES; - - head = compound_head(page); - - switch (__isolate_lru_page_prepare(head, 0)) { - case 0: - if (unlikely(!get_page_unless_zero(page))) - return -1; - - if (!TestClearPageLRU(page)) { - /* - * This page may in other isolation path, - * but we still hold lru_lock. - */ - put_page(page); - return -1; - } - - list_move(&head->lru, pagelist); - update_lru_size(lruvec, lru, page_zonenum(head), -thp_nr_pages(head)); - break; - - case -EBUSY: - return -1; - default: - break; - } - - return 0; -} - -static unsigned long reclaim_swapcache_pages_from_list(int nid, - struct list_head *page_list, unsigned long reclaim_num, bool putback_flag) -{ - struct scan_control sc = { - .may_unmap = 1, - .may_swap = 1, - .may_writepage = 1, - .gfp_mask = GFP_KERNEL, - }; - unsigned long nr_reclaimed = 0; - unsigned long nr_moved = 0; - struct page *page, *next; - LIST_HEAD(swap_pages); - struct pglist_data *pgdat = NULL; - struct reclaim_stat stat; - - pgdat = NODE_DATA(nid); - - if (putback_flag) - goto putback_list; - - if (reclaim_num == 0) - return 0; - - list_for_each_entry_safe(page, next, page_list, lru) { - if (!page_is_file_lru(page) && !__PageMovable(page) - && PageSwapCache(page)) { - ClearPageActive(page); - list_move(&page->lru, &swap_pages); - nr_moved++; - } - - if (nr_moved >= reclaim_num) - break; - } - - /* swap the pages */ - if (pgdat) - nr_reclaimed = shrink_page_list(&swap_pages, - pgdat, - &sc, - &stat, true); - - while (!list_empty(&swap_pages)) { - page = lru_to_page(&swap_pages); - list_del(&page->lru); - putback_lru_page(page); - } - - return nr_reclaimed; - -putback_list: - while (!list_empty(page_list)) { - page = lru_to_page(page_list); - list_del(&page->lru); - putback_lru_page(page); - } - - return nr_reclaimed; -} - -#define SWAP_SCAN_NUM_MAX 32 - -static bool swapcache_below_watermark(unsigned long *swapcache_watermark) -{ - return total_swapcache_pages() < swapcache_watermark[ETMEM_SWAPCACHE_WMARK_LOW]; -} - -static unsigned long get_swapcache_reclaim_num(unsigned long *swapcache_watermark) -{ - return total_swapcache_pages() > - swapcache_watermark[ETMEM_SWAPCACHE_WMARK_LOW] ? - (total_swapcache_pages() - swapcache_watermark[ETMEM_SWAPCACHE_WMARK_LOW]) : 0; -} - -/* - * The main function to reclaim swapcache, the whole reclaim process is - * divided into 3 steps. - * 1. get the total_swapcache_pages num to reclaim. - * 2. scan the LRU linked list of each memory node to obtain the - * swapcache pages that can be reclaimd. - * 3. reclaim the swapcache page until the requirements are meet. - */ -int do_swapcache_reclaim(unsigned long *swapcache_watermark, - unsigned int watermark_nr) -{ - int err = -EINVAL; - unsigned long swapcache_to_reclaim = 0; - unsigned long nr_reclaimed = 0; - unsigned long swapcache_total_reclaimable = 0; - unsigned long reclaim_page_count = 0; - - unsigned long *nr = NULL; - unsigned long *nr_to_reclaim = NULL; - struct list_head *swapcache_list = NULL; - - int nid = 0; - struct lruvec *lruvec = NULL; - struct list_head *src = NULL; - struct page *page = NULL; - struct page *next = NULL; - struct page *pos = NULL; - - struct mem_cgroup *memcg = NULL; - struct mem_cgroup *target_memcg = NULL; - - pg_data_t *pgdat = NULL; - unsigned int scan_count = 0; - int nid_num = 0; - - if (swapcache_watermark == NULL || - watermark_nr < ETMEM_SWAPCACHE_NR_WMARK) - return err; - - /* get the total_swapcache_pages num to reclaim. */ - swapcache_to_reclaim = get_swapcache_reclaim_num(swapcache_watermark); - if (swapcache_to_reclaim <= 0) - return err; - - nr = kcalloc(MAX_NUMNODES, sizeof(unsigned long), GFP_KERNEL); - if (nr == NULL) - return -ENOMEM; - - nr_to_reclaim = kcalloc(MAX_NUMNODES, sizeof(unsigned long), GFP_KERNEL); - if (nr_to_reclaim == NULL) { - kfree(nr); - return -ENOMEM; - } - - swapcache_list = kcalloc(MAX_NUMNODES, sizeof(struct list_head), GFP_KERNEL); - if (swapcache_list == NULL) { - kfree(nr); - kfree(nr_to_reclaim); - return -ENOMEM; - } - - /* - * scan the LRU linked list of each memory node to obtain the - * swapcache pages that can be reclaimd. - */ - for_each_node_state(nid, N_MEMORY) { - INIT_LIST_HEAD(&swapcache_list[nid_num]); - cond_resched(); - - pgdat = NODE_DATA(nid); - - memcg = mem_cgroup_iter(target_memcg, NULL, NULL); - do { - cond_resched(); - pos = NULL; - lruvec = mem_cgroup_lruvec(memcg, pgdat); - src = &(lruvec->lists[LRU_INACTIVE_ANON]); - spin_lock_irq(&lruvec->lru_lock); - scan_count = 0; - - /* - * Scan the swapcache pages that are not mapped from - * the end of the LRU linked list, scan SWAP_SCAN_NUM_MAX - * pages each time, and record the scan end point page. - */ - - pos = list_last_entry(src, struct page, lru); - spin_unlock_irq(&lruvec->lru_lock); -do_scan: - cond_resched(); - scan_count = 0; - spin_lock_irq(&lruvec->lru_lock); - - /* - * check if pos page is been released or not in LRU list, if true, - * cancel the subsequent page scanning of the current node. - */ - if (!pos || list_entry_is_head(pos, src, lru)) { - spin_unlock_irq(&lruvec->lru_lock); - continue; - } - - if (!PageLRU(pos) || page_lru(pos) != LRU_INACTIVE_ANON) { - spin_unlock_irq(&lruvec->lru_lock); - continue; - } - - page = pos; - pos = NULL; - /* Continue to scan down from the last scan breakpoint */ - list_for_each_entry_safe_reverse_from(page, next, src, lru) { - scan_count++; - pos = next; - if (scan_count >= SWAP_SCAN_NUM_MAX) - break; - - if (!PageSwapCache(page)) - continue; - - if (page_mapped(page)) - continue; - - if (add_page_for_reclaim_swapcache(page, - &swapcache_list[nid_num], - lruvec, LRU_INACTIVE_ANON) != 0) - continue; - - nr[nid_num]++; - swapcache_total_reclaimable++; - } - spin_unlock_irq(&lruvec->lru_lock); - - /* - * Check whether the scanned pages meet - * the reclaim requirements. - */ - if (swapcache_total_reclaimable <= swapcache_to_reclaim || - scan_count >= SWAP_SCAN_NUM_MAX) - goto do_scan; - - } while ((memcg = mem_cgroup_iter(target_memcg, memcg, NULL))); - - /* Start reclaiming the next memory node. */ - nid_num++; - } - - /* reclaim the swapcache page until the requirements are meet. */ - do { - nid_num = 0; - reclaim_page_count = 0; - - /* start swapcache page reclaim for each node. */ - for_each_node_state(nid, N_MEMORY) { - cond_resched(); - - nr_to_reclaim[nid_num] = (swapcache_total_reclaimable == 0) ? 0 : - ((swapcache_to_reclaim * nr[nid_num]) / - swapcache_total_reclaimable); - - reclaim_page_count += reclaim_swapcache_pages_from_list(nid, - &swapcache_list[nid_num], - nr_to_reclaim[nid_num], false); - nid_num++; - } - - nr_reclaimed += reclaim_page_count; - - /* - * Check whether the swapcache page reaches the reclaim requirement or - * the number of the swapcache page reclaimd is 0. Stop reclaim. - */ - if (nr_reclaimed >= swapcache_to_reclaim || reclaim_page_count == 0) - goto exit; - } while (!swapcache_below_watermark(swapcache_watermark) || - nr_reclaimed < swapcache_to_reclaim); -exit: - nid_num = 0; - /* - * Repopulate the swapcache pages that are not reclaimd back - * to the LRU linked list. - */ - for_each_node_state(nid, N_MEMORY) { - cond_resched(); - reclaim_swapcache_pages_from_list(nid, - &swapcache_list[nid_num], 0, true); - nid_num++; - } - - kfree(nr); - kfree(nr_to_reclaim); - kfree(swapcache_list); - - return 0; -} -EXPORT_SYMBOL_GPL(do_swapcache_reclaim); -#endif
反馈: 您发送到kernel@openeuler.org的补丁/补丁集,已成功转换为PR! PR链接地址: https://gitee.com/openeuler/kernel/pulls/1952 邮件列表地址:https://mailweb.openeuler.org/hyperkitty/list/kernel@openeuler.org/message/F...
FeedBack: The patch(es) which you have sent to kernel@openeuler.org mailing list has been converted to a pull request successfully! Pull request link: https://gitee.com/openeuler/kernel/pulls/1952 Mailing list address: https://mailweb.openeuler.org/hyperkitty/list/kernel@openeuler.org/message/F...