Kemeng Shi (2): x86: hugepage: use nt copy hugepage to AEP in x86 ugepage: add sysctl for hugepage alloc and mig
Yuchen Tang (6): etmem: add etmem scan feature to openEuler etmem: add etmem swap feature to openEuler mm: introduce and export pgdat peer_node mm: alloc pages from pmem's peer node etmem: add original kernel swap enabled options etmem: add swapcache reclaim to etmem
arch/arm64/configs/openeuler_defconfig | 3 + arch/x86/configs/openeuler_defconfig | 3 + arch/x86/include/asm/page_64.h | 9 + arch/x86/lib/Makefile | 1 + arch/x86/lib/copy_highpages.c | 107 ++ arch/x86/lib/copy_page_64.S | 73 ++ drivers/acpi/numa/srat.c | 7 + drivers/base/node.c | 15 + fs/proc/Makefile | 3 + fs/proc/base.c | 8 + fs/proc/etmem_proc.c | 216 ++++ fs/proc/etmem_scan.c | 1440 ++++++++++++++++++++++++ fs/proc/etmem_scan.h | 149 +++ fs/proc/etmem_swap.c | 283 +++++ fs/proc/internal.h | 4 + include/linux/etmem.h | 105 ++ include/linux/highmem.h | 17 + include/linux/mm.h | 4 + include/linux/mm_types.h | 6 + include/linux/mmzone.h | 14 + include/linux/swap.h | 2 +- include/uapi/asm-generic/mman-common.h | 4 + mm/Kconfig | 33 + mm/Makefile | 1 + mm/etmem.c | 475 ++++++++ mm/hugetlb.c | 9 + mm/madvise.c | 16 + mm/mm_init.c | 4 + mm/pagewalk.c | 3 + mm/sparse-vmemmap.c | 10 + mm/swap_state.c | 4 + mm/util.c | 5 + mm/vmscan.c | 21 + virt/kvm/kvm_main.c | 6 + 34 files changed, 3059 insertions(+), 1 deletion(-) create mode 100644 arch/x86/lib/copy_highpages.c create mode 100644 fs/proc/etmem_proc.c create mode 100644 fs/proc/etmem_scan.c create mode 100644 fs/proc/etmem_scan.h create mode 100644 fs/proc/etmem_swap.c create mode 100644 include/linux/etmem.h create mode 100644 mm/etmem.c
euleros inclusion category: feature feature: Classify page hotness through page table scanning bugzilla: NA CVE: NA
-------------------------------------------------
This patch proposes the etmem feature to openEuler. etmem periodically scans the vma segments of the target process, by walking its page table and check access bit of each page, before reporting the scan results to user space, so that we can better classify hotness of pages and further migrate hot ones to fast memory tier and cold ones to slow memory tier.
Signed-off-by: Fengguang Wu fengguang.wu@intel.com Signed-off-by: yanxiaodan yanxiaodan@huawei.com Signed-off-by: Feilong Lin linfeilong@huawei.com Signed-off-by: geruijun geruijun@huawei.com Signed-off-by: liubo liubo254@huawei.com Reviewed-by: Jing Xiangfeng jingxiangfeng@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com Signed-off-by: Yuchen Tang tangyuchen5@huawei.com --- arch/arm64/configs/openeuler_defconfig | 2 + arch/x86/configs/openeuler_defconfig | 2 + fs/proc/Makefile | 2 + fs/proc/base.c | 6 + fs/proc/etmem_proc.c | 122 ++ fs/proc/etmem_scan.c | 1440 ++++++++++++++++++++++++ fs/proc/etmem_scan.h | 149 +++ fs/proc/internal.h | 3 + include/linux/etmem.h | 27 + include/linux/mm.h | 4 + include/linux/mm_types.h | 6 + include/uapi/asm-generic/mman-common.h | 4 + mm/Kconfig | 23 + mm/madvise.c | 16 + mm/pagewalk.c | 3 + virt/kvm/kvm_main.c | 6 + 16 files changed, 1815 insertions(+) create mode 100644 fs/proc/etmem_proc.c create mode 100644 fs/proc/etmem_scan.c create mode 100644 fs/proc/etmem_scan.h create mode 100644 include/linux/etmem.h
diff --git a/arch/arm64/configs/openeuler_defconfig b/arch/arm64/configs/openeuler_defconfig index 9d2f717c1f7c..01a5c61cebbb 100644 --- a/arch/arm64/configs/openeuler_defconfig +++ b/arch/arm64/configs/openeuler_defconfig @@ -1071,6 +1071,8 @@ CONFIG_NEED_PER_CPU_EMBED_FIRST_CHUNK=y CONFIG_NEED_PER_CPU_PAGE_FIRST_CHUNK=y CONFIG_USE_PERCPU_NUMA_NODE_ID=y CONFIG_HAVE_SETUP_PER_CPU_AREA=y +CONFIG_ETMEM_SCAN=m +CONFIG_ETMEM=y CONFIG_CMA=y # CONFIG_CMA_DEBUG is not set # CONFIG_CMA_DEBUGFS is not set diff --git a/arch/x86/configs/openeuler_defconfig b/arch/x86/configs/openeuler_defconfig index b40100e16683..dafa73172937 100644 --- a/arch/x86/configs/openeuler_defconfig +++ b/arch/x86/configs/openeuler_defconfig @@ -1093,6 +1093,8 @@ CONFIG_NEED_PER_CPU_EMBED_FIRST_CHUNK=y CONFIG_NEED_PER_CPU_PAGE_FIRST_CHUNK=y CONFIG_USE_PERCPU_NUMA_NODE_ID=y CONFIG_HAVE_SETUP_PER_CPU_AREA=y +CONFIG_ETMEM_SCAN=m +CONFIG_ETMEM=y # CONFIG_CMA is not set CONFIG_MEM_SOFT_DIRTY=y CONFIG_GENERIC_EARLY_IOREMAP=y diff --git a/fs/proc/Makefile b/fs/proc/Makefile index bd08616ed8ba..b9a7bc7d8a75 100644 --- a/fs/proc/Makefile +++ b/fs/proc/Makefile @@ -34,3 +34,5 @@ proc-$(CONFIG_PROC_VMCORE) += vmcore.o proc-$(CONFIG_PRINTK) += kmsg.o proc-$(CONFIG_PROC_PAGE_MONITOR) += page.o proc-$(CONFIG_BOOT_CONFIG) += bootconfig.o +obj-$(CONFIG_ETMEM_SCAN) += etmem_scan.o +proc-${CONFIG_ETMEM} += etmem_proc.o diff --git a/fs/proc/base.c b/fs/proc/base.c index ffd54617c354..e634c61ba1ac 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -3285,6 +3285,9 @@ static const struct pid_entry tgid_base_stuff[] = { REG("smaps_rollup", S_IRUGO, proc_pid_smaps_rollup_operations), REG("pagemap", S_IRUSR, proc_pagemap_operations), #endif +#ifdef CONFIG_ETMEM + REG("idle_pages", 0600, proc_mm_idle_operations), +#endif #ifdef CONFIG_SECURITY DIR("attr", S_IRUGO|S_IXUGO, proc_attr_dir_inode_operations, proc_attr_dir_operations), #endif @@ -3634,6 +3637,9 @@ static const struct pid_entry tid_base_stuff[] = { REG("smaps_rollup", S_IRUGO, proc_pid_smaps_rollup_operations), REG("pagemap", S_IRUSR, proc_pagemap_operations), #endif +#ifdef CONFIG_ETMEM + REG("idle_pages", 0600, proc_mm_idle_operations), +#endif #ifdef CONFIG_SECURITY DIR("attr", S_IRUGO|S_IXUGO, proc_attr_dir_inode_operations, proc_attr_dir_operations), #endif diff --git a/fs/proc/etmem_proc.c b/fs/proc/etmem_proc.c new file mode 100644 index 000000000000..edacb9260345 --- /dev/null +++ b/fs/proc/etmem_proc.c @@ -0,0 +1,122 @@ +// SPDX-License-Identifier: GPL-2.0 +#include <linux/seq_file.h> +#include <linux/pagemap.h> +#include <linux/mempolicy.h> +#include <linux/swap.h> +#include <linux/sched/mm.h> +#include <linux/mmu_notifier.h> +#include <linux/page_idle.h> +#include <linux/uaccess.h> +#include <linux/pkeys.h> +#include <linux/module.h> +#include <linux/ptrace.h> +#include <linux/etmem.h> + +#include "internal.h" + +static DEFINE_SPINLOCK(scan_lock); + +static int page_scan_lock(struct file *file, int is_lock, struct file_lock *flock) +{ + if (is_lock) + spin_lock(&scan_lock); + else + spin_unlock(&scan_lock); + + return 0; +} + +/* will be filled when kvm_ept_idle module loads */ +struct file_operations proc_page_scan_operations = { + .flock = page_scan_lock, +}; +EXPORT_SYMBOL_GPL(proc_page_scan_operations); + +static ssize_t mm_idle_read(struct file *file, char __user *buf, + size_t count, loff_t *ppos) +{ + struct mm_struct *mm = file->private_data; + int ret = 0; + + if (!mm || !mmget_not_zero(mm)) { + ret = -ESRCH; + return ret; + } + if (proc_page_scan_operations.read) + ret = proc_page_scan_operations.read(file, buf, count, ppos); + + mmput(mm); + return ret; +} + +static int mm_idle_open(struct inode *inode, struct file *file) +{ + struct mm_struct *mm = NULL; + struct module *module = NULL; + int ret = -1; + + if (!file_ns_capable(file, &init_user_ns, CAP_SYS_ADMIN)) + return -EPERM; + + page_scan_lock(NULL, 1, NULL); + module = proc_page_scan_operations.owner; + if (module != NULL && try_module_get(module)) + ret = 0; + page_scan_lock(NULL, 0, NULL); + if (ret != 0) { + /* no scan ko installed, avoid to return valid file */ + return -ENODEV; + } + + mm = proc_mem_open(inode, PTRACE_MODE_READ); + if (IS_ERR(mm)) { + module_put(module); + return PTR_ERR(mm); + } + + file->private_data = mm; + + if (proc_page_scan_operations.open) + ret = proc_page_scan_operations.open(inode, file); + + if (ret != 0) + module_put(module); + + return ret; +} + +static int mm_idle_release(struct inode *inode, struct file *file) +{ + struct mm_struct *mm = file->private_data; + int ret = 0; + + if (mm) { + if (!mm_kvm(mm)) + flush_tlb_mm(mm); + mmdrop(mm); + } + + if (proc_page_scan_operations.release) + ret = proc_page_scan_operations.release(inode, file); + + if (proc_page_scan_operations.owner) + module_put(proc_page_scan_operations.owner); + + return ret; +} + +static long mm_idle_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) +{ + if (proc_page_scan_operations.unlocked_ioctl) + return proc_page_scan_operations.unlocked_ioctl(filp, cmd, arg); + + return 0; +} + +const struct file_operations proc_mm_idle_operations = { + .llseek = mem_lseek, /* borrow this */ + .read = mm_idle_read, + .open = mm_idle_open, + .release = mm_idle_release, + .unlocked_ioctl = mm_idle_ioctl, +}; diff --git a/fs/proc/etmem_scan.c b/fs/proc/etmem_scan.c new file mode 100644 index 000000000000..1e9677df9401 --- /dev/null +++ b/fs/proc/etmem_scan.c @@ -0,0 +1,1440 @@ +// SPDX-License-Identifier: GPL-2.0 +#include <linux/pagemap.h> +#include <linux/mm.h> +#include <linux/hugetlb.h> +#include <linux/kernel.h> +#include <linux/sched.h> +#include <linux/proc_fs.h> +#include <linux/uaccess.h> +#include <linux/kvm.h> +#include <linux/kvm_host.h> +#include <linux/bitmap.h> +#include <linux/sched/mm.h> +#include <linux/version.h> +#include <linux/module.h> +#include <linux/io.h> +#include <linux/pagewalk.h> +#include <linux/uaccess.h> +#include <linux/align.h> +#include <asm/cacheflush.h> +#include <asm/page.h> +#include <asm/pgalloc.h> +#include <asm/pgtable.h> +#include <linux/huge_mm.h> +#include <linux/etmem.h> +#ifdef CONFIG_ARM64 +#include <asm/pgtable-types.h> +#include <asm/memory.h> +#include <asm/kvm_mmu.h> +#include <asm/kvm_arm.h> +#include <asm/kvm_pgtable.h> +#include <asm/stage2_pgtable.h> +#endif +#include "etmem_scan.h" +#include <linux/hugetlb_inline.h> + +#ifdef CONFIG_X86_64 +/* + * Fallback to false for kernel doens't support KVM_INVALID_SPTE + * ept_idle can sitll work in this situation but the scan accuracy may drop, + * depends on the access frequences of the workload. + */ +#ifdef KVM_INVALID_SPTE +#define KVM_CHECK_INVALID_SPTE(val) ((val) == KVM_INVALID_SPTE) +#else +#define KVM_CHECK_INVALID_SPTE(val) (0) +#endif + +# define kvm_arch_mmu_pointer(vcpu) (vcpu->arch.mmu) +# define kvm_mmu_ad_disabled(mmu) (mmu->mmu_role.base.ad_disabled) +#endif /*CONFIG_X86_64*/ + +#ifdef CONFIG_ARM64 +#define if_pmd_thp_or_huge(pmd) (if_pmd_huge(pmd) || pmd_trans_huge(pmd)) +#endif /* CONFIG_ARM64 */ + +#ifdef DEBUG + +#define debug_printk trace_printk + +#define set_restart_gpa(val, note) ({ \ + unsigned long old_val = pic->restart_gpa; \ + pic->restart_gpa = (val); \ + trace_printk("restart_gpa=%lx %luK %s %s %d\n", \ + (val), (pic->restart_gpa - old_val) >> 10, \ + note, __func__, __LINE__); \ +}) + +#define set_next_hva(val, note) ({ \ + unsigned long old_val = pic->next_hva; \ + pic->next_hva = (val); \ + trace_printk(" next_hva=%lx %luK %s %s %d\n", \ + (val), (pic->next_hva - old_val) >> 10, \ + note, __func__, __LINE__); \ +}) + +#else + +#define debug_printk(...) + +#define set_restart_gpa(val, note) ({ \ + pic->restart_gpa = (val); \ +}) + +#define set_next_hva(val, note) ({ \ + pic->next_hva = (val); \ +}) + +#endif + +#define RET_RESCAN_FLAG 0x10000 + +/* error return IDLE_PAGE_TYPE_MAX or return valid page type */ +enum ProcIdlePageType (*vm_handle_pte_hole)(unsigned long addr, + unsigned long next, int depth, struct mm_walk *walk) = NULL; +EXPORT_SYMBOL_GPL(vm_handle_pte_hole); + +static phys_addr_t __stage2_range_addr_end(phys_addr_t addr, phys_addr_t end, + phys_addr_t size) +{ + phys_addr_t boundary = ALIGN_DOWN(addr + size, size); + + return (boundary - 1 < end - 1) ? boundary : end; +} + +static phys_addr_t stage2_range_addr_end(phys_addr_t addr, phys_addr_t end) +{ + phys_addr_t size = kvm_granule_size(KVM_PGTABLE_MIN_BLOCK_LEVEL); + + return __stage2_range_addr_end(addr, end, size); +} + + +static int set_walk_step(const char *val, const struct kernel_param *kp) +{ + int ret; + unsigned int n; + + ret = kstrtouint(val, 0, &n); + if (ret != 0 || n == 0) + return -EINVAL; + + return param_set_uint(val, kp); +} + +static struct kernel_param_ops walk_step_ops = { + .set = set_walk_step, + .get = param_get_uint, +}; + +static unsigned int __read_mostly walk_step = 512; // in PAGE_SIZE +module_param_cb(walk_step, &walk_step_ops, &walk_step, 0644); + +static unsigned int resched_step = 10; +module_param(resched_step, uint, 0644); + +static unsigned long pagetype_size[16] = { + [PTE_ACCESSED] = PAGE_SIZE, /* 4k page */ + [PMD_ACCESSED] = PMD_SIZE, /* 2M page */ + [PUD_PRESENT] = PUD_SIZE, /* 1G page */ + + [PTE_DIRTY_M] = PAGE_SIZE, + [PMD_DIRTY_M] = PMD_SIZE, + + [PTE_IDLE] = PAGE_SIZE, + [PMD_IDLE] = PMD_SIZE, + [PMD_IDLE_PTES] = PMD_SIZE, + + [PTE_HOLE] = PAGE_SIZE, + [PMD_HOLE] = PMD_SIZE, +}; + +static void u64_to_u8(uint64_t n, uint8_t *p) +{ + p += sizeof(uint64_t) - 1; + + *p-- = n; n >>= 8; + *p-- = n; n >>= 8; + *p-- = n; n >>= 8; + *p-- = n; n >>= 8; + + *p-- = n; n >>= 8; + *p-- = n; n >>= 8; + *p-- = n; n >>= 8; + *p = n; +} + +static void dump_pic(struct page_idle_ctrl *pic) +{ + debug_printk("page_idle_ctrl: pie_read=%d pie_read_max=%d", + pic->pie_read, + pic->pie_read_max); + debug_printk(" buf_size=%d bytes_copied=%d next_hva=%pK", + pic->buf_size, + pic->bytes_copied, + pic->next_hva); + debug_printk(" restart_gpa=%pK pa_to_hva=%pK\n", + pic->restart_gpa, + pic->gpa_to_hva); +} + +#ifdef CONFIG_ARM64 +static int if_pmd_huge(pmd_t pmd) +{ + return pmd_val(pmd) && !(pmd_val(pmd) & PMD_TABLE_BIT); +} + +static int if_pud_huge(pud_t pud) +{ +#ifndef __PAGETABLE_PMD_FOLDED + return pud_val(pud) && !(pud_val(pud) & PUD_TABLE_BIT); +#else + return 0; +#endif +} +#endif + +static void pic_report_addr(struct page_idle_ctrl *pic, unsigned long addr) +{ + unsigned long hva; + + pic->kpie[pic->pie_read++] = PIP_CMD_SET_HVA; + hva = addr; + u64_to_u8(hva, &pic->kpie[pic->pie_read]); + pic->pie_read += sizeof(uint64_t); + dump_pic(pic); +} + +static int pic_add_page(struct page_idle_ctrl *pic, + unsigned long addr, + unsigned long next, + enum ProcIdlePageType page_type) +{ + unsigned long page_size = pagetype_size[page_type]; + + dump_pic(pic); + + /* align kernel/user vision of cursor position */ + next = round_up(next, page_size); + + if (!pic->pie_read || + addr + pic->gpa_to_hva != pic->next_hva) { + /* merge hole */ + if (page_type == PTE_HOLE || + page_type == PMD_HOLE) { + set_restart_gpa(next, "PTE_HOLE|PMD_HOLE"); + return 0; + } + + if (addr + pic->gpa_to_hva < pic->next_hva) { + debug_printk("page_idle: addr moves backwards\n"); + WARN_ONCE(1, "page_idle: addr moves backwards"); + } + + if (pic->pie_read + sizeof(uint64_t) + 2 >= pic->pie_read_max) { + set_restart_gpa(addr, "PAGE_IDLE_KBUF_FULL"); + return PAGE_IDLE_KBUF_FULL; + } + + pic_report_addr(pic, round_down(addr, page_size) + + pic->gpa_to_hva); + } else { + if (PIP_TYPE(pic->kpie[pic->pie_read - 1]) == page_type && + PIP_SIZE(pic->kpie[pic->pie_read - 1]) < 0xF) { + set_next_hva(next + pic->gpa_to_hva, "IN-PLACE INC"); + set_restart_gpa(next, "IN-PLACE INC"); + pic->kpie[pic->pie_read - 1]++; + WARN_ONCE(page_size < next-addr, "next-addr too large"); + return 0; + } + if (pic->pie_read >= pic->pie_read_max) { + set_restart_gpa(addr, "PAGE_IDLE_KBUF_FULL"); + return PAGE_IDLE_KBUF_FULL; + } + } + + set_next_hva(next + pic->gpa_to_hva, "NEW-ITEM"); + set_restart_gpa(next, "NEW-ITEM"); + pic->kpie[pic->pie_read] = PIP_COMPOSE(page_type, 1); + pic->pie_read++; + + return 0; +} + +static int init_page_idle_ctrl_buffer(struct page_idle_ctrl *pic) +{ + pic->pie_read = 0; + pic->pie_read_max = min(PAGE_IDLE_KBUF_SIZE, + pic->buf_size - pic->bytes_copied); + /* reserve space for PIP_CMD_SET_HVA in the end */ + pic->pie_read_max -= sizeof(uint64_t) + 1; + + /* + * Align with PAGE_IDLE_KBUF_FULL + * logic in pic_add_page(), to avoid pic->pie_read = 0 when + * PAGE_IDLE_KBUF_FULL happened. + */ + if (pic->pie_read_max <= sizeof(uint64_t) + 2) + return PAGE_IDLE_KBUF_FULL; + + memset(pic->kpie, 0, sizeof(pic->kpie)); + return 0; +} + +static void setup_page_idle_ctrl(struct page_idle_ctrl *pic, void *buf, + int buf_size, unsigned int flags) +{ + pic->buf = buf; + pic->buf_size = buf_size; + pic->bytes_copied = 0; + pic->next_hva = 0; + pic->gpa_to_hva = 0; + pic->restart_gpa = 0; + pic->last_va = 0; + pic->flags = flags; +} + +static int page_idle_copy_user(struct page_idle_ctrl *pic, + unsigned long start, unsigned long end) +{ + int bytes_read; + int ret; + + dump_pic(pic); + + bytes_read = pic->pie_read; + if (!bytes_read) + return 0; + + ret = copy_to_user(pic->buf, pic->kpie, bytes_read); + if (ret) + return -EFAULT; + + pic->buf += bytes_read; + pic->bytes_copied += bytes_read; + if (pic->bytes_copied >= pic->buf_size) + return PAGE_IDLE_BUF_FULL; + + ret = init_page_idle_ctrl_buffer(pic); + if (ret) + return ret; + + cond_resched(); + return 0; +} + +#ifdef CONFIG_X86_64 +static int vm_walk_host_range(unsigned long long start, + unsigned long end, + struct mm_walk *walk) +{ + int ret; + struct page_idle_ctrl *pic = walk->private; + unsigned long tmp_gpa_to_hva = pic->gpa_to_hva; + + pic->gpa_to_hva = 0; + +#ifdef KVM_HAVE_MMU_RWLOCK + write_unlock_irq(&pic->kvm->mmu_lock); +#else + spin_unlock_irq(&pic->kvm->mmu_lock); +#endif + down_read(&walk->mm->mmap_lock); + local_irq_disable(); + ret = walk_page_range(walk->mm, start + tmp_gpa_to_hva, end + tmp_gpa_to_hva, + walk->ops, walk->private); + local_irq_enable(); + up_read(&walk->mm->mmap_lock); + pic->gpa_to_hva = tmp_gpa_to_hva; + if (pic->flags & VM_SCAN_HOST) { + pic->restart_gpa -= tmp_gpa_to_hva; + pic->flags &= ~VM_SCAN_HOST; + } + if (ret != PAGE_IDLE_KBUF_FULL && end > pic->restart_gpa) + pic->restart_gpa = end; + + /* ept page table may change after spin_unlock, rescan vm from root ept */ + ret |= RET_RESCAN_FLAG; + + return ret; +} + +static int ept_pte_range(struct page_idle_ctrl *pic, + pmd_t *pmd, unsigned long addr, unsigned long end, + struct mm_walk *walk) +{ + pte_t *pte; + enum ProcIdlePageType page_type; + int err = 0; + + pte = pte_offset_kernel(pmd, addr); + do { + if (KVM_CHECK_INVALID_SPTE(pte->pte)) { + page_type = PTE_IDLE; + } else if (!ept_pte_present(*pte)) { + err = vm_walk_host_range(addr, end, walk); + goto next; + } else if (!test_and_clear_bit(_PAGE_BIT_EPT_ACCESSED, + (unsigned long *) &pte->pte)) + page_type = PTE_IDLE; + else { + page_type = PTE_ACCESSED; + if (pic->flags & SCAN_DIRTY_PAGE) { + if (test_and_clear_bit(_PAGE_BIT_EPT_DIRTY, + (unsigned long *) &pte->pte)) + page_type = PTE_DIRTY_M; + } + } + + err = pic_add_page(pic, addr, addr + PAGE_SIZE, page_type); +next: + if (err) + break; + } while (pte++, addr += PAGE_SIZE, addr != end); + + return err; +} + +static enum ProcIdlePageType ept_huge_accessed(pmd_t *pmd, unsigned long addr, + unsigned long end) +{ + int accessed = PMD_IDLE; + pte_t *pte; + + pte = pte_offset_kernel(pmd, addr); + do { + if (!KVM_CHECK_INVALID_SPTE(pte->pte)) + continue; + if (!ept_pte_present(*pte)) + continue; + if (!test_and_clear_bit(_PAGE_BIT_EPT_ACCESSED, + (unsigned long *)&pte->pte)) + continue; + accessed = PMD_ACCESSED; + } while (pte++, addr += PAGE_SIZE, addr != end); + + return accessed; +} + +static int ept_pmd_range(struct page_idle_ctrl *pic, + pud_t *pud, unsigned long addr, unsigned long end, + struct mm_walk *walk) +{ + pmd_t *pmd; + unsigned long next; + enum ProcIdlePageType page_type; + enum ProcIdlePageType pte_page_type; + int err = 0; + + if (pic->flags & SCAN_HUGE_PAGE) + pte_page_type = PMD_IDLE_PTES; + else + pte_page_type = IDLE_PAGE_TYPE_MAX; + + pmd = pmd_offset(pud, addr); + do { + next = pmd_addr_end(addr, end); + if (KVM_CHECK_INVALID_SPTE(pmd->pmd)) + page_type = PMD_IDLE; + else if (!ept_pmd_present(*pmd)) { + err = vm_walk_host_range(addr, next, walk); + goto next; + } else if (!pmd_large(*pmd)) { + if (pic->flags & SCAN_AS_HUGE) + page_type = ept_huge_accessed(pmd, addr, next); + else + page_type = pte_page_type; + } else if (!test_and_clear_bit(_PAGE_BIT_EPT_ACCESSED, + (unsigned long *)pmd)) + page_type = PMD_IDLE; + else { + page_type = PMD_ACCESSED; + if ((pic->flags & SCAN_DIRTY_PAGE) && + test_and_clear_bit(_PAGE_BIT_EPT_DIRTY, + (unsigned long *) pmd)) + page_type = PMD_DIRTY_M; + } + + if (page_type != IDLE_PAGE_TYPE_MAX) + err = pic_add_page(pic, addr, next, page_type); + else + err = ept_pte_range(pic, pmd, addr, next, walk); + +next: + if (err) + break; + } while (pmd++, addr = next, addr != end); + + return err; +} + + +static int ept_pud_range(struct page_idle_ctrl *pic, + p4d_t *p4d, unsigned long addr, unsigned long end, + struct mm_walk *walk) +{ + pud_t *pud; + unsigned long next; + int err = 0; + + pud = pud_offset(p4d, addr); + do { + next = pud_addr_end(addr, end); + + if (!ept_pud_present(*pud)) { + err = vm_walk_host_range(addr, next, walk); + goto next; + } + + if (pud_large(*pud)) + err = pic_add_page(pic, addr, next, PUD_PRESENT); + else + err = ept_pmd_range(pic, pud, addr, next, walk); + +next: + if (err) + break; + } while (pud++, addr = next, addr != end); + + return err; +} + +static int ept_p4d_range(struct page_idle_ctrl *pic, + p4d_t *p4d, unsigned long addr, unsigned long end, + struct mm_walk *walk) +{ + unsigned long next; + int err = 0; + + p4d += p4d_index(addr); + do { + next = p4d_addr_end(addr, end); + if (!ept_p4d_present(*p4d)) { + set_restart_gpa(next, "P4D_HOLE"); + continue; + } + + err = ept_pud_range(pic, p4d, addr, next, walk); + if (err) + break; + } while (p4d++, addr = next, addr != end); + + return err; +} + +static int ept_pgd_range(struct page_idle_ctrl *pic, + pgd_t *pgd, + unsigned long addr, + unsigned long end, + struct mm_walk *walk) +{ + p4d_t *p4d; + unsigned long next; + int err = 0; + + pgd = pgd_offset_pgd(pgd, addr); + do { + next = pgd_addr_end(addr, end); + if (!ept_pgd_present(*pgd)) { + set_restart_gpa(next, "PGD_HOLE"); + continue; + } + + p4d = (p4d_t *)pgd_page_vaddr(*pgd); + err = ept_p4d_range(pic, p4d, addr, next, walk); + if (err) + break; + } while (pgd++, addr = next, addr != end); + + return err; +} + +static int ept_page_range(struct page_idle_ctrl *pic, + unsigned long addr, + unsigned long end, + struct mm_walk *walk) +{ + struct kvm_vcpu *vcpu; + struct kvm_mmu *mmu; + uint64_t *ept_root; + int err = 0; + + WARN_ON(addr >= end); + +#ifdef KVM_HAVE_MMU_RWLOCK + write_lock_irq(&pic->kvm->mmu_lock); +#else + spin_lock_irq(&pic->kvm->mmu_lock); +#endif + + vcpu = kvm_get_vcpu(pic->kvm, 0); + if (!vcpu) { + pic->gpa_to_hva = 0; + set_restart_gpa(TASK_SIZE, "NO-VCPU"); +#ifdef KVM_HAVE_MMU_RWLOCK + write_unlock_irq(&pic->kvm->mmu_lock); +#else + spin_unlock_irq(&pic->kvm->mmu_lock); +#endif + return -EINVAL; + } + + mmu = kvm_arch_mmu_pointer(vcpu); + if (!VALID_PAGE(mmu->root_hpa)) { + pic->gpa_to_hva = 0; + set_restart_gpa(TASK_SIZE, "NO-HPA"); +#ifdef KVM_HAVE_MMU_RWLOCK + write_unlock_irq(&pic->kvm->mmu_lock); +#else + spin_unlock_irq(&pic->kvm->mmu_lock); +#endif + return -EINVAL; + } + + ept_root = __va(mmu->root_hpa); + + /* Walk start at p4d when vm has 4 level table pages */ + if (mmu->shadow_root_level != 4) + err = ept_pgd_range(pic, (pgd_t *)ept_root, addr, end, walk); + else + err = ept_p4d_range(pic, (p4d_t *)ept_root, addr, end, walk); + + /* mmu_lock is unlock in vm_walk_host_range which will unlock mmu_lock + * and RET_RESCAN_FLAG will be set in ret value + */ + if (!(err & RET_RESCAN_FLAG)) +#ifdef KVM_HAVE_MMU_RWLOCK + write_unlock_irq(&pic->kvm->mmu_lock); +#else + spin_unlock_irq(&pic->kvm->mmu_lock); +#endif + else + err &= ~RET_RESCAN_FLAG; + + return err; +} + +static int ept_idle_supports_cpu(struct kvm *kvm) +{ + struct kvm_vcpu *vcpu; + struct kvm_mmu *mmu; + int ret; + + vcpu = kvm_get_vcpu(kvm, 0); + if (!vcpu) + return -EINVAL; + +#ifdef KVM_HAVE_MMU_RWLOCK + write_lock(&kvm->mmu_lock); +#else + spin_lock(&kvm->mmu_lock); +#endif + mmu = kvm_arch_mmu_pointer(vcpu); + if (kvm_mmu_ad_disabled(mmu)) { + pr_notice("CPU does not support EPT A/D bits tracking\n"); + ret = -EINVAL; + } else if (mmu->shadow_root_level < 4 || + (mmu->shadow_root_level == 5 && !pgtable_l5_enabled())) { + pr_notice("Unsupported EPT level %d\n", mmu->root_role.level); + ret = -EINVAL; + } else + ret = 0; +#ifdef KVM_HAVE_MMU_RWLOCK + write_unlock(&kvm->mmu_lock); +#else + spin_unlock(&kvm->mmu_lock); +#endif + + return ret; +} + +#else +static int arm_pte_range(struct page_idle_ctrl *pic, + pmd_t *pmd, unsigned long addr, unsigned long end) +{ + pte_t *pte; + enum ProcIdlePageType page_type; + int err = 0; + + pte = pte_offset_kernel(pmd, addr); + do { + if (!pte_present(*pte)) + page_type = PTE_HOLE; + else if (!test_and_clear_bit(_PAGE_MM_BIT_ACCESSED, + (unsigned long *) &pte->pte)) + page_type = PTE_IDLE; + else + page_type = PTE_ACCESSED; + + err = pic_add_page(pic, addr, addr + PAGE_SIZE, page_type); + if (err) + break; + } while (pte++, addr += PAGE_SIZE, addr != end); + + return err; +} + +static int arm_pmd_range(struct page_idle_ctrl *pic, + pud_t *pud, unsigned long addr, unsigned long end) +{ + pmd_t *pmd; + unsigned long next; + enum ProcIdlePageType page_type; + enum ProcIdlePageType pte_page_type; + int err = 0; + + if (pic->flags & SCAN_HUGE_PAGE) + pte_page_type = PMD_IDLE_PTES; + else + pte_page_type = IDLE_PAGE_TYPE_MAX; + + pmd = pmd_offset(pud, addr); + do { + next = pmd_addr_end(addr, end); + if (!pmd_present(*pmd)) + page_type = PMD_HOLE; + else if (!if_pmd_thp_or_huge(*pmd)) + page_type = pte_page_type; + else if (!test_and_clear_bit(_PAGE_MM_BIT_ACCESSED, + (unsigned long *)pmd)) + page_type = PMD_IDLE; + else + page_type = PMD_ACCESSED; + + if (page_type != IDLE_PAGE_TYPE_MAX) + err = pic_add_page(pic, addr, next, page_type); + else + err = arm_pte_range(pic, pmd, addr, next); + if (err) + break; + } while (pmd++, addr = next, addr != end); + + return err; +} + +static int arm_pud_range(struct page_idle_ctrl *pic, + p4d_t *p4d, unsigned long addr, unsigned long end) +{ + pud_t *pud; + unsigned long next; + int err = 0; + + pud = pud_offset(p4d, addr); + do { + next = pud_addr_end(addr, end); + if (!pud_present(*pud)) { + set_restart_gpa(next, "PUD_HOLE"); + continue; + } + + if (if_pud_huge(*pud)) + err = pic_add_page(pic, addr, next, PUD_PRESENT); + else + err = arm_pmd_range(pic, pud, addr, next); + if (err) + break; + } while (pud++, addr = next, addr != end); + + return err; +} + +static int arm_p4d_range(struct page_idle_ctrl *pic, + pgd_t *pgd, unsigned long addr, unsigned long end) +{ + p4d_t *p4d; + unsigned long next; + int err = 0; + + p4d = p4d_offset(pgd, addr); + do { + next = p4d_addr_end(addr, end); + if (!p4d_present(*p4d)) { + set_restart_gpa(next, "P4D_HOLE"); + continue; + } + + err = arm_pud_range(pic, p4d, addr, next); + if (err) + break; + } while (p4d++, addr = next, addr != end); + + return err; +} + +static int arm_page_range(struct page_idle_ctrl *pic, + unsigned long addr, + unsigned long end) +{ + pgd_t *pgd; + unsigned long next; + struct kvm *kvm = pic->kvm; + int err = 0; + + WARN_ON(addr >= end); + +#ifdef KVM_HAVE_MMU_RWLOCK + write_lock(&pic->kvm->mmu_lock); +#else + spin_lock(&pic->kvm->mmu_lock); +#endif + pgd = (pgd_t *)kvm->arch.mmu.pgt->pgd + pgd_index(addr); +#ifdef KVM_HAVE_MMU_RWLOCK + write_unlock(&pic->kvm->mmu_lock); +#else + spin_unlock(&pic->kvm->mmu_lock); +#endif + + local_irq_disable(); + do { + next = stage2_range_addr_end(addr, end); + if (!pgd_present(*pgd)) { + set_restart_gpa(next, "PGD_HOLE"); + continue; + } + + err = arm_p4d_range(pic, pgd, addr, next); + if (err) + break; + } while (pgd++, addr = next, addr != end); + + local_irq_enable(); + return err; +} +#endif + +/* + * Depending on whether hva falls in a memslot: + * + * 1) found => return gpa and remaining memslot size in *addr_range + * + * |<----- addr_range --------->| + * [ mem slot ] + * ^hva + * + * 2) not found => return hole size in *addr_range + * + * |<----- addr_range --------->| + * [first mem slot above hva ] + * ^hva + * + * If hva is above all mem slots, *addr_range will be ~0UL. + * We can finish read(2). + */ +static unsigned long vm_idle_find_gpa(struct page_idle_ctrl *pic, + unsigned long hva, + unsigned long *addr_range) +{ + struct kvm *kvm = pic->kvm; + struct kvm_memslots *slots; + struct kvm_memory_slot *memslot; + unsigned long hva_end; + int bkt; + gfn_t gfn; + + *addr_range = ~0UL; + mutex_lock(&kvm->slots_lock); + slots = kvm_memslots(pic->kvm); + kvm_for_each_memslot(memslot, bkt, slots) { + hva_end = memslot->userspace_addr + + (memslot->npages << PAGE_SHIFT); + + if (hva >= memslot->userspace_addr && hva < hva_end) { + gpa_t gpa; + + gfn = hva_to_gfn_memslot(hva, memslot); + *addr_range = hva_end - hva; + gpa = gfn_to_gpa(gfn); + mutex_unlock(&kvm->slots_lock); + return gpa; + } + + if (memslot->userspace_addr > hva) + *addr_range = min(*addr_range, + memslot->userspace_addr - hva); + } + mutex_unlock(&kvm->slots_lock); + return INVALID_PAGE; +} + +static inline unsigned long mask_to_size(unsigned long mask) +{ + return ~mask + 1; +} + +static int mm_idle_hugetlb_entry(pte_t *pte, unsigned long hmask, + unsigned long addr, unsigned long next, + struct mm_walk *walk); +static int vm_idle_hugetlb_entry(pte_t *pte, unsigned long hmask, + unsigned long addr, unsigned long next, + struct mm_walk *walk) +{ + struct page_idle_ctrl *pic = walk->private; + enum ProcIdlePageType page_type; + + pic->flags |= VM_SCAN_HOST; + + /* hugetlb page table entry of vm maybe not present while page is resident + * in address_space + */ + if (mask_to_size(hmask) != PUD_SIZE && !pte_present(*pte) && + vm_handle_pte_hole != NULL) { + page_type = vm_handle_pte_hole(addr, next, -1, walk); + if (page_type < IDLE_PAGE_TYPE_MAX) + return pic_add_page(pic, addr, next, page_type); + } + + return mm_idle_hugetlb_entry(pte, hmask, addr, next, walk); +} + +static int vm_idle_pte_hole(unsigned long addr, unsigned long next, int depth, struct mm_walk *walk) +{ + struct page_idle_ctrl *pic = walk->private; + enum ProcIdlePageType pagetype; + + if (vm_handle_pte_hole == NULL) + return 0; + + pagetype = vm_handle_pte_hole(addr, next, depth, walk); + if (pagetype >= IDLE_PAGE_TYPE_MAX) + return 0; + + debug_printk("scan pte hole addr %pK type %d\n", addr, pagetype); + pic->flags |= VM_SCAN_HOST; + return pic_add_page(pic, addr, next, pagetype); +} + +static int mm_idle_pmd_entry(pmd_t *pmd, unsigned long addr, + unsigned long next, struct mm_walk *walk); +static int vm_idle_pmd_entry(pmd_t *pmd, unsigned long addr, + unsigned long next, struct mm_walk *walk) +{ + struct page_idle_ctrl *pic = walk->private; + + pic->flags |= VM_SCAN_HOST; + return mm_idle_pmd_entry(pmd, addr, next, walk); +} + +static int mm_idle_pud_entry(pud_t *pud, unsigned long addr, + unsigned long next, struct mm_walk *walk); +static int vm_idle_pud_entry(pud_t *pud, unsigned long addr, + unsigned long next, struct mm_walk *walk) +{ + struct page_idle_ctrl *pic = walk->private; + + pic->flags |= VM_SCAN_HOST; + return mm_idle_pud_entry(pud, addr, next, walk); +} + +static int vm_idle_walk_hva_range(struct page_idle_ctrl *pic, + unsigned long start, unsigned long end, + struct mm_walk *walk) +{ + unsigned long gpa_addr; + unsigned long gpa_next; + unsigned long gpa_end; + unsigned long addr_range; + unsigned long va_end; + int ret; + int steps; + +#ifdef CONFIG_X86_64 + ret = ept_idle_supports_cpu(pic->kvm); + if (ret) + return ret; +#endif + + ret = init_page_idle_ctrl_buffer(pic); + if (ret) + return ret; + + for (; start < end;) { + gpa_addr = vm_idle_find_gpa(pic, start, &addr_range); + + if (gpa_addr == INVALID_PAGE) { + pic->gpa_to_hva = 0; + if (addr_range == ~0UL) { + set_restart_gpa(TASK_SIZE, "EOF"); + va_end = end; + } else { + start += addr_range; + set_restart_gpa(start, "OUT-OF-SLOT"); + va_end = start; + } + } else { + pic->gpa_to_hva = start - gpa_addr; + gpa_end = gpa_addr + addr_range; + steps = 0; + for (; gpa_addr < gpa_end;) { + gpa_next = min(gpa_end, gpa_addr + walk_step * PAGE_SIZE); +#ifdef CONFIG_ARM64 + ret = arm_page_range(pic, gpa_addr, gpa_next); +#else + ret = ept_page_range(pic, gpa_addr, gpa_next, walk); +#endif + gpa_addr = pic->restart_gpa; + + if (ret) + break; + + if (++steps >= resched_step) { + cond_resched(); + steps = 0; + } + } + va_end = pic->gpa_to_hva + gpa_end; + } + + start = pic->restart_gpa + pic->gpa_to_hva; + ret = page_idle_copy_user(pic, start, va_end); + if (ret) + break; + } + + if (start > pic->next_hva) + set_next_hva(start, "NEXT-START"); + + if (pic->bytes_copied) + ret = 0; + return ret; +} + +static int mm_idle_test_walk(unsigned long start, unsigned long end, + struct mm_walk *walk); +static ssize_t vm_idle_read(struct file *file, char *buf, + size_t count, loff_t *ppos) +{ + struct mm_struct *mm = file->private_data; + struct mm_walk mm_walk = {}; + struct mm_walk_ops mm_walk_ops = {}; + struct page_idle_ctrl *pic; + unsigned long hva_start = *ppos; + unsigned long hva_end = hva_start + (count << (3 + PAGE_SHIFT)); + int ret; + + pic = kzalloc(sizeof(*pic), GFP_KERNEL); + if (!pic) + return -ENOMEM; + + setup_page_idle_ctrl(pic, buf, count, file->f_flags); + pic->kvm = mm_kvm(mm); + + mm_walk_ops.pmd_entry = vm_idle_pmd_entry; + mm_walk_ops.pud_entry = vm_idle_pud_entry; + mm_walk_ops.hugetlb_entry = vm_idle_hugetlb_entry; + mm_walk_ops.pte_hole = vm_idle_pte_hole; + mm_walk_ops.test_walk = mm_idle_test_walk; + + mm_walk.mm = mm; + mm_walk.ops = &mm_walk_ops; + mm_walk.private = pic; + + ret = vm_idle_walk_hva_range(pic, hva_start, hva_end, &mm_walk); + if (ret) + goto out_kvm; + + ret = pic->bytes_copied; + *ppos = pic->next_hva; +out_kvm: + kfree(pic); + return ret; + +} + +static ssize_t mm_idle_read(struct file *file, char *buf, + size_t count, loff_t *ppos); + +static ssize_t page_scan_read(struct file *file, char *buf, + size_t count, loff_t *ppos) +{ + struct mm_struct *mm = file->private_data; + unsigned long hva_start = *ppos; + unsigned long hva_end = hva_start + (count << (3 + PAGE_SHIFT)); + + if ((hva_start >= TASK_SIZE) || (hva_end >= TASK_SIZE)) { + debug_printk("page_idle_read past TASK_SIZE: %pK %pK %lx\n", + hva_start, hva_end, TASK_SIZE); + return 0; + } + if (hva_end <= hva_start) { + debug_printk("page_idle_read past EOF: %pK %pK\n", + hva_start, hva_end); + return 0; + } + if (*ppos & (PAGE_SIZE - 1)) { + debug_printk("page_idle_read unaligned ppos: %pK\n", + hva_start); + return -EINVAL; + } + if (count < PAGE_IDLE_BUF_MIN) { + debug_printk("page_idle_read small count: %lx\n", + (unsigned long)count); + return -EINVAL; + } + + if (!mm_kvm(mm)) + return mm_idle_read(file, buf, count, ppos); + + return vm_idle_read(file, buf, count, ppos); +} + +static int page_scan_open(struct inode *inode, struct file *file) +{ + if (!try_module_get(THIS_MODULE)) + return -EBUSY; + + return 0; +} + +static int page_scan_release(struct inode *inode, struct file *file) +{ + struct mm_struct *mm = file->private_data; + struct kvm *kvm; + int ret = 0; + + if (!mm) { + ret = -EBADF; + goto out; + } + + kvm = mm_kvm(mm); + if (!kvm) { + ret = -EINVAL; + goto out; + } +#ifdef CONFIG_X86_64 +#ifdef KVM_HAVE_MMU_RWLOCK + write_lock(&kvm->mmu_lock); +#else + spin_lock(&kvm->mmu_lock); +#endif + + kvm_flush_remote_tlbs(kvm); +#ifdef KVM_HAVE_MMU_RWLOCK + write_unlock(&kvm->mmu_lock); +#else + spin_unlock(&kvm->mmu_lock); +#endif +#endif + +out: + module_put(THIS_MODULE); + return ret; +} + +static int mm_idle_pmd_large(pmd_t pmd) +{ +#ifdef CONFIG_ARM64 + return if_pmd_thp_or_huge(pmd); +#else + return pmd_large(pmd); +#endif +} + +static int mm_idle_pte_range(struct page_idle_ctrl *pic, pmd_t *pmd, + unsigned long addr, unsigned long next) +{ + enum ProcIdlePageType page_type; + pte_t *pte; + int err = 0; + + pte = pte_offset_kernel(pmd, addr); + do { + if (!pte_present(*pte)) + page_type = PTE_HOLE; + else if (pic->flags & SCAN_IGN_HOST) + page_type = PTE_IDLE; + else if (!test_and_clear_bit(_PAGE_MM_BIT_ACCESSED, + (unsigned long *) &pte->pte)) + page_type = PTE_IDLE; + else { + page_type = PTE_ACCESSED; + } + + err = pic_add_page(pic, addr, addr + PAGE_SIZE, page_type); + if (err) + break; + } while (pte++, addr += PAGE_SIZE, addr != next); + + return err; +} + +static int mm_idle_hugetlb_entry(pte_t *pte, unsigned long hmask, + unsigned long addr, unsigned long next, + struct mm_walk *walk) +{ + struct page_idle_ctrl *pic = walk->private; + enum ProcIdlePageType page_type; + unsigned long start = addr & hmask; /* hugepage may be splited in vm */ + int ret; + + if (mask_to_size(hmask) == PUD_SIZE) { + page_type = PUD_PRESENT; + goto add_page; + } + + if (!pte_present(*pte)) + page_type = PMD_HOLE; + else if (pic->flags & SCAN_IGN_HOST) + page_type = PMD_IDLE; + else if (!test_and_clear_bit(_PAGE_MM_BIT_ACCESSED, (unsigned long *)pte)) + page_type = PMD_IDLE; + else + page_type = PMD_ACCESSED; + +add_page: + ret = pic_add_page(pic, start, start + pagetype_size[page_type], page_type); + return ret; +} + +static int mm_idle_pmd_entry(pmd_t *pmd, unsigned long addr, + unsigned long next, struct mm_walk *walk) +{ + struct page_idle_ctrl *pic = walk->private; + enum ProcIdlePageType page_type; + enum ProcIdlePageType pte_page_type; + int err; + + /* + * Skip duplicate PMD_IDLE_PTES: when the PMD crosses VMA boundary, + * walk_page_range() can call on the same PMD twice. + */ + if ((addr & PMD_MASK) == (pic->last_va & PMD_MASK) && (pic->flags & SCAN_HUGE_PAGE)) { + debug_printk("ignore duplicate addr %pK %pK\n", + addr, pic->last_va); + set_restart_gpa(round_up(next, PMD_SIZE), "DUP_ADDR"); + return 0; + } + pic->last_va = addr; + + if (pic->flags & SCAN_HUGE_PAGE) + pte_page_type = PMD_IDLE_PTES; + else + pte_page_type = IDLE_PAGE_TYPE_MAX; + + if (!pmd_present(*pmd)) + page_type = PMD_HOLE; + else if (!mm_idle_pmd_large(*pmd)) + page_type = pte_page_type; + else if (!test_and_clear_bit(_PAGE_MM_BIT_ACCESSED, + (unsigned long *)pmd) || + pic->flags & SCAN_IGN_HOST) + page_type = PMD_IDLE; + else + page_type = PMD_ACCESSED; + + if (page_type != IDLE_PAGE_TYPE_MAX) + err = pic_add_page(pic, addr, next, page_type); + else + err = mm_idle_pte_range(pic, pmd, addr, next); + + return err; +} + +static int mm_idle_pud_entry(pud_t *pud, unsigned long addr, + unsigned long next, struct mm_walk *walk) +{ + struct page_idle_ctrl *pic = walk->private; + + spinlock_t *ptl = pud_trans_huge_lock(pud, walk->vma); + + if (ptl) { + if ((addr & PUD_MASK) != (pic->last_va & PUD_MASK)) { + pic_add_page(pic, addr, next, PUD_PRESENT); + pic->last_va = addr; + } + spin_unlock(ptl); + return 1; + } + + return 0; +} + +static int mm_idle_test_walk(unsigned long start, unsigned long end, + struct mm_walk *walk) +{ + struct vm_area_struct *vma = walk->vma; + struct page_idle_ctrl *pic = walk->private; + + /* If the specified page swapout is set, the untagged vma is skipped. */ + if ((pic->flags & VMA_SCAN_FLAG) && !(vma->vm_flags & VM_SWAPFLAG)) + return 1; + + if (vma->vm_file) { + if (is_vm_hugetlb_page(vma)) + return 0; + if ((vma->vm_flags & (VM_WRITE|VM_MAYSHARE)) == VM_WRITE) + return 0; + return 1; + } + + return 0; +} + +static int mm_idle_walk_range(struct page_idle_ctrl *pic, + unsigned long start, + unsigned long end, + struct mm_walk *walk) +{ + struct vm_area_struct *vma; + int ret = 0; + + ret = init_page_idle_ctrl_buffer(pic); + if (ret) + return ret; + + for (; start < end;) { + down_read(&walk->mm->mmap_lock); + vma = find_vma(walk->mm, start); + if (vma) { + if (end > vma->vm_start) { + local_irq_disable(); + ret = walk_page_range(walk->mm, start, end, + walk->ops, walk->private); + local_irq_enable(); + } else + set_restart_gpa(vma->vm_start, "VMA-HOLE"); + } else + set_restart_gpa(TASK_SIZE, "EOF"); + up_read(&walk->mm->mmap_lock); + + WARN_ONCE(pic->gpa_to_hva, "non-zero gpa_to_hva"); + if (ret != PAGE_IDLE_KBUF_FULL && end > pic->restart_gpa) + pic->restart_gpa = end; + start = pic->restart_gpa; + ret = page_idle_copy_user(pic, start, end); + if (ret) + break; + } + + if (start > pic->next_hva) + set_next_hva(start, "NEXT-START"); + + if (pic->bytes_copied) { + if (ret != PAGE_IDLE_BUF_FULL && pic->next_hva < end) + debug_printk("partial scan: next_hva=%pK end=%pK\n", + pic->next_hva, end); + ret = 0; + } else + debug_printk("nothing read"); + return ret; +} + +static ssize_t mm_idle_read(struct file *file, char *buf, + size_t count, loff_t *ppos) +{ + struct mm_struct *mm = file->private_data; + struct mm_walk_ops *mm_walk_ops = NULL; + struct mm_walk mm_walk = {}; + struct page_idle_ctrl *pic; + unsigned long va_start = *ppos; + unsigned long va_end = va_start + (count << (3 + PAGE_SHIFT)); + int ret; + + if (va_end <= va_start) { + debug_printk("%s past EOF: %pK %pK\n", + __func__, va_start, va_end); + return 0; + } + if (*ppos & (PAGE_SIZE - 1)) { + debug_printk("%s unaligned ppos: %pK\n", + __func__, va_start); + return -EINVAL; + } + if (count < PAGE_IDLE_BUF_MIN) { + debug_printk("%s small count: %lx\n", + __func__, (unsigned long)count); + return -EINVAL; + } + + pic = kzalloc(sizeof(*pic), GFP_KERNEL); + if (!pic) + return -ENOMEM; + + mm_walk_ops = kzalloc(sizeof(struct mm_walk_ops), GFP_KERNEL); + if (!mm_walk_ops) { + kfree(pic); + return -ENOMEM; + } + + setup_page_idle_ctrl(pic, buf, count, file->f_flags); + + mm_walk_ops->pmd_entry = mm_idle_pmd_entry; + mm_walk_ops->pud_entry = mm_idle_pud_entry; + mm_walk_ops->hugetlb_entry = mm_idle_hugetlb_entry; + mm_walk_ops->test_walk = mm_idle_test_walk; + + mm_walk.mm = mm; + mm_walk.ops = mm_walk_ops; + mm_walk.private = pic; + mm_walk.pgd = NULL; + mm_walk.no_vma = false; + ret = mm_idle_walk_range(pic, va_start, va_end, &mm_walk); + if (ret) + goto out_free; + + ret = pic->bytes_copied; + *ppos = pic->next_hva; +out_free: + kfree(pic); + kfree(mm_walk_ops); + return ret; +} + +static long page_scan_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) +{ + void __user *argp = (void __user *)arg; + unsigned int flags; + + if (get_user(flags, (unsigned int __user *)argp)) + return -EFAULT; + flags &= ALL_SCAN_FLAGS; + + switch (cmd) { + case IDLE_SCAN_ADD_FLAGS: + filp->f_flags |= flags; + break; + case IDLE_SCAN_REMOVE_FLAGS: + filp->f_flags &= ~flags; + break; + case VMA_SCAN_ADD_FLAGS: + filp->f_flags |= flags; + break; + case VMA_SCAN_REMOVE_FLAGS: + filp->f_flags &= ~flags; + break; + default: + return -EOPNOTSUPP; + } + + return 0; +} + +extern struct file_operations proc_page_scan_operations; + +static int page_scan_entry(void) +{ + proc_page_scan_operations.flock(NULL, 1, NULL); + proc_page_scan_operations.owner = THIS_MODULE; + proc_page_scan_operations.read = page_scan_read; + proc_page_scan_operations.open = page_scan_open; + proc_page_scan_operations.release = page_scan_release; + proc_page_scan_operations.unlocked_ioctl = page_scan_ioctl; + proc_page_scan_operations.flock(NULL, 0, NULL); + + return 0; +} + +static void page_scan_exit(void) +{ + proc_page_scan_operations.flock(NULL, 1, NULL); + proc_page_scan_operations.owner = NULL; + proc_page_scan_operations.read = NULL; + proc_page_scan_operations.open = NULL; + proc_page_scan_operations.release = NULL; + proc_page_scan_operations.unlocked_ioctl = NULL; + proc_page_scan_operations.flock(NULL, 0, NULL); +} + +MODULE_LICENSE("GPL"); +module_init(page_scan_entry); +module_exit(page_scan_exit); diff --git a/fs/proc/etmem_scan.h b/fs/proc/etmem_scan.h new file mode 100644 index 000000000000..e109f7f350e1 --- /dev/null +++ b/fs/proc/etmem_scan.h @@ -0,0 +1,149 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _PAGE_IDLE_H +#define _PAGE_IDLE_H + +#include <linux/types.h> + +#define SCAN_HUGE_PAGE O_NONBLOCK /* only huge page */ +#define SCAN_SKIM_IDLE O_NOFOLLOW /* stop on PMD_IDLE_PTES */ +#define SCAN_DIRTY_PAGE O_NOATIME /* report pte/pmd dirty bit */ + +/* define to not used file flags */ +#define SCAN_AS_HUGE 0100000000 /* treat normal page as hugepage in vm */ +#define SCAN_IGN_HOST 0200000000 /* ignore host access when scan vm */ +#define VM_SCAN_HOST 0400000000 /* scan and add host page for vm hole(internal) */ +#define VMA_SCAN_FLAG 0x1000 /* scan the specifics vma with flag */ + +#define ALL_SCAN_FLAGS (SCAN_HUGE_PAGE | SCAN_SKIM_IDLE | SCAN_DIRTY_PAGE | \ + SCAN_AS_HUGE | SCAN_IGN_HOST | VM_SCAN_HOST | VMA_SCAN_FLAG) + +#define IDLE_SCAN_MAGIC 0x66 +#define IDLE_SCAN_ADD_FLAGS _IOW(IDLE_SCAN_MAGIC, 0x0, unsigned int) +#define IDLE_SCAN_REMOVE_FLAGS _IOW(IDLE_SCAN_MAGIC, 0x1, unsigned int) +#define VMA_SCAN_ADD_FLAGS _IOW(IDLE_SCAN_MAGIC, 0x2, unsigned int) +#define VMA_SCAN_REMOVE_FLAGS _IOW(IDLE_SCAN_MAGIC, 0x3, unsigned int) + +enum ProcIdlePageType { + PTE_ACCESSED, /* 4k page */ + PMD_ACCESSED, /* 2M page */ + PUD_PRESENT, /* 1G page */ + + PTE_DIRTY_M, + PMD_DIRTY_M, + + PTE_IDLE, + PMD_IDLE, + PMD_IDLE_PTES, /* all PTE idle */ + + PTE_HOLE, + PMD_HOLE, + + PIP_CMD, + + IDLE_PAGE_TYPE_MAX +}; + +#define PIP_TYPE(a) (0xf & (a >> 4)) +#define PIP_SIZE(a) (0xf & a) +#define PIP_COMPOSE(type, nr) ((type << 4) | nr) + +#define PIP_CMD_SET_HVA PIP_COMPOSE(PIP_CMD, 0) + +#ifndef INVALID_PAGE +#define INVALID_PAGE ~0UL +#endif + +#ifdef CONFIG_ARM64 +#define _PAGE_MM_BIT_ACCESSED 10 +#else +#define _PAGE_MM_BIT_ACCESSED _PAGE_BIT_ACCESSED +#endif + +#ifdef CONFIG_X86_64 +#define _PAGE_BIT_EPT_ACCESSED 8 +#define _PAGE_BIT_EPT_DIRTY 9 +#define _PAGE_EPT_ACCESSED (_AT(pteval_t, 1) << _PAGE_BIT_EPT_ACCESSED) +#define _PAGE_EPT_DIRTY (_AT(pteval_t, 1) << _PAGE_BIT_EPT_DIRTY) + +#define _PAGE_EPT_PRESENT (_AT(pteval_t, 7)) + +static inline int ept_pte_present(pte_t a) +{ + return pte_flags(a) & _PAGE_EPT_PRESENT; +} + +static inline int ept_pmd_present(pmd_t a) +{ + return pmd_flags(a) & _PAGE_EPT_PRESENT; +} + +static inline int ept_pud_present(pud_t a) +{ + return pud_flags(a) & _PAGE_EPT_PRESENT; +} + +static inline int ept_p4d_present(p4d_t a) +{ + return p4d_flags(a) & _PAGE_EPT_PRESENT; +} + +static inline int ept_pgd_present(pgd_t a) +{ + return pgd_flags(a) & _PAGE_EPT_PRESENT; +} + +static inline int ept_pte_accessed(pte_t a) +{ + return pte_flags(a) & _PAGE_EPT_ACCESSED; +} + +static inline int ept_pmd_accessed(pmd_t a) +{ + return pmd_flags(a) & _PAGE_EPT_ACCESSED; +} + +static inline int ept_pud_accessed(pud_t a) +{ + return pud_flags(a) & _PAGE_EPT_ACCESSED; +} + +static inline int ept_p4d_accessed(p4d_t a) +{ + return p4d_flags(a) & _PAGE_EPT_ACCESSED; +} + +static inline int ept_pgd_accessed(pgd_t a) +{ + return pgd_flags(a) & _PAGE_EPT_ACCESSED; +} +#endif + +extern struct file_operations proc_page_scan_operations; + +#define PAGE_IDLE_KBUF_FULL 1 +#define PAGE_IDLE_BUF_FULL 2 +#define PAGE_IDLE_BUF_MIN (sizeof(uint64_t) * 2 + 3) + +#define PAGE_IDLE_KBUF_SIZE 8000 + +struct page_idle_ctrl { + struct mm_struct *mm; + struct kvm *kvm; + + uint8_t kpie[PAGE_IDLE_KBUF_SIZE]; + int pie_read; + int pie_read_max; + + void __user *buf; + int buf_size; + int bytes_copied; + + unsigned long next_hva; /* GPA for EPT; VA for PT */ + unsigned long gpa_to_hva; + unsigned long restart_gpa; + unsigned long last_va; + + unsigned int flags; +}; + +#endif diff --git a/fs/proc/internal.h b/fs/proc/internal.h index 9a8f32f21ff5..a9615455b709 100644 --- a/fs/proc/internal.h +++ b/fs/proc/internal.h @@ -303,6 +303,9 @@ extern const struct file_operations proc_pid_smaps_operations; extern const struct file_operations proc_pid_smaps_rollup_operations; extern const struct file_operations proc_clear_refs_operations; extern const struct file_operations proc_pagemap_operations; +#ifdef CONFIG_ETMEM +extern const struct file_operations proc_mm_idle_operations; +#endif
extern unsigned long task_vsize(struct mm_struct *); extern unsigned long task_statm(struct mm_struct *, diff --git a/include/linux/etmem.h b/include/linux/etmem.h new file mode 100644 index 000000000000..e8a2585f3891 --- /dev/null +++ b/include/linux/etmem.h @@ -0,0 +1,27 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef __MM_ETMEM_H_ +#define __MM_ETMEM_H_ + +#include <linux/list.h> +#include <asm/page.h> +#include <linux/mmzone.h> +#include <linux/memcontrol.h> +#include <linux/page-flags.h> + +#ifdef CONFIG_ETMEM + +#if IS_ENABLED(CONFIG_KVM) +static inline struct kvm *mm_kvm(struct mm_struct *mm) +{ + return mm->kvm; +} +#else +static inline struct kvm *mm_kvm(struct mm_struct *mm) +{ + return NULL; +} +#endif + + +#endif /* #ifdef CONFIG_ETMEM */ +#endif /* define __MM_ETMEM_H_ */ diff --git a/include/linux/mm.h b/include/linux/mm.h index 8cf86b56aba5..37b12832e7de 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -308,6 +308,10 @@ extern unsigned int kobjsize(const void *objp); # define VM_SOFTDIRTY 0 #endif
+#ifdef CONFIG_ETMEM +#define VM_SWAPFLAG 0x400000000000000 /* memory swap out flag in vma */ +#endif + #define VM_MIXEDMAP 0x10000000 /* Can contain "struct page" and pure PFN pages */ #define VM_HUGEPAGE 0x20000000 /* MADV_HUGEPAGE marked this vma */ #define VM_NOHUGEPAGE 0x40000000 /* MADV_NOHUGEPAGE marked this vma */ diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index 582aa5e44a5a..9226a236b2cb 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -931,6 +931,12 @@ struct mm_struct { #endif } lru_gen; #endif /* CONFIG_LRU_GEN */ +#ifdef CONFIG_ETMEM +#if IS_ENABLED(CONFIG_KVM) && !defined(__GENKSYMS__) + struct kvm *kvm; +#endif +#endif + } __randomize_layout;
/* diff --git a/include/uapi/asm-generic/mman-common.h b/include/uapi/asm-generic/mman-common.h index 6ce1f1ceb432..14e5498efd7a 100644 --- a/include/uapi/asm-generic/mman-common.h +++ b/include/uapi/asm-generic/mman-common.h @@ -79,6 +79,10 @@
#define MADV_COLLAPSE 25 /* Synchronous hugepage collapse */
+#define MADV_ETMEM_BASE 0x1100 +#define MADV_SWAPFLAG MADV_ETMEM_BASE /* for memory to be swap out */ +#define MADV_SWAPFLAG_REMOVE (MADV_SWAPFLAG + 1) + /* compatibility flags */ #define MAP_FILE 0
diff --git a/mm/Kconfig b/mm/Kconfig index ece4f2847e2b..919f451d683d 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -903,6 +903,29 @@ config USE_PERCPU_NUMA_NODE_ID config HAVE_SETUP_PER_CPU_AREA bool
+config ETMEM_SCAN + tristate "module: etmem page scan for etmem support" + depends on ETMEM + help + etmem scan is a critical part of the etmem feature. + A kernel module, etmem_scan.ko periodically scans the appointed vma + segments of the target process, perform page table walk accordingly, + check and clear the access bit of each page before finally report + scan results back to user space. + etmem scan also supports virtual machines. + +config ETMEM + bool "Enable etmem feature" + depends on MMU + depends on X86 || ARM64 + default n + help + etmem is a tiered memory extension technology that uses DRAM and memory + compression/high-performance storage media to form tiered memory storage. + Memory data is tiered, and cold data is migrated from memory media to + high-performance storage media to release memory space and reduce + memory costs. + config CMA bool "Contiguous Memory Allocator" depends on MMU diff --git a/mm/madvise.c b/mm/madvise.c index 4dded5d27e7e..726668a521dc 100644 --- a/mm/madvise.c +++ b/mm/madvise.c @@ -1092,6 +1092,14 @@ static int madvise_vma_behavior(struct vm_area_struct *vma, if (error) goto out; break; +#ifdef CONFIG_ETMEM + case MADV_SWAPFLAG: + new_flags |= VM_SWAPFLAG; + break; + case MADV_SWAPFLAG_REMOVE: + new_flags &= ~VM_SWAPFLAG; + break; +#endif case MADV_COLLAPSE: return madvise_collapse(vma, prev, start, end); } @@ -1196,6 +1204,10 @@ madvise_behavior_valid(int behavior) #ifdef CONFIG_MEMORY_FAILURE case MADV_SOFT_OFFLINE: case MADV_HWPOISON: +#endif +#ifdef CONFIG_ETMEM + case MADV_SWAPFLAG: + case MADV_SWAPFLAG_REMOVE: #endif return true;
@@ -1386,6 +1398,10 @@ int madvise_set_anon_name(struct mm_struct *mm, unsigned long start, * easily if memory pressure happens. * MADV_PAGEOUT - the application is not expected to use this memory soon, * page out the pages in this range immediately. + * MADV_SWAPFLAG - Used in the etmem memory extension feature, the process + * specifies the memory swap area by adding a flag to a specific + * vma address. + * MADV_SWAPFLAG_REMOVE - remove the specific vma flag * MADV_POPULATE_READ - populate (prefault) page tables readable by * triggering read faults if required * MADV_POPULATE_WRITE - populate (prefault) page tables writable by diff --git a/mm/pagewalk.c b/mm/pagewalk.c index b7d7e4fcfad7..2ac04c1dbb09 100644 --- a/mm/pagewalk.c +++ b/mm/pagewalk.c @@ -525,6 +525,9 @@ int walk_page_range(struct mm_struct *mm, unsigned long start, } while (start = next, start < end); return err; } +#ifdef CONFIG_ETMEM +EXPORT_SYMBOL_GPL(walk_page_range); +#endif
/** * walk_page_range_novma - walk a range of pagetables not backed by a vma diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index 486800a7024b..30fb090a61d7 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -1305,6 +1305,9 @@ static void kvm_destroy_vm(struct kvm *kvm)
kvm_destroy_pm_notifier(kvm); kvm_uevent_notify_change(KVM_EVENT_DESTROY_VM, kvm); +#if IS_ENABLED(CONFIG_KVM) + mm->kvm = NULL; +#endif kvm_destroy_vm_debugfs(kvm); kvm_arch_sync_events(kvm); mutex_lock(&kvm_lock); @@ -5098,6 +5101,9 @@ static int kvm_dev_ioctl_create_vm(unsigned long type) goto put_kvm; }
+#if IS_ENABLED(CONFIG_KVM) + kvm->mm->kvm = kvm; +#endif /* * Don't call kvm_put_kvm anymore at this point; file->f_op is * already set, with ->release() being kvm_vm_release(). In error
euleros inclusion category: feature feature: Add pages to swapcache and swap them out proactively bugzilla: NA CVE: NA
-------------------------------------------------
This patch proposes the etmem swap feature. etmem swap adds target pages to swap cache to be further reclaimed by kswapd, and dwell in swap space
Signed-off-by: yanxiaodan yanxiaodan@huawei.com Signed-off-by: linmiaohe linmiaohe@huawei.com Signed-off-by: louhongxiang louhongxiang@huawei.com Signed-off-by: liubo liubo254@huawei.com Signed-off-by: geruijun geruijun@huawei.com Signed-off-by: liangchenshu liangchenshu@huawei.com Reviewed-by: Jing Xiangfeng jingxiangfeng@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com Signed-off-by: Yuchen Tang tangyuchen5@huawei.com --- arch/arm64/configs/openeuler_defconfig | 1 + arch/x86/configs/openeuler_defconfig | 1 + fs/proc/Makefile | 5 +- fs/proc/base.c | 2 + fs/proc/etmem_proc.c | 94 +++++++++++++++++++++ fs/proc/etmem_swap.c | 109 +++++++++++++++++++++++++ fs/proc/internal.h | 1 + include/linux/etmem.h | 13 +++ include/linux/swap.h | 2 +- mm/Kconfig | 10 +++ mm/Makefile | 1 + mm/etmem.c | 64 +++++++++++++++ mm/vmscan.c | 4 + 13 files changed, 304 insertions(+), 3 deletions(-) create mode 100644 fs/proc/etmem_swap.c create mode 100644 mm/etmem.c
diff --git a/arch/arm64/configs/openeuler_defconfig b/arch/arm64/configs/openeuler_defconfig index 01a5c61cebbb..cf8568ef5387 100644 --- a/arch/arm64/configs/openeuler_defconfig +++ b/arch/arm64/configs/openeuler_defconfig @@ -1072,6 +1072,7 @@ CONFIG_NEED_PER_CPU_PAGE_FIRST_CHUNK=y CONFIG_USE_PERCPU_NUMA_NODE_ID=y CONFIG_HAVE_SETUP_PER_CPU_AREA=y CONFIG_ETMEM_SCAN=m +CONFIG_ETMEM_SWAP=m CONFIG_ETMEM=y CONFIG_CMA=y # CONFIG_CMA_DEBUG is not set diff --git a/arch/x86/configs/openeuler_defconfig b/arch/x86/configs/openeuler_defconfig index dafa73172937..e59ebb6230bf 100644 --- a/arch/x86/configs/openeuler_defconfig +++ b/arch/x86/configs/openeuler_defconfig @@ -1094,6 +1094,7 @@ CONFIG_NEED_PER_CPU_PAGE_FIRST_CHUNK=y CONFIG_USE_PERCPU_NUMA_NODE_ID=y CONFIG_HAVE_SETUP_PER_CPU_AREA=y CONFIG_ETMEM_SCAN=m +CONFIG_ETMEM_SWAP=m CONFIG_ETMEM=y # CONFIG_CMA is not set CONFIG_MEM_SOFT_DIRTY=y diff --git a/fs/proc/Makefile b/fs/proc/Makefile index b9a7bc7d8a75..fe283f354d61 100644 --- a/fs/proc/Makefile +++ b/fs/proc/Makefile @@ -34,5 +34,6 @@ proc-$(CONFIG_PROC_VMCORE) += vmcore.o proc-$(CONFIG_PRINTK) += kmsg.o proc-$(CONFIG_PROC_PAGE_MONITOR) += page.o proc-$(CONFIG_BOOT_CONFIG) += bootconfig.o -obj-$(CONFIG_ETMEM_SCAN) += etmem_scan.o -proc-${CONFIG_ETMEM} += etmem_proc.o +obj-$(CONFIG_ETMEM_SCAN) += etmem_scan.o +obj-$(CONFIG_ETMEM_SWAP) += etmem_swap.o +proc-${CONFIG_ETMEM} += etmem_proc.o diff --git a/fs/proc/base.c b/fs/proc/base.c index e634c61ba1ac..a99d89881c06 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -3287,6 +3287,7 @@ static const struct pid_entry tgid_base_stuff[] = { #endif #ifdef CONFIG_ETMEM REG("idle_pages", 0600, proc_mm_idle_operations), + REG("swap_pages", 0600, proc_mm_swap_operations), #endif #ifdef CONFIG_SECURITY DIR("attr", S_IRUGO|S_IXUGO, proc_attr_dir_inode_operations, proc_attr_dir_operations), @@ -3639,6 +3640,7 @@ static const struct pid_entry tid_base_stuff[] = { #endif #ifdef CONFIG_ETMEM REG("idle_pages", 0600, proc_mm_idle_operations), + REG("swap_pages", 0600, proc_mm_swap_operations), #endif #ifdef CONFIG_SECURITY DIR("attr", S_IRUGO|S_IXUGO, proc_attr_dir_inode_operations, proc_attr_dir_operations), diff --git a/fs/proc/etmem_proc.c b/fs/proc/etmem_proc.c index edacb9260345..2e6712cc43b2 100644 --- a/fs/proc/etmem_proc.c +++ b/fs/proc/etmem_proc.c @@ -120,3 +120,97 @@ const struct file_operations proc_mm_idle_operations = { .release = mm_idle_release, .unlocked_ioctl = mm_idle_ioctl, }; + +static DEFINE_SPINLOCK(swap_lock); + +static int page_swap_lock(struct file *file, int is_lock, struct file_lock *flock) +{ + if (is_lock) + spin_lock(&swap_lock); + else + spin_unlock(&swap_lock); + + return 0; +} +/*swap pages*/ +struct file_operations proc_swap_pages_operations = { + .flock = page_swap_lock, +}; +EXPORT_SYMBOL_GPL(proc_swap_pages_operations); + +static ssize_t mm_swap_write(struct file *file, const char __user *buf, + size_t count, loff_t *ppos) +{ + if (proc_swap_pages_operations.write) + return proc_swap_pages_operations.write(file, buf, count, ppos); + + return -1; +} + +static int mm_swap_open(struct inode *inode, struct file *file) +{ + struct mm_struct *mm = NULL; + struct module *module = NULL; + int ret = -1; + + if (!file_ns_capable(file, &init_user_ns, CAP_SYS_ADMIN)) + return -EPERM; + + page_swap_lock(NULL, 1, NULL); + module = proc_swap_pages_operations.owner; + if (module != NULL && try_module_get(module)) + ret = 0; + page_swap_lock(NULL, 0, NULL); + if (ret != 0) { + /* no swap ko installed, avoid to return valid file */ + return -ENODEV; + } + + mm = proc_mem_open(inode, PTRACE_MODE_READ); + if (IS_ERR(mm)) { + module_put(module); + return PTR_ERR(mm); + } + + file->private_data = mm; + + if (proc_swap_pages_operations.open) + ret = proc_swap_pages_operations.open(inode, file); + + if (ret != 0) + module_put(module); + + return ret; +} + +static int mm_swap_release(struct inode *inode, struct file *file) +{ + struct mm_struct *mm = file->private_data; + int ret = 0; + + if (mm) + mmdrop(mm); + + if (proc_swap_pages_operations.release) + ret = proc_swap_pages_operations.release(inode, file); + + if (proc_swap_pages_operations.owner) + module_put(proc_swap_pages_operations.owner); + + return ret; +} + +static long mm_swap_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) +{ + if (proc_swap_pages_operations.unlocked_ioctl) + return proc_swap_pages_operations.unlocked_ioctl(filp, cmd, arg); + return 0; +} + +const struct file_operations proc_mm_swap_operations = { + .llseek = mem_lseek, + .write = mm_swap_write, + .open = mm_swap_open, + .release = mm_swap_release, + .unlocked_ioctl = mm_swap_ioctl, +}; diff --git a/fs/proc/etmem_swap.c b/fs/proc/etmem_swap.c new file mode 100644 index 000000000000..4aad6b9db9a6 --- /dev/null +++ b/fs/proc/etmem_swap.c @@ -0,0 +1,109 @@ +// SPDX-License-Identifier: GPL-2.0 +#include <linux/init.h> +#include <linux/kernel.h> +#include <linux/module.h> +#include <linux/string.h> +#include <linux/proc_fs.h> +#include <linux/sched/mm.h> +#include <linux/mm.h> +#include <linux/swap.h> +#include <linux/mempolicy.h> +#include <linux/uaccess.h> +#include <linux/delay.h> +#include <linux/etmem.h> + +static ssize_t swap_pages_write(struct file *file, const char __user *buf, + size_t count, loff_t *ppos) +{ + char *p, *data, *data_ptr_res; + unsigned long vaddr; + struct mm_struct *mm = file->private_data; + struct page *page; + LIST_HEAD(pagelist); + int ret = 0; + + if (!mm || !mmget_not_zero(mm)) { + ret = -ESRCH; + goto out; + } + + if (count < 0) { + ret = -EOPNOTSUPP; + goto out_mm; + } + + data = memdup_user_nul(buf, count); + if (IS_ERR(data)) { + ret = PTR_ERR(data); + goto out_mm; + } + + data_ptr_res = data; + while ((p = strsep(&data, "\n")) != NULL) { + if (!*p) + continue; + + ret = kstrtoul(p, 16, &vaddr); + if (ret != 0) + continue; + + /* If get page struct failed, ignore it, get next page */ + page = get_page_from_vaddr(mm, vaddr); + if (!page) + continue; + + add_page_for_swap(page, &pagelist); + } + + if (!list_empty(&pagelist)) + reclaim_pages(&pagelist); + + ret = count; + kfree(data_ptr_res); +out_mm: + mmput(mm); +out: + return ret; +} + +static int swap_pages_open(struct inode *inode, struct file *file) +{ + if (!try_module_get(THIS_MODULE)) + return -EBUSY; + + return 0; +} + +static int swap_pages_release(struct inode *inode, struct file *file) +{ + module_put(THIS_MODULE); + return 0; +} + +extern struct file_operations proc_swap_pages_operations; + +static int swap_pages_entry(void) +{ + proc_swap_pages_operations.flock(NULL, 1, NULL); + proc_swap_pages_operations.owner = THIS_MODULE; + proc_swap_pages_operations.write = swap_pages_write; + proc_swap_pages_operations.open = swap_pages_open; + proc_swap_pages_operations.release = swap_pages_release; + proc_swap_pages_operations.flock(NULL, 0, NULL); + + return 0; +} + +static void swap_pages_exit(void) +{ + proc_swap_pages_operations.flock(NULL, 1, NULL); + proc_swap_pages_operations.owner = NULL; + proc_swap_pages_operations.write = NULL; + proc_swap_pages_operations.open = NULL; + proc_swap_pages_operations.release = NULL; + proc_swap_pages_operations.flock(NULL, 0, NULL); +} + +MODULE_LICENSE("GPL"); +module_init(swap_pages_entry); +module_exit(swap_pages_exit); diff --git a/fs/proc/internal.h b/fs/proc/internal.h index a9615455b709..be6d5dfc330c 100644 --- a/fs/proc/internal.h +++ b/fs/proc/internal.h @@ -305,6 +305,7 @@ extern const struct file_operations proc_clear_refs_operations; extern const struct file_operations proc_pagemap_operations; #ifdef CONFIG_ETMEM extern const struct file_operations proc_mm_idle_operations; +extern const struct file_operations proc_mm_swap_operations; #endif
extern unsigned long task_vsize(struct mm_struct *); diff --git a/include/linux/etmem.h b/include/linux/etmem.h index e8a2585f3891..5ebd1c3274b7 100644 --- a/include/linux/etmem.h +++ b/include/linux/etmem.h @@ -22,6 +22,19 @@ static inline struct kvm *mm_kvm(struct mm_struct *mm) } #endif
+extern int add_page_for_swap(struct page *page, struct list_head *pagelist); +extern struct page *get_page_from_vaddr(struct mm_struct *mm, + unsigned long vaddr); +#else /* !CONFIG_ETMEM */ +static inline int add_page_for_swap(struct page *page, struct list_head *pagelist) +{ + return 0; +}
+static inline struct page *get_page_from_vaddr(struct mm_struct *mm, + unsigned long vaddr) +{ + return NULL; +} #endif /* #ifdef CONFIG_ETMEM */ #endif /* define __MM_ETMEM_H_ */ diff --git a/include/linux/swap.h b/include/linux/swap.h index f6dd6575b905..a85c983db414 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h @@ -686,6 +686,6 @@ static inline bool mem_cgroup_swap_full(struct folio *folio) return vm_swap_full(); } #endif - +extern unsigned long reclaim_pages(struct list_head *folio_list); #endif /* __KERNEL__*/ #endif /* _LINUX_SWAP_H */ diff --git a/mm/Kconfig b/mm/Kconfig index 919f451d683d..2efe034efa31 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -914,6 +914,16 @@ config ETMEM_SCAN scan results back to user space. etmem scan also supports virtual machines.
+config ETMEM_SWAP + tristate "module: etmem page swap for etmem support" + depends on ETMEM + help + etmem swap is a critical component of the etmem feature. + When using etmem slide engine, etmem_swap.ko will add appointed pages + (ideally all of which are all rarely used, "cold" pages) to swapcache + proactively, which will later be reclaimed and added to swap space, + making room for more frequently used, "hot" pages. + config ETMEM bool "Enable etmem feature" depends on MMU diff --git a/mm/Makefile b/mm/Makefile index ec65984e2ade..db9c2806ae2d 100644 --- a/mm/Makefile +++ b/mm/Makefile @@ -138,3 +138,4 @@ obj-$(CONFIG_IO_MAPPING) += io-mapping.o obj-$(CONFIG_HAVE_BOOTMEM_INFO_NODE) += bootmem_info.o obj-$(CONFIG_GENERIC_IOREMAP) += ioremap.o obj-$(CONFIG_SHRINKER_DEBUG) += shrinker_debug.o +obj-$(CONFIG_ETMEM) += etmem.o diff --git a/mm/etmem.c b/mm/etmem.c new file mode 100644 index 000000000000..9a89bfcc1058 --- /dev/null +++ b/mm/etmem.c @@ -0,0 +1,64 @@ +// SPDX-License-Identifier: GPL-2.0 +#include <linux/mm.h> +#include <linux/swap.h> +#include <linux/pagemap.h> +#include <linux/memcontrol.h> +#include <linux/gfp.h> +#include <linux/mm_inline.h> +#include <linux/sysctl.h> +#include <linux/etmem.h> +#include "internal.h" + +int add_page_for_swap(struct page *page, struct list_head *pagelist) +{ + int err = -EBUSY; + struct page *head; + + /* If the page is mapped by more than one process, do not swap it */ + if (page_mapcount(page) > 1) + return -EACCES; + + if (PageHuge(page)) + return -EACCES; + + head = compound_head(page); + if (!folio_isolate_lru(page_folio(head))) { + put_page(page); + return err; + } + put_page(page); + if (PageUnevictable(page)) + putback_lru_page(page); + else + list_add_tail(&head->lru, pagelist); + + err = 0; + return err; +} +EXPORT_SYMBOL_GPL(add_page_for_swap); + +struct page *get_page_from_vaddr(struct mm_struct *mm, unsigned long vaddr) +{ + struct page *page; + struct vm_area_struct *vma; + unsigned int follflags; + + mmap_read_lock(mm); + + vma = find_vma(mm, vaddr); + if (!vma || vaddr < vma->vm_start || vma->vm_flags & VM_LOCKED) { + mmap_read_unlock(mm); + return NULL; + } + + follflags = FOLL_GET | FOLL_DUMP; + page = follow_page(vma, vaddr, follflags); + if (IS_ERR(page) || !page) { + mmap_read_unlock(mm); + return NULL; + } + + mmap_read_unlock(mm); + return page; +} +EXPORT_SYMBOL_GPL(get_page_from_vaddr); diff --git a/mm/vmscan.c b/mm/vmscan.c index 6f13394b112e..3f70e680d144 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -65,6 +65,7 @@ #include <linux/balloon_compaction.h> #include <linux/sched/sysctl.h>
+#include <linux/etmem.h> #include "internal.h" #include "swap.h"
@@ -2840,6 +2841,9 @@ unsigned long reclaim_pages(struct list_head *folio_list)
return nr_reclaimed; } +#ifdef CONFIG_ETMEM +EXPORT_SYMBOL_GPL(reclaim_pages); +#endif
static unsigned long shrink_list(enum lru_list lru, unsigned long nr_to_scan, struct lruvec *lruvec, struct scan_control *sc)
euleros inclusion category: feature feature: Add pages to swapcache and swap them out proactively bugzilla: NA CVE: NA
-----------------------------------------------------
Each CPU socket can have 1 DRAM and 1 PMEM node, we call them "peer nodes". Migration between DRAM and PMEM will by default happen between peer nodes.
It's a temp solution. In multiple memory layers, a node can have both promotion and demotion targets instead of a single peer node. User space may also be able to infer promotion/demotion targets based on future HMAT info.
Signed-off-by: Fan Du fan.du@intel.com Signed-off-by: Fengguang Wu fengguang.wu@intel.com Signed-off-by: Shijie Luo luoshijie1@huawei.com Signed-off-by: liubo liubo254@huawei.com Reviewed-by: louhongxiang louhongxiang@huawei.com Signed-off-by: Yuchen Tang tangyuchen5@huawei.com --- drivers/base/node.c | 7 +++++++ include/linux/etmem.h | 1 + include/linux/mmzone.h | 14 ++++++++++++++ mm/etmem.c | 28 ++++++++++++++++++++++++++++ mm/mm_init.c | 4 ++++ 5 files changed, 54 insertions(+)
diff --git a/drivers/base/node.c b/drivers/base/node.c index 493d533f8375..f4a7d5590c0c 100644 --- a/drivers/base/node.c +++ b/drivers/base/node.c @@ -569,11 +569,18 @@ static ssize_t node_read_distance(struct device *dev, } static DEVICE_ATTR(distance, 0444, node_read_distance, NULL);
+#ifdef CONFIG_ETMEM +static DEVICE_ATTR_RO(peer_node); +#endif + static struct attribute *node_dev_attrs[] = { &dev_attr_meminfo.attr, &dev_attr_numastat.attr, &dev_attr_distance.attr, &dev_attr_vmstat.attr, +#ifdef CONFIG_ETMEM + &dev_attr_peer_node.attr, +#endif NULL };
diff --git a/include/linux/etmem.h b/include/linux/etmem.h index 5ebd1c3274b7..fc19cdc0d567 100644 --- a/include/linux/etmem.h +++ b/include/linux/etmem.h @@ -9,6 +9,7 @@ #include <linux/page-flags.h>
#ifdef CONFIG_ETMEM +int find_best_peer_node(int nid);
#if IS_ENABLED(CONFIG_KVM) static inline struct kvm *mm_kvm(struct mm_struct *mm) diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index 57086c57b8e4..c5c9b9c60ea5 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -1410,6 +1410,20 @@ typedef struct pglist_data {
CACHELINE_PADDING(_pad2_);
+#ifdef CONFIG_ETMEM + /* + * Points to the nearest node in terms of latency + * E.g. peer of node 0 is node 2 per SLIT + * node distances: + * node 0 1 2 3 + * 0: 10 21 17 28 + * 1: 21 10 28 17 + * 2: 17 28 10 28 + * 3: 28 17 28 10 + */ + int peer_node; + int notused; +#endif /* Per-node vmstats */ struct per_cpu_nodestat __percpu *per_cpu_nodestats; atomic_long_t vm_stat[NR_VM_NODE_STAT_ITEMS]; diff --git a/mm/etmem.c b/mm/etmem.c index 9a89bfcc1058..fe5ca89a4e76 100644 --- a/mm/etmem.c +++ b/mm/etmem.c @@ -9,6 +9,34 @@ #include <linux/etmem.h> #include "internal.h"
+/* + * Return the nearest peer node in terms of *locality* + * E.g. peer of node 0 is node 2 per SLIT + * node distances: + * node 0 1 2 3 + * 0: 10 21 17 28 + * 1: 21 10 28 17 + * 2: 17 28 10 28 + * 3: 28 17 28 10 + */ +int find_best_peer_node(int nid) +{ + int n, val; + int min_val = INT_MAX; + int peer = NUMA_NO_NODE; + + for_each_online_node(n) { + if (n == nid) + continue; + val = node_distance(nid, n); + if (val < min_val) { + min_val = val; + peer = n; + } + } + return peer; +} + int add_page_for_swap(struct page *page, struct list_head *pagelist) { int err = -EBUSY; diff --git a/mm/mm_init.c b/mm/mm_init.c index fed4370b02e1..1a4efa3e76cd 100644 --- a/mm/mm_init.c +++ b/mm/mm_init.c @@ -29,6 +29,7 @@ #include "internal.h" #include "slab.h" #include "shuffle.h" +#include <linux/etmem.h>
#include <asm/setup.h>
@@ -1901,6 +1902,9 @@ void __init free_area_init(unsigned long *max_zone_pfn) if (pgdat->node_present_pages) node_set_state(nid, N_MEMORY); check_for_memory(pgdat); +#ifdef CONFIG_ETMEM + pgdat->peer_node = find_best_peer_node(nid); +#endif }
memmap_init();
euleros inclusion category: feature bugzilla: NA CVE: NA
-------------------------------------------------
We separate the pmem memory and normal dram memory and alloc pages from pmem's peer node when building vmemmap.
Signed-off-by: Shijie Luo luoshijie1@huawei.com Signed-off-by: Kemeng Shi shikemeng@huawei.com Reviewed-by: louhongxiang louhongxiang@huawei.com Signed-off-by: Yuchen Tang tangyuchen5@huawei.com --- drivers/acpi/numa/srat.c | 7 +++++++ drivers/base/node.c | 8 ++++++++ include/linux/etmem.h | 24 ++++++++++++++++++++++++ mm/etmem.c | 27 +++++++++++++++++++++++++++ mm/sparse-vmemmap.c | 10 ++++++++++ 5 files changed, 76 insertions(+)
diff --git a/drivers/acpi/numa/srat.c b/drivers/acpi/numa/srat.c index 12f330b0eac0..59e9532c0f1a 100644 --- a/drivers/acpi/numa/srat.c +++ b/drivers/acpi/numa/srat.c @@ -17,6 +17,7 @@ #include <linux/numa.h> #include <linux/nodemask.h> #include <linux/topology.h> +#include <linux/etmem.h>
static nodemask_t nodes_found_map = NODE_MASK_NONE;
@@ -277,6 +278,12 @@ acpi_numa_memory_affinity_init(struct acpi_srat_mem_affinity *ma) }
node_set(node, numa_nodes_parsed); +#ifdef CONFIG_ETMEM + if (ma->flags & ACPI_SRAT_MEM_NON_VOLATILE) + set_node_type(node, NODE_TYPE_PMEM); + else + set_node_type(node, NODE_TYPE_DRAM); +#endif
pr_info("SRAT: Node %u PXM %u [mem %#010Lx-%#010Lx]%s%s\n", node, pxm, diff --git a/drivers/base/node.c b/drivers/base/node.c index f4a7d5590c0c..69f8b93fc171 100644 --- a/drivers/base/node.c +++ b/drivers/base/node.c @@ -570,6 +570,14 @@ static ssize_t node_read_distance(struct device *dev, static DEVICE_ATTR(distance, 0444, node_read_distance, NULL);
#ifdef CONFIG_ETMEM +static ssize_t peer_node_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + int nid = dev->id; + struct pglist_data *pgdat = NODE_DATA(nid); + + return sysfs_emit(buf, "%d\n", pgdat->peer_node); +} static DEVICE_ATTR_RO(peer_node); #endif
diff --git a/include/linux/etmem.h b/include/linux/etmem.h index fc19cdc0d567..8f6d7b15d6ea 100644 --- a/include/linux/etmem.h +++ b/include/linux/etmem.h @@ -23,6 +23,30 @@ static inline struct kvm *mm_kvm(struct mm_struct *mm) } #endif
+enum node_type { + NODE_TYPE_DRAM, + NODE_TYPE_PMEM, +}; + +#ifdef CONFIG_NUMA +extern enum node_type nodes_type[MAX_NUMNODES]; +static inline void set_node_type(int nid, enum node_type type) +{ + nodes_type[nid] = type; +} +static inline enum node_type get_node_type(int nid) +{ + return nodes_type[nid]; +} +#else +static inline void set_node_type(int nid, enum node_type type) {} +static inline enum node_type get_node_type(int nid) +{ + return NODE_TYPE_DRAM; +} +#endif + +extern unsigned int sysctl_vmemmap_block_from_dram; extern int add_page_for_swap(struct page *page, struct list_head *pagelist); extern struct page *get_page_from_vaddr(struct mm_struct *mm, unsigned long vaddr); diff --git a/mm/etmem.c b/mm/etmem.c index fe5ca89a4e76..e2ba83583246 100644 --- a/mm/etmem.c +++ b/mm/etmem.c @@ -9,6 +9,11 @@ #include <linux/etmem.h> #include "internal.h"
+unsigned int sysctl_vmemmap_block_from_dram; +#ifdef CONFIG_NUMA +enum node_type nodes_type[MAX_NUMNODES]; +#endif + /* * Return the nearest peer node in terms of *locality* * E.g. peer of node 0 is node 2 per SLIT @@ -37,6 +42,28 @@ int find_best_peer_node(int nid) return peer; }
+#ifdef CONFIG_SYSCTL +static struct ctl_table vm_vmemmap_table[] = { + { + .procname = "vmemmap_block_from_dram", + .data = &sysctl_vmemmap_block_from_dram, + .maxlen = sizeof(sysctl_vmemmap_block_from_dram), + .mode = 0600, + .proc_handler = proc_dointvec_minmax, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_ONE, + }, + { } +}; + +static int __init vmemmap_sysctl_init(void) +{ + register_sysctl_init("vm", vm_vmemmap_table); + return 0; +} +late_initcall(vmemmap_sysctl_init); +#endif + int add_page_for_swap(struct page *page, struct list_head *pagelist) { int err = -EBUSY; diff --git a/mm/sparse-vmemmap.c b/mm/sparse-vmemmap.c index a2cbe44c48e1..3b954ebdb362 100644 --- a/mm/sparse-vmemmap.c +++ b/mm/sparse-vmemmap.c @@ -30,6 +30,7 @@
#include <asm/dma.h> #include <asm/pgalloc.h> +#include <linux/etmem.h>
/* * Allocate a block of memory to be used to back the virtual memory map @@ -55,6 +56,15 @@ void * __meminit vmemmap_alloc_block(unsigned long size, int node) static bool warned; struct page *page;
+#ifdef CONFIG_ETMEM + /* if node is pmem node, we should alloc_pages from + * it's peer node, because pmem node is not online + * at this time. + */ + if (get_node_type(node) == NODE_TYPE_PMEM && sysctl_vmemmap_block_from_dram) + node = NODE_DATA(node)->peer_node; +#endif + page = alloc_pages_node(node, gfp_mask, order); if (page) return page_address(page);
euleros inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I4QVXW CVE: NA
-------------------------------------------------
etmem, the memory vertical expansion technology, uses DRAM and high-performance storage new media to form multi-level memory storage. By grading the stored data, etmem migrates the classified cold storage data from the storage medium to the high-performance storage medium, so as to achieve the purpose of memory capacity expansion and memory cost reduction.
When the memory expansion function etmem is running, the native swap function of the kernel needs to be disabled in certain scenarios to avoid the impact of kernel swap.
This feature provides the preceding functions.
The /sys/kernel/mm/swap/ directory provides the kernel_swap_enable sys interface to enable or disable the native swap function of the kernel.
The default value of /sys/kernel/mm/swap/kernel_swap_enable is true, that is, kernel swap is enabled by default.
Turn on kernel swap: echo true > /sys/kernel/mm/swap/kernel_swap_enable
Turn off kernel swap: echo false > /sys/kernel/mm/swap/kernel_swap_enable
Signed-off-by: liubo liubo254@huawei.com Reviewed-by: Miaohe Lin linmiaohe@huawei.com Reviewed-by: Kefeng Wang wangkefeng.wang@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com Signed-off-by: Yuchen Tang tangyuchen5@huawei.com --- include/linux/etmem.h | 7 +++++++ mm/etmem.c | 30 ++++++++++++++++++++++++++++++ mm/swap_state.c | 4 ++++ mm/vmscan.c | 17 +++++++++++++++++ 4 files changed, 58 insertions(+)
diff --git a/include/linux/etmem.h b/include/linux/etmem.h index 8f6d7b15d6ea..668b19147ca9 100644 --- a/include/linux/etmem.h +++ b/include/linux/etmem.h @@ -46,10 +46,12 @@ static inline enum node_type get_node_type(int nid) } #endif
+extern struct kobj_attribute kernel_swap_enable_attr; extern unsigned int sysctl_vmemmap_block_from_dram; extern int add_page_for_swap(struct page *page, struct list_head *pagelist); extern struct page *get_page_from_vaddr(struct mm_struct *mm, unsigned long vaddr); +extern bool kernel_swap_enabled(void); #else /* !CONFIG_ETMEM */ static inline int add_page_for_swap(struct page *page, struct list_head *pagelist) { @@ -61,5 +63,10 @@ static inline struct page *get_page_from_vaddr(struct mm_struct *mm, { return NULL; } + +static inline bool kernel_swap_enabled(void) +{ + return true; +} #endif /* #ifdef CONFIG_ETMEM */ #endif /* define __MM_ETMEM_H_ */ diff --git a/mm/etmem.c b/mm/etmem.c index e2ba83583246..b5c5cf7768c6 100644 --- a/mm/etmem.c +++ b/mm/etmem.c @@ -9,6 +9,7 @@ #include <linux/etmem.h> #include "internal.h"
+static bool enable_kernel_swap __read_mostly = true; unsigned int sysctl_vmemmap_block_from_dram; #ifdef CONFIG_NUMA enum node_type nodes_type[MAX_NUMNODES]; @@ -42,6 +43,35 @@ int find_best_peer_node(int nid) return peer; }
+bool kernel_swap_enabled(void) +{ + return READ_ONCE(enable_kernel_swap); +} + +static ssize_t kernel_swap_enable_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + return sprintf(buf, "%s\n", enable_kernel_swap ? "true" : "false"); +} + +static ssize_t kernel_swap_enable_store(struct kobject *kobj, + struct kobj_attribute *attr, + const char *buf, size_t count) +{ + if (!strncmp(buf, "true", 4) || !strncmp(buf, "1", 1)) + WRITE_ONCE(enable_kernel_swap, true); + else if (!strncmp(buf, "false", 5) || !strncmp(buf, "0", 1)) + WRITE_ONCE(enable_kernel_swap, false); + else + return -EINVAL; + + return count; +} + +struct kobj_attribute kernel_swap_enable_attr = + __ATTR(kernel_swap_enable, 0644, kernel_swap_enable_show, + kernel_swap_enable_store); + #ifdef CONFIG_SYSCTL static struct ctl_table vm_vmemmap_table[] = { { diff --git a/mm/swap_state.c b/mm/swap_state.c index b3b14bd0dd64..ddb3a65e5c6e 100644 --- a/mm/swap_state.c +++ b/mm/swap_state.c @@ -21,6 +21,7 @@ #include <linux/swap_slots.h> #include <linux/huge_mm.h> #include <linux/shmem_fs.h> +#include <linux/etmem.h> #include "internal.h" #include "swap.h"
@@ -881,6 +882,9 @@ static struct kobj_attribute vma_ra_enabled_attr = __ATTR_RW(vma_ra_enabled);
static struct attribute *swap_attrs[] = { &vma_ra_enabled_attr.attr, +#ifdef CONFIG_ETMEM + &kernel_swap_enable_attr.attr, +#endif NULL, };
diff --git a/mm/vmscan.c b/mm/vmscan.c index 3f70e680d144..03b93ce6b536 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -7042,6 +7042,18 @@ static bool throttle_direct_reclaim(gfp_t gfp_mask, struct zonelist *zonelist, return false; }
+/* + * Check if original kernel swap is enabled + * turn off kernel swap,but leave page cache reclaim on + */ +static inline void kernel_force_no_swap(struct scan_control *sc) +{ +#ifdef CONFIG_ETMEM + if (sc != NULL && !kernel_swap_enabled()) + sc->may_swap = 0; +#endif +} + unsigned long try_to_free_pages(struct zonelist *zonelist, int order, gfp_t gfp_mask, nodemask_t *nodemask) { @@ -7058,6 +7070,7 @@ unsigned long try_to_free_pages(struct zonelist *zonelist, int order, .may_swap = 1, };
+ kernel_force_no_swap(&sc); /* * scan_control uses s8 fields for order, priority, and reclaim_idx. * Confirm they are large enough for max values. @@ -7496,6 +7509,8 @@ static int balance_pgdat(pg_data_t *pgdat, int order, int highest_zoneidx) sc.may_writepage = !laptop_mode && !nr_boost_reclaim; sc.may_swap = !nr_boost_reclaim;
+ kernel_force_no_swap(&sc); + /* * Do some background aging, to give pages a chance to be * referenced before reclaiming. All pages are rotated @@ -7874,6 +7889,8 @@ unsigned long shrink_all_memory(unsigned long nr_to_reclaim) noreclaim_flag = memalloc_noreclaim_save(); set_task_reclaim_state(current, &sc.reclaim_state);
+ kernel_force_no_swap(&sc); + nr_reclaimed = do_try_to_free_pages(zonelist, &sc);
set_task_reclaim_state(current, NULL);
euleros inclusion category: feature bugzilla: NA CVE: NA
-------------------------------------------------
etmem, the memory vertical expansion technology,
In the current etmem process, memory page swapping is implemented by invoking shrink_page_list. When this interface is invoked for the first time, pages are added to the swap cache and written to disks.The swap cache page is reclaimed only when this interface is invoked for the second time and no process accesses the page.However, in the etmem process, the user mode scans pages that have been accessed, and the migration is not delivered to pages that are not accessed by processes. Therefore, the swap cache may always be occupied. To solve the preceding problem, add the logic for actively reclaiming the swap cache.When the swap cache occupies a large amount of memory, the system proactively scans the LRU linked list and reclaims the swap cache to save memory within the specified range.
Signed-off-by: liubo liubo254@huawei.com Reviewed-by: Miaohe Lin linmiaohe@huawei.com Reviewed-by: Kefeng Wang wangkefeng.wang@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com Signed-off-by: Yuchen Tang tangyuchen5@huawei.com --- fs/proc/etmem_swap.c | 174 +++++++++++++++++++++++++ include/linux/etmem.h | 30 +++++ mm/etmem.c | 289 ++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 493 insertions(+)
diff --git a/fs/proc/etmem_swap.c b/fs/proc/etmem_swap.c index 4aad6b9db9a6..d6461e48cd3a 100644 --- a/fs/proc/etmem_swap.c +++ b/fs/proc/etmem_swap.c @@ -10,8 +10,26 @@ #include <linux/mempolicy.h> #include <linux/uaccess.h> #include <linux/delay.h> +#include <linux/numa.h> +#include <linux/freezer.h> +#include <linux/kthread.h> +#include <linux/mm_inline.h> #include <linux/etmem.h>
+#define RECLAIM_SWAPCACHE_MAGIC 0X77 +#define SET_SWAPCACHE_WMARK _IOW(RECLAIM_SWAPCACHE_MAGIC, 0x02, unsigned int) +#define RECLAIM_SWAPCACHE_ON _IOW(RECLAIM_SWAPCACHE_MAGIC, 0x01, unsigned int) +#define RECLAIM_SWAPCACHE_OFF _IOW(RECLAIM_SWAPCACHE_MAGIC, 0x00, unsigned int) + +#define WATERMARK_MAX 100 +#define SWAP_SCAN_NUM_MAX 32 + +static struct task_struct *reclaim_swapcache_tk; +static bool enable_swapcache_reclaim; +static unsigned long swapcache_watermark[ETMEM_SWAPCACHE_NR_WMARK]; + +static DECLARE_WAIT_QUEUE_HEAD(reclaim_queue); + static ssize_t swap_pages_write(struct file *file, const char __user *buf, size_t count, loff_t *ppos) { @@ -80,8 +98,153 @@ static int swap_pages_release(struct inode *inode, struct file *file) return 0; }
+/* check if swapcache meet requirements */ +static bool swapcache_balanced(void) +{ + return total_swapcache_pages() < swapcache_watermark[ETMEM_SWAPCACHE_WMARK_HIGH]; +} + +/* the flag present if swapcache reclaim is started */ +static bool swapcache_reclaim_enabled(void) +{ + return READ_ONCE(enable_swapcache_reclaim); +} + +static void start_swapcache_reclaim(void) +{ + if (swapcache_balanced()) + return; + /* RECLAIM_SWAPCACHE_ON trigger the thread to start running. */ + if (!waitqueue_active(&reclaim_queue)) + return; + + WRITE_ONCE(enable_swapcache_reclaim, true); + wake_up_interruptible(&reclaim_queue); +} + +static void stop_swapcache_reclaim(void) +{ + WRITE_ONCE(enable_swapcache_reclaim, false); +} + +static bool should_goto_sleep(void) +{ + if (swapcache_balanced()) + stop_swapcache_reclaim(); + + if (swapcache_reclaim_enabled()) + return false; + + return true; +} + +static int get_swapcache_watermark(unsigned int ratio) +{ + unsigned int low_watermark; + unsigned int high_watermark; + + low_watermark = ratio & 0xFF; + high_watermark = (ratio >> 8) & 0xFF; + if (low_watermark > WATERMARK_MAX || + high_watermark > WATERMARK_MAX || + low_watermark > high_watermark) + return -EPERM; + + swapcache_watermark[ETMEM_SWAPCACHE_WMARK_LOW] = totalram_pages() * + low_watermark / WATERMARK_MAX; + swapcache_watermark[ETMEM_SWAPCACHE_WMARK_HIGH] = totalram_pages() * + high_watermark / WATERMARK_MAX; + + return 0; +} + extern struct file_operations proc_swap_pages_operations;
+static void reclaim_swapcache_try_to_sleep(void) +{ + DEFINE_WAIT(wait); + + if (freezing(current) || kthread_should_stop()) + return; + + prepare_to_wait(&reclaim_queue, &wait, TASK_INTERRUPTIBLE); + if (should_goto_sleep()) { + if (!kthread_should_stop()) + schedule(); + } + finish_wait(&reclaim_queue, &wait); +} + +static void etmem_reclaim_swapcache(void) +{ + do_swapcache_reclaim(swapcache_watermark, + ARRAY_SIZE(swapcache_watermark)); + stop_swapcache_reclaim(); +} + +static int reclaim_swapcache_proactive(void *para) +{ + set_freezable(); + + while (1) { + bool ret; + + reclaim_swapcache_try_to_sleep(); + ret = try_to_freeze(); + if (kthread_should_stop()) + break; + + if (ret) + continue; + + etmem_reclaim_swapcache(); + } + + return 0; +} + +static int reclaim_swapcache_run(void) +{ + int ret = 0; + + reclaim_swapcache_tk = kthread_run(reclaim_swapcache_proactive, NULL, + "etmem_recalim_swapcache"); + if (IS_ERR(reclaim_swapcache_tk)) { + ret = PTR_ERR(reclaim_swapcache_tk); + reclaim_swapcache_tk = NULL; + } + return ret; +} + +static long swap_page_ioctl(struct file *filp, unsigned int cmd, + unsigned long arg) +{ + void __user *argp = (void __user *)arg; + unsigned int ratio; + + switch (cmd) { + case RECLAIM_SWAPCACHE_ON: + if (swapcache_reclaim_enabled()) + return 0; + start_swapcache_reclaim(); + break; + case RECLAIM_SWAPCACHE_OFF: + stop_swapcache_reclaim(); + break; + case SET_SWAPCACHE_WMARK: + if (get_user(ratio, (unsigned int __user *)argp)) + return -EFAULT; + + if (get_swapcache_watermark(ratio) != 0) + return -EFAULT; + break; + default: + return -EPERM; + } + + return 0; +} + static int swap_pages_entry(void) { proc_swap_pages_operations.flock(NULL, 1, NULL); @@ -89,8 +252,12 @@ static int swap_pages_entry(void) proc_swap_pages_operations.write = swap_pages_write; proc_swap_pages_operations.open = swap_pages_open; proc_swap_pages_operations.release = swap_pages_release; + proc_swap_pages_operations.unlocked_ioctl = swap_page_ioctl; proc_swap_pages_operations.flock(NULL, 0, NULL);
+ enable_swapcache_reclaim = false; + reclaim_swapcache_run(); + return 0; }
@@ -101,7 +268,14 @@ static void swap_pages_exit(void) proc_swap_pages_operations.write = NULL; proc_swap_pages_operations.open = NULL; proc_swap_pages_operations.release = NULL; + proc_swap_pages_operations.unlocked_ioctl = NULL; proc_swap_pages_operations.flock(NULL, 0, NULL); + + if (!IS_ERR(reclaim_swapcache_tk)) { + kthread_stop(reclaim_swapcache_tk); + reclaim_swapcache_tk = NULL; + } + return; }
MODULE_LICENSE("GPL"); diff --git a/include/linux/etmem.h b/include/linux/etmem.h index 668b19147ca9..a274785216d1 100644 --- a/include/linux/etmem.h +++ b/include/linux/etmem.h @@ -9,6 +9,22 @@ #include <linux/page-flags.h>
#ifdef CONFIG_ETMEM +/** + * list_for_each_entry_safe_reverse_from + * @pos: the type * to use as a loop cursor. + * @n: another type * to use as temporary storage + * @head: the head for your list. + * @member: the name of the list_head within the struct. + * + * Iterate backwards over list of given type from current point, safe against + * removal of list entry. + */ +#define list_for_each_entry_safe_reverse_from(pos, n, head, member) \ + for (n = list_prev_entry(pos, member); \ + &pos->member != (head); \ + pos = n, n = list_prev_entry(n, member)) + + int find_best_peer_node(int nid);
#if IS_ENABLED(CONFIG_KVM) @@ -46,12 +62,20 @@ static inline enum node_type get_node_type(int nid) } #endif
+enum etmem_swapcache_watermark_en { + ETMEM_SWAPCACHE_WMARK_LOW, + ETMEM_SWAPCACHE_WMARK_HIGH, + ETMEM_SWAPCACHE_NR_WMARK +}; + extern struct kobj_attribute kernel_swap_enable_attr; extern unsigned int sysctl_vmemmap_block_from_dram; extern int add_page_for_swap(struct page *page, struct list_head *pagelist); extern struct page *get_page_from_vaddr(struct mm_struct *mm, unsigned long vaddr); extern bool kernel_swap_enabled(void); +extern int do_swapcache_reclaim(unsigned long *swapcache_watermark, + unsigned int watermark_nr); #else /* !CONFIG_ETMEM */ static inline int add_page_for_swap(struct page *page, struct list_head *pagelist) { @@ -68,5 +92,11 @@ static inline bool kernel_swap_enabled(void) { return true; } + +static inline int do_swapcache_reclaim(unsigned long *swapcache_watermark, + unsigned int watermark_nr) +{ + return 0; +} #endif /* #ifdef CONFIG_ETMEM */ #endif /* define __MM_ETMEM_H_ */ diff --git a/mm/etmem.c b/mm/etmem.c index b5c5cf7768c6..90abf8ac4596 100644 --- a/mm/etmem.c +++ b/mm/etmem.c @@ -147,3 +147,292 @@ struct page *get_page_from_vaddr(struct mm_struct *mm, unsigned long vaddr) return page; } EXPORT_SYMBOL_GPL(get_page_from_vaddr); + +static int add_page_for_reclaim_swapcache(struct page *page, + struct list_head *pagelist, struct lruvec *lruvec, enum lru_list lru) +{ + struct page *head; + + /* If the page is mapped by more than one process, do not swap it */ + if (page_mapcount(page) > 1) + return -EINVAL; + + if (PageHuge(page)) + return -EINVAL; + + head = compound_head(page); + if (!PageLRU(head) || PageUnevictable(head)) + return -EBUSY; + + if (unlikely(!get_page_unless_zero(page))) + return -EBUSY; + + if (!TestClearPageLRU(page)) { + /* + * This page may in other isolation path, + * but we still hold lru_lock. + */ + put_page(page); + return -EBUSY; + } + + list_move(&head->lru, pagelist); + update_lru_size(lruvec, lru, page_zonenum(head), -thp_nr_pages(head)); + + return 0; +} + +static unsigned long reclaim_swapcache_pages_from_list(int nid, + struct list_head *page_list, unsigned long reclaim_num, bool putback_flag) +{ + unsigned long nr_reclaimed = 0; + unsigned long nr_moved = 0; + struct page *page, *next; + LIST_HEAD(swap_pages); + struct pglist_data *pgdat = NULL; + + pgdat = NODE_DATA(nid); + + if (putback_flag) + goto putback_list; + + if (reclaim_num == 0) + return 0; + + list_for_each_entry_safe(page, next, page_list, lru) { + if (!page_is_file_lru(page) && !__PageMovable(page) + && PageSwapCache(page)) { + ClearPageActive(page); + list_move(&page->lru, &swap_pages); + nr_moved++; + } + + if (nr_moved >= reclaim_num) + break; + } + + /* swap the pages */ + if (pgdat) + nr_reclaimed = reclaim_pages(&swap_pages); + + return nr_reclaimed; + +putback_list: + while (!list_empty(page_list)) { + page = lru_to_page(page_list); + list_del(&page->lru); + putback_lru_page(page); + } + + return nr_reclaimed; +} + +#define SWAP_SCAN_NUM_MAX 32 + +static bool swapcache_below_watermark(unsigned long *swapcache_watermark) +{ + return total_swapcache_pages() < swapcache_watermark[ETMEM_SWAPCACHE_WMARK_LOW]; +} + +static unsigned long get_swapcache_reclaim_num(unsigned long *swapcache_watermark) +{ + return total_swapcache_pages() > + swapcache_watermark[ETMEM_SWAPCACHE_WMARK_LOW] ? + (total_swapcache_pages() - swapcache_watermark[ETMEM_SWAPCACHE_WMARK_LOW]) : 0; +} + +/* + * The main function to reclaim swapcache, the whole reclaim process is + * divided into 3 steps. + * 1. get the total_swapcache_pages num to reclaim. + * 2. scan the LRU linked list of each memory node to obtain the + * swapcache pages that can be reclaimd. + * 3. reclaim the swapcache page until the requirements are meet. + */ +int do_swapcache_reclaim(unsigned long *swapcache_watermark, + unsigned int watermark_nr) +{ + int err = -EINVAL; + unsigned long swapcache_to_reclaim = 0; + unsigned long nr_reclaimed = 0; + unsigned long swapcache_total_reclaimable = 0; + unsigned long reclaim_page_count = 0; + + unsigned long *nr = NULL; + unsigned long *nr_to_reclaim = NULL; + struct list_head *swapcache_list = NULL; + + int nid = 0; + struct lruvec *lruvec = NULL; + struct list_head *src = NULL; + struct page *page = NULL; + struct page *next = NULL; + struct page *pos = NULL; + + struct mem_cgroup *memcg = NULL; + struct mem_cgroup *target_memcg = NULL; + + pg_data_t *pgdat = NULL; + unsigned int scan_count = 0; + int nid_num = 0; + + if (swapcache_watermark == NULL || + watermark_nr < ETMEM_SWAPCACHE_NR_WMARK) + return err; + + /* get the total_swapcache_pages num to reclaim. */ + swapcache_to_reclaim = get_swapcache_reclaim_num(swapcache_watermark); + if (swapcache_to_reclaim <= 0) + return err; + + nr = kcalloc(MAX_NUMNODES, sizeof(unsigned long), GFP_KERNEL); + if (nr == NULL) + return -ENOMEM; + + nr_to_reclaim = kcalloc(MAX_NUMNODES, sizeof(unsigned long), GFP_KERNEL); + if (nr_to_reclaim == NULL) { + kfree(nr); + return -ENOMEM; + } + + swapcache_list = kcalloc(MAX_NUMNODES, sizeof(struct list_head), GFP_KERNEL); + if (swapcache_list == NULL) { + kfree(nr); + kfree(nr_to_reclaim); + return -ENOMEM; + } + + /* + * scan the LRU linked list of each memory node to obtain the + * swapcache pages that can be reclaimd. + */ + for_each_node_state(nid, N_MEMORY) { + INIT_LIST_HEAD(&swapcache_list[nid_num]); + cond_resched(); + + pgdat = NODE_DATA(nid); + + memcg = mem_cgroup_iter(target_memcg, NULL, NULL); + do { + cond_resched(); + pos = NULL; + lruvec = mem_cgroup_lruvec(memcg, pgdat); + src = &(lruvec->lists[LRU_INACTIVE_ANON]); + spin_lock_irq(&lruvec->lru_lock); + scan_count = 0; + + /* + * Scan the swapcache pages that are not mapped from + * the end of the LRU linked list, scan SWAP_SCAN_NUM_MAX + * pages each time, and record the scan end point page. + */ + + pos = list_last_entry(src, struct page, lru); + spin_unlock_irq(&lruvec->lru_lock); +do_scan: + cond_resched(); + scan_count = 0; + spin_lock_irq(&lruvec->lru_lock); + + /* + * check if pos page is been released or not in LRU list, if true, + * cancel the subsequent page scanning of the current node. + */ + if (!pos || list_entry_is_head(pos, src, lru)) { + spin_unlock_irq(&lruvec->lru_lock); + continue; + } + + if (!PageLRU(pos) || folio_lru_list(page_folio(pos)) != LRU_INACTIVE_ANON) { + spin_unlock_irq(&lruvec->lru_lock); + continue; + } + + page = pos; + pos = NULL; + /* Continue to scan down from the last scan breakpoint */ + list_for_each_entry_safe_reverse_from(page, next, src, lru) { + scan_count++; + pos = next; + if (scan_count >= SWAP_SCAN_NUM_MAX) + break; + + if (!PageSwapCache(page)) + continue; + + if (page_mapped(page)) + continue; + + if (add_page_for_reclaim_swapcache(page, + &swapcache_list[nid_num], + lruvec, LRU_INACTIVE_ANON) != 0) + continue; + + nr[nid_num]++; + swapcache_total_reclaimable++; + } + spin_unlock_irq(&lruvec->lru_lock); + + /* + * Check whether the scanned pages meet + * the reclaim requirements. + */ + if (swapcache_total_reclaimable <= swapcache_to_reclaim || + scan_count >= SWAP_SCAN_NUM_MAX) + goto do_scan; + + } while ((memcg = mem_cgroup_iter(target_memcg, memcg, NULL))); + + /* Start reclaiming the next memory node. */ + nid_num++; + } + + /* reclaim the swapcache page until the requirements are meet. */ + do { + nid_num = 0; + reclaim_page_count = 0; + + /* start swapcache page reclaim for each node. */ + for_each_node_state(nid, N_MEMORY) { + cond_resched(); + + nr_to_reclaim[nid_num] = (swapcache_total_reclaimable == 0) ? 0 : + ((swapcache_to_reclaim * nr[nid_num]) / + swapcache_total_reclaimable); + + reclaim_page_count += reclaim_swapcache_pages_from_list(nid, + &swapcache_list[nid_num], + nr_to_reclaim[nid_num], false); + nid_num++; + } + + nr_reclaimed += reclaim_page_count; + + /* + * Check whether the swapcache page reaches the reclaim requirement or + * the number of the swapcache page reclaimd is 0. Stop reclaim. + */ + if (nr_reclaimed >= swapcache_to_reclaim || reclaim_page_count == 0) + goto exit; + } while (!swapcache_below_watermark(swapcache_watermark) || + nr_reclaimed < swapcache_to_reclaim); +exit: + nid_num = 0; + /* + * Repopulate the swapcache pages that are not reclaimd back + * to the LRU linked list. + */ + for_each_node_state(nid, N_MEMORY) { + cond_resched(); + reclaim_swapcache_pages_from_list(nid, + &swapcache_list[nid_num], 0, true); + nid_num++; + } + + kfree(nr); + kfree(nr_to_reclaim); + kfree(swapcache_list); + + return 0; +} +EXPORT_SYMBOL_GPL(do_swapcache_reclaim);
From: Kemeng Shi shikemeng@huawei.com
euleros inclusion category: feature feature: etmem bugzilla: https://gitee.com/openeuler/kernel/issues/I4OODH?from=project-issue CVE: NA
-------------------------------------------------
Add proc/sys/vm/hugepage_nocache_copy switch. Set 1 to copy hugepage with movnt SSE instructoin if cpu support it. Set 0 to copy hugepage as usual.
Signed-off-by: Yuchen Tang tangyuchen5@huawei.com Signed-off-by: Kemeng Shi shikemeng@huawei.com Reviewed-by: louhongxiang louhongxiang@huawei.com Reviewed-by: Kefeng Wang wangkefeng.wang@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- arch/x86/include/asm/page_64.h | 9 +++ arch/x86/lib/Makefile | 1 + arch/x86/lib/copy_highpages.c | 107 +++++++++++++++++++++++++++++++++ arch/x86/lib/copy_page_64.S | 73 ++++++++++++++++++++++ include/linux/highmem.h | 17 ++++++ mm/util.c | 5 ++ 6 files changed, 212 insertions(+) create mode 100644 arch/x86/lib/copy_highpages.c
diff --git a/arch/x86/include/asm/page_64.h b/arch/x86/include/asm/page_64.h index cc6b8e087192..115e32f4ef59 100644 --- a/arch/x86/include/asm/page_64.h +++ b/arch/x86/include/asm/page_64.h @@ -60,6 +60,15 @@ static inline void clear_page(void *page)
void copy_page(void *to, void *from);
+#ifdef CONFIG_ETMEM +void copy_page_nocache(void *to, void *from); +void copy_page_nocache_barrir(void); + +struct folio; +#define __HAVE_ARCH_COPY_HUGEPAGES 1 +void copy_highpages(struct folio *dst, struct folio *src); +#endif + #ifdef CONFIG_X86_5LEVEL /* * User space process size. This is the first address outside the user range. diff --git a/arch/x86/lib/Makefile b/arch/x86/lib/Makefile index ea3a28e7b613..d735ab09ad4d 100644 --- a/arch/x86/lib/Makefile +++ b/arch/x86/lib/Makefile @@ -74,4 +74,5 @@ endif lib-y += memmove_64.o memset_64.o lib-y += copy_user_64.o copy_user_uncached_64.o lib-y += cmpxchg16b_emu.o + lib-(CONFIG_ETMEM) += copy_highpages.o endif diff --git a/arch/x86/lib/copy_highpages.c b/arch/x86/lib/copy_highpages.c new file mode 100644 index 000000000000..1e3795e83586 --- /dev/null +++ b/arch/x86/lib/copy_highpages.c @@ -0,0 +1,107 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * accelerate copying page to pmem with non-temproal stroes + */ +#include <linux/sched.h> +#include <linux/mmzone.h> +#include <linux/highmem.h> +#include <linux/sysctl.h> + +DEFINE_STATIC_KEY_FALSE(hugepage_nocache_copy); +#ifdef CONFIG_SYSCTL +static void set_hugepage_nocache_copy(bool enabled) +{ + if (enabled) + static_branch_enable(&hugepage_nocache_copy); + else + static_branch_disable(&hugepage_nocache_copy); +} + +int sysctl_hugepage_nocache_copy(struct ctl_table *table, int write, + void __user *buffer, size_t *lenp, loff_t *ppos) +{ + struct ctl_table t; + int err; + int state; + + if (write && !capable(CAP_SYS_ADMIN)) + return -EPERM; + + state = static_branch_unlikely(&hugepage_nocache_copy); + t = *table; + t.data = &state; + err = proc_dointvec_minmax(&t, write, buffer, lenp, ppos); + if (err < 0) + return err; + if (write) + set_hugepage_nocache_copy(state); + return err; +} + +static struct ctl_table copy_highpages_table[] = { + { + .procname = "hugepage_nocache_copy", + .data = NULL, + .maxlen = sizeof(unsigned int), + .mode = 0600, + .proc_handler = sysctl_hugepage_nocache_copy, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_ONE, + }, + {} +}; + +static struct ctl_table copy_highpages_root_table[] = { + { + .procname = "vm", + .mode = 0555, + .child = copy_highpages_table, + }, + {} +}; + +static __init int copy_highpages_init(void) +{ + return register_sysctl_table(copy_highpages_root_table) ? 0 : -ENOMEM; +} +late_initcall(copy_highpages_init); +#endif + +static void copy_highpages_nocache(struct folio *dst, struct folio *src) +{ + char *vfrom, *vto; + int i; + int nr = folio_nr_pages(src); + + for (i = 0; i < nr; i++) { + cond_resched(); + vfrom = kmap_atomic(folio_page(src, i)); + vto = kmap_atomic(folio_page(dst, i)); + copy_page_nocache(vto, vfrom); + kunmap_atomic(vto); + kunmap_atomic(vfrom); + } + copy_page_nocache_barrir(); +} + +static void copy_highpages_cache(struct folio *dst, struct folio *src) +{ + long i = 0; + long nr = folio_nr_pages(src); + + for (;;) { + copy_highpage(folio_page(dst, i), folio_page(src, i)); + if (++i == nr) + break; + cond_resched(); + } +} + +void copy_highpages(struct folio *dst, struct folio *src) +{ + if (static_branch_unlikely(&hugepage_nocache_copy) && + get_node_type(page_to_nid(folio_page(dst, 0))) == NODE_TYPE_PMEM) + return copy_highpages_nocache(dst, src); + + return copy_highpages_cache(dst, src); +} diff --git a/arch/x86/lib/copy_page_64.S b/arch/x86/lib/copy_page_64.S index 30ea644bf446..8c80f4111dc2 100644 --- a/arch/x86/lib/copy_page_64.S +++ b/arch/x86/lib/copy_page_64.S @@ -87,3 +87,76 @@ SYM_FUNC_START_LOCAL(copy_page_regs) addq $2*8, %rsp RET SYM_FUNC_END(copy_page_regs) + +SYM_FUNC_START(copy_page_nocache) + ALTERNATIVE "jmp copy_page", "", X86_FEATURE_XMM2 + subq $2*8, %rsp + movq %rbx, (%rsp) + movq %r12, 1*8(%rsp) + + movl $(4096/64)-5, %ecx + .p2align 4 +.LoopNT64: + dec %rcx + movq 0x8*0(%rsi), %rax + movq 0x8*1(%rsi), %rbx + movq 0x8*2(%rsi), %rdx + movq 0x8*3(%rsi), %r8 + movq 0x8*4(%rsi), %r9 + movq 0x8*5(%rsi), %r10 + movq 0x8*6(%rsi), %r11 + movq 0x8*7(%rsi), %r12 + + prefetcht0 5*64(%rsi) + + movnti %rax, 0x8*0(%rdi) + movnti %rbx, 0x8*1(%rdi) + movnti %rdx, 0x8*2(%rdi) + movnti %r8, 0x8*3(%rdi) + movnti %r9, 0x8*4(%rdi) + movnti %r10, 0x8*5(%rdi) + movnti %r11, 0x8*6(%rdi) + movnti %r12, 0x8*7(%rdi) + + leaq 64 (%rsi), %rsi + leaq 64 (%rdi), %rdi + + jnz .LoopNT64 + + movl $5, %ecx + .p2align 4 +.LoopNT2: + decl %ecx + + movq 0x8*0(%rsi), %rax + movq 0x8*1(%rsi), %rbx + movq 0x8*2(%rsi), %rdx + movq 0x8*3(%rsi), %r8 + movq 0x8*4(%rsi), %r9 + movq 0x8*5(%rsi), %r10 + movq 0x8*6(%rsi), %r11 + movq 0x8*7(%rsi), %r12 + + movnti %rax, 0x8*0(%rdi) + movnti %rbx, 0x8*1(%rdi) + movnti %rdx, 0x8*2(%rdi) + movnti %r8, 0x8*3(%rdi) + movnti %r9, 0x8*4(%rdi) + movnti %r10, 0x8*5(%rdi) + movnti %r11, 0x8*6(%rdi) + movnti %r12, 0x8*7(%rdi) + + leaq 64(%rdi), %rdi + leaq 64(%rsi), %rsi + jnz .LoopNT2 + + movq (%rsp), %rbx + movq 1*8(%rsp), %r12 + addq $2*8, %rsp + ret +SYM_FUNC_END(copy_page_nocache) + +SYM_FUNC_START(copy_page_nocache_barrir) + ALTERNATIVE "", "sfence", X86_FEATURE_XMM2 + ret +SYM_FUNC_END(copy_page_nocache_barrir) diff --git a/include/linux/highmem.h b/include/linux/highmem.h index c1a0d23c61ea..fe5b75dd7647 100644 --- a/include/linux/highmem.h +++ b/include/linux/highmem.h @@ -565,4 +565,21 @@ static inline void unmap_and_put_page(struct page *page, void *addr) put_page(page); }
+#ifndef __HAVE_ARCH_COPY_HUGEPAGES + +static inline void copy_highpages(struct folio *dst, struct folio *src) +{ + long i = 0; + long nr = folio_nr_pages(src); + + for (;;) { + copy_highpage(folio_page(dst, i), folio_page(src, i)); + if (++i == nr) + break; + cond_resched(); + } +} + +#endif /* __HAVE_ARCH_COPY_HUGEPAGES */ + #endif /* _LINUX_HIGHMEM_H */ diff --git a/mm/util.c b/mm/util.c index be798981acc7..881d2453dbfb 100644 --- a/mm/util.c +++ b/mm/util.c @@ -23,6 +23,7 @@ #include <linux/processor.h> #include <linux/sizes.h> #include <linux/compat.h> +#include <linux/page-flags.h>
#include <linux/uaccess.h>
@@ -799,6 +800,9 @@ EXPORT_SYMBOL(folio_mapping); */ void folio_copy(struct folio *dst, struct folio *src) { +#ifdef CONFIG_ETMEM + copy_highpages(dst, src); +#else long i = 0; long nr = folio_nr_pages(src);
@@ -808,6 +812,7 @@ void folio_copy(struct folio *dst, struct folio *src) break; cond_resched(); } +#endif }
int sysctl_overcommit_memory __read_mostly = OVERCOMMIT_GUESS;
From: Kemeng Shi shikemeng@huawei.com
euleros inclusion category: feature feature: etmem bugzilla: https://gitee.com/openeuler/kernel/issues/I4OODH?from=project-issue CVE: NA
-------------------------------------------------
Add /proc/sys/vm/hugepage_pmem_allocall switch. Set 1 to allowed all memory in pmem could alloc for hugepage. Set 0(default) hugepage alloc is limited by zone watermark as usual. Add /proc/sys/vm/hugepage_mig_noalloc switch. Set 1 to forbid new hugepage alloc in hugepage migration when hugepage in dest node runs out. Set 0(default) to allow hugepage alloc in hugepage migration as usual.
Signed-off-by: Yuchen Tang tangyuchen5@huawei.com Signed-off-by: Kemeng Shi shikemeng@huawei.com Reviewed-by: louhongxiang louhongxiang@huawei.com Reviewed-by: Kefeng Wang wangkefeng.wang@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- include/linux/etmem.h | 3 +++ include/linux/highmem.h | 4 ++-- mm/etmem.c | 37 +++++++++++++++++++++++++++++++++++++ mm/hugetlb.c | 9 +++++++++ 4 files changed, 51 insertions(+), 2 deletions(-)
diff --git a/include/linux/etmem.h b/include/linux/etmem.h index a274785216d1..a51df74c28e1 100644 --- a/include/linux/etmem.h +++ b/include/linux/etmem.h @@ -39,6 +39,9 @@ static inline struct kvm *mm_kvm(struct mm_struct *mm) } #endif
+extern int sysctl_hugetlb_mig_noalloc; +extern int sysctl_hugetlb_pmem_allocall; + enum node_type { NODE_TYPE_DRAM, NODE_TYPE_PMEM, diff --git a/include/linux/highmem.h b/include/linux/highmem.h index fe5b75dd7647..ea88d7239195 100644 --- a/include/linux/highmem.h +++ b/include/linux/highmem.h @@ -565,6 +565,7 @@ static inline void unmap_and_put_page(struct page *page, void *addr) put_page(page); }
+#ifdef CONFIG_ETMEM #ifndef __HAVE_ARCH_COPY_HUGEPAGES
static inline void copy_highpages(struct folio *dst, struct folio *src) @@ -579,7 +580,6 @@ static inline void copy_highpages(struct folio *dst, struct folio *src) cond_resched(); } } - #endif /* __HAVE_ARCH_COPY_HUGEPAGES */ - +#endif /* CONFIG_ETMEM */ #endif /* _LINUX_HIGHMEM_H */ diff --git a/mm/etmem.c b/mm/etmem.c index 90abf8ac4596..37e389acb463 100644 --- a/mm/etmem.c +++ b/mm/etmem.c @@ -9,6 +9,10 @@ #include <linux/etmem.h> #include "internal.h"
+#ifdef CONFIG_ETMEM +int sysctl_hugetlb_mig_noalloc; +int sysctl_hugetlb_pmem_allocall; +#endif static bool enable_kernel_swap __read_mostly = true; unsigned int sysctl_vmemmap_block_from_dram; #ifdef CONFIG_NUMA @@ -94,6 +98,39 @@ static int __init vmemmap_sysctl_init(void) late_initcall(vmemmap_sysctl_init); #endif
+ +#ifdef CONFIG_HUGETLBFS +static struct ctl_table hugetlb_pmem_table[] = { + { + .procname = "hugepage_mig_noalloc", + .data = &sysctl_hugetlb_mig_noalloc, + .maxlen = sizeof(sysctl_hugetlb_mig_noalloc), + .mode = 0600, + .proc_handler = proc_dointvec_minmax, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_ONE, + }, + { + .procname = "hugepage_pmem_allocall", + .data = &sysctl_hugetlb_pmem_allocall, + .maxlen = sizeof(sysctl_hugetlb_pmem_allocall), + .mode = 0600, + .proc_handler = proc_dointvec_minmax, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_ONE, + }, + { } +}; + +static int __init hugetlb_pmem_init(void) +{ + register_sysctl_init("kernel", hugetlb_pmem_table); + return 0; +} +late_initcall(hugetlb_pmem_init); +#endif + + int add_page_for_swap(struct page *page, struct list_head *pagelist) { int err = -EBUSY; diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 1a4d388b6a3b..1b714e3ea680 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -35,6 +35,7 @@ #include <linux/delayacct.h> #include <linux/memory.h> #include <linux/mm_inline.h> +#include <linux/etmem.h>
#include <asm/page.h> #include <asm/pgalloc.h> @@ -2233,6 +2234,10 @@ static int alloc_pool_huge_page(struct hstate *h, nodemask_t *nodes_allowed, gfp_t gfp_mask = htlb_alloc_mask(h) | __GFP_THISNODE;
for_each_node_mask_to_alloc(h, nr_nodes, node, nodes_allowed) { +#ifdef CONFIG_ETMEM + if (get_node_type(node) == NODE_TYPE_PMEM && sysctl_hugetlb_pmem_allocall) + gfp_mask |= __GFP_MEMALLOC; +#endif folio = alloc_fresh_hugetlb_folio(h, gfp_mask, node, nodes_allowed, node_alloc_noretry); if (folio) { @@ -2501,7 +2506,11 @@ struct folio *alloc_hugetlb_folio_nodemask(struct hstate *h, int preferred_nid,
folio = dequeue_hugetlb_folio_nodemask(h, gfp_mask, preferred_nid, nmask); +#ifdef CONFIG_ETMEM + if (folio || sysctl_hugetlb_mig_noalloc) { +#else if (folio) { +#endif spin_unlock_irq(&hugetlb_lock); return folio; }
On 2024/1/2 19:40, Yuchen Tang wrote:
From: Kemeng Shi shikemeng@huawei.com
euleros inclusion category: feature feature: etmem bugzilla: https://gitee.com/openeuler/kernel/issues/I4OODH?from=project-issue CVE: NA
Add /proc/sys/vm/hugepage_pmem_allocall switch. Set 1 to allowed all memory in pmem could alloc for hugepage. Set 0(default) hugepage alloc is limited by zone watermark as usual. Add /proc/sys/vm/hugepage_mig_noalloc switch. Set 1 to forbid new hugepage alloc in hugepage migration when hugepage in dest node runs out. Set 0(default) to allow hugepage alloc in hugepage migration as usual.
Signed-off-by: Yuchen Tang tangyuchen5@huawei.com Signed-off-by: Kemeng Shi shikemeng@huawei.com Reviewed-by: louhongxiang louhongxiang@huawei.com Reviewed-by: Kefeng Wang wangkefeng.wang@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com
include/linux/etmem.h | 3 +++ include/linux/highmem.h | 4 ++-- mm/etmem.c | 37 +++++++++++++++++++++++++++++++++++++ mm/hugetlb.c | 9 +++++++++ 4 files changed, 51 insertions(+), 2 deletions(-)
diff --git a/include/linux/etmem.h b/include/linux/etmem.h index a274785216d1..a51df74c28e1 100644 --- a/include/linux/etmem.h +++ b/include/linux/etmem.h @@ -39,6 +39,9 @@ static inline struct kvm *mm_kvm(struct mm_struct *mm) } #endif
+extern int sysctl_hugetlb_mig_noalloc; +extern int sysctl_hugetlb_pmem_allocall;
这里要在etmem没有开启下定一个
#define sysctl_hugetlb_mig_noalloc 0 #define sysctl_hugetlb_pmem_allocall 0
另外你的命名要考虑换一下,不知道啥意思;
enum node_type { NODE_TYPE_DRAM, NODE_TYPE_PMEM, diff --git a/include/linux/highmem.h b/include/linux/highmem.h index fe5b75dd7647..ea88d7239195 100644 --- a/include/linux/highmem.h +++ b/include/linux/highmem.h @@ -565,6 +565,7 @@ static inline void unmap_and_put_page(struct page *page, void *addr) put_page(page); }
+#ifdef CONFIG_ETMEM #ifndef __HAVE_ARCH_COPY_HUGEPAGES
static inline void copy_highpages(struct folio *dst, struct folio *src) @@ -579,7 +580,6 @@ static inline void copy_highpages(struct folio *dst, struct folio *src) cond_resched(); } }
- #endif /* __HAVE_ARCH_COPY_HUGEPAGES */
+#endif /* CONFIG_ETMEM */ #endif /* _LINUX_HIGHMEM_H */ diff --git a/mm/etmem.c b/mm/etmem.c index 90abf8ac4596..37e389acb463 100644 --- a/mm/etmem.c +++ b/mm/etmem.c @@ -9,6 +9,10 @@ #include <linux/etmem.h> #include "internal.h"
+#ifdef CONFIG_ETMEM +int sysctl_hugetlb_mig_noalloc; +int sysctl_hugetlb_pmem_allocall; +#endif static bool enable_kernel_swap __read_mostly = true; unsigned int sysctl_vmemmap_block_from_dram; #ifdef CONFIG_NUMA @@ -94,6 +98,39 @@ static int __init vmemmap_sysctl_init(void) late_initcall(vmemmap_sysctl_init); #endif
+#ifdef CONFIG_HUGETLBFS +static struct ctl_table hugetlb_pmem_table[] = {
- {
.procname = "hugepage_mig_noalloc",
.data = &sysctl_hugetlb_mig_noalloc,
.maxlen = sizeof(sysctl_hugetlb_mig_noalloc),
.mode = 0600,
.proc_handler = proc_dointvec_minmax,
.extra1 = SYSCTL_ZERO,
.extra2 = SYSCTL_ONE,
- },
- {
.procname = "hugepage_pmem_allocall",
.data = &sysctl_hugetlb_pmem_allocall,
.maxlen = sizeof(sysctl_hugetlb_pmem_allocall),
.mode = 0600,
.proc_handler = proc_dointvec_minmax,
.extra1 = SYSCTL_ZERO,
.extra2 = SYSCTL_ONE,
- },
- { }
+};
+static int __init hugetlb_pmem_init(void) +{
- register_sysctl_init("kernel", hugetlb_pmem_table);
- return 0;
+} +late_initcall(hugetlb_pmem_init); +#endif
- int add_page_for_swap(struct page *page, struct list_head *pagelist) { int err = -EBUSY;
diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 1a4d388b6a3b..1b714e3ea680 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -35,6 +35,7 @@ #include <linux/delayacct.h> #include <linux/memory.h> #include <linux/mm_inline.h> +#include <linux/etmem.h>
下面的代码一样的问题,
#include <asm/page.h> #include <asm/pgalloc.h> @@ -2233,6 +2234,10 @@ static int alloc_pool_huge_page(struct hstate *h, nodemask_t *nodes_allowed, gfp_t gfp_mask = htlb_alloc_mask(h) | __GFP_THISNODE;
for_each_node_mask_to_alloc(h, nr_nodes, node, nodes_allowed) { +#ifdef CONFIG_ETMEM
if (get_node_type(node) == NODE_TYPE_PMEM && sysctl_hugetlb_pmem_allocall)
gfp_mask |= __GFP_MEMALLOC;
+#endif
请在你们头文件中定一个判断函数;
folio = alloc_fresh_hugetlb_folio(h, gfp_mask, node, nodes_allowed, node_alloc_noretry); if (folio) {
@@ -2501,7 +2506,11 @@ struct folio *alloc_hugetlb_folio_nodemask(struct hstate *h, int preferred_nid,
folio = dequeue_hugetlb_folio_nodemask(h, gfp_mask, preferred_nid, nmask);
+#ifdef CONFIG_ETMEM
这里不需要加etmem config;但是头文件要在etmem没有开启下要定义0
if (folio || sysctl_hugetlb_mig_noalloc) {
+#else if (folio) { +#endif spin_unlock_irq(&hugetlb_lock); return folio; }
反馈: 您发送到kernel@openeuler.org的补丁/补丁集,转换为PR失败! 邮件列表地址:https://mailweb.openeuler.org/hyperkitty/list/kernel@openeuler.org/message/N... 失败原因:应用补丁/补丁集失败,Patch failed at 0001 etmem: add etmem scan feature to openEuler 建议解决方法:请查看失败原因, 确认补丁是否可以应用在当前期望分支的最新代码上
FeedBack: The patch(es) which you have sent to kernel@openeuler.org has been converted to PR failed! Mailing list address: https://mailweb.openeuler.org/hyperkitty/list/kernel@openeuler.org/message/N... Failed Reason: apply patch(es) failed, Patch failed at 0001 etmem: add etmem scan feature to openEuler Suggest Solution: please checkout if the failed patch(es) can work on the newest codes in expected branch